About these ads

Strip HTML Tags from Text

Have you ever wondered how would you show on a web form text that is stripped of HTML tags but you still want to accept HTML tags when saving to your form? Sounds confusing?

To make it clear, I will give a good example.  Lets say you have a form that has a Rich Text Box (FTB or FCK) which you allow users to cut and paste items that have HTML tags so that you can display it properly like a Blog Article, but there are some instances that you want this to be stripped of the HTML tags like displaying a summary on a Grid.  Now my solution is to strip the HTML codes before displaying it when needed.

So when you copy this

Hello, World!

it wont show on your grid as this

<html>
 <head>
 <title>
 Hello World
 </title>
 </head>
 <body>
 <font size ="4" color="blue">
 Hello, World!
 </font>
 </body>
</html>

but as this

"Hello World!"

Now with a mix or Replace and Regular Expressions I created a class to handle that and here it is:

public string StripHTML(string sInputString)
 {

 try
 {
 string sOutputString;
 sOutputString = sInputString;
 //Initial Cleaning Step
 //Replace new line and carriage return with Spaces
 sOutputString = sOutputString.Replace("\r", " ");
 sOutputString = sOutputString.Replace("\n", " ");
 // Remove sTabs
 sOutputString = sOutputString.Replace("\t", string.Empty);

 //Tag Removal
 DataTable myDataTable = GetTableDefinition();
 myDataTable.DefaultView.Sort = "iID ASC";
 foreach (DataRow drCleaningItem in myDataTable.Rows)
 {
 string sOriginalString = (drCleaningItem["sOriginalString"]).ToString();
 string sReplacementString = (drCleaningItem["sReplacementString"]).ToString();
 sOutputString = Regex.Replace(sOutputString, sOriginalString, sReplacementString, RegexOptions.IgnoreCase);
 }

 //Initial replacement target string for linebreaks
 string sBreaks = "\r\r\r";

 // Initial replacement target string for sTabs
 string sTabs = "\t\t\t\t\t";
 for (int x = 0; x < sOutputString.Length; x++)
 {
 sOutputString = sOutputString.Replace(sBreaks, "\r\r");
 sOutputString = sOutputString.Replace(sTabs, "\t\t\t\t");
 sBreaks = sBreaks + "\r";
 sTabs = sTabs + "\t";
 }

 return sOutputString;

 }
 catch
 {
 return sInputString;
 }
 }

 private DataTable GetTableDefinition()
 {

 DataTable dtCleaningCollection = new DataTable();
 dtCleaningCollection.Columns.Add("iID", typeof(int));
 dtCleaningCollection.Columns.Add("sOriginalString", typeof(string));
 dtCleaningCollection.Columns.Add("sReplacementString", typeof(string));

 // Replace repeating spaces with single space
 dtCleaningCollection.Rows.Add(1, @"( )+", " ");

 // Prepare and clean Header Tag
 dtCleaningCollection.Rows.Add(2, @"<( )*head([^>])*>", "<head>");
 dtCleaningCollection.Rows.Add(3, @"(<( )*(/)( )*head( )*>)", "</head>");
 dtCleaningCollection.Rows.Add(4, "(<head>).*(</head>)", string.Empty);

 // Prepare and clean Script Tag
 dtCleaningCollection.Rows.Add(5, @"<( )*script([^>])*>", "<script>");
 dtCleaningCollection.Rows.Add(6, @"(<( )*(/)( )*script( )*>)", "</script>");
 dtCleaningCollection.Rows.Add(7, @"(<script>).*(</script>)", string.Empty);

 // Prepare and clean Style Tag
 dtCleaningCollection.Rows.Add(8, @"<( )*style([^>])*>", "<style>");
 dtCleaningCollection.Rows.Add(9, @"(<( )*(/)( )*style( )*>)", "</style>");
 dtCleaningCollection.Rows.Add(10, "(<style>).*(</style>)", string.Empty);

 // Replace <td> with sTabs
 dtCleaningCollection.Rows.Add(11, @"<( )*td([^>])*>", "\t");

 // Replace <BR> and <LI> with Line sBreaks
 dtCleaningCollection.Rows.Add(12, @"<( )*br( )*>", "\r");
 dtCleaningCollection.Rows.Add(13, @"<( )*li( )*>", "\r");

 // Replace <P>, <DIV> and <TR> with Double Line sBreaks
 dtCleaningCollection.Rows.Add(14, @"<( )*div([^>])*>", "\r\r");
 dtCleaningCollection.Rows.Add(15, @"<( )*tr([^>])*>", "\r\r");
 dtCleaningCollection.Rows.Add(16, @"<( )*p([^>])*>", "\r\r");

 // Remove Remaining tags enclosed in < >
 dtCleaningCollection.Rows.Add(17, @"<[^>]*>", string.Empty);

 // Replace special characters:
 dtCleaningCollection.Rows.Add(18, @"&nbsp;", " ");
 dtCleaningCollection.Rows.Add(19, @"&bull;", " * ");
 dtCleaningCollection.Rows.Add(20, @"&lsaquo;", "<");
 dtCleaningCollection.Rows.Add(21, @"&rsaquo;", ">");
 dtCleaningCollection.Rows.Add(22, @"&trade;", "(tm)");
 dtCleaningCollection.Rows.Add(23, @"&frasl;", "/");
 dtCleaningCollection.Rows.Add(24, @"&lt;", "<");
 dtCleaningCollection.Rows.Add(25, @"&gt;", ">");
 dtCleaningCollection.Rows.Add(26, @"&copy;", "(c)");
 dtCleaningCollection.Rows.Add(27, @"&reg;", "(r)");
 dtCleaningCollection.Rows.Add(28, @"&frac14;", "1/4");
 dtCleaningCollection.Rows.Add(29, @"&frac12;", "1/2");
 dtCleaningCollection.Rows.Add(30, @"&frac34;", "3/4");
 dtCleaningCollection.Rows.Add(31, @"&lsquo;", "'");
 dtCleaningCollection.Rows.Add(32, @"&rsquo;", "'");
 dtCleaningCollection.Rows.Add(33, @"&ldquo;", "\"");
 dtCleaningCollection.Rows.Add(34, @"&rdquo;", "\"");

 // Remove all others remianing special characters
 // you dont want to replace with another string
 dtCleaningCollection.Rows.Add(35, @"&(.{2,6});", string.Empty);

 // Remove extra line sBreaks and sTabs
 dtCleaningCollection.Rows.Add(36, "(\r)( )+(\r)", "\r\r");
 dtCleaningCollection.Rows.Add(37, "(\t)( )+(\t)", "\t\t");
 dtCleaningCollection.Rows.Add(38, "(\t)( )+(\r)", "\t\r");
 dtCleaningCollection.Rows.Add(39, "(\r)( )+(\t)", "\r\t");
 dtCleaningCollection.Rows.Add(40, "(\r)(\t)+(\r)", "\r\r");
 dtCleaningCollection.Rows.Add(41, "(\r)(\t)+", "\r\t");

 return dtCleaningCollection;
 }
About these ads

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

Follow

Get every new post delivered to your Inbox.

Join 197 other followers

%d bloggers like this: