以然是为了处理抓取的HTML文件内容,让抓取的数据不影响排版。
这只是一个不完全的处理。
去除 script , noscript 和 style 块。
将 Hn , div , p , li , tr 这些标签换成 br.
留下不影响排版的HTML标签。
用了扩展方法,所以,只有 .NET Framework 3.5 才能运行。
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
namespace RegularHTML {
public static class RegularHTML{
/*
* 需要整个去掉的
* script, style, title 整个去掉
*/
private static string[] removeWholeTags = { "script", "noscript", "style", "title" };
/*
* 需要保留的 tag,
* tr=>br td=>  div=>br p=>br li =>br
*/
private static string[] holdTags = { "br", "strong", "b", "li", "tr", "td", "div", "p","script","style","title",@"h\d+" };
private static string[] toBrTags = { "tr","div","p","li",@"h\d+" };
private static string[] toBlankTags = { "td" };
private static Regex removeReg,removeWholeReg , toBrRegLeft, toBrRegRight, toBlankReg;
private static Regex htmlCommentReg = new Regex(@"<!--[\s\S]*?-->");
private static Regex removeBlankReg = new Regex(@"\s{2,}");
private static Regex removeBrReg = new Regex(@"(<br[^>]*>\s*){3,}", RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase);
static RegularHTML() {
string regStr = string.Format(@"<(?!((/?\s?{0}\b)))[^>]+>", string.Join(@"\b)|(/?\s?", holdTags));
removeReg = new Regex(regStr, RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase);
// <(script|style)[^>]*>[\s\S]*?</\1[^>]*>
regStr = string.Format(@"<({0})[^>]*>[\s\S]*?</\1[^>]*>",string.Join("|",removeWholeTags));
removeWholeReg = new Regex(regStr, RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase);
regStr = string.Format(@"<({0})[^>]*>", string.Join("|", toBrTags));
toBrRegLeft = new Regex(regStr, RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase);
regStr = string.Format(@"</({0})[^>]*>", string.Join("|", toBrTags));
toBrRegRight = new Regex(regStr, RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase);
regStr = string.Format(@"</{0}[^>]*>", string.Join("|", toBlankTags));
toBlankReg = new Regex(regStr, RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase);
}
public static string GetRegularHTML(this string ctx) {
ctx = htmlCommentReg.Replace(ctx, "");
ctx = removeWholeReg.Replace(ctx, "");
ctx = removeReg.Replace(ctx, "");
ctx = toBrRegLeft.Replace(ctx, "");
ctx = toBrRegRight.Replace(ctx, "<br />");
ctx = toBlankReg.Replace(ctx, " ");
ctx = removeBlankReg.Replace(ctx, "\r\n");
ctx = removeBrReg.Replace(ctx, "<br /><br />");//三个以上的 br 换成 二个
return ctx;
}
}
}
| < Prev | Next > |
|---|
Last Updated ( Wednesday, 14 April 2010 17:53 )



