Html2Text

简介:

private static string StripHTML(string source)
        {
            try
            {
                string result;

                // Remove HTML Development formatting
                result = source.Replace("\r", " ");        // Replace line breaks with space because browsers inserts space
                result = result.Replace("\n", " ");        // Replace line breaks with space because browsers inserts space
                result = result.Replace("\t", string.Empty);    // Remove step-formatting
                result = System.Text.RegularExpressions.Regex.Replace(result, @"( )+", " ");    // Remove repeating speces becuase browsers ignore them

                // Remove the header (prepare first by clearing attributes)
                result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*head([^>])*>", "<head>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*head( )*>)", "</head>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result, "(<head>).*(</head>)", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                // remove all scripts (prepare first by clearing attributes)
                result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*script([^>])*>", "<script>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*script( )*>)", "</script>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                //result = System.Text.RegularExpressions.Regex.Replace(result, @"(<script>)([^(<script>\.</script>)])*(</script>)",string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result, @"(<script>).*(</script>)", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                // remove all styles (prepare first by clearing attributes)
                result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*style([^>])*>", "<style>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*style( )*>)", "</style>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result, "(<style>).*(</style>)", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                // insert tabs in spaces of <td> tags
                result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*td([^>])*>", "\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                // insert line breaks in places of <BR> and <LI> tags
                result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*br( )*>", "\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*li( )*>", "\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                // insert line paragraphs (double line breaks) in place if <P>, <DIV> and <TR> tags
                result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*div([^>])*>", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*tr([^>])*>", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*p([^>])*>", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                // Remove remaining tags like <a>, links, images, comments etc - anything thats enclosed inside < >
                result = System.Text.RegularExpressions.Regex.Replace(result, @"<[^>]*>", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                // replace special characters:
                result = System.Text.RegularExpressions.Regex.Replace(result, @"&nbsp;", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                result = System.Text.RegularExpressions.Regex.Replace(result, @"&bull;", " * ", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result, @"&lsaquo;", "<", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result, @"&rsaquo;", ">", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result, @"&trade;", "(tm)", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result, @"&frasl;", "/", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result, @"<", "<", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result, @">", ">", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result, @"&copy;", "(c)", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result, @"&reg;", "(r)", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result, @"&(.{2,6});", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                // for testng
                //System.Text.RegularExpressions.Regex.Replace(result, this.txtRegex.Text,string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);

                // make line breaking consistent
                result = result.Replace("\n", "\r");

                // Remove extra line breaks and tabs: replace over 2 breaks with 2 and over 4 tabs with 4. 
                // Prepare first to remove any whitespaces inbetween the escaped characters and remove redundant tabs inbetween linebreaks
                result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)( )+(\r)", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result, "(\t)( )+(\t)", "\t\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result, "(\t)( )+(\r)", "\t\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)( )+(\t)", "\r\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)(\t)+(\r)", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);    // Remove redundant tabs
                result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)(\t)+", "\r\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase);        // Remove multible tabs followind a linebreak with just one tab
                //string breaks = "\r\r\r";        // Initial replacement target string for linebreaks
                //string tabs = "\t\t\t\t\t";        // Initial replacement target string for tabs
                //for (int index = 0; index < result.Length; index++)
                //{
                //    result = result.Replace(breaks, "\r\r");
                //    result = result.Replace(tabs, "\t\t\t\t");
                //    breaks = breaks + "\r";
                //    tabs = tabs + "\t";
                //}
                result = System.Text.RegularExpressions.Regex.Replace(result, "\r{2,}", "\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                result = System.Text.RegularExpressions.Regex.Replace(result, "\t{2,}", "\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                return result;
            }
            catch
            {
            }
            return source;
        }




本文转自94cool博客园博客,原文链接:http://www.cnblogs.com/94cool/archive/2011/10/20/2218578.html,如需转载请自行联系原作者

相关文章
|
8月前
|
安全 Go
Golang深入浅出之-Go语言模板(text/template):动态生成HTML
【4月更文挑战第24天】Go语言标准库中的`text/template`包用于动态生成HTML和文本,但不熟悉其用法可能导致错误。本文探讨了三个常见问题:1) 忽视模板执行错误,应确保正确处理错误;2) 忽视模板安全,应使用`html/template`包防止XSS攻击;3) 模板结构不合理,应合理组织模板以提高可维护性。理解并运用这些最佳实践,能提升Go语言模板编程的效率和安全性,助力构建稳健的Web应用。
104 0
|
3月前
|
机器学习/深度学习 JSON JavaScript
LangChain-21 Text Splitters 内容切分器 支持多种格式 HTML JSON md Code(JS/Py/TS/etc) 进行切分并输出 方便将数据进行结构化后检索
LangChain-21 Text Splitters 内容切分器 支持多种格式 HTML JSON md Code(JS/Py/TS/etc) 进行切分并输出 方便将数据进行结构化后检索
46 0
|
JavaScript 前端开发
前端基础 -JQuery之val,text,html
前端基础 -JQuery之val,text,html
86 1
|
8月前
|
Java 数据库
【java】RTF转HTML或者TEXT
【java】RTF转HTML或者TEXT
40 1
|
8月前
|
安全 Go 开发者
Golang深入浅出之-Go语言模板(text/template):动态生成HTML
【4月更文挑战第25天】Go语言的`text/template`和`html/template`库提供动态HTML生成。本文介绍了模板基础,如基本语法和数据绑定,以及常见问题和易错点,如忘记转义、未初始化变量、复杂逻辑处理和错误处理。建议使用`html/template`防止XSS攻击,初始化数据结构,分离业务逻辑,并严谨处理错误。示例展示了条件判断和循环结构。通过遵循最佳实践,开发者能更安全、高效地生成HTML。
300 0
|
8月前
|
JSON Java 数据格式
Could not extract response: no suitable HttpMessageConverter found for ..content type [text/html...]
Could not extract response: no suitable HttpMessageConverter found for ..content type [text/html...]
793 0
|
PHP
php:html富文本提取text普通文本内容
php:html富文本提取text普通文本内容
153 0
|
JavaScript
jQuery中的.html() .text() .val() .attr()获取元素内容、值、属性
本文目录 1. 元素 2. 内容、值、属性 3. html() text() val() attr()用法
209 0
jQuery中的.html() .text() .val() .attr()获取元素内容、值、属性
|
Spring Java 安全
Refused to execute script from 'http://127.0.0.1:8004/login' because its MIME type ('text/html') ...
Refused to execute script from 'http://127.0.0.1:8004/login' because its MIME type ('text/html') is not executable, and strict MIME type checking is enabled. Refused to execute script …”,为什么会被拒绝执行呢?想到可能是权限的控制问题,亦即是 Spring Security 的静态资源访问配置问题。
3090 0