C# 将 HTML 转成纯文本详解编程语言
2023-06-13 09:20:30 时间
protected static Dictionary string, string _tags;
protected static HashSet string _ignoreTags;
// Instance variables
protected TextBuilder _text;
protected string _html;
protected int _pos;
// Static constructor (one time only)
static HtmlToText()
_tags = new Dictionary string, string
_tags.Add("address", "/n");
_tags.Add("blockquote", "/n");
_tags.Add("div", "/n");
_tags.Add("dl", "/n");
_tags.Add("fieldset", "/n");
_tags.Add("form", "/n");
_tags.Add("h1", "/n");
_tags.Add("/h1", "/n");
_tags.Add("h2", "/n");
_tags.Add("/h2", "/n");
_tags.Add("h3", "/n");
_tags.Add("/h3", "/n");
_tags.Add("h4", "/n");
_tags.Add("/h4", "/n");
_tags.Add("h5", "/n");
_tags.Add("/h5", "/n");
_tags.Add("h6", "/n");
_tags.Add("/h6", "/n");
_tags.Add("p", "/n");
_tags.Add("/p", "/n");
_tags.Add("table", "/n");
_tags.Add("/table", "/n");
_tags.Add("ul", "/n");
_tags.Add("/ul", "/n");
_tags.Add("ol", "/n");
_tags.Add("/ol", "/n");
_tags.Add("/li", "/n");
_tags.Add("br", "/n");
_tags.Add("/td", "/t");
_tags.Add("/tr", "/n");
_tags.Add("/pre", "/n");
_ignoreTags = new HashSet string
_ignoreTags.Add("script");
_ignoreTags.Add("noscript");
_ignoreTags.Add("style");
_ignoreTags.Add("object");
/// summary
/// Converts the given HTML to plain text and returns the result.
/// /summary
/// param name="html" HTML to be converted /param
/// returns Resulting plain text /returns
public string Convert(string html)
// Initialize state variables
_text = new TextBuilder();
_html = html;
_pos = 0;
// Process input
while (!EndOfText)
if (Peek() == )
// HTML tag
bool selfClosing;
string tag = ParseTag(out selfClosing);
// Handle special tag cases
if (tag == "body")
// Discard content before body
_text.Clear();
else if (tag == "/body")
// Discard content after /body
_pos = _html.Length;
else if (tag == "pre")
// Enter preformatted mode
_text.Preformatted = true;
EatWhitespaceToNextLine();
else if (tag == "/pre")
// Exit preformatted mode
_text.Preformatted = false;
string value;
if (_tags.TryGetValue(tag, out value))
_text.Write(value);
if (_ignoreTags.Contains(tag))
EatInnerContent(tag);
else if (Char.IsWhiteSpace(Peek()))
// Whitespace (treat all as space)
_text.Write(_text.Preformatted ? Peek() : );
MoveAhead();
else
// Other text
_text.Write(Peek());
MoveAhead();
// Return result
return HttpUtility.HtmlDecode(_text.ToString());
// Eats all characters that are part of the current tag
// and returns information about that tag
protected string ParseTag(out bool selfClosing)
string tag = String.Empty;
selfClosing = false;
if (Peek() == )
MoveAhead();
// Parse tag name
EatWhitespace();
int start = _pos;
if (Peek() == /)
MoveAhead();
while (!EndOfText !Char.IsWhiteSpace(Peek())
Peek() != / Peek() != )
MoveAhead();
tag = _html.Substring(start, _pos - start).ToLower();
// Parse rest of tag
while (!EndOfText Peek() != )
if (Peek() == " || Peek() == /)
EatQuotedValue();
else
if (Peek() == /)
selfClosing = true;
MoveAhead();
MoveAhead();
return tag;
// Consumes inner content from the current tag
protected void EatInnerContent(string tag)
string endTag = "/" + tag;
while (!EndOfText)
if (Peek() == )
// Consume a tag
bool selfClosing;
if (ParseTag(out selfClosing) == endTag)
return;
// Use recursion to consume nested tags
if (!selfClosing !tag.StartsWith("/"))
EatInnerContent(tag);
else MoveAhead();
// Returns true if the current position is at the end of
// the string
protected bool EndOfText
get { return (_pos = _html.Length); }
// Safely returns the character at the current position
protected char Peek()
return (_pos _html.Length) ? _html[_pos] : (char)0;
// Safely advances to current position to the next character
protected void MoveAhead()
_pos = Math.Min(_pos + 1, _html.Length);
// Moves the current position to the next non-whitespace
// character.
protected void EatWhitespace()
while (Char.IsWhiteSpace(Peek()))
MoveAhead();
// Moves the current position to the next non-whitespace
// character or the start of the next line, whichever
// comes first
protected void EatWhitespaceToNextLine()
while (Char.IsWhiteSpace(Peek()))
char c = Peek();
MoveAhead();
if (c == /n)
break;
// Moves the current position past a quoted value
protected void EatQuotedValue()
char c = Peek();
if (c == " || c == /)
// Opening quote
MoveAhead();
// Find end of value
int start = _pos;
_pos = _html.IndexOfAny(new char[] { c, /r, /n }, _pos);
if (_pos 0)
_pos = _html.Length;
else
MoveAhead(); // Closing quote
/// summary
/// A StringBuilder class that helps eliminate excess whitespace.
/// /summary
protected class TextBuilder
private StringBuilder _text;
private StringBuilder _currLine;
private int _emptyLines;
private bool _preformatted;
// Construction
public TextBuilder()
_text = new StringBuilder();
_currLine = new StringBuilder();
_emptyLines = 0;
_preformatted = false;
/// summary
/// Normally, extra whitespace characters are discarded.
/// If this property is set to true, they are passed
/// through unchanged.
/// /summary
public bool Preformatted
get
return _preformatted;
set
if (value)
// Clear line buffer if changing to
// preformatted mode
if (_currLine.Length 0)
FlushCurrLine();
_emptyLines = 0;
_preformatted = value;
/// summary
/// Clears all current text.
/// /summary
public void Clear()
_text.Length = 0;
_currLine.Length = 0;
_emptyLines = 0;
/// summary
/// Writes the given string to the output buffer.
/// /summary
/// param name="s" /param
public void Write(string s)
foreach (char c in s)
Write(c);
/// summary
/// Writes the given character to the output buffer.
/// /summary
/// param name="c" Character to write /param
public void Write(char c)
if (_preformatted)
// Write preformatted character
_text.Append(c);
else
if (c == /r)
// Ignore carriage returns. Well process
// /n if it comes next
else if (c == /n)
// Flush current line
FlushCurrLine();
else if (Char.IsWhiteSpace(c))
// Write single space character
int len = _currLine.Length;
if (len == 0 || !Char.IsWhiteSpace(_currLine[len - 1]))
_currLine.Append( );
else
// Add character to current line
_currLine.Append(c);
// Appends the current line to output buffer
protected void FlushCurrLine()
// Get current line
string line = _currLine.ToString().Trim();
// Determine if line contains non-space characters
string tmp = line.Replace(" ", String.Empty);
if (tmp.Length == 0)
// An empty line
_emptyLines++;
if (_emptyLines 2 _text.Length 0)
_text.AppendLine(line);
else
// A non-empty line
_emptyLines = 0;
_text.AppendLine(line);
// Reset current line
_currLine.Length = 0;
/// summary
/// Returns the current output as a string.
/// /summary
public override string ToString()
if (_currLine.Length 0)
FlushCurrLine();
return _text.ToString();
}
使用方法
HtmlToText convert = new HtmlToText(); textBox2.Text = convert.Convert(textBox1.Text);
原创文章,作者:Maggie-Hunter,如若转载,请注明出处:https://blog.ytso.com/11095.html
c相关文章
- C#编程入门_ToArray和CopyTo的区别_22「建议收藏」
- 用html做简单的日记,学习HTML日记[通俗易懂]
- js把HTML转成对象,将js对象转换为html「建议收藏」
- HTML添加背景图片_html背景图片铺满网页
- c# 多线程并发-金三银四面试:C#.NET面试题高级篇2-多线程
- c#面试题抽象类和接口的区别-金三银四面试:C#程序员经常遇到的30道基础面试题,想你所想
- resumable.js —— 基于 HTML 5 File API 的文件上传组件 支持续传后台c#实现详解编程语言
- 使用HTML连接MySQL数据库实现功能(html连接mysql数据库)
- Linux上跑起来的HTML之旅(linux运行html)
- MySQL与HTML的无缝连接(html与mysql连接)
- MySQL存储HTML页面的简单技巧(mysql存储html)
- Linux环境实现HTML文件编辑(linux编辑html)
- c与oracle配合实现复杂数据查询游标实践(c# oracle 游标)
- HTML连接Oracle利用JavaScript即可实现(html连接oracle)
- c#与mysql的连接
- 提取HTML代码中文字的C#函数
- php压缩HTML函数轻松实现压缩html/js/Css及注意事项
- c#获取CookieContainer的所有cookies函数代码
- C#下解析HTML的两种方法介绍
- C#图片与二进制转换的简单实例
- c#获取windows桌面背景代码示例
- c#的params参数使用示例
- C#中Html.RenderPartial与Html.RenderAction的区别分析
- C#基础之Lambda表达式用法实例教程