zl程序教程

您现在的位置是:首页 >  后端

当前栏目

C# 将 HTML 转成纯文本详解编程语言

c#HTML编程语言 详解 文本 转成
2023-06-13 09:20:30 时间
protected static Dictionary string, string _tags; protected static HashSet string _ignoreTags; // Instance variables protected TextBuilder _text; protected string _html; protected int _pos; // Static constructor (one time only) static HtmlToText() _tags = new Dictionary string, string _tags.Add("address", "/n"); _tags.Add("blockquote", "/n"); _tags.Add("div", "/n"); _tags.Add("dl", "/n"); _tags.Add("fieldset", "/n"); _tags.Add("form", "/n"); _tags.Add("h1", "/n"); _tags.Add("/h1", "/n"); _tags.Add("h2", "/n"); _tags.Add("/h2", "/n"); _tags.Add("h3", "/n"); _tags.Add("/h3", "/n"); _tags.Add("h4", "/n"); _tags.Add("/h4", "/n"); _tags.Add("h5", "/n"); _tags.Add("/h5", "/n"); _tags.Add("h6", "/n"); _tags.Add("/h6", "/n"); _tags.Add("p", "/n"); _tags.Add("/p", "/n"); _tags.Add("table", "/n"); _tags.Add("/table", "/n"); _tags.Add("ul", "/n"); _tags.Add("/ul", "/n"); _tags.Add("ol", "/n"); _tags.Add("/ol", "/n"); _tags.Add("/li", "/n"); _tags.Add("br", "/n"); _tags.Add("/td", "/t"); _tags.Add("/tr", "/n"); _tags.Add("/pre", "/n"); _ignoreTags = new HashSet string _ignoreTags.Add("script"); _ignoreTags.Add("noscript"); _ignoreTags.Add("style"); _ignoreTags.Add("object"); /// summary /// Converts the given HTML to plain text and returns the result. /// /summary /// param name="html" HTML to be converted /param /// returns Resulting plain text /returns public string Convert(string html) // Initialize state variables _text = new TextBuilder(); _html = html; _pos = 0; // Process input while (!EndOfText) if (Peek() == ) // HTML tag bool selfClosing; string tag = ParseTag(out selfClosing); // Handle special tag cases if (tag == "body") // Discard content before body _text.Clear(); else if (tag == "/body") // Discard content after /body _pos = _html.Length; else if (tag == "pre") // Enter preformatted mode _text.Preformatted = true; EatWhitespaceToNextLine(); else if (tag == "/pre") // Exit preformatted mode _text.Preformatted = false; string value; if (_tags.TryGetValue(tag, out value)) _text.Write(value); if (_ignoreTags.Contains(tag)) EatInnerContent(tag); else if (Char.IsWhiteSpace(Peek())) // Whitespace (treat all as space) _text.Write(_text.Preformatted ? Peek() : ); MoveAhead(); else // Other text _text.Write(Peek()); MoveAhead(); // Return result return HttpUtility.HtmlDecode(_text.ToString()); // Eats all characters that are part of the current tag // and returns information about that tag protected string ParseTag(out bool selfClosing) string tag = String.Empty; selfClosing = false; if (Peek() == ) MoveAhead(); // Parse tag name EatWhitespace(); int start = _pos; if (Peek() == /) MoveAhead(); while (!EndOfText !Char.IsWhiteSpace(Peek()) Peek() != / Peek() != ) MoveAhead(); tag = _html.Substring(start, _pos - start).ToLower(); // Parse rest of tag while (!EndOfText Peek() != ) if (Peek() == " || Peek() == /) EatQuotedValue(); else if (Peek() == /) selfClosing = true; MoveAhead(); MoveAhead(); return tag; // Consumes inner content from the current tag protected void EatInnerContent(string tag) string endTag = "/" + tag; while (!EndOfText) if (Peek() == ) // Consume a tag bool selfClosing; if (ParseTag(out selfClosing) == endTag) return; // Use recursion to consume nested tags if (!selfClosing !tag.StartsWith("/")) EatInnerContent(tag); else MoveAhead(); // Returns true if the current position is at the end of // the string protected bool EndOfText get { return (_pos = _html.Length); } // Safely returns the character at the current position protected char Peek() return (_pos _html.Length) ? _html[_pos] : (char)0; // Safely advances to current position to the next character protected void MoveAhead() _pos = Math.Min(_pos + 1, _html.Length); // Moves the current position to the next non-whitespace // character. protected void EatWhitespace() while (Char.IsWhiteSpace(Peek())) MoveAhead(); // Moves the current position to the next non-whitespace // character or the start of the next line, whichever // comes first protected void EatWhitespaceToNextLine() while (Char.IsWhiteSpace(Peek())) char c = Peek(); MoveAhead(); if (c == /n) break; // Moves the current position past a quoted value protected void EatQuotedValue() char c = Peek(); if (c == " || c == /) // Opening quote MoveAhead(); // Find end of value int start = _pos; _pos = _html.IndexOfAny(new char[] { c, /r, /n }, _pos); if (_pos 0) _pos = _html.Length; else MoveAhead(); // Closing quote /// summary /// A StringBuilder class that helps eliminate excess whitespace. /// /summary protected class TextBuilder private StringBuilder _text; private StringBuilder _currLine; private int _emptyLines; private bool _preformatted; // Construction public TextBuilder() _text = new StringBuilder(); _currLine = new StringBuilder(); _emptyLines = 0; _preformatted = false; /// summary /// Normally, extra whitespace characters are discarded. /// If this property is set to true, they are passed /// through unchanged. /// /summary public bool Preformatted get return _preformatted; set if (value) // Clear line buffer if changing to // preformatted mode if (_currLine.Length 0) FlushCurrLine(); _emptyLines = 0; _preformatted = value; /// summary /// Clears all current text. /// /summary public void Clear() _text.Length = 0; _currLine.Length = 0; _emptyLines = 0; /// summary /// Writes the given string to the output buffer. /// /summary /// param name="s" /param public void Write(string s) foreach (char c in s) Write(c); /// summary /// Writes the given character to the output buffer. /// /summary /// param name="c" Character to write /param public void Write(char c) if (_preformatted) // Write preformatted character _text.Append(c); else if (c == /r) // Ignore carriage returns. Well process // /n if it comes next else if (c == /n) // Flush current line FlushCurrLine(); else if (Char.IsWhiteSpace(c)) // Write single space character int len = _currLine.Length; if (len == 0 || !Char.IsWhiteSpace(_currLine[len - 1])) _currLine.Append( ); else // Add character to current line _currLine.Append(c); // Appends the current line to output buffer protected void FlushCurrLine() // Get current line string line = _currLine.ToString().Trim(); // Determine if line contains non-space characters string tmp = line.Replace(" ", String.Empty); if (tmp.Length == 0) // An empty line _emptyLines++; if (_emptyLines 2 _text.Length 0) _text.AppendLine(line); else // A non-empty line _emptyLines = 0; _text.AppendLine(line); // Reset current line _currLine.Length = 0; /// summary /// Returns the current output as a string. /// /summary public override string ToString() if (_currLine.Length 0) FlushCurrLine(); return _text.ToString(); }


使用方法

HtmlToText convert = new HtmlToText(); 

textBox2.Text = convert.Convert(textBox1.Text);

原创文章,作者:Maggie-Hunter,如若转载,请注明出处:https://blog.ytso.com/11095.html

c