I’ve found myself having to convert HTML to plain text a few times in a row now and thought I would post my very simple solution for it.
It’s got a few caveats, for example, it won’t handle <pre> tags or margins on divs and such thing. Not even paragraphs. But I just wanted a simple conversion so it was enough for me, feel free to extend it. You can try it out using this dotnetfiddle.
public static class Html2PlainText { private static readonly Regex NonExplicitLines = new Regex ("\r|\n", RegexOptions.Multiline | RegexOptions.Compiled); private static readonly Regex DivEndings = new Regex ("</div>", RegexOptions.IgnoreCase | RegexOptions.Compiled); private static readonly Regex LineBreaks = new Regex ("</br\s*>", RegexOptions.IgnoreCase | RegexOptions.Compiled); private static readonly Regex Tags = new Regex ("<[^>]*>", RegexOptions.Compiled); public static string Decode (string html) { if (string.IsNullOrEmpty (html)) return html; var decoded = html.Trim (); if (!HasTags (decoded)) return html; decoded = NonExplicitLines.Replace (decoded, string.Empty); decoded = DivEndings.Replace (decoded, Environment.NewLine); decoded = LineBreaks.Replace (decoded, Environment.NewLine); decoded = Tags.Replace (decoded, string.Empty).Trim (); return WebUtility.HtmlDecode (decoded); } private static bool HasTags (string str) { return str.StartsWith ("<") && str.EndsWith (">"); } }
nice one.
Thank you