/******************************************************************************* * You may amend and distribute as you like, but don't remove this header! * * EPPlus provides server-side generation of Excel 2007/2010 spreadsheets. * See http://www.codeplex.com/EPPlus for details. * * Copyright (C) 2011 Jan Källman * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * See the GNU Lesser General Public License for more details. * * The GNU Lesser General Public License can be viewed at http://www.opensource.org/licenses/lgpl-license.php * If you unfamiliar with this license or have questions about it, here is an http://www.gnu.org/licenses/gpl-faq.html * * All code and executables are provided "as is" with no warranty either express or implied. * The author accepts no liability for any damage or loss of business that this product may cause. * * Code change notes: * * Author Change Date * ****************************************************************************** * Richard Tallent Initial Release 2012-08-13 *******************************************************************************/ using System; using System.Collections.Generic; using System.Text; using System.Text.RegularExpressions; namespace OfficeOpenXml.Style { public class ExcelRichTextHtmlUtility { /// /// Provides basic HTML support by converting well-behaved HTML into appropriate RichText blocks. /// HTML support is limited, and does not include font colors, sizes, or typefaces at this time, /// and also does not support CSS style attributes. It does support line breaks using the BR tag. /// /// This routine parses the HTML into RegEx pairings of an HTML tag and the text until the NEXT /// tag (if any). The tag is parsed to determine the setting change to be applied to the last set /// of settings, and if the text is not blank, a new block is added to rich text. /// /// /// The HTML to parse into RichText /// /// public static void SetRichTextFromHtml(ExcelRange range, string html, string defaultFontName, short defaultFontSize) { // Reset the cell value, just in case there is an existing RichText value. range.Value = ""; // Sanity check for blank values, skips creating Regex objects for performance. if (String.IsNullOrEmpty(html)) { range.IsRichText = false; return; } // Change all BR tags to line breaks. http://epplus.codeplex.com/discussions/238692/ // Cells with line breaks aren't necessarily considered rich text, so this is performed // before parsing the HTML tags. html = System.Text.RegularExpressions.Regex.Replace(html, @"]*>", "\r\n", RegexOptions.Compiled | RegexOptions.IgnoreCase); string tag; string text; ExcelRichText thisrt = null; bool isFirst = true; // Get all pairs of legitimate tags and the text between them. This loop will // only execute if there is at least one start or end tag. foreach (Match m in System.Text.RegularExpressions.Regex.Matches(html, @"<(/?[a-z]+)[^<>]*>([\s\S]*?)(?=]*>|$)", RegexOptions.Compiled | RegexOptions.IgnoreCase)) { if (isFirst) { // On the very first match, set up the initial rich text object with // the defaults for the text BEFORE the match. range.IsRichText = true; thisrt = range.RichText.Add(CleanText(html.Substring(0, m.Index))); // May be 0-length thisrt.Size = defaultFontSize; // Set the default font size thisrt.FontName = defaultFontName; // Set the default font name isFirst = false; } // Get the tag and the block of text until the NEXT tag or EOS. If there are HTML entities // encoded, unencode them, they should be passed to RichText as normal characters (other // than non-breaking spaces, which should be replaced with normal spaces, they break Excel. tag = m.Groups[1].Captures[0].Value; text = CleanText(m.Groups[2].Captures[0].Value); if (thisrt.Text == "") { // The most recent rich text block wasn't *actually* used last time around, so update // the text and keep it as the "current" block. This happens with the first block if // it starts with a tag, and may happen later if tags come one right after the other. thisrt.Text = text; } else { // The current rich text block has some text, so create a new one. RichText.Add() // automatically applies the settings from the previous block, other than vertical // alignment. thisrt = range.RichText.Add(text); } // Override the settings based on the current tag, keep all other settings. SetStyleFromTag(tag, thisrt); } if (thisrt == null) { // No HTML tags were found, so treat this as a normal text value. range.IsRichText = false; range.Value = CleanText(html); } else if (String.IsNullOrEmpty(thisrt.Text)) { // Rich text was found, but the last node contains no text, so remove it. This can happen if, // say, the end of the string is an end tag or unsupported tag (common). range.RichText.Remove(thisrt); // Failsafe -- the HTML may be just tags, no text, in which case there may be no rich text // directives that remain. If that is the case, turn off rich text and treat this like a blank // cell value. if (range.RichText.Count == 0) { range.IsRichText = false; range.Value = ""; } } } private static void SetStyleFromTag(string tag, ExcelRichText settings) { switch (tag.ToLower()) { case "b": case "strong": settings.Bold = true; break; case "i": case "em": settings.Italic = true; break; case "u": settings.UnderLine = true; break; case "s": case "strike": settings.Strike = true; break; case "sup": settings.VerticalAlign = ExcelVerticalAlignmentFont.Superscript; break; case "sub": settings.VerticalAlign = ExcelVerticalAlignmentFont.Subscript; break; case "/b": case "/strong": settings.Bold = false; break; case "/i": case "/em": settings.Italic = false; break; case "/u": settings.UnderLine = false; break; case "/s": case "/strike": settings.Strike = false; break; case "/sup": case "/sub": settings.VerticalAlign = ExcelVerticalAlignmentFont.None; break; default: // unsupported HTML, no style change break; } } private static string CleanText(string s) { // Need to convert HTML entities (named or numbered) into actual Unicode characters s = System.Web.HttpUtility.HtmlDecode(s); // Remove any non-breaking spaces, kills Excel s = s.Replace("\u00A0", " "); return s; } } }