/*******************************************************************************
* You may amend and distribute as you like, but don't remove this header!
*
* EPPlus provides server-side generation of Excel 2007/2010 spreadsheets.
* See http://www.codeplex.com/EPPlus for details.
*
* Copyright (C) 2011 Jan Källman
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Lesser General Public License for more details.
*
* The GNU Lesser General Public License can be viewed at http://www.opensource.org/licenses/lgpl-license.php
* If you unfamiliar with this license or have questions about it, here is an http://www.gnu.org/licenses/gpl-faq.html
*
* All code and executables are provided "as is" with no warranty either express or implied.
* The author accepts no liability for any damage or loss of business that this product may cause.
*
* Code change notes:
*
* Author Change Date
* ******************************************************************************
* Richard Tallent Initial Release 2012-08-13
*******************************************************************************/
using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
namespace OfficeOpenXml.Style
{
public class ExcelRichTextHtmlUtility
{
///
/// Provides basic HTML support by converting well-behaved HTML into appropriate RichText blocks.
/// HTML support is limited, and does not include font colors, sizes, or typefaces at this time,
/// and also does not support CSS style attributes. It does support line breaks using the BR tag.
///
/// This routine parses the HTML into RegEx pairings of an HTML tag and the text until the NEXT
/// tag (if any). The tag is parsed to determine the setting change to be applied to the last set
/// of settings, and if the text is not blank, a new block is added to rich text.
///
///
/// The HTML to parse into RichText
///
///
public static void SetRichTextFromHtml(ExcelRange range, string html, string defaultFontName, short defaultFontSize)
{
// Reset the cell value, just in case there is an existing RichText value.
range.Value = "";
// Sanity check for blank values, skips creating Regex objects for performance.
if (String.IsNullOrEmpty(html))
{
range.IsRichText = false;
return;
}
// Change all BR tags to line breaks. http://epplus.codeplex.com/discussions/238692/
// Cells with line breaks aren't necessarily considered rich text, so this is performed
// before parsing the HTML tags.
html = System.Text.RegularExpressions.Regex.Replace(html, @"
]*>", "\r\n", RegexOptions.Compiled | RegexOptions.IgnoreCase);
string tag;
string text;
ExcelRichText thisrt = null;
bool isFirst = true;
// Get all pairs of legitimate tags and the text between them. This loop will
// only execute if there is at least one start or end tag.
foreach (Match m in System.Text.RegularExpressions.Regex.Matches(html, @"<(/?[a-z]+)[^<>]*>([\s\S]*?)(?=?[a-z]+[^<>]*>|$)", RegexOptions.Compiled | RegexOptions.IgnoreCase))
{
if (isFirst)
{
// On the very first match, set up the initial rich text object with
// the defaults for the text BEFORE the match.
range.IsRichText = true;
thisrt = range.RichText.Add(CleanText(html.Substring(0, m.Index))); // May be 0-length
thisrt.Size = defaultFontSize; // Set the default font size
thisrt.FontName = defaultFontName; // Set the default font name
isFirst = false;
}
// Get the tag and the block of text until the NEXT tag or EOS. If there are HTML entities
// encoded, unencode them, they should be passed to RichText as normal characters (other
// than non-breaking spaces, which should be replaced with normal spaces, they break Excel.
tag = m.Groups[1].Captures[0].Value;
text = CleanText(m.Groups[2].Captures[0].Value);
if (thisrt.Text == "")
{
// The most recent rich text block wasn't *actually* used last time around, so update
// the text and keep it as the "current" block. This happens with the first block if
// it starts with a tag, and may happen later if tags come one right after the other.
thisrt.Text = text;
}
else
{
// The current rich text block has some text, so create a new one. RichText.Add()
// automatically applies the settings from the previous block, other than vertical
// alignment.
thisrt = range.RichText.Add(text);
}
// Override the settings based on the current tag, keep all other settings.
SetStyleFromTag(tag, thisrt);
}
if (thisrt == null)
{
// No HTML tags were found, so treat this as a normal text value.
range.IsRichText = false;
range.Value = CleanText(html);
}
else if (String.IsNullOrEmpty(thisrt.Text))
{
// Rich text was found, but the last node contains no text, so remove it. This can happen if,
// say, the end of the string is an end tag or unsupported tag (common).
range.RichText.Remove(thisrt);
// Failsafe -- the HTML may be just tags, no text, in which case there may be no rich text
// directives that remain. If that is the case, turn off rich text and treat this like a blank
// cell value.
if (range.RichText.Count == 0)
{
range.IsRichText = false;
range.Value = "";
}
}
}
private static void SetStyleFromTag(string tag, ExcelRichText settings)
{
switch (tag.ToLower())
{
case "b":
case "strong":
settings.Bold = true;
break;
case "i":
case "em":
settings.Italic = true;
break;
case "u":
settings.UnderLine = true;
break;
case "s":
case "strike":
settings.Strike = true;
break;
case "sup":
settings.VerticalAlign = ExcelVerticalAlignmentFont.Superscript;
break;
case "sub":
settings.VerticalAlign = ExcelVerticalAlignmentFont.Subscript;
break;
case "/b":
case "/strong":
settings.Bold = false;
break;
case "/i":
case "/em":
settings.Italic = false;
break;
case "/u":
settings.UnderLine = false;
break;
case "/s":
case "/strike":
settings.Strike = false;
break;
case "/sup":
case "/sub":
settings.VerticalAlign = ExcelVerticalAlignmentFont.None;
break;
default:
// unsupported HTML, no style change
break;
}
}
private static string CleanText(string s)
{
// Need to convert HTML entities (named or numbered) into actual Unicode characters
s = System.Web.HttpUtility.HtmlDecode(s);
// Remove any non-breaking spaces, kills Excel
s = s.Replace("\u00A0", " ");
return s;
}
}
}