DEV Community

Masui Masanori
Masui Masanori

Posted on

[ASP.NET Core] Try reading a word processing file by OpenXML 2

Intro

I will try getting text styles and fonts in this time.

Getting specified styles and fonts

First, I will try getting the text styles and font what I specify them by myself.

Image description

DocFileReader.cs

using DocumentFormat.OpenXml;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Wordprocessing;

namespace OfficeFileAccessor.OfficeFiles.Readers;

public class DocFileReader : IOfficeFileReader
{
    private readonly NLog.Logger logger;
    public DocFileReader()
    {
        this.logger = NLog.LogManager.GetCurrentClassLogger();
    }
    public void Read(IFormFile file)
    {
        using WordprocessingDocument wordDoc = WordprocessingDocument.Open(file.OpenReadStream(), false);
        Body? body = wordDoc.MainDocumentPart?.Document?.Body;
        if (body == null)
        {
            logger.Warn("Failed reading the document");
            return;
        }
        foreach (OpenXmlElement elm in body.Elements())
        {
            if (elm is Table table)
            {
...
            }
            else if (elm is Paragraph paragraph)
            {
                if (elm.InnerText.Trim().Length <= 0)
                {
                    continue;
                }
                // Get full text from paragraph.InnerText
                logger.Info($"Paragraph Text: {paragraph.InnerText}");
                PrintFontInfoFromParagraph(wordDoc.MainDocumentPart, paragraph);
            }
        }
    }
    private void PrintFontInfoFromParagraph(MainDocumentPart? mainPart, Paragraph paragraph)
    {
        // One paragraph is separated as multiple Run elements by styles and fonts
        foreach (Run run in paragraph.Elements<Run>())
        {
            logger.Info($"Run Text: {run.InnerText}");
            // Get text style and font from RunProperties.
            RunProperties? runProperties = run.RunProperties;
            if (runProperties != null)
            {
                logger.Info($"RunProperties found:");
                var fonts = runProperties.RunFonts;
                if (fonts != null)
                {
                    logger.Info($"Font Name: {GetFontName(fonts, mainPart)}");
                }
                if (runProperties.Color != null)
                {
                    logger.Info($"Color: {runProperties.Color.Val}");
                }
                if (runProperties.Bold != null)
                {
                    logger.Info($"Bold: {runProperties.Bold.Val}");
                }
                if (runProperties.FontSize == null)
                {
                    logger.Info($"FontSize was null");
                }
                else if(int.TryParse(runProperties.FontSize.Val, out var size))
                {
                    // runProperties.FontSize.Val represents half-points
                    logger.Info($"FontSize: {size / 2}");
                }
            }
            logger.Info("------------");
        }
    }
    private string GetFontName(RunFonts? runFonts, MainDocumentPart? mainPart)
    {
        string? result = runFonts?.Ascii ??
            runFonts?.HighAnsi ??
            runFonts?.EastAsia ??
            runFonts?.ComplexScript;
        if (string.IsNullOrEmpty(result))
        {
            result = "No font set";
        }
        return result;
    }
}
Enter fullscreen mode Exit fullscreen mode

Result

Paragraph Text: カスタムfontに設定した場合
Run Text: カス
RunProperties found:
Font Name: No font set
FontSize was null
------------
Run Text: タムfo
RunProperties found:
Font Name: Noto Sans JP Black
FontSize was null
------------
Run Text: ntに
RunProperties found:
Font Name: No font set
FontSize was null
------------
Run Text: 設定
RunProperties found:
Font Name: No font set
Bold:
FontSize was null
------------
Run Text: した
RunProperties found:
Font Name: Meiryo UI
FontSize: 16
------------
Run Text: 場合
RunProperties found:
Font Name: No font set
Color: 60CAF3
FontSize: 22
------------
Enter fullscreen mode Exit fullscreen mode

Getting default styles and fonts

Unless I change the font, color, size, etc. by myself, the above code won't get that informations.

Image description

So I have to get them from the base style or ThemeFonts.
I can get informations of "見出し(Headline)".
But some styles like "標準(Normal)" don't have style informations, so I set the default values if I can't get "ParagraphStyleId" from the paragraph.

DocFileReader.cs

...
public class DocFileReader : IOfficeFileReader
{
    private readonly NLog.Logger logger;
    private enum FontType
    {
        Ascii = 0,
        HighAnsi,
        EastAsia,
        Latin,
    }
    private enum FontPriority {
        Major = 0,
        Minor
    }
    private record ThemeFont(string? EastAsiaMajorFont, string? EastAsiaMinorFont, string? LatinMajorFont, string? LatinMinorFont);
    private record TextFont (FontType FontType, string FontName);
    private class TextProps
    {
        public List<TextFont> Fonts { get; set; } = [];
        public int FontSize { get; set; } = 11;
        public bool Bold { get; set; } = false;
        public string Color { get; set; } = "000000";
    }
...
    public void Read(IFormFile file)
    {
        using WordprocessingDocument wordDoc = WordprocessingDocument.Open(file.OpenReadStream(), false);

        Body? body = wordDoc.MainDocumentPart?.Document?.Body;
...
        ThemeFont themeFont = GetThemeFont(wordDoc.MainDocumentPart);
        foreach (OpenXmlElement elm in body.Elements())
        {
            if (elm is Table table)
            {
...
            }
            else if (elm is Paragraph paragraph)
            {
                // Get full text from paragraph.InnerText
                logger.Info($"Paragraph Text: {paragraph.InnerText}");
                PrintFontInfoFromParagraph(wordDoc.MainDocumentPart, paragraph, themeFont);
            }
        }
    }    
    /// <summary>
    /// Get fonts from Theme
    /// </summary>
    /// <param name="mainPart"></param>
    /// <returns></returns>
    private ThemeFont GetThemeFont(MainDocumentPart? mainPart)
    {
        if (mainPart?.ThemePart == null)
        {
            return new(null, null, null, null);
        }
        var theme = mainPart.ThemePart.Theme;
        var themeElements = theme.ThemeElements;
        if (themeElements == null)
        {
            return new(null, null, null, null);
        }
        var majorFontScheme = themeElements.FontScheme?.MajorFont;
        var minorFontScheme = themeElements.FontScheme?.MinorFont;
        if(majorFontScheme == null && minorFontScheme == null)
        {
            return new(null, null, null, null);
        }
        return new ThemeFont(EastAsiaMajorFont: majorFontScheme?.EastAsianFont?.Typeface,
            EastAsiaMinorFont: minorFontScheme?.EastAsianFont?.Typeface,
            LatinMajorFont: majorFontScheme?.LatinFont?.Typeface,
            LatinMinorFont: minorFontScheme?.LatinFont?.Typeface);
    }
    private void PrintFontInfoFromParagraph(MainDocumentPart? mainPart, Paragraph paragraph, ThemeFont themeFont)
    {
        TextProps? props = GetTextProps(mainPart, paragraph, themeFont); 

        // One paragraph is separated as multiple Run elements by styles and font types
        foreach (Run run in paragraph.Elements<Run>())
        {
            logger.Info($"Run Text: {run.InnerText}");
            RunProperties? runProperties = run.RunProperties;
            if (runProperties == null)
            {
                logger.Info("runProperties was null");
            }
            else
            {
                var fonts = GetFonts(runProperties.RunFonts);                
                if(fonts.Count > 0)
                {
                    foreach(var f in fonts)
                    {
                        logger.Info($"Font Name: {f.FontName} Type: {f.FontType}");
                    }
                }
                else if(props?.Fonts != null)
                {
                    foreach(var f in props.Fonts)
                    {
                        logger.Info($"Font Name: {f.FontName} Type: {f.FontType}");
                    }
                }

                if (runProperties.Color == null)
                {
                    if(props?.Color != null)
                    {
                        logger.Info($"Color: {props.Color}");
                    }
                }
                else
                {
                    logger.Info($"Color: {runProperties.Color.Val}");
                }
                if (runProperties.Bold == null)
                {
                    if(props?.Bold != null)
                    {
                        logger.Info($"Bold: {props.Bold}");
                    }
                }
                else
                {
                    logger.Info($"Bold: {runProperties.Bold.Val}");
                }
                if (runProperties.FontSize == null)
                {
                    if(props?.FontSize != null)
                    {
                        logger.Info($"FontSize: {props.FontSize}");
                    }
                }
                else if(int.TryParse(runProperties.FontSize.Val, out var size))
                {
                    // runProperties.FontSize.Val represents half-points
                    logger.Info($"FontSize: {size / 2}");
                }
            }
            logger.Info("------------");
        }
    }
    /// <summary>
    /// Get style and font from paragraph
    /// </summary>
    /// <param name="mainPart"></param>
    /// <param name="paragraph"></param>
    /// <param name="themeFont"></param>
    /// <returns></returns>
    private TextProps? GetTextProps(MainDocumentPart? mainPart, Paragraph paragraph, ThemeFont themeFont)
    {
        string? styleId = paragraph.ParagraphProperties?.ParagraphStyleId?.Val?.Value;
        Style? style = GetStyleById(mainPart, styleId);

        TextProps? result = GetTextPropsFromRunProperties(style?.StyleRunProperties, themeFont);
        if(style == null)
        {
            return GenerateDefaultProps(themeFont);
        } 
        else if(result == null || result.Fonts == null || result.Fonts.Count <= 0)
        {
            StyleRunProperties? inheritedRunProperties = GetInheritedRunProperties(style, mainPart);
            if (inheritedRunProperties == null)
            {
                return GenerateDefaultProps(themeFont);
            }
            else
            {
                logger.Info("Inherited from Base Style:");
                return GetTextPropsFromRunProperties(inheritedRunProperties, themeFont);
            }
        }
        return result;
    }    
    private static StyleRunProperties? GetInheritedRunProperties(Style style, MainDocumentPart? mainPart)
    {
        if (style.BasedOn != null)
        {
            string? baseStyleId = style.BasedOn.Val?.Value;
            Style? baseStyle = mainPart?.StyleDefinitionsPart?.Styles?.Elements<Style>()
                .FirstOrDefault(s => s.StyleId == baseStyleId);
            if (baseStyle != null)
            {
                if (baseStyle.StyleRunProperties != null)
                {
                    return baseStyle.StyleRunProperties;
                }
                else
                {
                    return GetInheritedRunProperties(baseStyle, mainPart);
                }
            }
        }
        return null;
    }
    private static List<TextFont> GetFonts(RunFonts? runFonts)
    {
        List<TextFont> results = [];
        if(string.IsNullOrEmpty(runFonts?.Ascii?.Value) == false)
        {
            results.Add(new TextFont(FontType.Ascii, runFonts.Ascii.Value));
        }
        if(string.IsNullOrEmpty(runFonts?.HighAnsi?.Value) == false)
        {
            results.Add(new TextFont(FontType.HighAnsi, runFonts.HighAnsi.Value));
        }
        if(string.IsNullOrEmpty(runFonts?.EastAsia?.Value) == false)
        {
            results.Add(new TextFont(FontType.EastAsia, runFonts.EastAsia.Value));
        }
        return results;
    }
    private static TextProps GenerateDefaultProps(ThemeFont themeFont)
    {
        // If the style cannot be gotton, return the default font information.
        List<TextFont> fonts = [];
        if(string.IsNullOrEmpty(themeFont.LatinMinorFont) == false)
        {
            fonts.Add(new(FontType.Latin, themeFont.LatinMinorFont));
        }
        if(string.IsNullOrEmpty(themeFont.EastAsiaMinorFont) == false)
        {
            fonts.Add(new(FontType.EastAsia, themeFont.EastAsiaMinorFont));
        }
        return new ()
        {
            Fonts = fonts,
        };
    }    
    private static Style? GetStyleById(MainDocumentPart? mainPart, string? styleId)
    {
        if(string.IsNullOrEmpty(styleId))
        {
            return null;
        }
        IEnumerable<Style>? styles = mainPart?.StyleDefinitionsPart?.Styles?.Elements<Style>();
        if (styles != null)
        {
            return styles.FirstOrDefault(s => s.StyleId == styleId);
        }
        return null;
    }
    private TextProps? GetTextPropsFromRunProperties(StyleRunProperties? runProperties, ThemeFont themeFont)
    {
        if (runProperties == null)
        {
            return null;
        }
        TextProps? result = new();
        var runFonts = runProperties.RunFonts;
        if (runFonts != null)
        {
            result.Fonts = GetTextFonts(runFonts);
            if(result.Fonts.Count <= 0)
            {
                result.Fonts = GetTextFonts(themeFont, runFonts);
            }
        }
        if (runProperties.Color?.Val != null)
        {
            result.Color = runProperties.Color.Val!;
        }
        if (runProperties.Bold != null)
        {
            result.Bold = true;
        }
        // runProperties.FontSize.Val represents half-points
        if (string.IsNullOrEmpty(runProperties.FontSize?.Val) == false &&
            int.TryParse(runProperties.FontSize?.Val, out var size))
        {
            result.FontSize = size / 2;
        }
        return result;
    }
    /// <summary>
    /// Get font name from RunFonts
    /// </summary>
    /// <param name="runFonts"></param>
    /// <returns></returns>
    private static List<TextFont> GetTextFonts(RunFonts runFonts)
    {
        List<TextFont> results = [];
        if (runFonts.Ascii?.Value != null && runFonts.Ascii.HasValue)
        {
            results.Add(new TextFont(FontType.Ascii, runFonts.Ascii.Value));
        }
        if (runFonts.HighAnsi?.Value != null && runFonts.HighAnsi.HasValue)
        {
            results.Add(new TextFont(FontType.HighAnsi, runFonts.HighAnsi.Value));
        }
        if (runFonts.EastAsia?.Value != null && runFonts.EastAsia.HasValue)
        {
            results.Add(new TextFont(FontType.EastAsia, runFonts.EastAsia.Value));
        }
        return results;
    }
    /// <summary>
    /// Get font name from ThemeFonts
    /// </summary>
    /// <param name="themeFont"></param>
    /// <param name="runFonts"></param>
    /// <returns></returns>
    private static List<TextFont> GetTextFonts(ThemeFont themeFont, RunFonts runFonts)
    {
        List<TextFont> results = [];
        // ThemeFont is divided into MajorFont and MinorFont.
        if(runFonts.EastAsiaTheme?.Value == ThemeFontValues.MajorEastAsia)
        {
            if(string.IsNullOrEmpty(themeFont.LatinMajorFont) == false)
            {
                results.Add(new(FontType.Latin, themeFont.LatinMajorFont));
            }
            if(string.IsNullOrEmpty(themeFont.EastAsiaMajorFont) == false)
            {
                results.Add(new(FontType.EastAsia, themeFont.EastAsiaMajorFont));
            }
        }
        else
        {
            if(string.IsNullOrEmpty(themeFont.LatinMinorFont) == false)
            {
                results.Add(new(FontType.Latin, themeFont.LatinMinorFont));
            }
            if(string.IsNullOrEmpty(themeFont.EastAsiaMinorFont) == false)
            {
                results.Add(new(FontType.EastAsia, themeFont.EastAsiaMinorFont));
            }
        }
        return results;
    }
}
Enter fullscreen mode Exit fullscreen mode

Result

Found a Paragraph with text: This is みだし1
Paragraph Text: This is みだし1
Run Text: This is みだし1
Font Name: 游ゴシック Light Type: Latin
Color: 000000
Bold: False
FontSize: 16
------------
Found a Paragraph with text: あいう
Paragraph Text: あいう
Run Text: あいう
Font Name: 游明朝 Type: Latin
Color: 000000
Bold: False
FontSize: 11
------------
Found a Paragraph with text: 見出し2
Paragraph Text: 見出し2
Run Text: 見出し2
Font Name: 游ゴシック Light Type: Latin
Color: 000000
Bold: False
FontSize: 14
------------
Found a Paragraph with text: えおか
Paragraph Text: えおか
Run Text: えおか
Font Name: 游明朝 Type: Latin
Color: 000000
Bold: False
FontSize: 11
------------
Found a Paragraph with text: きくけ
Paragraph Text: きくけ
Run Text: きくけ
Font Name: 游明朝 Type: Latin
Color: 000000
Bold: False
FontSize: 11
------------
Found a Paragraph with text: こさし
Paragraph Text: こさし
Run Text: こさし
Font Name: 游明朝 Type: Latin
Color: 000000
Bold: False
FontSize: 11
------------
...
Enter fullscreen mode Exit fullscreen mode

Top comments (0)