DEV Community

Masui Masanori
Masui Masanori

Posted on

1

[ASP.NET Core] Try reading a word processing file by OpenXML 2

Intro

I will try getting text styles and fonts in this time.

Getting specified styles and fonts

First, I will try getting the text styles and font what I specify them by myself.

Image description

DocFileReader.cs

using DocumentFormat.OpenXml;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Wordprocessing;

namespace OfficeFileAccessor.OfficeFiles.Readers;

public class DocFileReader : IOfficeFileReader
{
    private readonly NLog.Logger logger;
    public DocFileReader()
    {
        this.logger = NLog.LogManager.GetCurrentClassLogger();
    }
    public void Read(IFormFile file)
    {
        using WordprocessingDocument wordDoc = WordprocessingDocument.Open(file.OpenReadStream(), false);
        Body? body = wordDoc.MainDocumentPart?.Document?.Body;
        if (body == null)
        {
            logger.Warn("Failed reading the document");
            return;
        }
        foreach (OpenXmlElement elm in body.Elements())
        {
            if (elm is Table table)
            {
...
            }
            else if (elm is Paragraph paragraph)
            {
                if (elm.InnerText.Trim().Length <= 0)
                {
                    continue;
                }
                // Get full text from paragraph.InnerText
                logger.Info($"Paragraph Text: {paragraph.InnerText}");
                PrintFontInfoFromParagraph(wordDoc.MainDocumentPart, paragraph);
            }
        }
    }
    private void PrintFontInfoFromParagraph(MainDocumentPart? mainPart, Paragraph paragraph)
    {
        // One paragraph is separated as multiple Run elements by styles and fonts
        foreach (Run run in paragraph.Elements<Run>())
        {
            logger.Info($"Run Text: {run.InnerText}");
            // Get text style and font from RunProperties.
            RunProperties? runProperties = run.RunProperties;
            if (runProperties != null)
            {
                logger.Info($"RunProperties found:");
                var fonts = runProperties.RunFonts;
                if (fonts != null)
                {
                    logger.Info($"Font Name: {GetFontName(fonts, mainPart)}");
                }
                if (runProperties.Color != null)
                {
                    logger.Info($"Color: {runProperties.Color.Val}");
                }
                if (runProperties.Bold != null)
                {
                    logger.Info($"Bold: {runProperties.Bold.Val}");
                }
                if (runProperties.FontSize == null)
                {
                    logger.Info($"FontSize was null");
                }
                else if(int.TryParse(runProperties.FontSize.Val, out var size))
                {
                    // runProperties.FontSize.Val represents half-points
                    logger.Info($"FontSize: {size / 2}");
                }
            }
            logger.Info("------------");
        }
    }
    private string GetFontName(RunFonts? runFonts, MainDocumentPart? mainPart)
    {
        string? result = runFonts?.Ascii ??
            runFonts?.HighAnsi ??
            runFonts?.EastAsia ??
            runFonts?.ComplexScript;
        if (string.IsNullOrEmpty(result))
        {
            result = "No font set";
        }
        return result;
    }
}
Enter fullscreen mode Exit fullscreen mode

Result

Paragraph Text: カスタムfontに設定した場合
Run Text: カス
RunProperties found:
Font Name: No font set
FontSize was null
------------
Run Text: タムfo
RunProperties found:
Font Name: Noto Sans JP Black
FontSize was null
------------
Run Text: ntに
RunProperties found:
Font Name: No font set
FontSize was null
------------
Run Text: 設定
RunProperties found:
Font Name: No font set
Bold:
FontSize was null
------------
Run Text: した
RunProperties found:
Font Name: Meiryo UI
FontSize: 16
------------
Run Text: 場合
RunProperties found:
Font Name: No font set
Color: 60CAF3
FontSize: 22
------------
Enter fullscreen mode Exit fullscreen mode

Getting default styles and fonts

Unless I change the font, color, size, etc. by myself, the above code won't get that informations.

Image description

So I have to get them from the base style or ThemeFonts.
I can get informations of "見出し(Headline)".
But some styles like "標準(Normal)" don't have style informations, so I set the default values if I can't get "ParagraphStyleId" from the paragraph.

DocFileReader.cs

...
public class DocFileReader : IOfficeFileReader
{
    private readonly NLog.Logger logger;
    private enum FontType
    {
        Ascii = 0,
        HighAnsi,
        EastAsia,
        Latin,
    }
    private enum FontPriority {
        Major = 0,
        Minor
    }
    private record ThemeFont(string? EastAsiaMajorFont, string? EastAsiaMinorFont, string? LatinMajorFont, string? LatinMinorFont);
    private record TextFont (FontType FontType, string FontName);
    private class TextProps
    {
        public List<TextFont> Fonts { get; set; } = [];
        public int FontSize { get; set; } = 11;
        public bool Bold { get; set; } = false;
        public string Color { get; set; } = "000000";
    }
...
    public void Read(IFormFile file)
    {
        using WordprocessingDocument wordDoc = WordprocessingDocument.Open(file.OpenReadStream(), false);

        Body? body = wordDoc.MainDocumentPart?.Document?.Body;
...
        ThemeFont themeFont = GetThemeFont(wordDoc.MainDocumentPart);
        foreach (OpenXmlElement elm in body.Elements())
        {
            if (elm is Table table)
            {
...
            }
            else if (elm is Paragraph paragraph)
            {
                // Get full text from paragraph.InnerText
                logger.Info($"Paragraph Text: {paragraph.InnerText}");
                PrintFontInfoFromParagraph(wordDoc.MainDocumentPart, paragraph, themeFont);
            }
        }
    }    
    /// <summary>
    /// Get fonts from Theme
    /// </summary>
    /// <param name="mainPart"></param>
    /// <returns></returns>
    private ThemeFont GetThemeFont(MainDocumentPart? mainPart)
    {
        if (mainPart?.ThemePart == null)
        {
            return new(null, null, null, null);
        }
        var theme = mainPart.ThemePart.Theme;
        var themeElements = theme.ThemeElements;
        if (themeElements == null)
        {
            return new(null, null, null, null);
        }
        var majorFontScheme = themeElements.FontScheme?.MajorFont;
        var minorFontScheme = themeElements.FontScheme?.MinorFont;
        if(majorFontScheme == null && minorFontScheme == null)
        {
            return new(null, null, null, null);
        }
        return new ThemeFont(EastAsiaMajorFont: majorFontScheme?.EastAsianFont?.Typeface,
            EastAsiaMinorFont: minorFontScheme?.EastAsianFont?.Typeface,
            LatinMajorFont: majorFontScheme?.LatinFont?.Typeface,
            LatinMinorFont: minorFontScheme?.LatinFont?.Typeface);
    }
    private void PrintFontInfoFromParagraph(MainDocumentPart? mainPart, Paragraph paragraph, ThemeFont themeFont)
    {
        TextProps? props = GetTextProps(mainPart, paragraph, themeFont); 

        // One paragraph is separated as multiple Run elements by styles and font types
        foreach (Run run in paragraph.Elements<Run>())
        {
            logger.Info($"Run Text: {run.InnerText}");
            RunProperties? runProperties = run.RunProperties;
            if (runProperties == null)
            {
                logger.Info("runProperties was null");
            }
            else
            {
                var fonts = GetFonts(runProperties.RunFonts);                
                if(fonts.Count > 0)
                {
                    foreach(var f in fonts)
                    {
                        logger.Info($"Font Name: {f.FontName} Type: {f.FontType}");
                    }
                }
                else if(props?.Fonts != null)
                {
                    foreach(var f in props.Fonts)
                    {
                        logger.Info($"Font Name: {f.FontName} Type: {f.FontType}");
                    }
                }

                if (runProperties.Color == null)
                {
                    if(props?.Color != null)
                    {
                        logger.Info($"Color: {props.Color}");
                    }
                }
                else
                {
                    logger.Info($"Color: {runProperties.Color.Val}");
                }
                if (runProperties.Bold == null)
                {
                    if(props?.Bold != null)
                    {
                        logger.Info($"Bold: {props.Bold}");
                    }
                }
                else
                {
                    logger.Info($"Bold: {runProperties.Bold.Val}");
                }
                if (runProperties.FontSize == null)
                {
                    if(props?.FontSize != null)
                    {
                        logger.Info($"FontSize: {props.FontSize}");
                    }
                }
                else if(int.TryParse(runProperties.FontSize.Val, out var size))
                {
                    // runProperties.FontSize.Val represents half-points
                    logger.Info($"FontSize: {size / 2}");
                }
            }
            logger.Info("------------");
        }
    }
    /// <summary>
    /// Get style and font from paragraph
    /// </summary>
    /// <param name="mainPart"></param>
    /// <param name="paragraph"></param>
    /// <param name="themeFont"></param>
    /// <returns></returns>
    private TextProps? GetTextProps(MainDocumentPart? mainPart, Paragraph paragraph, ThemeFont themeFont)
    {
        string? styleId = paragraph.ParagraphProperties?.ParagraphStyleId?.Val?.Value;
        Style? style = GetStyleById(mainPart, styleId);

        TextProps? result = GetTextPropsFromRunProperties(style?.StyleRunProperties, themeFont);
        if(style == null)
        {
            return GenerateDefaultProps(themeFont);
        } 
        else if(result == null || result.Fonts == null || result.Fonts.Count <= 0)
        {
            StyleRunProperties? inheritedRunProperties = GetInheritedRunProperties(style, mainPart);
            if (inheritedRunProperties == null)
            {
                return GenerateDefaultProps(themeFont);
            }
            else
            {
                logger.Info("Inherited from Base Style:");
                return GetTextPropsFromRunProperties(inheritedRunProperties, themeFont);
            }
        }
        return result;
    }    
    private static StyleRunProperties? GetInheritedRunProperties(Style style, MainDocumentPart? mainPart)
    {
        if (style.BasedOn != null)
        {
            string? baseStyleId = style.BasedOn.Val?.Value;
            Style? baseStyle = mainPart?.StyleDefinitionsPart?.Styles?.Elements<Style>()
                .FirstOrDefault(s => s.StyleId == baseStyleId);
            if (baseStyle != null)
            {
                if (baseStyle.StyleRunProperties != null)
                {
                    return baseStyle.StyleRunProperties;
                }
                else
                {
                    return GetInheritedRunProperties(baseStyle, mainPart);
                }
            }
        }
        return null;
    }
    private static List<TextFont> GetFonts(RunFonts? runFonts)
    {
        List<TextFont> results = [];
        if(string.IsNullOrEmpty(runFonts?.Ascii?.Value) == false)
        {
            results.Add(new TextFont(FontType.Ascii, runFonts.Ascii.Value));
        }
        if(string.IsNullOrEmpty(runFonts?.HighAnsi?.Value) == false)
        {
            results.Add(new TextFont(FontType.HighAnsi, runFonts.HighAnsi.Value));
        }
        if(string.IsNullOrEmpty(runFonts?.EastAsia?.Value) == false)
        {
            results.Add(new TextFont(FontType.EastAsia, runFonts.EastAsia.Value));
        }
        return results;
    }
    private static TextProps GenerateDefaultProps(ThemeFont themeFont)
    {
        // If the style cannot be gotton, return the default font information.
        List<TextFont> fonts = [];
        if(string.IsNullOrEmpty(themeFont.LatinMinorFont) == false)
        {
            fonts.Add(new(FontType.Latin, themeFont.LatinMinorFont));
        }
        if(string.IsNullOrEmpty(themeFont.EastAsiaMinorFont) == false)
        {
            fonts.Add(new(FontType.EastAsia, themeFont.EastAsiaMinorFont));
        }
        return new ()
        {
            Fonts = fonts,
        };
    }    
    private static Style? GetStyleById(MainDocumentPart? mainPart, string? styleId)
    {
        if(string.IsNullOrEmpty(styleId))
        {
            return null;
        }
        IEnumerable<Style>? styles = mainPart?.StyleDefinitionsPart?.Styles?.Elements<Style>();
        if (styles != null)
        {
            return styles.FirstOrDefault(s => s.StyleId == styleId);
        }
        return null;
    }
    private TextProps? GetTextPropsFromRunProperties(StyleRunProperties? runProperties, ThemeFont themeFont)
    {
        if (runProperties == null)
        {
            return null;
        }
        TextProps? result = new();
        var runFonts = runProperties.RunFonts;
        if (runFonts != null)
        {
            result.Fonts = GetTextFonts(runFonts);
            if(result.Fonts.Count <= 0)
            {
                result.Fonts = GetTextFonts(themeFont, runFonts);
            }
        }
        if (runProperties.Color?.Val != null)
        {
            result.Color = runProperties.Color.Val!;
        }
        if (runProperties.Bold != null)
        {
            result.Bold = true;
        }
        // runProperties.FontSize.Val represents half-points
        if (string.IsNullOrEmpty(runProperties.FontSize?.Val) == false &&
            int.TryParse(runProperties.FontSize?.Val, out var size))
        {
            result.FontSize = size / 2;
        }
        return result;
    }
    /// <summary>
    /// Get font name from RunFonts
    /// </summary>
    /// <param name="runFonts"></param>
    /// <returns></returns>
    private static List<TextFont> GetTextFonts(RunFonts runFonts)
    {
        List<TextFont> results = [];
        if (runFonts.Ascii?.Value != null && runFonts.Ascii.HasValue)
        {
            results.Add(new TextFont(FontType.Ascii, runFonts.Ascii.Value));
        }
        if (runFonts.HighAnsi?.Value != null && runFonts.HighAnsi.HasValue)
        {
            results.Add(new TextFont(FontType.HighAnsi, runFonts.HighAnsi.Value));
        }
        if (runFonts.EastAsia?.Value != null && runFonts.EastAsia.HasValue)
        {
            results.Add(new TextFont(FontType.EastAsia, runFonts.EastAsia.Value));
        }
        return results;
    }
    /// <summary>
    /// Get font name from ThemeFonts
    /// </summary>
    /// <param name="themeFont"></param>
    /// <param name="runFonts"></param>
    /// <returns></returns>
    private static List<TextFont> GetTextFonts(ThemeFont themeFont, RunFonts runFonts)
    {
        List<TextFont> results = [];
        // ThemeFont is divided into MajorFont and MinorFont.
        if(runFonts.EastAsiaTheme?.Value == ThemeFontValues.MajorEastAsia)
        {
            if(string.IsNullOrEmpty(themeFont.LatinMajorFont) == false)
            {
                results.Add(new(FontType.Latin, themeFont.LatinMajorFont));
            }
            if(string.IsNullOrEmpty(themeFont.EastAsiaMajorFont) == false)
            {
                results.Add(new(FontType.EastAsia, themeFont.EastAsiaMajorFont));
            }
        }
        else
        {
            if(string.IsNullOrEmpty(themeFont.LatinMinorFont) == false)
            {
                results.Add(new(FontType.Latin, themeFont.LatinMinorFont));
            }
            if(string.IsNullOrEmpty(themeFont.EastAsiaMinorFont) == false)
            {
                results.Add(new(FontType.EastAsia, themeFont.EastAsiaMinorFont));
            }
        }
        return results;
    }
}
Enter fullscreen mode Exit fullscreen mode

Result

Found a Paragraph with text: This is みだし1
Paragraph Text: This is みだし1
Run Text: This is みだし1
Font Name: 游ゴシック Light Type: Latin
Color: 000000
Bold: False
FontSize: 16
------------
Found a Paragraph with text: あいう
Paragraph Text: あいう
Run Text: あいう
Font Name: 游明朝 Type: Latin
Color: 000000
Bold: False
FontSize: 11
------------
Found a Paragraph with text: 見出し2
Paragraph Text: 見出し2
Run Text: 見出し2
Font Name: 游ゴシック Light Type: Latin
Color: 000000
Bold: False
FontSize: 14
------------
Found a Paragraph with text: えおか
Paragraph Text: えおか
Run Text: えおか
Font Name: 游明朝 Type: Latin
Color: 000000
Bold: False
FontSize: 11
------------
Found a Paragraph with text: きくけ
Paragraph Text: きくけ
Run Text: きくけ
Font Name: 游明朝 Type: Latin
Color: 000000
Bold: False
FontSize: 11
------------
Found a Paragraph with text: こさし
Paragraph Text: こさし
Run Text: こさし
Font Name: 游明朝 Type: Latin
Color: 000000
Bold: False
FontSize: 11
------------
...
Enter fullscreen mode Exit fullscreen mode

Heroku

This site is built on Heroku

Join the ranks of developers at Salesforce, Airbase, DEV, and more who deploy their mission critical applications on Heroku. Sign up today and launch your first app!

Get Started

Top comments (0)

Billboard image

The Next Generation Developer Platform

Coherence is the first Platform-as-a-Service you can control. Unlike "black-box" platforms that are opinionated about the infra you can deploy, Coherence is powered by CNC, the open-source IaC framework, which offers limitless customization.

Learn more

👋 Kindness is contagious

Dive into an ocean of knowledge with this thought-provoking post, revered deeply within the supportive DEV Community. Developers of all levels are welcome to join and enhance our collective intelligence.

Saying a simple "thank you" can brighten someone's day. Share your gratitude in the comments below!

On DEV, sharing ideas eases our path and fortifies our community connections. Found this helpful? Sending a quick thanks to the author can be profoundly valued.

Okay