DEV Community

Masui Masanori
Masui Masanori

Posted on

1

[ASP.NET Core] Try reading a spreadsheet file by OpenXML 1

Intro

This time, I will try reading a spreadsheet file(MS Excel file) by DocumentFormat.OpenXml.
I will use this file to check the result.

Image description

Getting cells

To load a spreadsheet using OpenXML, I will get a WorkbookPart from the spreadsheet file first.
And then I will get a WorksheetPart from it, finally I will be able to get Worksheets from the WorksheetPart.

XlsFileReader.cs

using DocumentFormat.OpenXml;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Spreadsheet;
using DocumentFormat.OpenXml.Drawing.Spreadsheet;
using Drawing = DocumentFormat.OpenXml.Drawing;
using OfficeFileAccessor.Apps;
using System.Text.RegularExpressions;

namespace OfficeFileAccessor.OfficeFiles.Readers;

public class XlsFileReader(ILogger<XlsFileReader> Logger) : IXlsFileReader
{
...
    public void Read(IFormFile file)
    {
        using SpreadsheetDocument spreadsheet = SpreadsheetDocument.Open(file.OpenReadStream(), false);
        WorkbookPart? bookPart = spreadsheet.WorkbookPart;
        if(bookPart == null) {
            return;
        }
        List<string> sheetNames = GetSheetNameList(bookPart);
        foreach(var name in sheetNames)
        {
            Worksheet? targetSheet = GetWorksheet(bookPart, name);
            if(targetSheet == null) {
                return;
            }
            WorksheetPart? sheetPart = targetSheet.WorksheetPart;
            foreach(Row row in targetSheet.Descendants<Row>())
            {
                foreach(Cell cell in row.Cast<Cell>())
                {
                    // Get cell values
                    Worksheets.Cell? cellValue = GetCellValue(bookPart, cell);                    
                }
            }
            // Load first sheet for testing.
            break;
        }
    }
...
    private static List<string> GetSheetNameList(WorkbookPart bookPart) =>
        [.. bookPart.Workbook.Descendants<Sheet>().Where(s => string.IsNullOrEmpty(s.Name) == false).Select(s => s.Name?.Value ?? "")];
    private static Worksheet? GetWorksheet(WorkbookPart bookPart, string sheetName)
    {
        foreach(Sheet s in bookPart.Workbook.Descendants<Sheet>())
        {
            if(s.Name == sheetName && string.IsNullOrEmpty(s.Id) == false)
            {
                if(bookPart.TryGetPartById(s.Id!, out var part))
                {
                    if (part is WorksheetPart result)
                    {
                        return result.Worksheet;
                    }
                }
            }
        }
        return null;
    }
...
}
Enter fullscreen mode Exit fullscreen mode

Getting cell values

After getting cell instances, I can get values from them.

Text

Cell.cs

namespace OfficeFileAccessor.OfficeFiles.Worksheets;
public class Cell
{
    public required string Address { get; init; }
    public required CellValueType Type { get; init; }
    public required string Value { get; init; }
    public string? Formula { get; init; }
...
}
Enter fullscreen mode Exit fullscreen mode

XlsFileReader.cs

...
    private static Worksheets.Cell GetCellValue(WorkbookPart bookPart, Cell cell)
    {    
        // Get value
        string value = cell.InnerText;
        // if the data type is SharedString, find the value from Shared String Table
        if (cell.DataType != null && cell.DataType.Value == CellValues.SharedString)
        {
            SharedStringTablePart? sharedStringTablePart = bookPart.GetPartsOfType<SharedStringTablePart>()
                ?.FirstOrDefault();
            if (sharedStringTablePart != null)
            {
                OpenXmlElement sharedStringItem = sharedStringTablePart.SharedStringTable
                    .ElementAt(int.Parse(value));
                // Concatenate all text except phonetic reading
                string result = string.Concat(
                    sharedStringItem.Descendants<DocumentFormat.OpenXml.Spreadsheet.Text>()
                                    .Where(t => CheckIsPhonetic(t) == false)
                                    .Select(t => t.Text)
                );
                return new Worksheets.Cell
                {
                    Address = cell.CellReference?.Value ?? "A1",
                    Type = Worksheets.CellValueType.Text,
                    Value = result,
                };
            }
        }
        Worksheets.CellValueType valueType = Worksheets.CellValueType.Text;
        return new Worksheets.Cell
        {
            Address = cell.CellReference?.Value ?? "A1",
            Type = valueType,
            Value = value, 
        };
    }
    /// <summary>
    /// Check if the parent element is "PhoneticRun"
    /// </summary>
    /// <param name="textElement"></param>
    /// <returns></returns>
    private static bool CheckIsPhonetic(DocumentFormat.OpenXml.Spreadsheet.Text textElement)
    {
        return textElement.Ancestors<PhoneticRun>().Any();
    }
...
Enter fullscreen mode Exit fullscreen mode

SharedStringTablePart

When I get the cell value from the SharedStringTablePart, if I get it from InnerText, the phonetic will also be combined.

example
  • Cell value: 建築物に関する事項(集約版)(参考様式)
  • InnerText: 建築物に関する事項(集約版)(参考様式)ケンチクブツシュウヤクバンサンコウヨウシキ

So I should filter the values.

XlsFileReader.cs

...
    private static Worksheets.Cell GetCellValue(WorkbookPart bookPart, Cell cell)
    {
...
                OpenXmlElement sharedStringItem = sharedStringTablePart.SharedStringTable
                    .ElementAt(int.Parse(value));
                // Concatenate all text except phonetic reading
                string result = string.Concat(
                    sharedStringItem.Descendants<DocumentFormat.OpenXml.Spreadsheet.Text>()
                                    .Where(t => CheckIsPhonetic(t) == false)
                                    .Select(t => t.Text)
                );
...
    }
...
    private static bool CheckIsPhonetic(DocumentFormat.OpenXml.Spreadsheet.Text textElement)
    {
        return textElement.Ancestors<PhoneticRun>().Any();
    }
...
Enter fullscreen mode Exit fullscreen mode

Double

I can get numeric values as same as getting texts.
But some values may not be correct if they have decimal points.

XlsFileReader.cs

...
        Worksheets.CellValueType valueType = Worksheets.CellValueType.Text;
        if (string.IsNullOrEmpty(value) == false &&
            double.TryParse(value, out double nv))
        {
            valueType = Worksheets.CellValueType.Double;
            value = nv.ToString("G");
        }
        return new Worksheets.Cell
        {
            Address = cell.CellReference?.Value ?? "A1",
            Type = valueType,
            Value = value, 
        };
...
Enter fullscreen mode Exit fullscreen mode

Formula

XlsFileReader.cs

...
        // Formula
        string? formula = cell.CellFormula?.Text;
        // Get calculation result
        string? calcResult = cell.CellValue?.InnerText;

        if(string.IsNullOrEmpty(formula) == false && string.IsNullOrEmpty(calcResult) == false)
        {
            if (double.TryParse(calcResult, out double n))
            {
                calcResult = n.ToString("G");
            }
            return new Worksheets.Cell
            {
                Address = cell.CellReference?.Value ?? "A1",
                Type = Worksheets.CellValueType.Formula,
                Value = calcResult,
                Formula = formula,
            };
        }
...
Enter fullscreen mode Exit fullscreen mode

Getting cell styles

Border

Get cell borders from WorkbookStylesPart.

Cell.cs

namespace OfficeFileAccessor.OfficeFiles.Worksheets;
public class Cell
{
    public required string Address { get; init; }
    public required CellValueType Type { get; init; }
    public required string Value { get; init; }
    public string? Formula { get; init; }
    public required double Width { get; init; }
    public required double Height { get; init; }
    public string? BackgroundColor { get; init; }
    public required CellBorders Borders { get; init; }
...
}
Enter fullscreen mode Exit fullscreen mode

CellBorders.cs

namespace OfficeFileAccessor.OfficeFiles.Worksheets;

public record CellBorders
{
    public BorderType Left { get; init; }
    public BorderType Top { get; init; }
    public BorderType Right { get; init; }
    public BorderType Bottom { get; init; }
...
    public static CellBorders GetNoBorders()
    {
        return new ()
        {
            Left = BorderType.None,
            Top = BorderType.None,
            Right = BorderType.None,
            Bottom = BorderType.None,
        };
    }
}
Enter fullscreen mode Exit fullscreen mode

BorderType.cs

namespace OfficeFileAccessor.OfficeFiles.Worksheets;
public enum BorderType
{
    None = 0,
    Thin,
}
public static class BorderTypeFactory
{
    public static BorderType Get(string? borderValue)
    {
        if(string.IsNullOrEmpty(borderValue))
        {
            return BorderType.None;
        }
        return borderValue.ToLower() switch
        {
            "thin" => BorderType.Thin,
            _ => BorderType.None,
        };
    }
}
Enter fullscreen mode Exit fullscreen mode

XlsFileReader.cs

...
    private Worksheets.Cell GetCellValue(WorkbookPart bookPart, Cell cell, double width, double height)
    {
        // Borders
        Worksheets.CellBorders borders = GetBorders(bookPart, cell);
...
        return new Worksheets.Cell
        {
            Address = cell.CellReference?.Value ?? "A1",
            Type = valueType,
            Value = value,
            Borders = borders,
        };
    }
...
    private static Worksheets.CellBorders GetBorders(WorkbookPart bookPart, Cell cell)
    {
        if(cell.StyleIndex?.Value == null)
        {
            return Worksheets.CellBorders.GetNoBorders();
        }
        CellFormat? cellFormat = bookPart.WorkbookStylesPart?.Stylesheet?.CellFormats?.ElementAt((int)cell.StyleIndex.Value) as CellFormat;
        if(cellFormat?.BorderId?.Value != null)
        {
            Border? border = bookPart.WorkbookStylesPart?.Stylesheet?.Borders?.ElementAt(
                    (int)cellFormat.BorderId.Value) as Border;
            if(border != null)
            {
                return new ()
                {
                    Left = Worksheets.BorderTypeFactory.Get(border?.LeftBorder?.Style?.InnerText),
                    Top = Worksheets.BorderTypeFactory.Get(border?.TopBorder?.Style?.InnerText),
                    Right = Worksheets.BorderTypeFactory.Get(border?.RightBorder?.Style?.InnerText),
                    Bottom = Worksheets.BorderTypeFactory.Get(border?.BottomBorder?.Style?.InnerText),
                };
            }            
        }
        return Worksheets.CellBorders.GetNoBorders();
    }
...
Enter fullscreen mode Exit fullscreen mode

Background color

XlsFileReader.cs

...
    private static Worksheets.Cell GetCellValue(WorkbookPart bookPart, Cell cell)
    {
        // Cell color
        string? backgroundColor = GetCellColor(cell, bookPart);
...
        return new Worksheets.Cell
        {
            Address = cell.CellReference?.Value ?? "A1",
            Type = valueType,
            Value = value,
            BackgroundColor = backgroundColor,
            Borders = borders,
        };
    }
...
    private static string? GetCellColor(Cell cell, WorkbookPart bookPart)
    {
        uint? styleIndex = cell.StyleIndex?.Value;
        if(styleIndex == null)
        {
            return null;
        }
        CellFormat? cellFormat = bookPart.WorkbookStylesPart?.Stylesheet?.CellFormats?.ElementAt((int)styleIndex) as CellFormat;
        if (cellFormat?.FillId != null)
        {
            Fill? fill = bookPart.WorkbookStylesPart?.Stylesheet?.Fills?.ElementAt((int)cellFormat.FillId.Value) as Fill;
            PatternFill? patternFill = fill?.PatternFill;
            string? rgbColor = GetRgbColor(patternFill?.ForegroundColor?.Rgb);
            if(string.IsNullOrEmpty(rgbColor) == false)
            {
                return rgbColor;
            }
            string? themeColor = GetThemeColor(bookPart, patternFill?.ForegroundColor?.Theme?.Value);
            if(string.IsNullOrEmpty(themeColor) == false)
            {
                return themeColor;
            }
        }    
        return null;
    }
    private static string? GetRgbColor(HexBinaryValue? rgb)
    {
        if(rgb?.InnerText == null)
        {
            return null;
        }
        // Remove alpha value
        return rgb.InnerText[2..];
    }
    private static string? GetThemeColor(WorkbookPart bookPart, uint? themeColorIndex)
    {
        if(themeColorIndex == null || themeColorIndex <= 0)
        {
            return null;
        }
        ThemePart? themePart = bookPart.ThemePart;
        Drawing.Theme? theme = themePart?.Theme;
        if(theme != null)
        {
            Drawing.Color2Type? color2Type = theme.ThemeElements?.ColorScheme?.ElementAt((int)themeColorIndex) as Drawing.Color2Type;
            return color2Type?.RgbColorModelHex?.Val;
        }
        return null;
    }
...
Enter fullscreen mode Exit fullscreen mode

Width, Height

By default, cells don't have own widths and heights.
So I should get them from columns and rows.

PrintArea

To get column widths, I should get PrintArea first to get last column index.

PageArea.cs

namespace OfficeFileAccessor.OfficeFiles.Worksheets;
public record PageArea(CellAddress Start, CellAddress End);
Enter fullscreen mode Exit fullscreen mode

XlsFileReader.cs

...
    private List<Worksheets.PageArea> GetPrintArea(WorkbookPart bookPart, Worksheet worksheet)
    {
        DefinedNames? definedNames = bookPart.Workbook.DefinedNames;
        if(definedNames == null)
        {
            return [];
        }
        List<Worksheets.PageArea> results = [];
        foreach (DefinedName definedName in definedNames.Elements<DefinedName>())
        {
            if(string.IsNullOrEmpty(definedName.Name?.Value))
            {
                continue;
            }
            if (definedName.Name.Value.StartsWith("_xlnm.Print_Area"))
            {
                string sheetName = "default sheet";
                if(definedName.LocalSheetId != null)
                {
                    Sheet? sheet = bookPart.Workbook.Sheets?.Elements<Sheet>()
                        ?.FirstOrDefault(s => s.SheetId?.Value != null && s.SheetId.Value == definedName.LocalSheetId.Value + 1);
                    if(sheet?.Name != null)
                    {
                        sheetName = sheet.Name!;
                    }
                }
                string printAreaValue = definedName.Text;
                // SheetName is like SheetName!$A$1:$Z$20
                string[] ranges = printAreaValue.Split('!');                
                foreach(var r in ranges)
                {
                    string[] addresses = r.Split(":");
                    if(addresses.Length < 2)
                    {
                        continue;
                    }
                    (string columnNameStart, int rowStart) = GetCellAddress(addresses[0]);
                    if(string.IsNullOrEmpty(columnNameStart) || rowStart <= 0)
                    {
                        continue;
                    }
                    Worksheets.CellAddress startAddress = new (columnNameStart, ConvertAlphabetToIndex(columnNameStart),
                        rowStart);
                    (string columnNameEnd, int rowEnd) = GetCellAddress(addresses[1]);
                    if(string.IsNullOrEmpty(columnNameEnd) || rowEnd <= 0)
                    {
                        continue;
                    }
                    Worksheets.CellAddress endAddress = new (columnNameEnd, ConvertAlphabetToIndex(columnNameEnd),
                        rowEnd);

                    results.Add(new (startAddress, endAddress)); 
                }
            }
        }
        return results;
    }
...
Enter fullscreen mode Exit fullscreen mode

Width, Height

XlsFileReader.cs

...
    public class XlsFileReader(ILogger<XlsFileReader> Logger) : IXlsFileReader
    {
        private readonly double DefaultWidth = Numbers.ConvertFromPixelToCentimeter(8.38 * 7.0);
        private readonly double DefaultHeight = Numbers.ConvertFromPointToCentimeter(18.75);
        private static readonly Regex CellAddressRegex = new (@"\$([a-zA-Z]+)\$([0-9]+)");
        private static readonly Regex ColumnNameRegex = new ("([a-zA-Z]+)");
        public void Read(IFormFile file)
        {
            using SpreadsheetDocument spreadsheet = SpreadsheetDocument.Open(file.OpenReadStream(), false);
            WorkbookPart? bookPart = spreadsheet.WorkbookPart;
            List<string> sheetNames = GetSheetNameList(bookPart);
            foreach(var name in sheetNames)
            {
                Worksheet? targetSheet = GetWorksheet(bookPart, name);
                if(targetSheet == null)
                {
                    return;
                }
                List<Worksheets.PageArea> printAreas = GetPrintArea(bookPart, targetSheet);
                Worksheets.PageArea? target = printAreas.FirstOrDefault();
                List<Worksheets.ColumnWidth> widths = [];
                if(target != null)
                {
                    widths = GetColumnWidths(targetSheet, target.Start.Column, target.End.Column);
                }
                foreach(Row row in targetSheet.Descendants<Row>())
                {
                    double height = DefaultHeight;
                    if(row.Height?.Value != null)
                    {
                        // The row.Height value is expressed in points
                        // convert it to centimeters
                        height = Numbers.ConvertFromPointToCentimeter(row.Height.Value);
                    }
                    foreach(Cell cell in row.Cast<Cell>())
                    {
                        string columnName = GetColumnNameFromAddress(cell.CellReference);
                        double? width = widths.FirstOrDefault(w => w.ColumnName == columnName)?.Width;
                        width ??= DefaultWidth;
                        Worksheets.Cell? cellValue = GetCellValue(bookPart, cell, (double)width, height);
                    }
                }
                // Load first sheet for testing.
                break;
            }
        }
        private static Worksheets.Cell GetCellValue(WorkbookPart bookPart, Cell cell, double width, double height)
        {
...
            return new Worksheets.Cell
            {
                Address = cell.CellReference?.Value ?? "A1",
                Type = valueType,
                Value = value,            
                Width = width,
                Height = height,
                BackgroundColor = backgroundColor,
                Borders = borders,
            };
        }
...
        private List<Worksheets.ColumnWidth> GetColumnWidths(Worksheet sheet, int startColumn, int lastColumn)
        {
            Columns? columns = sheet.Descendants<Columns>().FirstOrDefault();
            if (columns == null)
            {
                return [];
            }
            List<Worksheets.ColumnWidth> results = [];
            for (int i = startColumn; i <= lastColumn; i++)
            {
                double columnWidth = DefaultWidth;
                if (columns != null)
                {
                    uint idx = (uint)i;
                    Column? column = columns.Elements<Column>().FirstOrDefault(c => 
                        c?.Min != null && c.Max != null && c.Min <= idx && c.Max >= idx);
                    if (column?.Width != null)
                    {
                        // The column.width value represents the number of characters (7 pixels by default)
                        // convert it to centimeters.
                        columnWidth = Numbers.ConvertFromPixelToCentimeter(column.Width * 7.0);
                    }
                }
                results.Add(new Worksheets.ColumnWidth(i, ConvertIndexToAlphabet(i), columnWidth));
            }
            return results;
        }
...
Enter fullscreen mode Exit fullscreen mode

Numbers.cs

namespace OfficeFileAccessor.Apps;
public static class Numbers
{
    private const double EmuToCm = 1.0 / 360000.0;
    private const double PointToCm = 0.0352778;
    private const double PixelToCm = 0.0264583;

    public static int ParseInt(string? value, int defaultValue)
    {
        if(string.IsNullOrEmpty(value))
        {
            return defaultValue;
        }
        if(int.TryParse(value, out var result))
        {
            return result;
        }
        return defaultValue;
    }
    public static double ConvertFromEMUToCentimeter(int emuValue)
    {
        return EmuToCm * (double)emuValue;
    }
    public static double ConvertFromPointToCentimeter(double pointValue)
    {
        return PointToCm * pointValue;
    }
    public static double ConvertFromPixelToCentimeter(double pixelValue)
    {
        return PixelToCm * pixelValue;
    }
}
Enter fullscreen mode Exit fullscreen mode

Heroku

This site is built on Heroku

Join the ranks of developers at Salesforce, Airbase, DEV, and more who deploy their mission critical applications on Heroku. Sign up today and launch your first app!

Get Started

Top comments (0)

Image of Timescale

Timescale – the developer's data platform for modern apps, built on PostgreSQL

Timescale Cloud is PostgreSQL optimized for speed, scale, and performance. Over 3 million IoT, AI, crypto, and dev tool apps are powered by Timescale. Try it free today! No credit card required.

Try free

👋 Kindness is contagious

Please leave a ❤️ or a friendly comment on this post if you found it helpful!

Okay