1 module xlsx;
2 
3 import std.file, std.format, std.regex, std.conv, std.algorithm, std.range, std.utf;
4 import archive.core, archive.zip;
5 
6 import dxml.parser;
7 
8 private struct Coord {
9     int row;
10     int column;
11 }
12 
13 /// Aliases a Sheet as a two-dimensional array of strings.
14 alias Sheet = string[][];
15 
16 private enum configSplitYes = makeConfig(SplitEmpty.yes);
17 
18 /++
19 Reads a sheet from an XLSX file. 
20 
21 Params:
22 fileName = The path of the XLSX file to read
23 sheetNum = The sheet number to read (one-indexed)
24 
25 Returns: The contents of the sheet, as a two-dimensional array of strings.
26 +/
27 Sheet readSheet(string fileName, int sheetNum) {
28     assert(sheetNum > 0);
29     
30     auto zip = new ZipArchive(read(fileName));
31     auto sheet = zip.getFile(format!"xl/worksheets/sheet%d.xml"(sheetNum));
32     if(sheet is null) throw new Exception("Invalid sheet number");
33     //std.utf.validate!(ubyte[])(sheet.data);
34     string xml = cast(string) sheet.data;
35     
36     validate(xml);
37 
38     auto sstFile = zip.getFile("xl/sharedStrings.xml");
39     string sstXML = cast(string) sstFile.data;
40     string[] sst = parseStringTable(sstXML);
41     
42     return parseSheetXML(xml, sst);
43 }
44 
45 /++
46 Reads a sheet from an XLSX file by its name.
47 
48 Params:
49 fileName = the path of the XLSX file to read
50 sheetNum = The name of the sheet to read
51 
52 Returns: the contents of the sheet, as a two-dimensional array of strings.
53 +/
54 Sheet readSheetByName(string fileName, string sheetName) {
55     auto zip = new ZipArchive(read(fileName));
56     auto workbook = zip.getFile("xl/workbook.xml");
57     if(workbook is null) throw new Exception("Invalid XLSX file");
58     //std.utf.validate!(ubyte[])(sheet.data);
59     string xml = cast(string) workbook.data;
60 
61     const int id = getSheetId(xml, sheetName);
62 
63     return readSheet(fileName, id);
64 }
65 
66 /++
67 Parses an XLSX sheet from XML.
68 
69 Params:
70 xmlString = The XML to parse.
71 
72 Returns: the equivalent sheet, as a two dimensional array of strings.
73 +/
74 Sheet parseSheetXML(string xmlString, string[] sst) {
75     Sheet temp;
76     
77     int cols = 0;
78 
79     string[] theRow;
80 
81     auto range = parseXML!configSplitYes(xmlString);
82     while(!range.empty) {
83         if(range.front.type == EntityType.elementStart) {
84             if(range.front.name == "dimension") {
85                 auto attr = range.front.attributes.front;
86                 assert(attr.name == "ref");
87                 auto dims = parseDimensions(attr.value);
88                 cols = dims.column;
89                 assert(cols > 0);
90             }
91             else if(range.front.name == "row") {
92                 assert(cols > 0);
93                 theRow = new string[cols];
94             }
95             else if(range.front.name == "c") {
96                 auto attrs = range.front.attributes;
97                 Coord loc;
98                 bool isref;
99                 foreach(attr; attrs) {
100                     if(attr.name == "r") {
101                         loc = parseLocation(attr.value);
102                     }
103                     else if(attr.name == "t") {
104                         if(attr.value == "s") isref=true;
105                         else isref = false;
106                     }
107                 }
108                 assert(loc.row >= 0 && loc.column >= 0);
109                 range.popFront;
110                 if(range.front.type != EntityType.elementEnd) {
111                     range.popFront;
112                     assert(range.front.type == EntityType.text);
113                     string text = range.front.text;
114                     assert(theRow.length > 0);
115                     assert(loc.column < theRow.length);
116                     if(isref) theRow[loc.column] = sst[parse!int(text)];
117                     else theRow[loc.column] = text;
118                 }
119             }
120         }
121         else if(range.front.type == EntityType.elementEnd) {
122             if(range.front.name == "row") {
123                 temp ~= theRow;
124             }
125         }
126         range.popFront;
127     }
128     
129     
130     return temp;
131 }
132 unittest {
133     const string test = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
134 <worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" mc:Ignorable="x14ac" xmlns:x14ac="http://schemas.microsoft.com/office/spreadsheetml/2009/9/ac"><dimension ref="A1:C3"/><sheetViews><sheetView tabSelected="1" workbookViewId="0"><selection activeCell="A4" sqref="A4"/></sheetView></sheetViews><sheetFormatPr defaultRowHeight="15" x14ac:dyDescent="0.25"/><sheetData><row r="1" spans="1:3" x14ac:dyDescent="0.25"><c r="A1"><v>1</v></c><c r="B1"><v>5</v></c><c r="C1"><v>7</v></c></row><row r="2" spans="1:3" x14ac:dyDescent="0.25"><c r="A2"><v>2</v></c><c r="B2"><v>4</v></c><c r="C2"><v>3</v></c></row><row r="3" spans="1:3" x14ac:dyDescent="0.25"><c r="A3"><v>7</v></c><c r="B3"><v>82</v></c><c r="C3"><v>1</v></c></row></sheetData><pageMargins left="0.7" right="0.7" top="0.75" bottom="0.75" header="0.3" footer="0.3"/></worksheet>`;
135     const Sheet testSheet = [["1","5","7"],["2","4","3"],["7","82","1"]];
136     const Sheet result = parseSheetXML(test, null);
137     assert(result == testSheet);
138 }
139 unittest {
140     const string test = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
141 <worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" mc:Ignorable="x14ac" xmlns:x14ac="http://schemas.microsoft.com/office/spreadsheetml/2009/9/ac"><dimension ref="A1:C6"/><sheetViews><sheetView tabSelected="1" workbookViewId="0"><selection activeCell="A7" sqref="A7"/></sheetView></sheetViews><sheetFormatPr defaultRowHeight="15" x14ac:dyDescent="0.25"/><sheetData><row r="1" spans="1:3" x14ac:dyDescent="0.25"><c r="A1" t="s"><v>0</v></c><c r="B1" t="s"><v>1</v></c></row><row r="2" spans="1:3" x14ac:dyDescent="0.25"><c r="A2"><v>1</v></c><c r="B2" s="1"><v>6.2</v></c><c r="C2" s="1"/></row><row r="3" spans="1:3" x14ac:dyDescent="0.25"><c r="A3"><v>2</v></c><c r="B3" s="1"><v>3.4</v></c><c r="C3" s="1"/></row><row r="4" spans="1:3" x14ac:dyDescent="0.25"><c r="A4"><v>3</v></c><c r="B4" s="1"><v>87.1</v></c><c r="C4" s="1"/></row><row r="5" spans="1:3" x14ac:dyDescent="0.25"><c r="A5"><v>4</v></c><c r="B5" s="2"><v>83.2</v></c><c r="C5" s="2"/></row><row r="6" spans="1:3" x14ac:dyDescent="0.25"><c r="A6"><v>5</v></c><c r="B6" s="1"><v>3</v></c><c r="C6" s="1"/></row></sheetData><mergeCells count="1"><mergeCell ref="B5:C5"/></mergeCells><pageMargins left="0.7" right="0.7" top="0.75" bottom="0.75" header="0.3" footer="0.3"/></worksheet>`;
142     const Sheet expected = [["Param", "Value", ""], ["1", "6.2", ""], ["2", "3.4", ""], ["3", "87.1", ""], ["4", "83.2", ""], ["5", "3", ""]];
143     const Sheet result = parseSheetXML(test, ["Param", "Value"]);
144     assert(result == expected);
145 }
146 
147 /++
148 Gets the numeric (> 1) id of a sheet with a given name.
149 
150 Params: 
151 wbXml = The XML content of the workbook.xml file in the main XLSX zip archive.
152 sheetName = The name of the sheet to find.
153 
154 Returns: the id of the given sheet
155 
156 Exceptions:
157 Exception if the given sheet name is not in the workbook.
158 +/
159 private int getSheetId(string wbXml, string sheetName) {
160     auto range = parseXML!configSplitYes(wbXml);
161     while(!range.empty) {
162         if(range.front.type == EntityType.elementStart && range.front.name == "sheet") {
163             auto attrs = range.front.attributes;
164             bool nameFound;
165             foreach(attr; attrs) {
166                 if(attr.name == "name" && attr.value == sheetName) nameFound = true;
167                 else if(nameFound && attr.name == "sheetId") {
168                     return parse!int(attr.value);
169                 }
170             }
171         } 
172         range.popFront;
173     }
174     
175     throw new Exception("No sheet with that name!");
176 }
177 unittest {
178     const string test = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
179 <workbook xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" mc:Ignorable="x15 xr2" xmlns:x15="http://schemas.microsoft.com/office/spreadsheetml/2010/11/main" xmlns:xr2="http://schemas.microsoft.com/office/spreadsheetml/2015/revision2"><fileVersion appName="xl" lastEdited="7" lowestEdited="7" rupBuild="18625"/><workbookPr defaultThemeVersion="166925"/><mc:AlternateContent xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"><mc:Choice Requires="x15"><x15ac:absPath url="C:\Users\rraab.ADVILL\Documents\Code\D\dlang-xlsx\" xmlns:x15ac="http://schemas.microsoft.com/office/spreadsheetml/2010/11/ac"/></mc:Choice></mc:AlternateContent><bookViews><workbookView xWindow="0" yWindow="0" windowWidth="28800" windowHeight="12210" activeTab="1" xr2:uid="{045295F2-59E2-495E-BB00-149A1C289780}"/></bookViews><sheets><sheet name="Test1" sheetId="1" r:id="rId1"/><sheet name="Test2" sheetId="2" r:id="rId2"/></sheets><calcPr calcId="171027"/><extLst><ext uri="{140A7094-0E35-4892-8432-C4D2E57EDEB5}" xmlns:x15="http://schemas.microsoft.com/office/spreadsheetml/2010/11/main"><x15:workbookPr chartTrackingRefBase="1"/></ext></extLst></workbook>`;
180     assert(getSheetId(test, "Test1") == 1);
181     assert(getSheetId(test, "Test2") == 2);
182 }
183 
184 /++
185 Parses a Shared String Table XML string into an array of strings. 
186 
187 Params:
188 sst = The Shared String Table XML string to parse
189 Returns: 
190 A simple string array that can be indexed into from an "s" type cell's value (which is a zero-based index, thankfully)
191 +/
192 private string[] parseStringTable(string sst) {
193     //auto doc = new DocumentParser(sst);
194     string[] table;
195 
196     auto range = parseXML!configSplitYes(sst);
197 
198     while(!range.empty) {
199         if(range.front.type == EntityType.elementStart && range.front.name == "t") {
200             range.popFront;
201             table ~= range.front.text;
202         }
203         range.popFront;
204     }
205     return table;
206 }
207 
208 /++
209 Gets a column from a sheet, with each element parsed as the templated type.
210 
211 Params:
212 sheet = The Sheet from which to extract the column.
213 col = The (zero-based) index of the column to extract.
214 
215 Returns: the given column, as a newly-allocated array.
216 +/
217 T[] getColumn(T)(Sheet sheet, size_t col) {
218     return sheet.map!(x => parse!T(x[col])).array;
219 }
220 unittest {
221     assert(getColumn!int([["1","5","7"],["2","4","3"],["7","82","1"]], 1) == [5, 4, 82]);
222 }
223 
224 ///Parses an Excel letter-number column-major one-based location string to a pair of row-major zero-based indeces
225 private Coord parseLocation(string location) {
226     auto pat = ctRegex!"([A-Z]+)([0-9]+)";
227     auto m = location.matchFirst(pat);
228     Coord temp;
229     string rrow = m[2];
230     temp.row = parse!int(rrow)-1;
231     temp.column = m[1].columnNameToNumber-1;
232     return temp;
233 }
234 unittest {
235     const Coord C3 = {2,2};
236     const Coord B15 = {14,1};
237     assert(parseLocation("C3") == C3);
238     assert(parseLocation("B15") == B15);
239 }
240 
241 private Coord parseDimensions(string dims) {
242     auto pat = ctRegex!"([A-Z]+)([0-9]+):([A-Z]+)([0-9]+)";
243     auto m = dims.matchFirst(pat);
244     Coord temp;
245     string m4 = m[4];
246     string m2 = m[2];
247     temp.row = parse!int(m4) - parse!int(m2) + 1;
248     temp.column = m[3].columnNameToNumber - m[1].columnNameToNumber + 1;
249     return temp;
250 }
251 
252 private int columnNameToNumber(string col) {
253     reverse(col.dup);
254     int num;
255     foreach(i, c; col) {
256         num += (c - 'A' + 1)*26^^i;
257     }
258     return num;
259 }
260 
261 unittest {
262     assert(columnNameToNumber("AA") == 27);
263 }