1 module xlsx;
2 
3 import std.file, std.format, std.regex, std.conv, std.algorithm, std.range, std.utf;
4 import archive.core, archive.zip;
5 
6 import dxml.parser;
7 
8 private struct Coord {
9     int row;
10     int column;
11 }
12 
13 /// Aliases a Sheet as a two-dimensional array of strings.
14 alias Sheet = string[][];
15 
16 private enum configSplitYes = makeConfig(SplitEmpty.yes);
17 
18 /++
19 Reads a sheet from an XLSX file. 
20 
21 Params:
22 fileName = The path of the XLSX file to read
23 sheetNum = The sheet number to read (one-indexed)
24 
25 Returns: The contents of the sheet, as a two-dimensional array of strings.
26 +/
27 Sheet readSheet(string fileName, int sheetNum) {
28     assert(sheetNum > 0);
29     
30     auto zip = new ZipArchive(read(fileName));
31     auto sheet = zip.getFile(format!"xl/worksheets/sheet%d.xml"(sheetNum));
32     if(sheet is null) throw new Exception("Invalid sheet number");
33     //std.utf.validate!(ubyte[])(sheet.data);
34     string xml = cast(string) sheet.data;
35     
36     validate(xml);
37 
38     auto sstFile = zip.getFile("xl/sharedStrings.xml");
39     string sstXML = cast(string) sstFile.data;
40     string[] sst = parseStringTable(sstXML);
41     
42     return parseSheetXML(xml, sst);
43 }
44 
45 /++
46 Reads a sheet from an XLSX file by its name.
47 
48 Params:
49 fileName = the path of the XLSX file to read
50 sheetNum = The name of the sheet to read
51 
52 Returns: the contents of the sheet, as a two-dimensional array of strings.
53 +/
54 Sheet readSheetByName(string fileName, string sheetName) {
55     auto zip = new ZipArchive(read(fileName));
56     auto workbook = zip.getFile("xl/workbook.xml");
57     if(workbook is null) throw new Exception("Invalid XLSX file");
58     //std.utf.validate!(ubyte[])(sheet.data);
59     string xml = cast(string) workbook.data;
60 
61     const int id = getSheetId(xml, sheetName);
62 
63     return readSheet(fileName, id);
64 }
65 
66 /++
67 Parses an XLSX sheet from XML.
68 
69 Params:
70 xmlString = The XML to parse.
71 
72 Returns: the equivalent sheet, as a two dimensional array of strings.
73 +/
74 Sheet parseSheetXML(string xmlString, string[] sst) {
75     Sheet temp;
76     
77     int cols = 0;
78 
79     string[] theRow;
80 
81     auto range = parseXML!configSplitYes(xmlString);
82     while(!range.empty) {
83         if(range.front.type == EntityType.elementStart) {
84             if(range.front.name == "dimension") {
85                 auto attr = range.front.attributes.front;
86                 assert(attr.name == "ref");
87                 auto dims = parseDimensions(attr.value);
88                 cols = dims.column;
89                 assert(cols > 0);
90             }
91             else if(range.front.name == "row") {
92                 assert(cols > 0);
93                 theRow = new string[cols];
94             }
95             else if(range.front.name == "c") {
96                 auto attrs = range.front.attributes;
97                 Coord loc;
98                 bool isref;
99                 foreach(attr; attrs) {
100                     if(attr.name == "r") {
101                         loc = parseLocation(attr.value);
102                     }
103                     else if(attr.name == "t") {
104                         if(attr.value == "s") isref=true;
105                         else isref = false;
106                     }
107                 }
108                 assert(loc.row >= 0 && loc.column >= 0);
109                 range.popFront;
110                 if(range.front.type == EntityType.elementStart) {
111                     if(range.front.name == "f") {
112                         range.popFront; 
113                         if(range.front.type == EntityType.text) range.popFront; 
114                         range.popFront;
115                     }
116                     range.popFront;
117                     assert(range.front.type == EntityType.text);
118                     string text = range.front.text;
119                     assert(theRow.length > 0);
120                     assert(loc.column < theRow.length);
121                     if(isref) theRow[loc.column] = sst[parse!int(text)];
122                     else theRow[loc.column] = text;
123                 }
124                 else {
125                     theRow[loc.column] = "";
126                 }
127             }
128         }
129         else if(range.front.type == EntityType.elementEnd) {
130             if(range.front.name == "row") {
131                 temp ~= theRow;
132             }
133         }
134         range.popFront;
135     }
136     
137     
138     return temp;
139 }
140 unittest {
141     const string test = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
142 <worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" mc:Ignorable="x14ac" xmlns:x14ac="http://schemas.microsoft.com/office/spreadsheetml/2009/9/ac"><dimension ref="A1:C3"/><sheetViews><sheetView tabSelected="1" workbookViewId="0"><selection activeCell="A4" sqref="A4"/></sheetView></sheetViews><sheetFormatPr defaultRowHeight="15" x14ac:dyDescent="0.25"/><sheetData><row r="1" spans="1:3" x14ac:dyDescent="0.25"><c r="A1"><v>1</v></c><c r="B1"><v>5</v></c><c r="C1"><v>7</v></c></row><row r="2" spans="1:3" x14ac:dyDescent="0.25"><c r="A2"><v>2</v></c><c r="B2"><v>4</v></c><c r="C2"><f t="shared" si="0" /><v>3</v></c></row><row r="3" spans="1:3" x14ac:dyDescent="0.25"><c r="A3"><v>7</v></c><c r="B3"><v>82</v></c><c r="C3"><v>1</v></c></row></sheetData><pageMargins left="0.7" right="0.7" top="0.75" bottom="0.75" header="0.3" footer="0.3"/></worksheet>`;
143     const Sheet testSheet = [["1","5","7"],["2","4","3"],["7","82","1"]];
144     const Sheet result = parseSheetXML(test, null);
145     assert(result == testSheet);
146 }
147 unittest {
148     const string test = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
149 <worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" mc:Ignorable="x14ac" xmlns:x14ac="http://schemas.microsoft.com/office/spreadsheetml/2009/9/ac"><dimension ref="A1:C6"/><sheetViews><sheetView tabSelected="1" workbookViewId="0"><selection activeCell="A7" sqref="A7"/></sheetView></sheetViews><sheetFormatPr defaultRowHeight="15" x14ac:dyDescent="0.25"/><sheetData><row r="1" spans="1:3" x14ac:dyDescent="0.25"><c r="A1" t="s"><v>0</v></c><c r="B1" t="s"><v>1</v></c></row><row r="2" spans="1:3" x14ac:dyDescent="0.25"><c r="A2"><v>1</v></c><c r="B2" s="1"><f>ABS(-6.2)</f><v>6.2</v></c><c r="C2" s="1"/></row><row r="3" spans="1:3" x14ac:dyDescent="0.25"><c r="A3"><v>2</v></c><c r="B3" s="1"><v>3.4</v></c><c r="C3" s="1"/></row><row r="4" spans="1:3" x14ac:dyDescent="0.25"><c r="A4"><v>3</v></c><c r="B4" s="1"><v>87.1</v></c><c r="C4" s="1"/></row><row r="5" spans="1:3" x14ac:dyDescent="0.25"><c r="A5"><v>4</v></c><c r="B5" s="2"><v>83.2</v></c><c r="C5" s="2"/></row><row r="6" spans="1:3" x14ac:dyDescent="0.25"><c r="A6"><v>5</v></c><c r="B6" s="1"><v>3</v></c><c r="C6" s="1"/></row></sheetData><mergeCells count="1"><mergeCell ref="B5:C5"/></mergeCells><pageMargins left="0.7" right="0.7" top="0.75" bottom="0.75" header="0.3" footer="0.3"/></worksheet>`;
150     const Sheet expected = [["Param", "Value", ""], ["1", "6.2", ""], ["2", "3.4", ""], ["3", "87.1", ""], ["4", "83.2", ""], ["5", "3", ""]];
151     const Sheet result = parseSheetXML(test, ["Param", "Value"]);
152     assert(result == expected);
153 }
154 
155 /++
156 Gets the numeric (> 1) id of a sheet with a given name.
157 
158 Params: 
159 wbXml = The XML content of the workbook.xml file in the main XLSX zip archive.
160 sheetName = The name of the sheet to find.
161 
162 Returns: the id of the given sheet
163 
164 Exceptions:
165 Exception if the given sheet name is not in the workbook.
166 +/
167 private int getSheetId(string wbXml, string sheetName) {
168     auto range = parseXML!configSplitYes(wbXml);
169     while(!range.empty) {
170         if(range.front.type == EntityType.elementStart && range.front.name == "sheet") {
171             auto attrs = range.front.attributes;
172             bool nameFound;
173             foreach(attr; attrs) {
174                 if(attr.name == "name" && attr.value == sheetName) nameFound = true;
175                 else if(nameFound && attr.name == "sheetId") {
176                     return parse!int(attr.value);
177                 }
178             }
179         } 
180         range.popFront;
181     }
182     
183     throw new Exception("No sheet with that name!");
184 }
185 unittest {
186     const string test = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
187 <workbook xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" mc:Ignorable="x15 xr2" xmlns:x15="http://schemas.microsoft.com/office/spreadsheetml/2010/11/main" xmlns:xr2="http://schemas.microsoft.com/office/spreadsheetml/2015/revision2"><fileVersion appName="xl" lastEdited="7" lowestEdited="7" rupBuild="18625"/><workbookPr defaultThemeVersion="166925"/><mc:AlternateContent xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"><mc:Choice Requires="x15"><x15ac:absPath url="C:\Users\rraab.ADVILL\Documents\Code\D\dlang-xlsx\" xmlns:x15ac="http://schemas.microsoft.com/office/spreadsheetml/2010/11/ac"/></mc:Choice></mc:AlternateContent><bookViews><workbookView xWindow="0" yWindow="0" windowWidth="28800" windowHeight="12210" activeTab="1" xr2:uid="{045295F2-59E2-495E-BB00-149A1C289780}"/></bookViews><sheets><sheet name="Test1" sheetId="1" r:id="rId1"/><sheet name="Test2" sheetId="2" r:id="rId2"/></sheets><calcPr calcId="171027"/><extLst><ext uri="{140A7094-0E35-4892-8432-C4D2E57EDEB5}" xmlns:x15="http://schemas.microsoft.com/office/spreadsheetml/2010/11/main"><x15:workbookPr chartTrackingRefBase="1"/></ext></extLst></workbook>`;
188     assert(getSheetId(test, "Test1") == 1);
189     assert(getSheetId(test, "Test2") == 2);
190 }
191 
192 /++
193 Parses a Shared String Table XML string into an array of strings. 
194 
195 Params:
196 sst = The Shared String Table XML string to parse
197 Returns: 
198 A simple string array that can be indexed into from an "s" type cell's value (which is a zero-based index, thankfully)
199 +/
200 private string[] parseStringTable(string sst) {
201     //auto doc = new DocumentParser(sst);
202     string[] table;
203 
204     auto range = parseXML!configSplitYes(sst);
205 
206     while(!range.empty) {
207         if(range.front.type == EntityType.elementStart && range.front.name == "t") {
208             range.popFront;
209             table ~= range.front.text;
210         }
211         range.popFront;
212     }
213     return table;
214 }
215 
216 /++
217 Gets a column from a sheet, with each element parsed as the templated type.
218 
219 Params:
220 sheet = The Sheet from which to extract the column.
221 col = The (zero-based) index of the column to extract.
222 
223 Returns: the given column, as a newly-allocated array.
224 +/
225 T[] getColumn(T)(Sheet sheet, size_t col) {
226     return sheet.map!(x => parse!T(x[col])).array;
227 }
228 unittest {
229     assert(getColumn!int([["1","5","7"],["2","4","3"],["7","82","1"]], 1) == [5, 4, 82]);
230 }
231 
232 ///Parses an Excel letter-number column-major one-based location string to a pair of row-major zero-based indeces
233 private Coord parseLocation(string location) {
234     auto pat = ctRegex!"([A-Z]+)([0-9]+)";
235     auto m = location.matchFirst(pat);
236     Coord temp;
237     string rrow = m[2];
238     temp.row = parse!int(rrow)-1;
239     temp.column = m[1].columnNameToNumber-1;
240     return temp;
241 }
242 unittest {
243     const Coord C3 = {2,2};
244     const Coord B15 = {14,1};
245     assert(parseLocation("C3") == C3);
246     assert(parseLocation("B15") == B15);
247 }
248 
249 private Coord parseDimensions(string dims) {
250     auto pat = ctRegex!"([A-Z]+)([0-9]+):([A-Z]+)([0-9]+)";
251     auto m = dims.matchFirst(pat);
252     Coord temp;
253     string m4 = m[4];
254     string m2 = m[2];
255     temp.row = parse!int(m4) - parse!int(m2) + 1;
256     temp.column = m[3].columnNameToNumber - m[1].columnNameToNumber + 1;
257     return temp;
258 }
259 
260 private int columnNameToNumber(string col) {
261     reverse(col.dup);
262     int num;
263     foreach(i, c; col) {
264         num += (c - 'A' + 1)*26^^i;
265     }
266     return num;
267 }
268 
269 unittest {
270     assert(columnNameToNumber("AA") == 27);
271 }