之前是用 node-xlsx 来处理excel文件,主要是读取数据或者根据数据生成excel文件。不过,node-xlsx 似乎无法处理超大的excel(100MB以上),例如:
var xlsx = require('node-xlsx'); var sheets = xlsx.parse('./test.xlsx'); //获取所有sheets
文件中有一个sheet的体积比较大,得到的是一个空的数组,估计是因为内存加载不进去。想要解决这个问题,似乎只有一种方法,就是用stream的方式,一段一段提取excel里面的数据。
然而,node-xlsx 不支持用流的方式读取Excel,因为excel文件编码的原因,只有把excel转为xml或者csv,才能用流的方式处理。

感谢这位老哥,找到了xlsx-extract 这个库,完美解决用流的方式读取excel
var powXLSX = require('xlsx-extract').XLSX; new powXLSX().extract('./test.xlsx', { sheet_all: true }) // 读取文件所有sheet,默认只读取第一张sheet,参数配置如下 .on('sheet', function (sheet) { console.log('sheet', sheet); // sheet is array [sheetname, sheetid, sheetnr] }) .on('row', function (row) { console.log('row', row); // row is a array of values or [] }) .on('cell', function (cell) { // console.log('cell', cell); //cell is a value or null }) .on('error', function (err) { console.error('error', err); }) .on('end', function (err) { console.log('eof'); });
options = {
// sheet selection (provide one of the following)
sheet_name?: string; // select by sheet name
sheet_nr?: string; // default "1" - select by number of the sheet starting on 1
sheet_id?: string; // select by sheet id, e.g. "1"
sheet_rid?: string; // select by internal sheet rid, e.g. "rId1'
sheet_all?: boolean; // default false - select all sheets
// sax parser selection
parser?: string; // default "sax" - 'sax'|'expat'
// row selection
ignore_header?: number; // default 0 - the number of header lines to ignore
include_empty_rows?: boolean; // default false - include empty rows in the middle/at start
// how to output sheet, rows and cells
format?: string; // default array - convert to 'array'||'json'||'tsv'||'obj'
// tsv output options
tsv_float_comma?: boolean; // default false - use "," als decimal point for floats
tsv_delimiter?: string; // default '\t' - use specified character to field delimiter
tsv_endofline?: string; // default depending on your operating system (node os.EOL) e.g. '\n'
// cell value formats
raw_values?: boolean; // default false - do not apply cell formats (get values as string as in xlsx)
round_floats?: boolean; // default true - round float values as the cell format defines (values will be reported as parsed floats otherwise)
date1904?: boolean; // default false - use date 1904 conversion
ignore_timezone?: boolean; // default false - ignore timezone in date parsing
convert_values?: { // apply cell number formats or not (values will be reported as strings otherwise)
ints?: boolean; // rounds to int if number format is for int
floats?: boolean; // rounds floats according to float number format
dates?: boolean; // converts xlsx date to js date
bools?: boolean; // converts xlsx bool to js boolean
};
// xlsx structure options
workfolder?: string; // default 'xl' - the workbook subfolder in zip structure
}
具体的实现原理可以去github 探究 xlsx-extract