全文轉載自:jinshuaiwang的博客
目前處理Excel的開源javaAPI主要有兩種,一是Jxl(Java Excel API),Jxl只支持Excel2003以下的版本。另外一種是Apache的Jakarta POI,相比於Jxl,POI對微軟辦公文檔的支持更加強大,但是它使用復雜,上手慢。POI可支持更高的Excel版本2007。對Excel的讀取,POI有兩種模式,一是用戶模式,這種方式同Jxl的使用很類似,使用簡單,都是將文件一次性讀到內存,文件小的時候,沒有什么問題,當文件大的時候,就會出現OutOfMemory的內存溢出問題。第二種是事件驅動模式,拿Excel2007來說,其內容采用XML的格式來存儲,所以處理excel就是解析XML,而目前使用事件驅動模式解析XML的API是SAX(Simple API for XML),這種模型在讀取XML文檔時,並沒有將整個文檔讀入內存,而是按順序將整個文檔解析完,在解析過程中,會主動產生事件交給程序中相應的處理函數來處理當前內容。因此這種方式對系統資源要求不高,可以處理海量數據。筆者曾經做過測試,這種方法處理一千萬條,每條五列的數據花費大約11分鍾。可見處理海量數據的文件事件驅動是一個很好的方式。而本文中用到的AbstractExcel2003Reader、AbstractExcel2007Reader對Excel的讀取都是采用這種POI的事件驅動模式。至於Excel的寫操作,對較高版本的Excel2007,POI提供了很好的支持,主要流程是第一步構建工作薄和電子表格對象,第二步在一個流中構建文本文件,第三步使用流中產生的數據替換模板中的電子表格。這種方式也可以處理海量數據文件。AbstractExcel2007Writer就是使用這種方式進行寫操作。對於寫入較低版本的Excel2003,POI使用了用戶模式來處理,就是將整個文檔加載進內存,如果數據量大的話就會出現內存溢出的問題,Excel2003Writer就是使用這種方式。據筆者的測試,如果數據量大於3萬條,每條8列的話,就會報OutOfMemory的錯誤。Excel2003中每個電子表格的記錄數必須在65536以下,否則就會發生異常。目前還沒有好的解決方案,建議對於海量數據寫入操作,盡量使用Excel2007。
/** * 抽象Excel2003讀取器,通過實現HSSFListener監聽器,采用事件驅動模式解析excel2003 * 中的內容,遇到特定事件才會觸發,大大減少了內存的使用。 * */ public class Excel2003Reader implements HSSFListener{ private int minColumns = -1; private POIFSFileSystem fs; private int lastRowNumber; private int lastColumnNumber; /** Should we output the formula, or the value it has? */ private boolean outputFormulaValues = true; /** For parsing Formulas */ private SheetRecordCollectingListener workbookBuildingListener; //excel2003工作薄 private HSSFWorkbook stubWorkbook; // Records we pick up as we process private SSTRecord sstRecord; private FormatTrackingHSSFListener formatListener; //表索引 private int sheetIndex = -1; private BoundSheetRecord[] orderedBSRs; @SuppressWarnings("unchecked") private ArrayList boundSheetRecords = new ArrayList(); // For handling formulas with string results private int nextRow; private int nextColumn; private boolean outputNextStringRecord; //當前行 private int curRow = 0; //存儲行記錄的容器 private List<String> rowlist = new ArrayList<String>();; @SuppressWarnings( "unused") private String sheetName; private IRowReader rowReader; public void setRowReader(IRowReader rowReader){ this.rowReader = rowReader; } /** * 遍歷excel下所有的sheet * @throws IOException */ public void process(String fileName) throws IOException { this.fs = new POIFSFileSystem(new FileInputStream(fileName)); MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener( this); formatListener = new FormatTrackingHSSFListener(listener); HSSFEventFactory factory = new HSSFEventFactory(); HSSFRequest request = new HSSFRequest(); if (outputFormulaValues) { request.addListenerForAllRecords(formatListener); } else { workbookBuildingListener = new SheetRecordCollectingListener( formatListener); request.addListenerForAllRecords(workbookBuildingListener); } factory.processWorkbookEvents(request, fs); } /** * HSSFListener 監聽方法,處理 Record */ @SuppressWarnings("unchecked") public void processRecord(Record record) { int thisRow = -1; int thisColumn = -1; String thisStr = null; String value = null; switch (record.getSid()) { case BoundSheetRecord.sid: boundSheetRecords.add(record); break; case BOFRecord.sid: BOFRecord br = (BOFRecord) record; if (br.getType() == BOFRecord.TYPE_WORKSHEET) { // 如果有需要,則建立子工作薄 if (workbookBuildingListener != null && stubWorkbook == null) { stubWorkbook = workbookBuildingListener .getStubHSSFWorkbook(); } sheetIndex++; if (orderedBSRs == null) { orderedBSRs = BoundSheetRecord .orderByBofPosition(boundSheetRecords); } sheetName = orderedBSRs[sheetIndex].getSheetname(); } break; case SSTRecord.sid: sstRecord = (SSTRecord) record; break; case BlankRecord.sid: BlankRecord brec = (BlankRecord) record; thisRow = brec.getRow(); thisColumn = brec.getColumn(); thisStr = ""; rowlist.add(thisColumn, thisStr); break; case BoolErrRecord.sid: //單元格為布爾類型 BoolErrRecord berec = (BoolErrRecord) record; thisRow = berec.getRow(); thisColumn = berec.getColumn(); thisStr = berec.getBooleanValue()+""; rowlist.add(thisColumn, thisStr); break; case FormulaRecord.sid: //單元格為公式類型 FormulaRecord frec = (FormulaRecord) record; thisRow = frec.getRow(); thisColumn = frec.getColumn(); if (outputFormulaValues) { if (Double.isNaN(frec.getValue())) { // Formula result is a string // This is stored in the next record outputNextStringRecord = true; nextRow = frec.getRow(); nextColumn = frec.getColumn(); } else { thisStr = formatListener.formatNumberDateCell(frec); } } else { thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook, frec.getParsedExpression()) + '"'; } rowlist.add(thisColumn,thisStr); break; case StringRecord.sid://單元格中公式的字符串 if (outputNextStringRecord) { // String for formula StringRecord srec = (StringRecord) record; thisStr = srec.getString(); thisRow = nextRow; thisColumn = nextColumn; outputNextStringRecord = false; } break; case LabelRecord.sid: LabelRecord lrec = (LabelRecord) record; curRow = thisRow = lrec.getRow(); thisColumn = lrec.getColumn(); value = lrec.getValue().trim(); value = value.equals("")?" ":value; this.rowlist.add(thisColumn, value); break; case LabelSSTRecord.sid: //單元格為字符串類型 LabelSSTRecord lsrec = (LabelSSTRecord) record; curRow = thisRow = lsrec.getRow(); thisColumn = lsrec.getColumn(); if (sstRecord == null) { rowlist.add(thisColumn, " "); } else { value = sstRecord .getString(lsrec.getSSTIndex()).toString().trim(); value = value.equals("")?" ":value; rowlist.add(thisColumn,value); } break; case NumberRecord.sid: //單元格為數字類型 NumberRecord numrec = (NumberRecord) record; curRow = thisRow = numrec.getRow(); thisColumn = numrec.getColumn(); value = formatListener.formatNumberDateCell(numrec).trim(); value = value.equals("")?" ":value; // 向容器加入列值 rowlist.add(thisColumn, value); break; default: break; } // 遇到新行的操作 if (thisRow != -1 && thisRow != lastRowNumber) { lastColumnNumber = -1; } // 空值的操作 if (record instanceof MissingCellDummyRecord) { MissingCellDummyRecord mc = (MissingCellDummyRecord) record; curRow = thisRow = mc.getRow(); thisColumn = mc.getColumn(); rowlist.add(thisColumn," "); } // 更新行和列的值 if (thisRow > -1) lastRowNumber = thisRow; if (thisColumn > -1) lastColumnNumber = thisColumn; // 行結束時的操作 if (record instanceof LastCellOfRowDummyRecord) { if (minColumns > 0) { // 列值重新置空 if (lastColumnNumber == -1) { lastColumnNumber = 0; } } lastColumnNumber = -1; // 每行結束時, 調用getRows() 方法 rowReader.getRows(sheetIndex,curRow, rowlist); // 清空容器 rowlist.clear(); } } }
/** * 抽象Excel2007讀取器,excel2007的底層數據結構是xml文件,采用SAX的事件驅動的方法解析 * xml,需要繼承DefaultHandler,在遇到文件內容時,事件會觸發,這種做法可以大大降低 * 內存的耗費,特別使用於大數據量的文件。 * */ public class Excel2007Reader extends DefaultHandler { //共享字符串表 private SharedStringsTable sst; //上一次的內容 private String lastContents; private boolean nextIsString; private int sheetIndex = -1; private List<String> rowlist = new ArrayList<String>(); //當前行 private int curRow = 0; //當前列 private int curCol = 0; //日期標志 private boolean dateFlag; //數字標志 private boolean numberFlag; private boolean isTElement; private IRowReader rowReader; public void setRowReader(IRowReader rowReader){ this.rowReader = rowReader; } /**只遍歷一個電子表格,其中sheetId為要遍歷的sheet索引,從1開始,1-3 * @param filename * @param sheetId * @throws Exception */ public void processOneSheet(String filename,int sheetId) throws Exception { OPCPackage pkg = OPCPackage.open(filename); XSSFReader r = new XSSFReader(pkg); SharedStringsTable sst = r.getSharedStringsTable(); XMLReader parser = fetchSheetParser(sst); // 根據 rId# 或 rSheet# 查找sheet InputStream sheet2 = r.getSheet("rId"+sheetId); sheetIndex++; InputSource sheetSource = new InputSource(sheet2); parser.parse(sheetSource); sheet2.close(); } /** * 遍歷工作簿中所有的電子表格 * @param filename * @throws Exception */ public void process(String filename) throws Exception { OPCPackage pkg = OPCPackage.open(filename); XSSFReader r = new XSSFReader(pkg); SharedStringsTable sst = r.getSharedStringsTable(); XMLReader parser = fetchSheetParser(sst); Iterator<InputStream> sheets = r.getSheetsData(); while (sheets.hasNext()) { curRow = 0; sheetIndex++; InputStream sheet = sheets.next(); InputSource sheetSource = new InputSource(sheet); parser.parse(sheetSource); sheet.close(); } } public XMLReader fetchSheetParser(SharedStringsTable sst) throws SAXException { XMLReader parser = XMLReaderFactory .createXMLReader("org.apache.xerces.parsers.SAXParser"); this.sst = sst; parser.setContentHandler(this); return parser; } public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException { // c => 單元格 if ("c".equals(name)) { // 如果下一個元素是 SST 的索引,則將nextIsString標記為true String cellType = attributes.getValue("t"); if ("s".equals(cellType)) { nextIsString = true; } else { nextIsString = false; } //日期格式 String cellDateType = attributes.getValue("s"); if ("1".equals(cellDateType)){ dateFlag = true; } else { dateFlag = false; } String cellNumberType = attributes.getValue("s"); if("2".equals(cellNumberType)){ numberFlag = true; } else { numberFlag = false; } } //當元素為t時 if("t".equals(name)){ isTElement = true; } else { isTElement = false; } // 置空 lastContents = ""; } public void endElement(String uri, String localName, String name) throws SAXException { // 根據SST的索引值的到單元格的真正要存儲的字符串 // 這時characters()方法可能會被調用多次 if (nextIsString) { try { int idx = Integer.parseInt(lastContents); lastContents = new XSSFRichTextString(sst.getEntryAt(idx)) .toString(); } catch (Exception e) { } } //t元素也包含字符串 if(isTElement){ String value = lastContents.trim(); rowlist.add(curCol, value); curCol++; isTElement = false; // v => 單元格的值,如果單元格是字符串則v標簽的值為該字符串在SST中的索引 // 將單元格內容加入rowlist中,在這之前先去掉字符串前后的空白符 } else if ("v".equals(name)) { String value = lastContents.trim(); value = value.equals("")?" ":value; //日期格式處理 if(dateFlag){ Date date = HSSFDateUtil.getJavaDate(Double.valueOf(value)); SimpleDateFormat dateFormat = new SimpleDateFormat( "dd/MM/yyyy"); value = dateFormat.format(date); } //數字類型處理 if(numberFlag){ BigDecimal bd = new BigDecimal(value); value = bd.setScale(3,BigDecimal.ROUND_UP).toString(); } rowlist.add(curCol, value); curCol++; }else { //如果標簽名稱為 row ,這說明已到行尾,調用 optRows() 方法 if (name.equals("row")) { rowReader.getRows(sheetIndex,curRow,rowlist); rowlist.clear(); curRow++; curCol = 0; } } } public void characters(char[] ch, int start, int length) throws SAXException { //得到單元格內容的值 lastContents += new String(ch, start, length); } }
public class ExcelReaderUtil { //excel2003擴展名 public static final String EXCEL03_EXTENSION = ".xls"; //excel2007擴展名 public static final String EXCEL07_EXTENSION = ".xlsx"; /** * 讀取Excel文件,可能是03也可能是07版本 * @param excel03 * @param excel07 * @param fileName * @throws Exception */ public static void readExcel(IRowReader reader,String fileName) throws Exception{ // 處理excel2003文件 if (fileName.endsWith(EXCEL03_EXTENSION)){ Excel2003Reader excel03 = new Excel2003Reader(); excel03.setRowReader(reader); excel03.process(fileName); // 處理excel2007文件 } else if (fileName.endsWith(EXCEL07_EXTENSION)){ Excel2007Reader excel07 = new Excel2007Reader(); excel07.setRowReader(reader); excel07.process(fileName); } else { throw new Exception("文件格式錯誤,fileName的擴展名只能是xls或xlsx。"); } } }
public interface IRowReader { /**業務邏輯實現方法 * @param sheetIndex * @param curRow * @param rowlist */ public void getRows(int sheetIndex,int curRow, List<String> rowlist); }
public class RowReader implements IRowReader{ /* 業務邏輯實現方法 * @see com.eprosun.util.excel.IRowReader#getRows(int, int, java.util.List) */ public void getRows(int sheetIndex, int curRow, List<String> rowlist) { // TODO Auto-generated method stub System.out.print(curRow+" "); for (int i = 0; i < rowlist.size(); i++) { System.out.print(rowlist.get(i) + " "); } System.out.println(); } }
public class Main { public static void main(String[] args) throws Exception { IRowReader reader = new RowReader(); //ExcelReaderUtil.readExcel(reader, "F://te03.xls"); ExcelReaderUtil.readExcel(reader, "F://test07.xlsx"); } }
public class Excel2003Writer { /** * @param args */ public static void main(String[] args) { try{ System.out.println("開始寫入excel2003...."); writeExcel("tes2003.xls"); System.out.println("寫完xcel2003"); } catch (IOException e) { } } /** * 寫入excel並填充內容,一個sheet只能寫65536行以下,超出會報異常,寫入時建議使用AbstractExcel2007Writer * @param fileName * @throws IOException */ public static void writeExcel(String fileName) throws IOException{ // 創建excel2003對象 Workbook wb = new HSSFWorkbook(); // 設置文件放置路徑和文件名 FileOutputStream fileOut = new FileOutputStream(fileName); // 創建新的表單 Sheet sheet = wb.createSheet("newsheet"); // 創建新行 for(int i=0;i<20000;i++){ Row row = sheet.createRow(i); // 創建單元格 Cell cell = row.createCell(0); // 設置單元格值 cell.setCellValue(1); row.createCell(1).setCellValue(1+i); row.createCell(2).setCellValue(true); row.createCell(3).setCellValue(0.43d); row.createCell(4).setCellValue('d'); row.createCell(5).setCellValue(""); row.createCell(6).setCellValue("第七列"+i); row.createCell(7).setCellValue("第八列"+i); } wb.write(fileOut); fileOut.close(); } }
/** * 抽象excel2007讀入器,先構建.xlsx一張模板,改寫模板中的sheet.xml,使用這種方法 * 寫入.xlsx文件,不需要太大的內存 * */ public abstract class AbstractExcel2007Writer { private SpreadsheetWriter sw; /** * 寫入電子表格的主要流程 * @param fileName * @throws Exception */ public void process(String fileName) throws Exception{ // 建立工作簿和電子表格對象 XSSFWorkbook wb = new XSSFWorkbook(); XSSFSheet sheet = wb.createSheet("sheet1"); // 持有電子表格數據的xml文件名 例如 /xl/worksheets/sheet1.xml String sheetRef = sheet.getPackagePart().getPartName().getName(); // 保存模板 FileOutputStream os = new FileOutputStream("template.xlsx"); wb.write(os); os.close(); // 生成xml文件 File tmp = File.createTempFile("sheet", ".xml"); Writer fw = new FileWriter(tmp); sw = new SpreadsheetWriter(fw); generate(); fw.close(); // 使用產生的數據替換模板 File templateFile = new File("template.xlsx"); FileOutputStream out = new FileOutputStream(fileName); substitute(templateFile, tmp, sheetRef.substring(1), out); out.close(); //刪除文件之前調用一下垃圾回收器,否則無法刪除模板文件 System.gc(); // 刪除臨時模板文件 if (templateFile.isFile()&&templateFile.exists()){ templateFile.delete(); } } /** * 類使用者應該使用此方法進行寫操作 * @throws Exception */ public abstract void generate() throws Exception; public void beginSheet() throws IOException { sw.beginSheet(); } public void insertRow(int rowNum) throws IOException { sw.insertRow(rowNum); } public void createCell(int columnIndex, String value) throws IOException { sw.createCell(columnIndex, value, -1); } public void createCell(int columnIndex, double value) throws IOException { sw.createCell(columnIndex, value, -1); } public void endRow() throws IOException { sw.endRow(); } public void endSheet() throws IOException { sw.endSheet(); } /** * * @param zipfile the template file * @param tmpfile the XML file with the sheet data * @param entry the name of the sheet entry to substitute, e.g. xl/worksheets/sheet1.xml * @param out the stream to write the result to */ private static void substitute(File zipfile, File tmpfile, String entry, OutputStream out) throws IOException { ZipFile zip = new ZipFile(zipfile); ZipOutputStream zos = new ZipOutputStream(out); @SuppressWarnings("unchecked") Enumeration<ZipEntry> en = (Enumeration<ZipEntry>) zip.entries(); while (en.hasMoreElements()) { ZipEntry ze = en.nextElement(); if (!ze.getName().equals(entry)) { zos.putNextEntry(new ZipEntry(ze.getName())); InputStream is = zip.getInputStream(ze); copyStream(is, zos); is.close(); } } zos.putNextEntry(new ZipEntry(entry)); InputStream is = new FileInputStream(tmpfile); copyStream(is, zos); is.close(); zos.close(); } private static void copyStream(InputStream in, OutputStream out) throws IOException { byte[] chunk = new byte[1024]; int count; while ((count = in.read(chunk)) >= 0) { out.write(chunk, 0, count); } } /** * 在寫入器中寫入電子表格 * */ public static class SpreadsheetWriter { private final Writer _out; private int _rownum; private static String LINE_SEPARATOR = System.getProperty("line.separator"); public SpreadsheetWriter(Writer out) { _out = out; } public void beginSheet() throws IOException { _out.write("<?xml version=\"1.0\" encoding=\"GB2312\"?>" + "<worksheet xmlns=\"http://schemas.openxmlformats.org/spreadsheetml/2006/main\">"); _out.write("<sheetData>"+LINE_SEPARATOR); } public void endSheet() throws IOException { _out.write("</sheetData>"); _out.write("</worksheet>"); } /** * 插入新行 * * @param rownum 以0開始 */ public void insertRow(int rownum) throws IOException { _out.write("<row r=\"" + (rownum + 1) + "\">"+LINE_SEPARATOR); this._rownum = rownum; } /** * 插入行結束標志 */ public void endRow() throws IOException { _out.write("</row>"+LINE_SEPARATOR); } /** * 插入新列 * @param columnIndex * @param value * @param styleIndex * @throws IOException */ public void createCell(int columnIndex, String value, int styleIndex) throws IOException { String ref = new CellReference(_rownum, columnIndex) .formatAsString(); _out.write("<c r=\"" + ref + "\" t=\"inlineStr\""); if (styleIndex != -1) _out.write(" s=\"" + styleIndex + "\""); _out.write(">"); _out.write("<is><t>"+XMLEncoder.encode(value)+"</t></is>"); _out.write("</c>"); } public void createCell(int columnIndex, String value) throws IOException { createCell(columnIndex, value, -1); } public void createCell(int columnIndex, double value, int styleIndex) throws IOException { String ref = new CellReference(_rownum, columnIndex) .formatAsString(); _out.write("<c r=\"" + ref + "\" t=\"n\""); if (styleIndex != -1) _out.write(" s=\"" + styleIndex + "\""); _out.write(">"); _out.write("<v>" + value + "</v>"); _out.write("</c>"); } public void createCell(int columnIndex, double value) throws IOException { createCell(columnIndex, value, -1); } public void createCell(int columnIndex, Calendar value, int styleIndex) throws IOException { createCell(columnIndex, DateUtil.getExcelDate(value, false), styleIndex); } } }
public class Excel2007WriterImpl extends AbstractExcel2007Writer{ /** * @param args * @throws Exception */ public static void main(String[] args) throws Exception { // TODO Auto-generated method stub System.out.println("............................"); long start = System.currentTimeMillis(); //構建excel2007寫入器 AbstractExcel2007Writer excel07Writer = new Excel2007WriterImpl(); //調用處理方法 excel07Writer.process("F://test07.xlsx"); long end = System.currentTimeMillis(); System.out.println("....................."+(end-start)/1000); } /* * 可根據需求重寫此方法,對於單元格的小數或者日期格式,會出現精度問題或者日期格式轉化問題,建議使用字符串插入方法 * @see com.excel.ver2.AbstractExcel2007Writer#generate() */ @Override public void generate()throws Exception { //電子表格開始 beginSheet(); for (int rownum = 0; rownum < 100; rownum++) { //插入新行 insertRow(rownum); //建立新單元格,索引值從0開始,表示第一列 createCell(0, "中國<" + rownum + "!"); createCell(1, 34343.123456789); createCell(2, "23.67%"); createCell(3, "12:12:23"); createCell(4, "2010-10-11 12:12:23"); createCell(5, "true"); createCell(6, "false"); //結束行 endRow(); } //電子表格結束 endSheet(); } }
public class XMLEncoder { private static final String[] xmlCode = new String[256]; static { // Special characters xmlCode['\''] = "'"; xmlCode['\"'] = """; // double quote xmlCode['&'] = "&"; // ampersand xmlCode['<'] = "<"; // lower than xmlCode['>'] = ">"; // greater than } /** * <p> * Encode the given text into xml. * </p> * * @param string the text to encode * @return the encoded string */ public static String encode(String string) { if (string == null) return ""; int n = string.length(); char character; String xmlchar; StringBuffer buffer = new StringBuffer(); // loop over all the characters of the String. for (int i = 0; i < n; i++) { character = string.charAt(i); // the xmlcode of these characters are added to a StringBuffer one by one try { xmlchar = xmlCode[character]; if (xmlchar == null) { buffer.append(character); } else { buffer.append(xmlCode[character]); } } catch (ArrayIndexOutOfBoundsException aioobe) { buffer.append(character); } } return buffer.toString(); } }