工作中有遇到需要獲取上傳的Excel文件的列明、最大行數、大小等元數據信息。通常做法是通過Apache的POI工具加載文件然后再讀取行列進行處理。這種方法很大的弊端就是需要把excel文件加載到內存,如果遇到大的文件,內存暴增,很容易出現OOM。為了解決這個問題,我研究了excel文件的格式,寫了一工具類來自己解析和獲取這些信息。
一、excel文件格式解析
其實xls、xlsx格式的文件其實就是一個壓縮包,我們找一個excel文件,把后綴改成.rar,然后解壓,你會發現文件夾里面大概是這樣的:
其中關鍵的是xl這個文件夾,看第二張圖:
1、workbook.xml 里面包含了sheet的信息,比如有幾個sheet,每一個的名稱是什么
2、sharedString.xml 老重要了,里面就是包含了整個excel文件中單元格中的內容,excel是通過索引來引用內容的。
3、worksheets 文件夾里面包含了sheet內容的定義
看第三張圖,sheet1.xml表示第一個sheet的定義,其內容是這樣的:
看到那些數字了嗎,其實表示這個單元格的內容在sharedString.xml中的索引。
二、示例代碼實現
接下來我將展示一個獲取excel文件中列名稱、行數、sheet名稱的java代碼。
import java.io.File; import java.io.RandomAccessFile; import java.io.UnsupportedEncodingException; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.channels.FileChannel; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * excel文件元數據讀取工具 * * @author yuananyun * @date 2017/11/16 14:20 **/ public class ExcelXmlUtil { //獲取第一個sheet的名稱的表達式 private static Pattern firstSheetPattern = Pattern.compile("sheet name=\"(.*?)\" sheetId=\"1\""); //抽取一行的表達式,如 //<row r="200001" spans="1:2" s="1" customFormat="1" x14ac:dyDescent="0.15">/row> private static Pattern rowPattern = Pattern.compile("<row(.*?)></row>"); //求解一行行號的表達式 private static Pattern rowNumPattern = Pattern.compile("r=\"(\\d+)\""); //求解標題列個數的表達式 private static Pattern columnCountPattern = Pattern.compile("</v>"); //求解列標題索引的表達式 private static Pattern columnIndexPattern = Pattern.compile("<v>(\\d*)</v>"); //求解列標題名稱的表達式 private static Pattern titleValuePattern = Pattern.compile("(?:(?:<t>)|(?:<t xml:space=\".*\">))([\\s\\S]*?)</t>"); static class ExcelRowColumnInfo { private long maxRowNum; private int coluntCount; private List<String> titleList; private String firstSheetName; public ExcelRowColumnInfo(String firstSheetName, int maxRowNum, int coluntCount, List<String> titleList) { this.firstSheetName = firstSheetName; this.maxRowNum = maxRowNum; this.coluntCount = coluntCount; this.titleList = titleList; } public long getMaxRowNum() { return maxRowNum; } public void setMaxRowNum(int maxRowNum) { this.maxRowNum = maxRowNum; } public int getColuntCount() { return coluntCount; } public void setColuntCount(int coluntCount) { this.coluntCount = coluntCount; } public List<String> getTitleList() { return titleList == null ? new ArrayList<>() : titleList; } public void setTitleList(List<String> titleList) { this.titleList = titleList; } public String getFirstSheetName() { return firstSheetName; } public void setFirstSheetName(String firstSheetName) { this.firstSheetName = firstSheetName; } @Override public String toString() { return "ExcelRowColumnInfo{" + "maxRowNum=" + maxRowNum + ", coluntCount=" + coluntCount + ", titleList=" + titleList.toString() + '}'; } } /** * 獲取excel文件的行列個數 * * @param excelFilePath * @param isOverwrite 是否覆蓋源excel文件 * @return ExcelRowColumnInfo */ public static ExcelRowColumnInfo getRowAndColumnInfo(String excelFilePath, boolean isOverwrite) { try { File excelFile = new File(excelFilePath); if (!excelFile.exists()) return null; String zipFilePath = excelFilePath.replace(".xlsx", ".zip").replace(".xls", ".zip"); File zipFile = new File(zipFilePath); if (zipFile.exists()) zipFile.delete(); if (isOverwrite) { //直接重命名 excelFile.renameTo(zipFile); } else { // 復制文件 FileUtil.copyFile(excelFilePath, zipFilePath); } //解壓的臨時目錄 String tmpDir = zipFilePath.replace(".zip", ""); List<File> fileList = ZipUtils.upzipFile(zipFile, tmpDir); File sheet1File = null; File sharedStringsFile = null; File workbookFile = null; for (File file : fileList) { if (file.getPath().contains("sheet1.xml")) sheet1File = file; if (file.getPath().contains("sharedStrings.xml")) sharedStringsFile = file; if (file.getPath().contains("workbook.xml")) workbookFile = file; } if (sheet1File == null || sharedStringsFile == null) return null; //抽取sheet名稱 String sheetName = parseFirstSheetName(workbookFile); int[] rcArray = parseMaxRowNumAndColCount(sheet1File); int maxRowNum = rcArray[0]; // int columCount = rcArray[1]; int[] titleIndexArray = parseTitleIndexArray(sheet1File); List<String> titleList = parseTitleList(sharedStringsFile, titleIndexArray); deleteFileRecursively(zipFile); deleteFileRecursively(new File(tmpDir)); if (titleList == null || titleList.size() == 0 || maxRowNum == 0) return null; return new ExcelRowColumnInfo(sheetName, maxRowNum, titleList.size(), titleList); } catch (Exception ex) { ex.printStackTrace(); } return null; } /** * 解析第一個sheet的名稱 * * @param workbookFile * @return */ private static String parseFirstSheetName(File workbookFile) { String content = getFileSegment(workbookFile, 0, Integer.MAX_VALUE); Matcher matcher = firstSheetPattern.matcher(content); if (matcher.find()) return matcher.group(1); return null; } /** * 求解標題列關鍵字所在的索引 * * @param sheet1File * @return */ private static int[] parseTitleIndexArray(File sheet1File) { int realColCount = 0; String startSegment = getFileSegment(sheet1File, 2000); if (startSegment != null) { //求解真實的列數 Matcher matcher = rowPattern.matcher(startSegment); if (matcher.find()) { String firstRow = matcher.group(1); if (firstRow != null) { matcher = columnCountPattern.matcher(firstRow); while (matcher.find()) realColCount++; } } if (realColCount > 0) { //求解標題 int[] titleIndexArray = new int[realColCount]; matcher = columnIndexPattern.matcher(startSegment); int i = 0; while (matcher.find() && i < realColCount) { titleIndexArray[i++] = Integer.parseInt(matcher.group(1)); } return titleIndexArray; } } return null; } /** * 解析excel文件的標題列名稱 * * @param sharedStringsFile * @param titleIndexArray * @return */ private static List<String> parseTitleList(File sharedStringsFile, int[] titleIndexArray) { List<String> titleList = new ArrayList<>(); int count = titleIndexArray.length; if (count > 0) { int minIndex = Integer.MAX_VALUE; int maxIndex = Integer.MIN_VALUE; for (int i = 0; i < count; i++) { int index = titleIndexArray[i]; if (index > maxIndex) maxIndex = index; if (index < minIndex) minIndex = index; } //885是頭部的長度,限制每個row長度為200字符 // int length = (885 + (maxIndex - minIndex + 1) * 200); //標題真的是到處都在, String[] titleArray = new String[count]; // if (minIndex > 10000) { // //這是一個大文檔,整篇加載 // length = Integer.MAX_VALUE; // } String segment = getFileSegment(sharedStringsFile, 0, Integer.MAX_VALUE); Matcher matcher = titleValuePattern.matcher(segment); int i = 0; while (matcher.find() && count > 0) { String value = matcher.group(1); // System.out.println(i + " ------> " + value); for (int j = 0; j < titleIndexArray.length; j++) { if (i == titleIndexArray[j]) { titleArray[j] = value; count--; break; } } i++; } if (titleArray.length > 0) { Collections.addAll(titleList, titleArray); //去掉空格單元格 Collections.reverse(titleList); for (int i1 = 0; i1 < titleList.size(); i1++) { String title = String.valueOf(titleList.get(i1)); if ("".equals(title.trim())) titleList.remove(i1); } Collections.reverse(titleList); } } return titleList; } /** * 解析文件的最大行號和列數 * * @param sheet1File * @return */ private static int[] parseMaxRowNumAndColCount(File sheet1File) { int rowNum = 0, colCount = 0; String endSegment = getFileSegment(sheet1File, -1000); if (endSegment != null) { Matcher matcher = rowPattern.matcher(endSegment); String lastRow = ""; while (matcher.find()) { lastRow = matcher.group(1); } if (lastRow.length() > 0) { matcher = rowNumPattern.matcher(lastRow); if (matcher.find()) rowNum = Integer.parseInt(matcher.group(1)); matcher = columnCountPattern.matcher(lastRow); while (matcher.find()) colCount++; } } return new int[]{rowNum, colCount}; } /** * 遞歸刪除文件及文件夾 * * @param file */ private static void deleteFileRecursively(File file) { if (file.exists()) { if (file.isFile()) { file.delete(); } else if (file.isDirectory()) { File[] files = file.listFiles(); for (int i = 0; i < files.length; i++) { deleteFileRecursively(files[i]); } file.delete(); } } } private static String getFileSegment(File file, int length) { return getFileSegment(file, 0, length); } /** * 從一個文件中截取一段字符串 * * @param file * @param offset * @param length length<0時,offset將失效 * @return */ private static String getFileSegment(File file, long offset, int length) { if (file == null || !file.exists()) return null; try { Charset charset = Charset.forName("UTF-8"); CharsetDecoder decoder = charset.newDecoder(); StringBuilder builder = new StringBuilder(); RandomAccessFile aFile = new RandomAccessFile(file, "r"); FileChannel inChannel = aFile.getChannel(); if (inChannel != null) { if (Integer.MAX_VALUE == length) length = (int) inChannel.size(); ByteBuffer buf = ByteBuffer.allocate(Math.abs(length)); if (length < 0) offset = inChannel.size() + length; int size = Math.abs(length); inChannel.position(offset < 0 ? 0 : offset); int bytesRead = inChannel.read(buf); while (bytesRead != -1 && size > 0) { buf.flip(); CharBuffer charBuffer = decoder.decode(buf); builder.append(charBuffer); buf.clear(); bytesRead = inChannel.read(buf); size = size - bytesRead; } inChannel.close(); } aFile.close(); return builder.toString(); } catch (Exception ex) { ex.printStackTrace(); } return null; } /** * 測試 * @param args * @throws UnsupportedEncodingException */ public static void main(String[] args) throws UnsupportedEncodingException { ExcelRowColumnInfo result; result = getRowAndColumnInfo("D:\\元數據求解.xls", false); System.out.println(result); } }
用到的幾個工具類:
/** * 文件復制 * @param srcFilePath * @param destFilePath * @return */ public static String copyFile(String srcFilePath, String destFilePath){ if (StringUtils.isEmpty(srcFilePath) || StringUtils.isEmpty(destFilePath)){ return null; } File srcFile = new File(srcFilePath); File destFile = new File(destFilePath); if (!srcFile.exists() || srcFile.isDirectory()){ return null; } try { if (!destFile.exists()) { destFile.createNewFile(); } FileUtils.copyFile(srcFile, destFile); return destFilePath; } catch (IOException e){ e.printStackTrace(); } return null; } /** * 對.zip文件進行解壓縮 * * @param zipFile 解壓縮文件 * @param descDir 解壓縮的目標地址,如:D:\\測試 或 /mnt/d/測試 * @return */ @SuppressWarnings("rawtypes") public static List<File> upzipFile(File zipFile, String descDir) { List<File> _list = new ArrayList<File>(); try { ZipFile _zipFile = new ZipFile(zipFile, "GBK"); for (Enumeration entries = _zipFile.getEntries(); entries.hasMoreElements(); ) { ZipEntry entry = (ZipEntry) entries.nextElement(); File _file = new File(descDir + File.separator + entry.getName()); if (entry.isDirectory()) { _file.mkdirs(); } else { File _parent = _file.getParentFile(); if (!_parent.exists()) { _parent.mkdirs(); } InputStream _in = _zipFile.getInputStream(entry); OutputStream _out = new FileOutputStream(_file); int len = 0; while ((len = _in.read(_byte)) > 0) { _out.write(_byte, 0, len); } _in.close(); _out.flush(); _out.close(); _list.add(_file); } } _zipFile.close(); } catch (IOException e) { } return _list; }
其中zip用的是
org.apache.tools.zip.ZipEntry;