手動解析Excel獲取文件元數據

本文轉載自查看原文 2017-12-20 15:46 1067 Java學習/ 原創

工作中有遇到需要獲取上傳的Excel文件的列明、最大行數、大小等元數據信息。通常做法是通過Apache的POI工具加載文件然后再讀取行列進行處理。這種方法很大的弊端就是需要把excel文件加載到內存，如果遇到大的文件，內存暴增，很容易出現OOM。為了解決這個問題，我研究了excel文件的格式，寫了一工具類來自己解析和獲取這些信息。

一、excel文件格式解析

其實xls、xlsx格式的文件其實就是一個壓縮包，我們找一個excel文件，把后綴改成.rar,然后解壓，你會發現文件夾里面大概是這樣的：

其中關鍵的是xl這個文件夾，看第二張圖：

1、workbook.xml 里面包含了sheet的信息，比如有幾個sheet，每一個的名稱是什么

2、sharedString.xml 老重要了，里面就是包含了整個excel文件中單元格中的內容，excel是通過索引來引用內容的。

3、worksheets 文件夾里面包含了sheet內容的定義

看第三張圖，sheet1.xml表示第一個sheet的定義，其內容是這樣的：

看到那些數字了嗎，其實表示這個單元格的內容在sharedString.xml中的索引。

二、示例代碼實現

接下來我將展示一個獲取excel文件中列名稱、行數、sheet名稱的java代碼。

import java.io.File;
import java.io.RandomAccessFile;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * excel文件元數據讀取工具
 *
 * @author yuananyun
 * @date 2017/11/16 14:20
 **/
public class ExcelXmlUtil {
    //獲取第一個sheet的名稱的表達式
    private static Pattern firstSheetPattern = Pattern.compile("sheet name=\"(.*?)\" sheetId=\"1\"");
    //抽取一行的表達式，如
    //<row r="200001" spans="1:2" s="1" customFormat="1" x14ac:dyDescent="0.15">/row>
    private static Pattern rowPattern = Pattern.compile("<row(.*?)></row>");
    //求解一行行號的表達式
    private static Pattern rowNumPattern = Pattern.compile("r=\"(\\d+)\"");
    //求解標題列個數的表達式
    private static Pattern columnCountPattern = Pattern.compile("</v>");
    //求解列標題索引的表達式
    private static Pattern columnIndexPattern = Pattern.compile("<v>(\\d*)</v>");
    //求解列標題名稱的表達式
    private static Pattern titleValuePattern = Pattern.compile("(?:(?:<t>)|(?:<t xml:space=\".*\">))([\\s\\S]*?)</t>");

    static class ExcelRowColumnInfo {
        private long maxRowNum;
        private int coluntCount;
        private List<String> titleList;
        private String firstSheetName;

        public ExcelRowColumnInfo(String firstSheetName, int maxRowNum, int coluntCount, List<String> titleList) {
            this.firstSheetName = firstSheetName;
            this.maxRowNum = maxRowNum;
            this.coluntCount = coluntCount;
            this.titleList = titleList;
        }

        public long getMaxRowNum() {
            return maxRowNum;
        }

        public void setMaxRowNum(int maxRowNum) {
            this.maxRowNum = maxRowNum;
        }

        public int getColuntCount() {
            return coluntCount;
        }

        public void setColuntCount(int coluntCount) {
            this.coluntCount = coluntCount;
        }

        public List<String> getTitleList() {
            return titleList == null ? new ArrayList<>() : titleList;
        }

        public void setTitleList(List<String> titleList) {
            this.titleList = titleList;
        }

        public String getFirstSheetName() {
            return firstSheetName;
        }

        public void setFirstSheetName(String firstSheetName) {
            this.firstSheetName = firstSheetName;
        }

        @Override
        public String toString() {
            return "ExcelRowColumnInfo{" +
                    "maxRowNum=" + maxRowNum +
                    ", coluntCount=" + coluntCount +
                    ", titleList=" + titleList.toString() +
                    '}';
        }
    }

    /**
     * 獲取excel文件的行列個數
     *
     * @param excelFilePath
     * @param isOverwrite   是否覆蓋源excel文件
     * @return ExcelRowColumnInfo
     */
    public static ExcelRowColumnInfo getRowAndColumnInfo(String excelFilePath, boolean isOverwrite) {
        try {
            File excelFile = new File(excelFilePath);
            if (!excelFile.exists()) return null;
            String zipFilePath = excelFilePath.replace(".xlsx", ".zip").replace(".xls", ".zip");
            File zipFile = new File(zipFilePath);
            if (zipFile.exists()) zipFile.delete();
            if (isOverwrite) {
                //直接重命名
                excelFile.renameTo(zipFile);
            } else {
                // 復制文件
                FileUtil.copyFile(excelFilePath, zipFilePath);
            }
            //解壓的臨時目錄
            String tmpDir = zipFilePath.replace(".zip", "");
            List<File> fileList = ZipUtils.upzipFile(zipFile, tmpDir);
            File sheet1File = null;
            File sharedStringsFile = null;
            File workbookFile = null;
            for (File file : fileList) {
                if (file.getPath().contains("sheet1.xml"))
                    sheet1File = file;
                if (file.getPath().contains("sharedStrings.xml"))
                    sharedStringsFile = file;
                if (file.getPath().contains("workbook.xml"))
                    workbookFile = file;
            }
            if (sheet1File == null || sharedStringsFile == null) return null;

            //抽取sheet名稱
            String sheetName = parseFirstSheetName(workbookFile);

            int[] rcArray = parseMaxRowNumAndColCount(sheet1File);
            int maxRowNum = rcArray[0];
//            int columCount = rcArray[1];

            int[] titleIndexArray = parseTitleIndexArray(sheet1File);
            List<String> titleList = parseTitleList(sharedStringsFile, titleIndexArray);

            deleteFileRecursively(zipFile);
            deleteFileRecursively(new File(tmpDir));

            if (titleList == null || titleList.size() == 0 || maxRowNum == 0) return null;

            return new ExcelRowColumnInfo(sheetName, maxRowNum, titleList.size(), titleList);
        } catch (Exception ex) {
            ex.printStackTrace();
        }
        return null;
    }

    /**
     * 解析第一個sheet的名稱
     *
     * @param workbookFile
     * @return
     */
    private static String parseFirstSheetName(File workbookFile) {
        String content = getFileSegment(workbookFile, 0, Integer.MAX_VALUE);
        Matcher matcher = firstSheetPattern.matcher(content);
        if (matcher.find())
            return matcher.group(1);
        return null;
    }

    /**
     * 求解標題列關鍵字所在的索引
     *
     * @param sheet1File
     * @return
     */
    private static int[] parseTitleIndexArray(File sheet1File) {
        int realColCount = 0;
        String startSegment = getFileSegment(sheet1File, 2000);
        if (startSegment != null) {
            //求解真實的列數
            Matcher matcher = rowPattern.matcher(startSegment);
            if (matcher.find()) {
                String firstRow = matcher.group(1);
                if (firstRow != null) {
                    matcher = columnCountPattern.matcher(firstRow);
                    while (matcher.find())
                        realColCount++;
                }
            }
            if (realColCount > 0) {
                //求解標題
                int[] titleIndexArray = new int[realColCount];
                matcher = columnIndexPattern.matcher(startSegment);
                int i = 0;
                while (matcher.find() && i < realColCount) {
                    titleIndexArray[i++] = Integer.parseInt(matcher.group(1));
                }
                return titleIndexArray;
            }
        }
        return null;
    }


    /**
     * 解析excel文件的標題列名稱
     *
     * @param sharedStringsFile
     * @param titleIndexArray
     * @return
     */
    private static List<String> parseTitleList(File sharedStringsFile, int[] titleIndexArray) {
        List<String> titleList = new ArrayList<>();
        int count = titleIndexArray.length;
        if (count > 0) {
            int minIndex = Integer.MAX_VALUE;
            int maxIndex = Integer.MIN_VALUE;
            for (int i = 0; i < count; i++) {
                int index = titleIndexArray[i];
                if (index > maxIndex) maxIndex = index;
                if (index < minIndex) minIndex = index;
            }
            //885是頭部的長度，限制每個row長度為200字符
//            int length = (885 + (maxIndex - minIndex + 1) * 200);
            //標題真的是到處都在，
            String[] titleArray = new String[count];
//            if (minIndex > 10000) {
//                //這是一個大文檔,整篇加載
//                length = Integer.MAX_VALUE;
//            }
            String segment = getFileSegment(sharedStringsFile, 0, Integer.MAX_VALUE);
            Matcher matcher = titleValuePattern.matcher(segment);
            int i = 0;
            while (matcher.find() && count > 0) {
                String value = matcher.group(1);
//                System.out.println(i + " ------> " + value);
                for (int j = 0; j < titleIndexArray.length; j++) {
                    if (i == titleIndexArray[j]) {
                        titleArray[j] = value;
                        count--;
                        break;
                    }
                }
                i++;
            }
            if (titleArray.length > 0) {
                Collections.addAll(titleList, titleArray);
                //去掉空格單元格
                Collections.reverse(titleList);
                for (int i1 = 0; i1 < titleList.size(); i1++) {
                    String title = String.valueOf(titleList.get(i1));
                    if ("".equals(title.trim()))
                        titleList.remove(i1);
                }
                Collections.reverse(titleList);
            }
        }
        return titleList;
    }

    /**
     * 解析文件的最大行號和列數
     *
     * @param sheet1File
     * @return
     */
    private static int[] parseMaxRowNumAndColCount(File sheet1File) {
        int rowNum = 0, colCount = 0;
        String endSegment = getFileSegment(sheet1File, -1000);
        if (endSegment != null) {
            Matcher matcher = rowPattern.matcher(endSegment);
            String lastRow = "";
            while (matcher.find()) {
                lastRow = matcher.group(1);
            }
            if (lastRow.length() > 0) {
                matcher = rowNumPattern.matcher(lastRow);
                if (matcher.find())
                    rowNum = Integer.parseInt(matcher.group(1));
                matcher = columnCountPattern.matcher(lastRow);
                while (matcher.find())
                    colCount++;
            }
        }
        return new int[]{rowNum, colCount};
    }


    /**
     * 遞歸刪除文件及文件夾
     *
     * @param file
     */
    private static void deleteFileRecursively(File file) {
        if (file.exists()) {
            if (file.isFile()) {
                file.delete();
            } else if (file.isDirectory()) {
                File[] files = file.listFiles();
                for (int i = 0; i < files.length; i++) {
                    deleteFileRecursively(files[i]);
                }
                file.delete();
            }
        }
    }


    private static String getFileSegment(File file, int length) {
        return getFileSegment(file, 0, length);
    }

    /**
     * 從一個文件中截取一段字符串
     *
     * @param file
     * @param offset
     * @param length length<0時，offset將失效
     * @return
     */
    private static String getFileSegment(File file, long offset, int length) {
        if (file == null || !file.exists()) return null;
        try {
            Charset charset = Charset.forName("UTF-8");
            CharsetDecoder decoder = charset.newDecoder();

            StringBuilder builder = new StringBuilder();
            RandomAccessFile aFile = new RandomAccessFile(file, "r");
            FileChannel inChannel = aFile.getChannel();
            if (inChannel != null) {
                if (Integer.MAX_VALUE == length)
                    length = (int) inChannel.size();
                ByteBuffer buf = ByteBuffer.allocate(Math.abs(length));
                if (length < 0)
                    offset = inChannel.size() + length;
                int size = Math.abs(length);
                inChannel.position(offset < 0 ? 0 : offset);
                int bytesRead = inChannel.read(buf);
                while (bytesRead != -1 && size > 0) {
                    buf.flip();
                    CharBuffer charBuffer = decoder.decode(buf);
                    builder.append(charBuffer);
                    buf.clear();
                    bytesRead = inChannel.read(buf);
                    size = size - bytesRead;
                }
                inChannel.close();
            }
            aFile.close();
            return builder.toString();
        } catch (Exception ex) {
            ex.printStackTrace();
        }
        return null;
    }


    /**
     * 測試
     * @param args
     * @throws UnsupportedEncodingException
     */
    public static void main(String[] args) throws UnsupportedEncodingException {
        ExcelRowColumnInfo result;
        result = getRowAndColumnInfo("D:\\元數據求解.xls", false);
        System.out.println(result);
    }

}

用到的幾個工具類：

/**
     * 文件復制
     * @param srcFilePath
     * @param destFilePath
     * @return
     */
    public static String copyFile(String srcFilePath, String destFilePath){

        if (StringUtils.isEmpty(srcFilePath) || StringUtils.isEmpty(destFilePath)){
            return null;
        }
        File srcFile = new File(srcFilePath);
        File destFile = new File(destFilePath);
        if (!srcFile.exists() || srcFile.isDirectory()){
            return null;
        }
        try {
            if (!destFile.exists()) {
                destFile.createNewFile();
            }
            FileUtils.copyFile(srcFile, destFile);
            return destFilePath;
        } catch (IOException e){
            e.printStackTrace();
        }
        return null;
    }



    /**
     * 對.zip文件進行解壓縮
     *
     * @param zipFile 解壓縮文件
     * @param descDir 解壓縮的目標地址，如：D:\\測試 或 /mnt/d/測試
     * @return
     */
    @SuppressWarnings("rawtypes")
    public static List<File> upzipFile(File zipFile, String descDir) {
        List<File> _list = new ArrayList<File>();
        try {
            ZipFile _zipFile = new ZipFile(zipFile, "GBK");
            for (Enumeration entries = _zipFile.getEntries(); entries.hasMoreElements(); ) {
                ZipEntry entry = (ZipEntry) entries.nextElement();
                File _file = new File(descDir + File.separator + entry.getName());
                if (entry.isDirectory()) {
                    _file.mkdirs();
                } else {
                    File _parent = _file.getParentFile();
                    if (!_parent.exists()) {
                        _parent.mkdirs();
                    }
                    InputStream _in = _zipFile.getInputStream(entry);
                    OutputStream _out = new FileOutputStream(_file);
                    int len = 0;
                    while ((len = _in.read(_byte)) > 0) {
                        _out.write(_byte, 0, len);
                    }
                    _in.close();
                    _out.flush();
                    _out.close();
                    _list.add(_file);
                }
            }
            _zipFile.close();
        } catch (IOException e) {
        }
        return _list;
    }

其中zip用的是

org.apache.tools.zip.ZipEntry;

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 SQL Server Management Studio 手動導入Excel文件 [轉]AMF手動解析教程 SQL手動注入解析打開excel表格文件為空白，還需再手動選擇文件才可以打開的解決辦法。 springmvc手動獲取bean Springboot手動獲取bean SpringBoot手動獲取Bean 手動獲取判斷處理權限 java中使用poi導出excel表格數據並且可以手動修改導出路徑 springboot手動加載.properties文件