java POI 讀取大數據量 excel 處理空單元格問題


import lombok.Data;
import org.apache.poi.ss.usermodel.BuiltinFormats;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import java.util.ArrayList;
import java.util.List;

/**
 * @author: 
 * @descripition: 快速讀取excel
 * @date: created in 15:30 2020/10/13
 * @modify: Copyright (c) Supermap All Rights Reserved.
 */
@Data
public class Excel07Parser extends DefaultHandler {

    //取SST 的索引對應的值
    private SharedStringsTable sst;

    //解析結果保存
    private List<List<String>> container;

    // 開始行
    private Integer startRow=0;

    // 結束行
    private Integer endRow=0;

    // 當前行
    private Integer row;

    // 是否是查詢數據
    private Boolean is=false;

    private Excel07Parser.CellDataType nextDataType = Excel07Parser.CellDataType.SSTINDEX;
    private final DataFormatter formatter = new DataFormatter();
    private short formatIndex;
    private String formatString;

    private StylesTable stylesTable;

    //用一個enum表示單元格可能的數據類型
    enum CellDataType{
        BOOL, ERROR, FORMULA, INLINESTR, SSTINDEX, NUMBER, DATE, NULL
    }



    public Excel07Parser(SharedStringsTable sst,StylesTable stylesTable, List<List<String>> container) {
        this.sst = sst;
        this.container = container;
        this.stylesTable = stylesTable;
    }

    public Excel07Parser(SharedStringsTable sst, List<List<String>> container, Integer startRow, Integer endRow, Boolean is) {
        this.sst = sst;
        this.container = container;
        this.startRow = startRow;
        this.endRow = endRow;
        this.is = is;
    }

    /**
     * 存儲cell標簽下v標簽包裹的字符文本內容
     * 在v標簽開始后,解析器自動調用characters()保存到 lastContents
     * 【但】當cell標簽的屬性 s是 t時, 表示取到的lastContents是 SharedStringsTable 的index值
     * 需要在v標簽結束時根據 index(lastContents)獲取一次真正的值
     */
    private String lastContents;

    //有效數據矩形區域,A1:Y2
    private String dimension;

    //根據dimension得出每行的數據長度
    private int longest;

    //上個有內容的單元格id,判斷空單元格
    private String lastCellid;

    //上一行id, 判斷空行
    private String lastRowid;

    // 判斷單元格cell的c標簽下是否有v,否則可能數據錯位
    private boolean hasV = false;

    //行數據保存
    private List<String> currentRow;

    //單元格內容是SST 的索引
    private boolean isSSTIndex = false;


    @Override
    public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {

        lastContents = "";
        if (qName.equals("dimension")) {
            dimension = attributes.getValue("ref");
            longest = covertRowIdtoInt(dimension.substring(dimension.indexOf(":") + 1));
        }
        //行開始
        if (qName.equals("row")) {
            String rowNum = attributes.getValue("r");
            row = Integer.parseInt(rowNum);
            //判斷空行
            if (lastRowid != null) {
                //與上一行相差2, 說明中間有空行
                int gap = Integer.parseInt(rowNum) - Integer.parseInt(lastRowid);
                if (gap > 1) {
                    gap -= 1;
                    while (gap > 0) {
                        container.add(new ArrayList<>());
                        gap--;
                    }
                }
            }
            lastRowid = attributes.getValue("r");
            currentRow = new ArrayList<>();
        }
        if (qName.equals("c")) {
            // 設置單元格的數據類型
            this.setNextDataType(attributes);

            String rowId = attributes.getValue("r");

            //空單元判斷,添加空字符到list
            if (lastCellid != null) {
                int gap = covertRowIdtoInt(rowId) - covertRowIdtoInt(lastCellid);
                for (int i = 0; i < gap - 1; i++) {
                    currentRow.add("");
                }
            } else {
                //第一個單元格可能不是在第一列
                if (!"A1".equals(rowId)) {
                    for (int i = 0; i < covertRowIdtoInt(rowId) - 1; i++) {
                        currentRow.add("");
                    }
                }
            }
            lastCellid = rowId;

            //判斷單元格的值是SST 的索引,不能直接characters方法取值
            if (attributes.getValue("t") != null && attributes.getValue("t").equals("s")) {
                isSSTIndex = true;
            } else {
                isSSTIndex = false;
            }
        }
    }

    @Override
    public void endElement(String uri, String localName, String qName) throws SAXException {

        //行結束,存儲一行數據
        if (qName.equals("row")) {

            //判斷最后一個單元格是否在最后,補齊列數
            //【注意】有的單元格只修改單元格格式,而沒有內容,會出現c標簽下沒有v標簽,導致currentRow少
            if (covertRowIdtoInt(lastCellid) < longest) {
                int min = Math.min(currentRow.size(), covertRowIdtoInt(lastCellid));
                for (int i = 0; i < longest - min; i++) {
                    currentRow.add("");
                }
            }
            if (is){
                if (row==1 || row==2 ||(row > startRow && row < endRow)){
                    container.add(currentRow);
                }
            }else {
                container.add(currentRow);
            }
            lastCellid = null;
        }

        //單元格結束,沒有v時需要補位
        if (qName.equals("c")){
            if (!hasV) currentRow.add("");
            hasV = false;
        }

        //單元格內容標簽結束,characters方法會被調用處理內容
        if (qName.equals("v")) {
            hasV = true;
            //單元格的值是SST 的索引
            if (isSSTIndex) {
                String sstIndex = lastContents.toString();
                try {
                    int idx = Integer.parseInt(sstIndex);
                    XSSFRichTextString rtss = new XSSFRichTextString(
                            sst.getEntryAt(idx));
                    lastContents = rtss.toString();
                    currentRow.add(lastContents);
                } catch (NumberFormatException ex) {
                    System.out.println(lastContents);
                }
            } else {
                lastContents = this.getDataValue(lastContents.trim(), "");
                currentRow.add(lastContents);
            }

        }

    }


    /**
     * 獲取element的文本數據
     *
     * @see org.xml.sax.ContentHandler#characters
     */
    @Override
    public void characters(char[] ch, int start, int length)
            throws SAXException {
        lastContents += new String(ch, start, length);
    }

    /**
     * 列號轉數字   AB7-->28 第28列
     *
     * @param cellId 單元格定位id,行列號,AB7
     * @return
     */
    public static int covertRowIdtoInt(String cellId) {
        StringBuilder sb = new StringBuilder();
        String column = "";
        //從cellId中提取列號
        for(char c:cellId.toCharArray()){
            if (Character.isAlphabetic(c)){
                sb.append(c);
            }else{
                column = sb.toString();
            }
        }
        //列號字符轉數字
        int result = 0;
        for (char c : column.toCharArray()) {
            result = result * 26 + (c - 'A') + 1;
        }
        return result;
    }

    /**
     * 根據element屬性設置數據類型
     * @param attributes
     */
    public void setNextDataType(Attributes attributes){

        nextDataType = Excel07Parser.CellDataType.NUMBER;
        formatIndex = -1;
        formatString = null;
        String cellType = attributes.getValue("t");
        String cellStyleStr = attributes.getValue("s");
        if ("b".equals(cellType)){
            nextDataType = Excel07Parser.CellDataType.BOOL;
        }else if ("e".equals(cellType)){
            nextDataType = Excel07Parser.CellDataType.ERROR;
        }else if ("inlineStr".equals(cellType)){
            nextDataType = Excel07Parser.CellDataType.INLINESTR;
        }else if ("s".equals(cellType)){
            nextDataType = Excel07Parser.CellDataType.SSTINDEX;
        }else if ("str".equals(cellType)){
            nextDataType = Excel07Parser.CellDataType.FORMULA;
        }
        if (cellStyleStr != null){
            int styleIndex = Integer.parseInt(cellStyleStr);
            XSSFCellStyle style = stylesTable.getStyleAt(styleIndex);
            formatIndex = style.getDataFormat();
            formatString = style.getDataFormatString();
            if ("m/d/yy" == formatString){
                nextDataType = Excel07Parser.CellDataType.DATE;
                //full format is "yyyy-MM-dd hh:mm:ss.SSS";
                formatString = "yyyy-MM-dd";
            }
            if (formatString == null){
                nextDataType = Excel07Parser.CellDataType.NULL;
                formatString = BuiltinFormats.getBuiltinFormat(formatIndex);
            }
        }
    }

    /**
     * 根據數據類型獲取數據
     * @param value
     * @param thisStr
     * @return
     */
    public String getDataValue(String value, String thisStr)

    {
        switch (nextDataType)
        {
            //這幾個的順序不能隨便交換,交換了很可能會導致數據錯誤
            case BOOL:
                char first = value.charAt(0);
                thisStr = first == '0' ? "FALSE" : "TRUE";
                break;
            case ERROR:
                thisStr = "\"ERROR:" + value.toString() + '"';
                break;
            case FORMULA:
                thisStr = '"' + value.toString() + '"';
                break;
            case INLINESTR:
                XSSFRichTextString rtsi = new XSSFRichTextString(value.toString());
                thisStr = rtsi.toString();
                rtsi = null;
                break;
            case SSTINDEX:
                String sstIndex = value.toString();
                thisStr = value.toString();
                break;
            case NUMBER:
                if (formatString != null){
                    thisStr = formatter.formatRawCellContents(Double.parseDouble(value), formatIndex, formatString).trim();
                }else{
                    thisStr = value;
                }
                thisStr = thisStr.replace("_", "").trim();
                break;
            case DATE:
                try{
                    thisStr = formatter.formatRawCellContents(Double.parseDouble(value), formatIndex, formatString);
                }catch(NumberFormatException ex){
                    thisStr = value.toString();
                }
                thisStr = thisStr.replace(" ", "");
                break;
            default:
                thisStr = "";
                break;
        }
        return thisStr;
    }

}
如何調用

  

import com.sgis.common.testutils.Excel07Parser;
import org.apache.commons.io.IOUtils;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.model.StylesTable;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.junit4.SpringRunner;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;


@Test
    public void method_19() throws OpenXML4JException, IOException, SAXException {
        String filePath = "H:\\Project\\test\\測試.xlsx";
        // 讀取excel數據
        OPCPackage pkg = OPCPackage.open(filePath);
        XSSFReader r = new XSSFReader(pkg);

        InputStream is = r.getSheet("rId1");
        //debug 查看轉換的xml原始文件,方便理解后面解析時的處理,
        byte[] isBytes = IOUtils.toByteArray(is);

        //下面是SST 的索引會用到的
        SharedStringsTable sst = r.getSharedStringsTable();

        XMLReader parser = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
        List<List<String>> container = new ArrayList<>();

        StylesTable stylesTable = r.getStylesTable();
        parser.setContentHandler(new Excel07Parser(sst, stylesTable, container));


        InputSource inputSource = new InputSource(new ByteArrayInputStream(isBytes));
        parser.parse(inputSource);

        is.close();

        System.out.println(container);

    }

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM