【解決方案】Java獲取文件字符集格式

本文轉載自查看原文 2020-09-24 15:46 1047 excel/ java

背景：

excel格式，不管是.xlx 還是 .xlsx，每個單元格cell都有容量限制，最大容量是32767字節，不滿足我們的需求，所以我們使用逗號分隔值文件，即.csv格式，本質是以純文本形式存儲表格數據。
但是在使用csv文件進行數據的導入導出過程中發現，如果將下載下來的csv文件進行修改保存，再導入，會有亂碼問題。原因是對文件修改的同時，也修改了它的編碼格式。
以下是為了解決編碼格式問題引入的字符集工具類，通過獲取導入文件的字符集類型，再在解析過程前指定字符集就能解析出非亂碼的數據。

package com.example.test;

import lombok.Cleanup;
import lombok.NonNull;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;

import java.io.*;
import java.util.BitSet;

/**
 * 　@Description: 編碼集工具類
 * 　@author miaoying
 * 　@date 2020/9/24
 */
@Slf4j
public class EncodeUtil {
    private static int BYTE_SIZE = 8;
    public static String CODE_UTF8 = "UTF-8";
    public static String CODE_UTF8_BOM = "UTF-8_BOM";
    public static String CODE_GBK = "GBK";
    public static String CODE_UNICODE = "Unicode";
    public static String CODE_UTF16 = "UTF-16";

    /**
     * 通過文件獲取編碼集名稱
     *
     * @param file
     * @param ignoreBom
     * @return
     * @throws Exception
     */
    public static String getEncode(File file, boolean ignoreBom) throws Exception {
        BufferedInputStream bis = new BufferedInputStream(new FileInputStream(file));
        return getEncode(bis, ignoreBom);
    }

    /**
     * 通過文件緩存流獲取編碼集名稱
     *
     * @param bis
     * @return
     * @throws Exception
     */
    public static String getEncode(@NonNull BufferedInputStream bis, boolean ignoreBom) throws Exception {
        bis.mark(0);

        String encodeType = StringUtils.EMPTY;
        byte[] head = new byte[3];
        bis.read(head);
        if (head[0] == -1 && head[1] == -2) {
            encodeType = CODE_UTF16;
        } else if (head[0] == -2 && head[1] == -1) {
            encodeType = CODE_UNICODE;
        } //帶BOM
        else if (head[0] == -17 && head[1] == -69 && head[2] == -65) {
            if (ignoreBom) {
                encodeType = CODE_UTF8;
            } else {
                encodeType = CODE_UTF8_BOM;
            }
        } else if (CODE_UNICODE.equals(encodeType)) {
            encodeType = CODE_UTF16;
        } else if (isUTF8(bis)) {
            encodeType = CODE_UTF8;
        } else {
            encodeType = CODE_GBK;
        }
        log.info("encodeType : " + encodeType);
        return encodeType;
    }

    /**
     * 是否是無BOM的UTF8格式，不判斷常規場景，只區分無BOM UTF8和GBK
     *
     * @param bis
     * @return
     */
    private static boolean isUTF8(@NonNull BufferedInputStream bis) throws Exception {
        bis.reset();
        int code = bis.read();
        do {
            BitSet bitSet = convert2BitSet(code);
            //判斷是否為單字節
            if (bitSet.get(0)) {
                //多字節時，再讀取N個字節
                if (!checkMultiByte(bis, bitSet)) {
                    return false;
                }
            } else {
                //單字節時什么都不用做，再次讀取字節
            }
            code = bis.read();
        } while (code != -1);
        return true;
    }

    /**
     * 檢測多字節，判斷是否為utf8，已經讀取了一個字節
     *
     * @param bis
     * @param bitSet
     * @return
     */
    private static boolean checkMultiByte(@NonNull BufferedInputStream bis, @NonNull BitSet bitSet) throws Exception {
        int count = getCountOfSequential(bitSet);
        //已經讀取了一個字節，不能再讀取
        byte[] bytes = new byte[count - 1];
        bis.read(bytes);
        for (byte b : bytes) {
            if (!checkUtf8Byte(b)) {
                return false;
            }
        }
        return true;
    }

    /**
     * 檢測單字節，判斷是否為utf8
     *
     * @param b
     * @return
     */
    private static boolean checkUtf8Byte(byte b) {
        BitSet bitSet = convert2BitSet(b);
        return bitSet.get(0) && !bitSet.get(1);
    }

    /**
     * 檢測bitSet中從開始有多少個連續的1
     *
     * @param bitSet
     * @return
     */
    private static int getCountOfSequential(@NonNull BitSet bitSet) {
        int count = 0;
        for (int i = 0; i < BYTE_SIZE; i++) {
            if (bitSet.get(i)) {
                count++;
            } else {
                break;
            }
        }
        return count;
    }


    /**
     * 將整形轉為BitSet
     *
     * @param code
     * @return
     */
    private static BitSet convert2BitSet(int code) {
        BitSet bitSet = new BitSet(BYTE_SIZE);

        for (int i = 0; i < BYTE_SIZE; i++) {
            int tmp3 = code >> (BYTE_SIZE - i - 1);
            int tmp2 = 0x1 & tmp3;
            if (tmp2 == 1) {
                bitSet.set(i);
            }
        }
        return bitSet;
    }

    /**
     * 將一指定編碼的文件轉換為另一編碼的文件
     *
     * @param oldFullFileName
     * @param oldCharsetName
     * @param newFullFileName
     * @param newCharsetName
     */
    public static void convert(String oldFullFileName, String oldCharsetName, String newFullFileName, String newCharsetName) throws Exception {
        log.info("the old file name is : {}, The oldCharsetName is : {}", oldFullFileName, oldCharsetName);
        log.info("the new file name is : {}, The newCharsetName is : {}", newFullFileName, newCharsetName);

        StringBuffer content = new StringBuffer();

        @Cleanup
        BufferedReader bin = new BufferedReader(new InputStreamReader(new FileInputStream(oldFullFileName), oldCharsetName));
        String line;
        while ((line = bin.readLine()) != null) {
            content.append(line);
            content.append(System.getProperty("line.separator"));
        }
        newFullFileName = newFullFileName.replace("\\", "/");
        File dir = new File(newFullFileName.substring(0, newFullFileName.lastIndexOf("/")));
        if (!dir.exists()) {
            dir.mkdirs();
        }
        @Cleanup
        Writer out = new OutputStreamWriter(new FileOutputStream(newFullFileName), newCharsetName);
        out.write(content.toString());
    }

}

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Oracle 字符集常見字符集及解決方案 mysql uuid重復？你是mysql字符集的受害者，解決方案如下 mysql 1366的錯誤字符集錯誤解決方案用java轉換文件的字符集 java更改文件字符集編碼 java字符集 python讀取us7ascii字符集Oracle數據庫中文亂碼問題的解決方案 Java中的字符集 Java 字符集編碼 Java自動檢測文件編碼（字符集）