背景:
excel格式,不管是.xlx 還是 .xlsx, 每個單元格cell都有容量限制,最大容量是32767字節,不滿足我們的需求,所以我們使用逗號分隔值文件,即.csv格式,本質是以純文本形式存儲表格數據。
但是在使用csv文件進行數據的導入導出過程中發現,如果將下載下來的csv文件進行修改保存,再導入,會有亂碼問題。原因是對文件修改的同時,也修改了它的編碼格式。
以下是為了解決編碼格式問題引入的字符集工具類,通過獲取導入文件的字符集類型,再在解析過程前指定字符集就能解析出非亂碼的數據。
package com.example.test; import lombok.Cleanup; import lombok.NonNull; import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang3.StringUtils; import java.io.*; import java.util.BitSet; /** * @Description: 編碼集工具類 * @author miaoying * @date 2020/9/24 */ @Slf4j public class EncodeUtil { private static int BYTE_SIZE = 8; public static String CODE_UTF8 = "UTF-8"; public static String CODE_UTF8_BOM = "UTF-8_BOM"; public static String CODE_GBK = "GBK"; public static String CODE_UNICODE = "Unicode"; public static String CODE_UTF16 = "UTF-16"; /** * 通過文件獲取編碼集名稱 * * @param file * @param ignoreBom * @return * @throws Exception */ public static String getEncode(File file, boolean ignoreBom) throws Exception { BufferedInputStream bis = new BufferedInputStream(new FileInputStream(file)); return getEncode(bis, ignoreBom); } /** * 通過文件緩存流獲取編碼集名稱 * * @param bis * @return * @throws Exception */ public static String getEncode(@NonNull BufferedInputStream bis, boolean ignoreBom) throws Exception { bis.mark(0); String encodeType = StringUtils.EMPTY; byte[] head = new byte[3]; bis.read(head); if (head[0] == -1 && head[1] == -2) { encodeType = CODE_UTF16; } else if (head[0] == -2 && head[1] == -1) { encodeType = CODE_UNICODE; } //帶BOM else if (head[0] == -17 && head[1] == -69 && head[2] == -65) { if (ignoreBom) { encodeType = CODE_UTF8; } else { encodeType = CODE_UTF8_BOM; } } else if (CODE_UNICODE.equals(encodeType)) { encodeType = CODE_UTF16; } else if (isUTF8(bis)) { encodeType = CODE_UTF8; } else { encodeType = CODE_GBK; } log.info("encodeType : " + encodeType); return encodeType; } /** * 是否是無BOM的UTF8格式,不判斷常規場景,只區分無BOM UTF8和GBK * * @param bis * @return */ private static boolean isUTF8(@NonNull BufferedInputStream bis) throws Exception { bis.reset(); int code = bis.read(); do { BitSet bitSet = convert2BitSet(code); //判斷是否為單字節 if (bitSet.get(0)) { //多字節時,再讀取N個字節 if (!checkMultiByte(bis, bitSet)) { return false; } } else { //單字節時什么都不用做,再次讀取字節 } code = bis.read(); } while (code != -1); return true; } /** * 檢測多字節,判斷是否為utf8,已經讀取了一個字節 * * @param bis * @param bitSet * @return */ private static boolean checkMultiByte(@NonNull BufferedInputStream bis, @NonNull BitSet bitSet) throws Exception { int count = getCountOfSequential(bitSet); //已經讀取了一個字節,不能再讀取 byte[] bytes = new byte[count - 1]; bis.read(bytes); for (byte b : bytes) { if (!checkUtf8Byte(b)) { return false; } } return true; } /** * 檢測單字節,判斷是否為utf8 * * @param b * @return */ private static boolean checkUtf8Byte(byte b) { BitSet bitSet = convert2BitSet(b); return bitSet.get(0) && !bitSet.get(1); } /** * 檢測bitSet中從開始有多少個連續的1 * * @param bitSet * @return */ private static int getCountOfSequential(@NonNull BitSet bitSet) { int count = 0; for (int i = 0; i < BYTE_SIZE; i++) { if (bitSet.get(i)) { count++; } else { break; } } return count; } /** * 將整形轉為BitSet * * @param code * @return */ private static BitSet convert2BitSet(int code) { BitSet bitSet = new BitSet(BYTE_SIZE); for (int i = 0; i < BYTE_SIZE; i++) { int tmp3 = code >> (BYTE_SIZE - i - 1); int tmp2 = 0x1 & tmp3; if (tmp2 == 1) { bitSet.set(i); } } return bitSet; } /** * 將一指定編碼的文件轉換為另一編碼的文件 * * @param oldFullFileName * @param oldCharsetName * @param newFullFileName * @param newCharsetName */ public static void convert(String oldFullFileName, String oldCharsetName, String newFullFileName, String newCharsetName) throws Exception { log.info("the old file name is : {}, The oldCharsetName is : {}", oldFullFileName, oldCharsetName); log.info("the new file name is : {}, The newCharsetName is : {}", newFullFileName, newCharsetName); StringBuffer content = new StringBuffer(); @Cleanup BufferedReader bin = new BufferedReader(new InputStreamReader(new FileInputStream(oldFullFileName), oldCharsetName)); String line; while ((line = bin.readLine()) != null) { content.append(line); content.append(System.getProperty("line.separator")); } newFullFileName = newFullFileName.replace("\\", "/"); File dir = new File(newFullFileName.substring(0, newFullFileName.lastIndexOf("/"))); if (!dir.exists()) { dir.mkdirs(); } @Cleanup Writer out = new OutputStreamWriter(new FileOutputStream(newFullFileName), newCharsetName); out.write(content.toString()); } }