gbk轉utf-8,奇數中文亂碼。
一、亂碼的原因
gbk的中文編碼是一個漢字用【2】個字節表示,例如漢字“內部”的gbk編碼16進制的顯示為c4 da b2 bf
utf-8的中文編碼是一個漢字用【3】個字節表示,例如漢字“內部”的utf-8編碼16進制的顯示為e5 86 85 e9 83 a8
很顯然,gbk是無法直接轉換成utf-8,少字節變為多字節
二、轉換的辦法
1.首先將gbk字符串getBytes()得到兩個原始字節,轉換成二進制字符流,共16位。
2.根據UTF-8的漢字編碼規則,首字節以1110開頭,次字節以10開頭,第3字節以10開頭。在原始的2進制字符串中插入標志位。最終的長度從16--->16+3+2+2=24。
3.轉換完成
通過以下方法將GBK字符轉成UTF-8編碼格式的byte【】數組
- package test;
- import java.io.UnsupportedEncodingException;
- public class TestEncoder {
- /**
- * @param args
- */
- public static void main(String[] args) throws Exception {
- String gbk = "iteye問答頻道編碼轉換問題";
- String iso = new String(gbk.getBytes("UTF-8"),"ISO-8859-1");
- System.out.println(iso);
- String utf8 = new String(iso.getBytes("ISO-8859-1"),"UTF-8");
- System.out.println(utf8);
- System.out.println(getUTF8StringFromGBKString(gbk));
- }
- public static String getUTF8StringFromGBKString(String gbkStr) {
- try {
- return new String(getUTF8BytesFromGBKString(gbkStr), "UTF-8");
- } catch (UnsupportedEncodingException e) {
- throw new InternalError();
- }
- }
- public static byte[] getUTF8BytesFromGBKString(String gbkStr) {
- int n = gbkStr.length();
- byte[] utfBytes = new byte[3 * n];
- int k = 0;
- for (int i = 0; i < n; i++) {
- int m = gbkStr.charAt(i);
- if (m < 128 && m >= 0) {
- utfBytes[k++] = (byte) m;
- continue;
- }
- utfBytes[k++] = (byte) (0xe0 | (m >> 12));
- utfBytes[k++] = (byte) (0x80 | ((m >> 6) & 0x3f));
- utfBytes[k++] = (byte) (0x80 | (m & 0x3f));
- }
- if (k < utfBytes.length) {
- byte[] tmp = new byte[k];
- System.arraycopy(utfBytes, 0, tmp, 0, k);
- return tmp;
- }
- return utfBytes;
- }
- }
或者:
- public static void gbk2Utf() throws UnsupportedEncodingException {
- String gbk = "我來了";
- char[] c = gbk.toCharArray();
- byte[] fullByte = new byte[3*c.length];
- for (int i=0; i<c.length; i++) {
- String binary = Integer.toBinaryString(c[i]);
- StringBuffer sb = new StringBuffer();
- int len = 16 - binary.length();
- //前面補零
- for(int j=0; j<len; j++){
- sb.append("0");
- }
- sb.append(binary);
- //增加位,達到到24位3個字節
- sb.insert(0, "1110");
- sb.insert(8, "10");
- sb.insert(16, "10");
- fullByte[i*3] = Integer.valueOf(sb.substring(0, 8), 2).byteValue();//二進制字符串創建整型
- fullByte[i*3+1] = Integer.valueOf(sb.substring(8, 16), 2).byteValue();
- fullByte[i*3+2] = Integer.valueOf(sb.substring(16, 24), 2).byteValue();
- }
- //模擬UTF-8編碼的網站顯示
- System.out.println(new String(fullByte,"UTF-8"));
- }