UTF-8/UNICODE/簡體中文/繁體中文之間的轉換


簡介

這幾天一直在研究中文的簡體和繁體之間的轉換問題,網上查了一下資料,在此進行整理和備份。

繁體中文有GBK碼和BIG5碼兩種編碼,簡體中文一般使用的是GB2312編碼。

這些編碼之間的轉換基本都是使用下列3個函數:LCMapString、WideCharToMultiByte和MultiByteToWideChar,其中還會牽涉到UNICODE碼和UTF-8碼這兩種編碼。

GB2312編碼與GBK編碼可以直接使用LCMapString轉換,GB2312編碼/GBK編碼與BIG5編碼則無法直接轉換,必須使用UNICODE作為中間編碼進行中轉。

另外UTF-8編碼是網絡常用編碼,如XML文件和網頁基本都是使用這種編碼,所以在此也一並研究了一下。

下面是我將GB2312/GBK/BIG5/UNICODE/UTF-8這5種編碼之間的轉換編寫到一個函數的代碼。

代碼

  1 int Convert(void *sstr, int scp, void **dstr, int dcp)  2 {  3 #define CP_GBK 936  4 #define CP_BIG5 950  5 #define CP_UTF8 65001  6  7 enum { _unicode, _utf8, _gb2312, _gbk, _big5 };  8 enum { _wc2mb, _mb2wc, _sc2tc, _tc2sc };  9  10  LCID lcid;  11 void *src;  12 void *dest;  13 int cch;  14 int scp0;  15 int act;  16  UINT cp;  17  18 if (((scp < _unicode) || (scp > _big5)) ||  19 ((dcp < _unicode) || (dcp > _big5)))  20 return -1;  21  22 src = NULL;  23 dest = sstr;  24 cch = 0;  25 scp0 = scp;  26  27 while (scp != dcp)  28  {  29 src = dest;  30 switch (scp)  31  {  32 case _unicode:  33 switch (dcp)  34  {  35 case _utf8:  36 scp = _utf8;  37 act = _wc2mb;  38 cp = CP_UTF8;  39 break;  40 case _gb2312:  41 scp = ((scp0 == _big5) ? _gbk : _gb2312);  42 act = _wc2mb;  43 cp = CP_GBK;  44 break;  45 case _gbk:  46 scp = _gbk;  47 act = _wc2mb;  48 cp = CP_GBK;  49 break;  50 case _big5:  51 scp = _big5;  52 act = _wc2mb;  53 cp = CP_BIG5;  54 break;  55  }  56 break;  57 case _utf8:  58 switch (dcp)  59  {  60 case _unicode:  61 case _gb2312:  62 case _gbk:  63 case _big5:  64 scp = _unicode;  65 act = _mb2wc;  66 cp = CP_UTF8;  67 break;  68  }  69 break;  70 case _gb2312:  71 switch (dcp)  72  {  73 case _unicode:  74 case _utf8:  75 scp = _unicode;  76 act = _mb2wc;  77 cp = CP_GBK;  78 break;  79 case _gbk:  80 case _big5:  81 scp = _gbk;  82 act = _sc2tc;  83 break;  84  }  85 break;  86 case _gbk:  87 switch (dcp)  88  {  89 case _unicode:  90 case _utf8:  91 case _big5:  92 scp = _unicode;  93 act = _mb2wc;  94 cp = CP_GBK;  95 break;  96 case _gb2312:  97 scp = _gb2312;  98 act = _tc2sc;  99 break; 100  } 101 break; 102 case _big5: 103 switch (dcp) 104  { 105 case _unicode: 106 case _utf8: 107 case _gb2312: 108 case _gbk: 109 scp = _unicode; 110 act = _mb2wc; 111 cp = CP_BIG5; 112 break; 113  } 114 break; 115  } 116 117 switch (act) 118  { 119 case _wc2mb: 120 cch = WideCharToMultiByte(cp, 0, (wchar_t *)src, -1, NULL, 0, NULL, NULL); 121 dest = malloc(cch * sizeof(char)); 122 WideCharToMultiByte(cp, 0, (wchar_t *)src, -1, (char *)dest, cch, NULL, NULL); 123 break; 124 case _mb2wc: 125 cch = MultiByteToWideChar(cp, 0, (char *)src, -1, NULL, 0); 126 dest = malloc(cch * sizeof(wchar_t)); 127 MultiByteToWideChar(cp, 0, (char *)src, -1, (wchar_t *)dest, cch); 128 break; 129 case _sc2tc: 130 lcid = GetSystemDefaultLCID(); 131 cch = LCMapString(lcid, LCMAP_TRADITIONAL_CHINESE, (char *)src, -1, NULL, 0); 132 dest = malloc(cch * sizeof(char)); 133 LCMapString(lcid, LCMAP_TRADITIONAL_CHINESE, (char *)src, -1, (char *)dest, cch); 134 break; 135 case _tc2sc: 136 lcid = GetSystemDefaultLCID(); 137 cch = LCMapString(lcid, LCMAP_SIMPLIFIED_CHINESE, (char *)src, -1, NULL, 0); 138 dest = malloc(cch * sizeof(char)); 139 LCMapString(lcid, LCMAP_SIMPLIFIED_CHINESE, (char *)src, -1, (char *)dest, cch); 140 break; 141  } 142 143 if (src && (src != sstr)) 144  { 145 free(src); 146  } 147  } 148 149 if (dstr) 150  { 151 *dstr = dest; 152  } 153 else 154  { 155 free(dest); 156  } 157 158 return cch; 159 }

參數說明

sstr:[in]源字符串的首地址,由於可能是char *和wchar_t *兩種數據類型,所以這里我設置為了void *類型 scp:[in]源字符串的編碼方式,0:UNICODE編碼、1:UTF-8編碼、2:GB2312編碼、3:GBK編碼、4:BIG5編碼 dstr:[out]目標字符串地址的指針,由於可能是char **和wchar_t **兩種數據類型,所以這里我設置為了void **類型 dcp:[in]目標字符串的編碼方式,取值范圍與scp類似

函數使用

由於編碼方式比較難記憶,所以我將任意兩種編碼的轉化進行了如下定義

 #define UnicodeToUtf8(src, dest)      Convert((void *)(src), 0, (void **)(dest), 1)
#define UnicodeToGb2312(src, dest) Convert((void *)(src), 0, (void **)(dest), 2) #define UnicodeToGbk(src, dest) Convert((void *)(src), 0, (void **)(dest), 3) #define UnicodeToBig5(src, dest)    Convert((void *)(src), 0, (void **)(dest), 4) #define Utf8ToUnicode(src, dest)    Convert((void *)(src), 1, (void **)(dest), 0) #define Utf8ToGb2312(src, dest) Convert((void *)(src), 1, (void **)(dest), 2) #define Utf8ToGbk(src, dest)    Convert((void *)(src), 1, (void **)(dest), 3) #define Utf8ToBig5(src, dest)   Convert((void *)(src), 1, (void **)(dest), 4) #define Gb2312ToUnicode(src, dest) Convert((void *)(src), 2, (void **)(dest), 0) #define Gb2312ToUtf8(src, dest) Convert((void *)(src), 2, (void **)(dest), 1) #define Gb2312ToGbk(src, dest) Convert((void *)(src), 2, (void **)(dest), 3) #define Gb2312ToBig5(src, dest) Convert((void *)(src), 2, (void **)(dest), 4) #define GbkToUnicode(src, dest) Convert((void *)(src), 3, (void **)(dest), 0) #define GbkToUtf8(src, dest)    Convert((void *)(src), 3, (void **)(dest), 1) #define GbkToGb2312(src, dest) Convert((void *)(src), 3, (void **)(dest), 2) #define GbkToBig5(src, dest)    Convert((void *)(src), 3, (void **)(dest), 4) #define Big5ToUnicode(src, dest)    Convert((void *)(src), 4, (void **)(dest), 0) #define Big5ToUtf8(src, dest)   Convert((void *)(src), 4, (void **)(dest), 1) #define Big5ToGb2312(src, dest) Convert((void *)(src), 4, (void **)(dest), 2) #define Big5ToGbk(src, dest)    Convert((void *)(src), 4, (void **)(dest), 3)

測試代碼如下:

 1 void main()  2 {  3 char *p0;  4 char *p1;  5  6 Gb2312ToBig5("中華人民共和國", &p0);  7 printf("%s\n", p0);  8 Big5ToGb2312(p0, &p1);  9 printf("%s\n", p1); 10 11 free(p0); 12 free(p1); 13 }

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM