撥開字符編碼的迷霧--字符編碼轉換

本文轉載自查看原文 2017-09-04 14:26 685 Unicode/ BOM/ C++/ UTF-8/ UTF-16/ 字符編碼/ ASCII/ ANSI

撥開字符編碼迷霧系列文章鏈接：

撥開字符編碼的迷霧--字符編碼概述

撥開字符編碼的迷霧--編譯器如何處理文件編碼

撥開字符編碼的迷霧--字符編碼轉換

撥開字符編碼的迷霧--MySQL數據庫字符編碼

1. Windows API介紹

本文介紹使用Windows API進行字符編碼的轉換，涉及WideCharToMultiByte和MultiByteToWideChar2個API，
API接口名中的MultiByte對應着多字節編碼，如ASCII、UTF-8等都是多字節編碼，而WideChar字面意思是寬字符，在windows內部寬字符特指UTF-16編碼。原型如下：

int WideCharToMultiByte(
  UINT CodePage, 
  DWORD dwFlags, 
  LPCWSTR lpWideCharStr, 
  int cchWideChar, 
  LPSTR lpMultiByteStr, 
  int cbMultiByte, 
  LPCSTR lpDefaultChar, 
  LPBOOL lpUsedDefaultChar 
);

int MultiByteToWideChar(
  UINT CodePage, 
  DWORD dwFlags, 
  LPCSTR lpMultiByteStr, 
  int cbMultiByte, 
  LPWSTR lpWideCharStr, 
  int cchWideChar 
);

2. 接口封裝

std::string UnicodeToANSI(const std::wstring &str, UINT iCodePage = CP_ACP) {
	std::string strRes;
	int iSize = ::WideCharToMultiByte(iCodePage, 0, str.c_str(), -1, NULL, 0, NULL, NULL);

	if (iSize == 0)
		return strRes;

	char *szBuf = new (std::nothrow) char[iSize];
	if (!szBuf)
		return strRes;
	memset(szBuf, 0, iSize);

	::WideCharToMultiByte(iCodePage, 0, str.c_str(), -1, szBuf, iSize, NULL, NULL);

	strRes = szBuf;
	delete[] szBuf;

	return strRes;
}

std::wstring ANSIToUnicode(const std::string &str, UINT iCodePage = CP_ACP) {
	std::wstring strRes;

	int iSize = ::MultiByteToWideChar(iCodePage, 0, str.c_str(), -1, NULL, 0);

	if (iSize == 0)
		return strRes;

	wchar_t *szBuf = new (std::nothrow) wchar_t[iSize];
	if (!szBuf)
		return strRes;
	memset(szBuf, 0, iSize * sizeof(wchar_t));

	::MultiByteToWideChar(iCodePage, 0, str.c_str(), -1, szBuf, iSize);

	strRes = szBuf;
	delete[] szBuf;

	return strRes;
}

std::string UnicodeToUTF8(const std::wstring &str) {
	std::string strRes;

	int iSize = ::WideCharToMultiByte(CP_UTF8, 0, str.c_str(), -1, NULL, 0, NULL, NULL);

	if (iSize == 0)
		return strRes;

	char *szBuf = new (std::nothrow) char[iSize];
	if (!szBuf)
		return strRes;
	memset(szBuf, 0, iSize);

	::WideCharToMultiByte(CP_UTF8, 0, str.c_str(), -1, szBuf, iSize, NULL, NULL);

	strRes = szBuf;
	delete[] szBuf;

	return strRes;
}

std::string UnicodeToUTF8BOM(const std::wstring &str) {
	std::string strRes;

	int iSize = ::WideCharToMultiByte(CP_UTF8, 0, str.c_str(), -1, NULL, 0, NULL, NULL);

	if (iSize == 0)
		return strRes;

	unsigned char *szBuf = new (std::nothrow) unsigned char[iSize + 3];
	if (!szBuf)
		return strRes;
	memset(szBuf, 0, iSize + 3);

	if (::WideCharToMultiByte(CP_UTF8, 0, str.c_str(), -1, (LPSTR)(szBuf + 3), iSize, NULL, NULL) > 0) {
		szBuf[0] = 0xEF;
		szBuf[1] = 0xBB;
		szBuf[2] = 0xBF;
	}

	strRes = (char*)szBuf;
	delete[] szBuf;

	return strRes;
}

std::wstring UTF8ToUnicode(const std::string &str) {
	std::wstring strRes;
	int iSize = ::MultiByteToWideChar(CP_UTF8, 0, str.c_str(), -1, NULL, 0);

	if (iSize == 0)
		return strRes;

	wchar_t *szBuf = new (std::nothrow) wchar_t[iSize];
	if (!szBuf)
		return strRes;
	memset(szBuf, 0, iSize * sizeof(wchar_t));
	::MultiByteToWideChar(CP_UTF8, 0, str.c_str(), -1, szBuf, iSize);

	strRes = szBuf;
	delete[] szBuf;

	return strRes;
}

std::string ANSIToUTF8(const std::string &str, UINT iCodePage = CP_ACP) {
	return UnicodeToUTF8(ANSIToUnicode(str, iCodePage));
}

std::string ANSIToUTF8BOM(const std::string &str, UINT iCodePage = CP_ACP) {
	return UnicodeToUTF8BOM(ANSIToUnicode(str, iCodePage));
}

std::string UTF8ToANSI(const std::string &str, UINT iCodePage = CP_ACP) {
	return UnicodeToANSI(UTF8ToUnicode(str), iCodePage);
}

對於只支持簡體中文（部分韓文、日文）的系統，iCodePage可以使用CP_ACP，這時API會使用系統當前的代碼頁（簡體中文系統為CP936，即GBK字符集）來進行編碼轉換。但遇到如下情況就需要手動指定代碼頁了：

需要轉換的字符串中的文字是系統當前代碼頁不支持的。如字符串中含有中文，而當前系統代碼頁確是英文的；
GBK字符集中只包含了一部分韓文和日文，部分韓文和日文的轉換可以正常轉換，若遇到不能轉換的情況也需要將指定iCodePage為特定的支持韓文或日文的代碼頁了，特別是中文和韓文、日文等混合的情況下。如韓文“탉”不包含在GBK中，若這時仍然使用CP_ACP就會得到錯誤的轉換結果?，十六進制3F。但GB18030（代碼頁為54936）支持“탉”，可以手動指定iCodePage為54936。

如果代碼中含有GBK不支持的字符，如“탉”、“𤭢”等，Visual Studio會彈出如下提示：

選擇“以其他編碼保存”，選擇“Unicode（UTF-8帶簽名）- 代碼頁65001”保存。
雖然“簡體中文（GB18030) - 代碼頁54936”也支持這些字符，但不能選擇該選項進行保存，具體原因在撥開字符編碼的迷霧--編譯器如何處理文件編碼中有詳細的介紹。

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 撥開字符編碼的迷霧--字符編碼概述撥開字符編碼的迷霧--編譯器如何處理文件編碼字符編碼轉換 LoadRunner字符編碼轉換 iconv字符編碼轉換 python 2 or 3 的字符編碼轉換 PHP轉換數組的字符編碼 c++字符編碼轉換字符編碼轉換筆記字符編碼