C++ UTF8和UTF16互轉代碼

本文轉載自查看原文 2017-05-03 11:41 6847 C/C++/ C++/ linux/unix編程/ utf8/ unicode/ 數據結構與算法/ windows系統編程

簡介

1、這段代碼只考慮在小端序情況下的轉換（一般的機器都是的）。
2、這段代碼需要C++11的支持(只是用到了u16string)，如果不支持，可以添加下面代碼

typedef uint16_t char16_t;
typedef std::basic_string<char16_t>

utfconvert.h

#ifndef __UTFCONVERT_H__
#define __UTFCONVERT_H__
#include <string>


// 從UTF16編碼字符串構建，需要帶BOM標記
std::string utf16_to_utf8(const std::u16string& u16str);

// 從UTF16 LE編碼的字符串創建
std::string utf16le_to_utf8(const std::u16string& u16str);

// 從UTF16BE編碼字符串創建
std::string utf16be_to_utf8(const std::u16string& u16str);

// 獲取轉換為UTF-16 LE編碼的字符串
std::u16string utf8_to_utf16le(const std::string& u8str, bool addbom = false, bool* ok = NULL);

// 獲取轉換為UTF-16 BE的字符串
std::u16string utf8_to_utf16be(const std::string& u8str, bool addbom = false, bool* ok = NULL);

#endif //! __UTFCONVERT_H__

utfconvert.cpp

#include "utfconvert.h"

#include <stdint.h>
#ifdef __GNUC__
#include <endian.h>
#endif // __GNUC__

static inline uint16_t byteswap_ushort(uint16_t number)
{
#if defined(_MSC_VER) && _MSC_VER > 1310
	return _byteswap_ushort(number);
#elif defined(__GNUC__)
	return __builtin_bswap16(number);
#else
	return (number >> 8) | (number << 8);
#endif
}


////////////////////////////////////////
//     以下轉換都是在小端序下進行     //
////////////////////////////////////////

// 從UTF16編碼字符串構建，需要帶BOM標記
std::string utf16_to_utf8(const std::u16string& u16str)
{
	if (u16str.empty()){ return std::string(); }
	//Byte Order Mark
	char16_t bom = u16str[0];
	switch (bom){
	case 0xFEFF:	//Little Endian
		return utf16le_to_utf8(u16str);
		break;
	case 0xFFFE:	//Big Endian
		return utf16be_to_utf8(u16str);
		break;
	default:
		return std::string();
	}
}


// 從UTF16 LE編碼的字符串創建
std::string utf16le_to_utf8(const std::u16string& u16str)
{
	if (u16str.empty()){ return std::string(); }
	const char16_t* p = u16str.data();
	std::u16string::size_type len = u16str.length();
	if (p[0] == 0xFEFF){
		p += 1;	//帶有bom標記，后移
		len -= 1;
	}

	// 開始轉換
	std::string u8str;
	u8str.reserve(len * 3);

	char16_t u16char;
	for (std::u16string::size_type i = 0; i < len; ++i){
		// 這里假設是在小端序下(大端序不適用)
		u16char = p[i];
		
		// 1字節表示部分
		if (u16char < 0x0080){
			// u16char <= 0x007f
			// U- 0000 0000 ~ 0000 07ff : 0xxx xxxx
			u8str.push_back((char)(u16char & 0x00FF));	// 取低8bit
			continue;
		}
		// 2 字節能表示部分
		if (u16char >= 0x0080 && u16char <= 0x07FF){
			// * U-00000080 - U-000007FF:  110xxxxx 10xxxxxx
			u8str.push_back((char)(((u16char >> 6) & 0x1F) | 0xC0));
			u8str.push_back((char)((u16char & 0x3F) | 0x80));
			continue;
		}
		// 代理項對部分(4字節表示)
		if (u16char >= 0xD800 && u16char <= 0xDBFF) {
			// * U-00010000 - U-001FFFFF: 1111 0xxx 10xxxxxx 10xxxxxx 10xxxxxx
			uint32_t highSur = u16char;
			uint32_t lowSur = p[++i];
			// 從代理項對到UNICODE代碼點轉換
			// 1、從高代理項減去0xD800，獲取有效10bit
			// 2、從低代理項減去0xDC00，獲取有效10bit
			// 3、加上0x10000，獲取UNICODE代碼點值
			uint32_t codePoint = highSur - 0xD800;
			codePoint <<= 10;
			codePoint |= lowSur - 0xDC00;
			codePoint += 0x10000;
			// 轉為4字節UTF8編碼表示
			u8str.push_back((char)((codePoint >> 18) | 0xF0));
			u8str.push_back((char)(((codePoint >> 12) & 0x3F) | 0x80));
			u8str.push_back((char)(((codePoint >> 06) & 0x3F) | 0x80));
			u8str.push_back((char)((codePoint & 0x3F) | 0x80));
			continue;
		}
		// 3 字節表示部分
		{
			// * U-0000E000 - U-0000FFFF:  1110xxxx 10xxxxxx 10xxxxxx
			u8str.push_back((char)(((u16char >> 12) & 0x0F) | 0xE0));
			u8str.push_back((char)(((u16char >> 6) & 0x3F) | 0x80));
			u8str.push_back((char)((u16char & 0x3F) | 0x80));
			continue;
		}
	}
	
	return u8str;
}


// 從UTF16BE編碼字符串創建
std::string utf16be_to_utf8(const std::u16string& u16str)
{
	if (u16str.empty()){ return std::string(); }
	const char16_t* p = u16str.data();
	std::u16string::size_type len = u16str.length();
	if (p[0] == 0xFEFF){
		p += 1;	//帶有bom標記，后移
		len -= 1;
	}


	// 開始轉換
	std::string u8str;
	u8str.reserve(len * 2);
	char16_t u16char;	//u16le 低字節存低位，高字節存高位
	for (std::u16string::size_type i = 0; i < len; ++i) {
		// 這里假設是在小端序下(大端序不適用)
		u16char = p[i];
		// 將大端序轉為小端序
		u16char = byteswap_ushort(u16char);

		// 1字節表示部分
		if (u16char < 0x0080) {
			// u16char <= 0x007f
			// U- 0000 0000 ~ 0000 07ff : 0xxx xxxx
			u8str.push_back((char)(u16char & 0x00FF));
			continue;
		}
		// 2 字節能表示部分
		if (u16char >= 0x0080 && u16char <= 0x07FF) {
			// * U-00000080 - U-000007FF:  110xxxxx 10xxxxxx
			u8str.push_back((char)(((u16char >> 6) & 0x1F) | 0xC0));
			u8str.push_back((char)((u16char & 0x3F) | 0x80));
			continue;
		}
		// 代理項對部分(4字節表示)
		if (u16char >= 0xD800 && u16char <= 0xDBFF) {
			// * U-00010000 - U-001FFFFF: 1111 0xxx 10xxxxxx 10xxxxxx 10xxxxxx
			uint32_t highSur = u16char;
			uint32_t lowSur = byteswap_ushort(p[++i]);
			// 從代理項對到UNICODE代碼點轉換
			// 1、從高代理項減去0xD800，獲取有效10bit
			// 2、從低代理項減去0xDC00，獲取有效10bit
			// 3、加上0x10000，獲取UNICODE代碼點值
			uint32_t codePoint = highSur - 0xD800;
			codePoint <<= 10;
			codePoint |= lowSur - 0xDC00;
			codePoint += 0x10000;
			// 轉為4字節UTF8編碼表示
			u8str.push_back((char)((codePoint >> 18) | 0xF0));
			u8str.push_back((char)(((codePoint >> 12) & 0x3F) | 0x80));
			u8str.push_back((char)(((codePoint >> 06) & 0x3F) | 0x80));
			u8str.push_back((char)((codePoint & 0x3F) | 0x80));
			continue;
		}
		// 3 字節表示部分
		{
			// * U-0000E000 - U-0000FFFF:  1110xxxx 10xxxxxx 10xxxxxx
			u8str.push_back((char)(((u16char >> 12) & 0x0F) | 0xE0));
			u8str.push_back((char)(((u16char >> 6) & 0x3F) | 0x80));
			u8str.push_back((char)((u16char & 0x3F) | 0x80));
			continue;
		}
	}
	return u8str;
}






// 獲取轉換為UTF-16 LE編碼的字符串
std::u16string utf8_to_utf16le(const std::string& u8str, bool addbom, bool* ok)
{
	std::u16string u16str;
	u16str.reserve(u8str.size());
	if (addbom) {
		u16str.push_back(0xFEFF);	//bom (字節表示為 FF FE)
	}
	std::string::size_type len = u8str.length();

	const unsigned char* p = (unsigned char*)(u8str.data());
	// 判斷是否具有BOM(判斷長度小於3字節的情況)
	if (len > 3 && p[0] == 0xEF && p[1] == 0xBB && p[2] == 0xBF){
		p += 3;
		len -= 3;
	}

	bool is_ok = true;
	// 開始轉換
	for (std::string::size_type i = 0; i < len; ++i) {
		uint32_t ch = p[i];	// 取出UTF8序列首字節
		if ((ch & 0x80) == 0) {
			// 最高位為0，只有1字節表示UNICODE代碼點
			u16str.push_back((char16_t)ch);
			continue;
		}
		switch (ch & 0xF0)
		{
		case 0xF0: // 4 字節字符, 0x10000 到 0x10FFFF
		{
			uint32_t c2 = p[++i];
			uint32_t c3 = p[++i];
			uint32_t c4 = p[++i];
			// 計算UNICODE代碼點值(第一個字節取低3bit，其余取6bit)
			uint32_t codePoint = ((ch & 0x07U) << 18) | ((c2 & 0x3FU) << 12) | ((c3 & 0x3FU) << 6) | (c4 & 0x3FU);
			if (codePoint >= 0x10000)
			{
				// 在UTF-16中 U+10000 到 U+10FFFF 用兩個16bit單元表示, 代理項對.
				// 1、將代碼點減去0x10000(得到長度為20bit的值)
				// 2、high 代理項 是將那20bit中的高10bit加上0xD800(110110 00 00000000)
				// 3、low  代理項 是將那20bit中的低10bit加上0xDC00(110111 00 00000000)
				codePoint -= 0x10000;
				u16str.push_back((char16_t)((codePoint >> 10) | 0xD800U));
				u16str.push_back((char16_t)((codePoint & 0x03FFU) | 0xDC00U));
			}
			else
			{
				// 在UTF-16中 U+0000 到 U+D7FF 以及 U+E000 到 U+FFFF 與Unicode代碼點值相同.
				// U+D800 到 U+DFFF 是無效字符, 為了簡單起見，這里假設它不存在(如果有則不編碼)
				u16str.push_back((char16_t)codePoint);
			}
		}
		break;
		case 0xE0: // 3 字節字符, 0x800 到 0xFFFF
		{
			uint32_t c2 = p[++i];
			uint32_t c3 = p[++i];
			// 計算UNICODE代碼點值(第一個字節取低4bit，其余取6bit)
			uint32_t codePoint = ((ch & 0x0FU) << 12) | ((c2 & 0x3FU) << 6) | (c3 & 0x3FU);
			u16str.push_back((char16_t)codePoint);
		}
		break;
		case 0xD0: // 2 字節字符, 0x80 到 0x7FF
		case 0xC0:
		{
			uint32_t c2 = p[++i];
			// 計算UNICODE代碼點值(第一個字節取低5bit，其余取6bit)
			uint32_t codePoint = ((ch & 0x1FU) << 12) | ((c2 & 0x3FU) << 6);
			u16str.push_back((char16_t)codePoint);
		}
		break;
		default:	// 單字節部分(前面已經處理，所以不應該進來)
			is_ok = false;
			break;
		}
	}
	if (ok != NULL) { *ok = is_ok; }

	return u16str;
}


// 獲取轉換為UTF-16 BE的字符串
std::u16string utf8_to_utf16be(const std::string& u8str, bool addbom, bool* ok)
{
	// 先獲取utf16le編碼字符串
	std::u16string u16str = utf8_to_utf16le(u8str, addbom, ok);
	// 將小端序轉換為大端序
	for (size_t i = 0; i < u16str.size(); ++i) {
		u16str[i] = byteswap_ushort(u16str[i]);
	}
	return u16str;
}

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 utf8、utf16、utf32之間的格式從字節理解Unicode（UTF8/UTF16) utf16編碼格式 C++ UTF8 UrlEncode（寬字符） UTF8與std:string互轉 C++的標准庫函數默認都是操作字節，而不是字符，非常痛苦，所以引入了u16string和u32string（Linux上的wchar_t是32位的原因，utf16對unicode的支持是有缺陷的）good Unicode和UTF8/16/32詳細介紹 C++ UTF-8和GBK相互轉化 C++實現utf8和gbk編碼字符串互相轉換 C++ URLDecode和URLEncode實現——僅限gb2312，非utf8