C++ UTF8字符串与UNICODE转换


这是zen库中的一小段源码,源文件名为“zen_utf8.h”, 只有两个函数

std::u32string UTF8ToUnicode(std::string const & utf8);
std::string UnicodeToUTF8(std::u32string const & strRes);

用法:

#include <iostream>
#include <string>
#include "zen_utf8.h"

using namespace std;

int main(int argc, const char * argv[]) {
	std::string s = "你好,ABCD";
	auto s32 = Zen::UTF8ToUnicode(s);
	auto s_0_4 = Zen::UnicodeToUTF8(s32.substr(0, 4));
	std::cout << s << " 的前4个字是:" << s_0_4 << std::endl;
	return 0;
}

附 zen_utf8.h 源文件内容

/*
 Copyright (c) 2013 MeherTJ G. All rights reserved.
 License: Everybody can use these code freely.
 */

#pragma once
#include <string>
#include <cstdint>

namespace Zen {

	std::u32string UTF8ToUnicode(std::string const & utf8)
	{
		if (utf8.empty())
		{
			return {};
		}

		std::u32string res;

		for (size_t i = 0; i < utf8.size(); )
		{
			auto c = (unsigned char)utf8[i];
			char32_t wideChar = 0;
			if ((c & 0x80) == 0)
			{
				wideChar = c;
				++i;
			}
			else if((c & 0xE0) == 0xC0)  ///< 110x-xxxx 10xx-xxxx
			{
				if(i + 2 > utf8.size()) break;
				wideChar  = (char32_t(c) & 0x3F) << 6;
				wideChar |= (char32_t(utf8[i+1]) & 0x3F);
				i += 2;
			}
			else if((c & 0xF0) == 0xE0)  ///< 1110-xxxx 10xx-xxxx 10xx-xxxx
			{
				if(i + 3 > utf8.size()) break;
				wideChar  = (char32_t(c) & 0x1F) << 12;
				wideChar |= (char32_t(utf8[i+1]) & 0x3F) << 6;
				wideChar |= (char32_t(utf8[i+2]) & 0x3F);
				i += 3;
			}
			else if((c & 0xF8) == 0xF0)  ///< 1111-0xxx 10xx-xxxx 10xx-xxxx 10xx-xxxx
			{
				if(i + 4 > utf8.size()) break;
				wideChar  = (char32_t(c) & 0x0F) << 18;
				wideChar |= (char32_t(utf8[i+1]) & 0x3F) << 12;
				wideChar |= (char32_t(utf8[i+2]) & 0x3F) << 6;
				wideChar |= (char32_t(utf8[i+3]) & 0x3F);
				i += 4;
			}
			else///< 1111-10xx 10xx-xxxx 10xx-xxxx 10xx-xxxx 10xx-xxxx
			{
				if(i + 4 > utf8.size()) break;
				wideChar  = (char32_t(c) & 0x07) << 24;
				wideChar |= (char32_t(utf8[i+1]) & 0x3F) << 18;
				wideChar |= (char32_t(utf8[i+2]) & 0x3F) << 12;
				wideChar |= (char32_t(utf8[i+3]) & 0x3F) << 6;
				wideChar |= (char32_t(utf8[i+4]) & 0x3F);
				i += 4;
			}
			res.push_back(wideChar);
		}

		return res;
	}

	std::string UnicodeToUTF8(std::u32string const & strRes)
	{
		std::string utf8;
		for (char32_t c : strRes)
		{
			auto i = (uint32_t)c;
			if (i < 0x80)
			{
				utf8.push_back((char)i);
			}
			else if(i < 0x800)
			{
				utf8.push_back((char)(0xc0 | (i >> 6)));
				utf8.push_back((char)(0x80 | (i & 0x3f)));
			}
			else if(i < 0x10000 )
			{
				utf8.push_back((char)(0xe0 | (i >> 12)));
				utf8.push_back((char)(0x80 | ((i >> 6)& 0x3f)));
				utf8.push_back((char)(0x80 | (i & 0x3f)));
			}
			else if(i < 0x200000 )
			{
				utf8.push_back((char)(0xf0 | (i >> 18)));
				utf8.push_back((char)(0x80 | ((i >> 12)& 0x3f)));
				utf8.push_back((char)(0x80 | ((i >> 6)& 0x3f)));
				utf8.push_back((char)(0x80 | (i & 0x3f)));
			}
			else
			{
				utf8.push_back((char)(0xf8 | (i >> 24)));
				utf8.push_back((char)(0x80 | ((i >> 18)& 0x3f)));
				utf8.push_back((char)(0x80 | ((i >> 12)& 0x3f)));
				utf8.push_back((char)(0x80 | ((i >> 6)& 0x3f)));
				utf8.push_back((char)(0x80 | (i & 0x3f)));
			}
		}
		return utf8;
	}
}


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM