这是zen库中的一小段源码,源文件名为“zen_utf8.h”, 只有两个函数
std::u32string UTF8ToUnicode(std::string const & utf8);
std::string UnicodeToUTF8(std::u32string const & strRes);
用法:
#include <iostream>
#include <string>
#include "zen_utf8.h"
using namespace std;
int main(int argc, const char * argv[]) {
std::string s = "你好,ABCD";
auto s32 = Zen::UTF8ToUnicode(s);
auto s_0_4 = Zen::UnicodeToUTF8(s32.substr(0, 4));
std::cout << s << " 的前4个字是:" << s_0_4 << std::endl;
return 0;
}
附 zen_utf8.h 源文件内容
/*
Copyright (c) 2013 MeherTJ G. All rights reserved.
License: Everybody can use these code freely.
*/
#pragma once
#include <string>
#include <cstdint>
namespace Zen {
std::u32string UTF8ToUnicode(std::string const & utf8)
{
if (utf8.empty())
{
return {};
}
std::u32string res;
for (size_t i = 0; i < utf8.size(); )
{
auto c = (unsigned char)utf8[i];
char32_t wideChar = 0;
if ((c & 0x80) == 0)
{
wideChar = c;
++i;
}
else if((c & 0xE0) == 0xC0) ///< 110x-xxxx 10xx-xxxx
{
if(i + 2 > utf8.size()) break;
wideChar = (char32_t(c) & 0x3F) << 6;
wideChar |= (char32_t(utf8[i+1]) & 0x3F);
i += 2;
}
else if((c & 0xF0) == 0xE0) ///< 1110-xxxx 10xx-xxxx 10xx-xxxx
{
if(i + 3 > utf8.size()) break;
wideChar = (char32_t(c) & 0x1F) << 12;
wideChar |= (char32_t(utf8[i+1]) & 0x3F) << 6;
wideChar |= (char32_t(utf8[i+2]) & 0x3F);
i += 3;
}
else if((c & 0xF8) == 0xF0) ///< 1111-0xxx 10xx-xxxx 10xx-xxxx 10xx-xxxx
{
if(i + 4 > utf8.size()) break;
wideChar = (char32_t(c) & 0x0F) << 18;
wideChar |= (char32_t(utf8[i+1]) & 0x3F) << 12;
wideChar |= (char32_t(utf8[i+2]) & 0x3F) << 6;
wideChar |= (char32_t(utf8[i+3]) & 0x3F);
i += 4;
}
else///< 1111-10xx 10xx-xxxx 10xx-xxxx 10xx-xxxx 10xx-xxxx
{
if(i + 4 > utf8.size()) break;
wideChar = (char32_t(c) & 0x07) << 24;
wideChar |= (char32_t(utf8[i+1]) & 0x3F) << 18;
wideChar |= (char32_t(utf8[i+2]) & 0x3F) << 12;
wideChar |= (char32_t(utf8[i+3]) & 0x3F) << 6;
wideChar |= (char32_t(utf8[i+4]) & 0x3F);
i += 4;
}
res.push_back(wideChar);
}
return res;
}
std::string UnicodeToUTF8(std::u32string const & strRes)
{
std::string utf8;
for (char32_t c : strRes)
{
auto i = (uint32_t)c;
if (i < 0x80)
{
utf8.push_back((char)i);
}
else if(i < 0x800)
{
utf8.push_back((char)(0xc0 | (i >> 6)));
utf8.push_back((char)(0x80 | (i & 0x3f)));
}
else if(i < 0x10000 )
{
utf8.push_back((char)(0xe0 | (i >> 12)));
utf8.push_back((char)(0x80 | ((i >> 6)& 0x3f)));
utf8.push_back((char)(0x80 | (i & 0x3f)));
}
else if(i < 0x200000 )
{
utf8.push_back((char)(0xf0 | (i >> 18)));
utf8.push_back((char)(0x80 | ((i >> 12)& 0x3f)));
utf8.push_back((char)(0x80 | ((i >> 6)& 0x3f)));
utf8.push_back((char)(0x80 | (i & 0x3f)));
}
else
{
utf8.push_back((char)(0xf8 | (i >> 24)));
utf8.push_back((char)(0x80 | ((i >> 18)& 0x3f)));
utf8.push_back((char)(0x80 | ((i >> 12)& 0x3f)));
utf8.push_back((char)(0x80 | ((i >> 6)& 0x3f)));
utf8.push_back((char)(0x80 | (i & 0x3f)));
}
}
return utf8;
}
}