#include <iostream> #include <fstream> #include <string> #include <sstream> using namespace std; enum TEXT_TYPE { TEXT_ANSI = 0, TEXT_UTF8 = 1, TEXT_UTF8_BOM = 2, TEXT_UTF16_LE = 3, TEXT_UTF16_BE = 4, TEXT_UNKNOW = 5, }; //检查是否为无BOM的UTF8 bool check_utf8_without_bom(const string &file_name) { ifstream file_in; file_in.open(file_name, ios::in); if (!file_in.is_open()) { cout << "打开文件失败" << endl; return false; } stringstream buffer; buffer << file_in.rdbuf(); file_in.close(); string text = buffer.str(); size_t len = text.size(); int n = 0; unsigned char ch; bool b_all_ascii = true; //0x00-0x7F为ASCII码范围 for (size_t i = 0; i < len; ++i) { ch = text[i]; if ((ch & 0x80) != 0) { b_all_ascii = false; } if (n == 0) { if (ch >= 0x80) { if (ch >= 0xFC && ch <= 0xFD) { n = 6; } else if (ch >= 0xF8) { n = 5; } else if (ch >= 0xF0) { n = 4; } else if (ch >= 0xE0) { n = 3; } else if (ch >= 0xC0) { n = 2; } else { return false; } n--; } } else { if ((ch & 0xC0) != 0x80)//在UTF-8中,以位模式10开始的所有字节是多字节序列的后续字节 { return false; } n--; } } if (n > 0) { return false; } if (b_all_ascii) { return false; } return true; } //检查文本编码 TEXT_TYPE check_text_encode(const string &file_name) { /* ANSI 无格式定义 对于中文编码格式是GB2312; Unicode little endian 文本里前两个字节为FF FE 字节流是little endian Unicode big endian 文本里前两个字节为FE FF 字节流是big endian UTF-8带BOM 前两字节为EF BB,第三字节为BF 带BOM UTF-8不带BOM 无格式定义,需另加判断 不带BOM */ ifstream file_in(file_name, ios::binary); if (!file_in.is_open()) { cout << "打开文件失败" << endl;; return TEXT_UNKNOW; } int head; unsigned char ch; file_in.read((char*)&ch, sizeof(ch)); head = ch << 8; file_in.read((char*)&ch, sizeof(ch)); head |= ch; file_in.close(); TEXT_TYPE result_code; switch (head) { case 0xFFFE: result_code = TEXT_UTF16_LE; break; case 0xFEFF: result_code = TEXT_UTF16_BE; break; case 0xEFBB: result_code = TEXT_UTF8_BOM; break; default: if (check_utf8_without_bom(file_name)) result_code = TEXT_UTF8; else result_code = TEXT_ANSI; break; } return result_code; } int main(int argc, char* argv[]) { string str1 = "E:\\Book\\ANSI.txt"; string str2 = "E:\\Book\\UTF8.txt"; string str3 = "E:\\Book\\UTF8_BOM.txt"; TEXT_TYPE txttype1 = check_text_encode(str1); TEXT_TYPE txttype2 = check_text_encode(str2); TEXT_TYPE txttype3 = check_text_encode(str3); getchar(); return 0; }
参考链接:
https://www.jb51.net/article/128576.htm
https://www.cnblogs.com/Toya/p/11433441.html