#include <iostream> #include <fstream> #include <string> #include <sstream> using namespace std; enum TEXT_TYPE { TEXT_ANSI = 0, TEXT_UTF8 = 1, TEXT_UTF8_BOM = 2, TEXT_UTF16_LE = 3, TEXT_UTF16_BE = 4, TEXT_UNKNOW = 5, }; //檢查是否為無BOM的UTF8 bool check_utf8_without_bom(const string &file_name) { ifstream file_in; file_in.open(file_name, ios::in); if (!file_in.is_open()) { cout << "打開文件失敗" << endl; return false; } stringstream buffer; buffer << file_in.rdbuf(); file_in.close(); string text = buffer.str(); size_t len = text.size(); int n = 0; unsigned char ch; bool b_all_ascii = true; //0x00-0x7F為ASCII碼范圍 for (size_t i = 0; i < len; ++i) { ch = text[i]; if ((ch & 0x80) != 0) { b_all_ascii = false; } if (n == 0) { if (ch >= 0x80) { if (ch >= 0xFC && ch <= 0xFD) { n = 6; } else if (ch >= 0xF8) { n = 5; } else if (ch >= 0xF0) { n = 4; } else if (ch >= 0xE0) { n = 3; } else if (ch >= 0xC0) { n = 2; } else { return false; } n--; } } else { if ((ch & 0xC0) != 0x80)//在UTF-8中,以位模式10開始的所有字節是多字節序列的后續字節 { return false; } n--; } } if (n > 0) { return false; } if (b_all_ascii) { return false; } return true; } //檢查文本編碼 TEXT_TYPE check_text_encode(const string &file_name) { /* ANSI 無格式定義 對於中文編碼格式是GB2312; Unicode little endian 文本里前兩個字節為FF FE 字節流是little endian Unicode big endian 文本里前兩個字節為FE FF 字節流是big endian UTF-8帶BOM 前兩字節為EF BB,第三字節為BF 帶BOM UTF-8不帶BOM 無格式定義,需另加判斷 不帶BOM */ ifstream file_in(file_name, ios::binary); if (!file_in.is_open()) { cout << "打開文件失敗" << endl;; return TEXT_UNKNOW; } int head; unsigned char ch; file_in.read((char*)&ch, sizeof(ch)); head = ch << 8; file_in.read((char*)&ch, sizeof(ch)); head |= ch; file_in.close(); TEXT_TYPE result_code; switch (head) { case 0xFFFE: result_code = TEXT_UTF16_LE; break; case 0xFEFF: result_code = TEXT_UTF16_BE; break; case 0xEFBB: result_code = TEXT_UTF8_BOM; break; default: if (check_utf8_without_bom(file_name)) result_code = TEXT_UTF8; else result_code = TEXT_ANSI; break; } return result_code; } int main(int argc, char* argv[]) { string str1 = "E:\\Book\\ANSI.txt"; string str2 = "E:\\Book\\UTF8.txt"; string str3 = "E:\\Book\\UTF8_BOM.txt"; TEXT_TYPE txttype1 = check_text_encode(str1); TEXT_TYPE txttype2 = check_text_encode(str2); TEXT_TYPE txttype3 = check_text_encode(str3); getchar(); return 0; }
參考鏈接:
https://www.jb51.net/article/128576.htm
https://www.cnblogs.com/Toya/p/11433441.html