Ubuntu下C++使用icu庫檢測字符編碼。需先安裝libicu-dev庫:
sudo apt install libicu-dev
C++代碼如下:
//g++ -o x x.cpp -licuuc -licui18n
#include <stdio.h>
#include <string.h>
#include <unicode/ucnv.h>
#include <unicode/utypes.h>
#include <unicode/ucsdet.h>
#define BUF_MAX 4096
/*
* data, 傳入參數, 需要探測的字符串
* len, 傳入參數, 探測字符串長度
* detected 傳出參數, 探測的最有可能的字符編碼名稱, 調用者需要釋放該字段
**/
bool detectTextEncoding(const char *data, int32_t len, char **detected) {
UCharsetDetector *csd;
const UCharsetMatch **csm;
int32_t match, matchCount = 0;
UErrorCode status = U_ZERO_ERROR;
csd = ucsdet_open(&status);
if (status != U_ZERO_ERROR)
return false;
ucsdet_setText(csd, data, len, &status);
if (status != U_ZERO_ERROR)
return false;
csm = ucsdet_detectAll(csd, &matchCount, &status);
if (status != U_ZERO_ERROR)
return false;
#if 0 //打印出探測的可能的編碼
for(match = 0; match < matchCount; match += 1)
{
const char *name = ucsdet_getName(csm[match], &status);
const char *lang = ucsdet_getLanguage(csm[match], &status);
int32_t confidence = ucsdet_getConfidence(csm[match], &status);
if (lang == NULL || strlen(lang) == 0)
lang = "**";
printf("%s (%s) %d\n", name, lang, confidence);
}
#endif
if (matchCount > 0) {
*detected = strdup(ucsdet_getName(csm[0], &status)); //分配了內存, 需要釋放
if (status != U_ZERO_ERROR)
return false;
}
printf("charset = %s\n", *detected);
ucsdet_close(csd);
return true;
}
/*
* toConverterName, 轉換后的字符編碼
* fromConverterName, 轉換前的字符編碼
* target, 存儲轉換后的字符串, 傳出參數
* targetCapacity, 存儲容量,target的大小
* source, 需要轉換的字符串
* sourceLength, source的大小
**/
int convert(const char *toConverterName, const char *fromConverterName,
char *target, int32_t targetCapacity, const char *source, int32_t sourceLength) {
UErrorCode error = U_ZERO_ERROR;
ucnv_convert(toConverterName, fromConverterName, target, targetCapacity, source, sourceLength, &error);
return error;
}
int main(int argc, char **argv) {
if (argc <= 1) {
printf("Usage: %s [filename]...\n", argv[0]);
return -1;
}
FILE *file;
char *filename = argv[1];
file = fopen(filename, "rb");
if (file == NULL) {
printf("Cannot open file \"%s\"\n\n", filename);
return -1;
}
int len = 0;
char *detected = NULL;
char *buffer = new char[BUF_MAX];
char *target = new char[BUF_MAX * 2];
while (true) {
memset(buffer, 0, BUF_MAX);
memset(target, 0, BUF_MAX * 2);
len = (int32_t) fread(buffer, sizeof(char), BUF_MAX, file);
if (detected == NULL) {
if (!detectTextEncoding(buffer, len, &detected)) //編碼探測
break;
}
//轉換為utf8字符編碼
if (convert("UTF-8", detected, target, BUF_MAX * 2, (const char *) buffer, len) != U_ZERO_ERROR) {
printf("ucnv_convert error");
break;
}
printf("%s", target); //打印出轉換的文件的字符串
if (len < BUF_MAX)
break;
}
delete[] buffer;
delete[] target;
delete[] detected;
fclose(file);
return 0;
}
測試一下,正常檢測出了當前文件編碼:

