Ubuntu下C++使用icu庫檢測字符編碼


Ubuntu下C++使用icu庫檢測字符編碼。需先安裝libicu-dev庫:

sudo apt install libicu-dev

  

C++代碼如下:

//g++ -o x x.cpp -licuuc -licui18n
#include <stdio.h>
#include <string.h>

#include <unicode/ucnv.h>
#include <unicode/utypes.h>
#include <unicode/ucsdet.h>

#define BUF_MAX     4096

/*
 * data,    傳入參數, 需要探測的字符串
 * len,     傳入參數, 探測字符串長度
 * detected  傳出參數, 探測的最有可能的字符編碼名稱, 調用者需要釋放該字段
**/
bool detectTextEncoding(const char *data, int32_t len, char **detected) {
    UCharsetDetector *csd;
    const UCharsetMatch **csm;
    int32_t match, matchCount = 0;

    UErrorCode status = U_ZERO_ERROR;

    csd = ucsdet_open(&status);
    if (status != U_ZERO_ERROR)
        return false;

    ucsdet_setText(csd, data, len, &status);
    if (status != U_ZERO_ERROR)
        return false;

    csm = ucsdet_detectAll(csd, &matchCount, &status);
    if (status != U_ZERO_ERROR)
        return false;

#if 0 //打印出探測的可能的編碼
    for(match = 0; match < matchCount; match += 1)
    {
        const char *name = ucsdet_getName(csm[match], &status);
        const char *lang = ucsdet_getLanguage(csm[match], &status);
        int32_t confidence = ucsdet_getConfidence(csm[match], &status);

        if (lang == NULL || strlen(lang) == 0)
                lang = "**";

        printf("%s (%s) %d\n", name, lang, confidence);
    }
#endif

    if (matchCount > 0) {
        *detected = strdup(ucsdet_getName(csm[0], &status)); //分配了內存, 需要釋放
        if (status != U_ZERO_ERROR)
            return false;
    }

    printf("charset = %s\n", *detected);

    ucsdet_close(csd);
    return true;
}


/*
 * toConverterName,      轉換后的字符編碼
 * fromConverterName,    轉換前的字符編碼
 * target,               存儲轉換后的字符串, 傳出參數
 * targetCapacity,       存儲容量,target的大小
 * source,              需要轉換的字符串
 * sourceLength,         source的大小
**/
int convert(const char *toConverterName, const char *fromConverterName,
            char *target, int32_t targetCapacity, const char *source, int32_t sourceLength) {
    UErrorCode error = U_ZERO_ERROR;
    ucnv_convert(toConverterName, fromConverterName, target, targetCapacity, source, sourceLength, &error);

    return error;
}

int main(int argc, char **argv) {
    if (argc <= 1) {
        printf("Usage: %s [filename]...\n", argv[0]);
        return -1;
    }

    FILE *file;
    char *filename = argv[1];

    file = fopen(filename, "rb");
    if (file == NULL) {
        printf("Cannot open file \"%s\"\n\n", filename);
        return -1;
    }

    int len = 0;
    char *detected = NULL;

    char *buffer = new char[BUF_MAX];
    char *target = new char[BUF_MAX * 2];

    while (true) {
        memset(buffer, 0, BUF_MAX);
        memset(target, 0, BUF_MAX * 2);

        len = (int32_t) fread(buffer, sizeof(char), BUF_MAX, file);

        if (detected == NULL) {
            if (!detectTextEncoding(buffer, len, &detected)) //編碼探測
                break;
        }

        //轉換為utf8字符編碼
        if (convert("UTF-8", detected, target, BUF_MAX * 2, (const char *) buffer, len) != U_ZERO_ERROR) {
            printf("ucnv_convert error");
            break;
        }

        printf("%s", target); //打印出轉換的文件的字符串

        if (len < BUF_MAX)
            break;
    }

    delete[] buffer;
    delete[] target;
    delete[] detected;
    fclose(file);

    return 0;
}

  測試一下,正常檢測出了當前文件編碼:

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM