參考自:https://blog.csdn.net/ac540101928/article/details/52786435
上面鏈接的方法詳細講解了最短編輯距離算法,但不能處理中文字符。
unicode和utf-8互轉:https://www.cnblogs.com/cthon/p/9297232.html
#include "EditDistance.h"
#include <string>
using std::cout;
using std::endl;
using std::string;
//判斷字符的字節長,以便區分編碼規則,實現utf-8編碼
/// 獲取一個字節高位開頭為1的個數
size_t nBytesCode(const char ch)
{
if(ch & (1 << 7))//如果ch是多字節的,下面循環,判斷utf-8編碼的字節長
{
int nBytes = 1;
for(int idx = 0; idx != 6; ++idx)
{
if(ch & (1 << (6 - idx)))
{
++nBytes;
}
else
break;
}
return nBytes;//返回字節長
}
return 1;
}
#if 0
//該算法復雜了,不夠簡潔
size_t nBytesCode(const char ch)
{
size_t nBytes = 0;
if(ch &(1 << 7))
{//對中文進行處理-utf8編碼
if((ch & 0xF0) == 0xC0 || (ch & 0xF0) == 0xD0) // 1111 0000
{ // &11xx xxxx
nBytes += 2; // 1100 0000
} // 1101 0000
else if((ch & 0xF0) == 0xE0)
{
nBytes += 3;
}
else if((ch & 0xFF) == 0xF0 ||
(ch & 0xFF) == 0xF1 ||
(ch & 0xFF) == 0xF2 ||
(ch & 0xFF) == 0xF3 ||
(ch & 0xFF) == 0xF4 ||
(ch & 0xFF) == 0xF5 ||
(ch & 0xFF) == 0xF6 ||
(ch & 0xFF) == 0xF7 )
{
nBytes += 4;
}
else if((ch & 0xFF) == 0xF8 ||
(ch & 0xFF) == 0xF9 ||
(ch & 0xFF) == 0xFA ||
(ch & 0xFF) == 0xFB)
{
nBytes += 5;
}
else if((ch & 0xFF) == 0xFC)
{
nBytes += 6;
}
}
else
{//1字節編碼或英文
nBytes += 1;
}
return nBytes;
}
#endif
std::size_t length(const std::string &str)
{
std::size_t ilen = 0;
for(std::size_t idx = 0; idx != str.size(); ++idx)
{
int nBytes = nBytesCode(str[idx]);
idx += (nBytes - 1);
++ilen;
}
return ilen;
}
int triple_min(const int &a, const int &b, const int &c)
{
return a < b ? (a < c ? a : c) : (b < c ? b : c);
}
int editDistance(const std::string & lhs, const std::string &rhs)
{//計算最小編輯距離-包括處理中英文
size_t lhs_len = length(lhs);//字符長
size_t rhs_len = length(rhs);
size_t blhs_len = length(lhs);//字節長
size_t brhs_len = length(rhs);
int editDist[lhs_len + 1][rhs_len + 1];
for(size_t idx = 0; idx <= lhs_len; ++idx)
{
editDist[idx][0] = idx;
}
for(size_t idx = 0; idx <= rhs_len; ++idx)
{
editDist[0][idx] = idx;
}
std::string sublhs, subrhs;
for(std::size_t dist_i = 1, lhs_idx = 0; dist_i <= lhs_len && lhs_idx <= blhs_len; ++dist_i, ++lhs_idx)//lhs_idx<=blhs_len一定要加上,防止substr處理越界,自己調試幾下就清楚了
{
size_t nBytes = nBytesCode(lhs[lhs_idx]);
sublhs = lhs.substr(lhs_idx, nBytes);
lhs_idx += (nBytes - 1);
for(std::size_t dist_j = 1, rhs_idx = 0; dist_j <= rhs_len && rhs_idx <= brhs_len; ++dist_j, ++rhs_idx)
{
nBytes = nBytesCode(rhs[rhs_idx]);
subrhs = rhs.substr(rhs_idx, nBytes);
rhs_idx += (nBytes - 1);
if(sublhs == subrhs)
{
editDist[dist_i][dist_j] = editDist[dist_i - 1][dist_j - 1];
}
else
{
editDist[dist_i][dist_j] = triple_min(
editDist[dist_i][dist_j - 1] + 1,
editDist[dist_i - 1][dist_j] + 1,
editDist[dist_i - 1][dist_j - 1] + 1);
}
}
}
return editDist[lhs_len][rhs_len];
}
