一個在字符串中查找多個關鍵字的函數strstrs（三種不同算法實現及效率分析）

本文轉載自查看原文 2016-10-26 23:22 2538 c++

20190529更新

1 增加測試用例
2 修復中文查找可能導致越界的bug
3 strstr改為不使用二分（效率會慢一些，但匹配結果相對可控），推薦使用strstrs_ext

==================================================================================

20190529：windows上建議使用strstrs_ext，linux上在數據不匹配的場景好像strstrs_normal更快一點。我把測試效率代碼附上，有需要的可以自己驗證。

從我自己測試的效率對比猜測，linux上gcc的strstr應該不是普通的暴力匹配法，網上的說法不正確。

==================================================================================

平時項目中有時需要用到在字符串中搜索兩個或更多的關鍵字的情景。例如：將字符串"ab|cd#ef|"按豎線或者井號做分隔

如果是大項目，一般會采用正則表達式做處理。但有時寫個小程序，不想因此引進一個正則庫，所以我自己寫了一個支持多關鍵字版本的字符串查找函數strstrs

函數說明：

 1 #include <stdio.h>
 2 #include <windows.h>
 3 
 4 #ifndef IN
 5 #define IN 
 6 #endif
 7 
 8 //函數說明：在字符串中搜索指定的關鍵字，支持1-nCnt個關鍵字
 9 //strToFind 待查找字符串 不允許為空
10 //strKeywords 搜索關鍵字字符串數組 不允許為空 數組元素不允許為空(NULL)，但可以是空串("")
11 //nCnt 關鍵字個數
12 //pFound 查找到的關鍵字在字符串數組的位置 不允許為空
13 //返回值：
14 //1 如果關鍵字存在空串，則返回strToFind 
15 //2 如果找不到關鍵字則返回NULL
16 //3 如果找到關鍵字，則返回關鍵字在strKeywords中的位置（位置從0開始）
17 
18 //使用哈希加二分查找實現
19 const char *strstrs(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound);
20 //使用哈希加鏈接實現 推薦使用
21 const char *strstrs_ext(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound);
22 //依次查找關鍵字的實現
23 const char *strstrs_normal(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound);
24 
25 //以下是為了使用方便而增加的一些重載，沒多大意義
26 char *strstrs(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound);
27 char *strstrs_ext(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound);
28 char *strstrs_normal(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound);
29 
30 char *strstrs(IN char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound);
31 char *strstrs_ext(IN char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound);
32 char *strstrs_normal(IN char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound);
33 
34 const char *strstrs(const char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound);
35 const char *strstrs_ext(const char *strToFind, const char *strKeywords[], size_t nCnt, int pFound);
36 const char *strstrs_normal(const char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound);
37 void tets_strstrs(int nStep); // 0 strstrs 1 strstrs_ext 2 strstrs_normal

函數實現及相應測試代碼：

// stdafx.cpp : source file that includes just the standard includes
// sqlite_test.pch will be the pre-compiled header
// stdafx.obj will contain the pre-compiled type information

#include "stdafx.h"
#include <assert.h>
#include <stdlib.h>
#include <time.h>
#include <stdio.h>


// TODO: reference any additional headers you need in STDAFX.H
// and not in this file


const char *strstrs(const char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound)
{
    return strstrs(const_cast<char *>(strToFind), strKeywords, nCnt, pFound);
}

const char *strstrs_ext(const char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound)
{
    return strstrs_ext(const_cast<char *>(strToFind), strKeywords, nCnt, pFound);
}

const char *strstrs_normal(const char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound)
{
    return strstrs_normal(const_cast<char *>(strToFind), strKeywords, nCnt, pFound);
}

const char *strstrs(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound)
{
    return strstrs(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound);
}

const char *strstrs_ext(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound)
{
    return strstrs_ext(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound);
}

const char *strstrs_normal(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound)
{
    return strstrs_normal(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound);
}


char *strstrs(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound)
{
    return strstrs(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound);
}

char *strstrs_ext(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound)
{
    return strstrs_ext(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound);
}

char *strstrs_normal(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound)
{
    return strstrs_normal(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound);
}

typedef struct tagKeyPos
{
    const char *m_str;
    size_t m_nIdx;
    size_t m_strLen;
}KeyPos;

int __strstrs_cmp(const void *p1, const void *p2)
{
    const KeyPos *pLeft = (KeyPos *)p1, *pRight = (KeyPos *)p2;
    int nCmp = strcmp(pLeft->m_str, pRight->m_str);
    if (nCmp == 0)
    {
        return pLeft->m_nIdx - pRight->m_nIdx;
    }

    return nCmp;
}

/*
//lower_bound
KeyPos *__strstrs_find_first(KeyPos *pRealBeg, KeyPos *pRealEnd, size_t *pKeyLenArr, KeyPos *pKey)
{
    KeyPos *pBeg = pRealBeg;
    KeyPos *pEnd = pRealEnd;

    KeyPos *pEqal = NULL;
    while (pBeg != pEnd)
    {
        pEqal = pBeg +  (pEnd - pBeg) / 2;
        int nCmp = memcmp( pEqal->m_str, pKey->m_str, pEqal->m_strLen );
        if (nCmp == 0)
        {
            //若相等，則往前找，直至找到最后一個相等的元素
            while (pEqal != pBeg)
            {
                pEqal--;
                if (memcmp( pEqal->m_str, pKey->m_str, pEqal->m_strLen ))
                {
                    return pEqal + 1;
                }
            }

            return pBeg;
        }
        else if (nCmp > 0)
        {
            //中值比目標值大
            pEnd = pEqal;
        }
        else
        {
            //中值比目標值小
            pBeg = pEqal + 1;
        }

    }

    return pRealEnd;
}
*/

KeyPos *__strstrs_find_first(KeyPos *pRealBeg, KeyPos *pRealEnd, size_t *pKeyLenArr, KeyPos *pKey)
{
    KeyPos *pBeg = pRealBeg;
    KeyPos *pEnd = pRealEnd;

    while (pBeg != pEnd)
    {
        int nCmp = memcmp( pBeg->m_str, pKey->m_str, pBeg->m_strLen );
        if (nCmp == 0)
        {
            return pBeg;
        }

        ++pBeg;
    }

    return pRealEnd;
}

char *strstrs(char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound)
{
    //作者：皇家救星 創建於：2016-10-19 
    //有bug請發送郵件至89475049@qq.com 郵件主題注明：strstrs問題反饋
    //異常參數判斷
    assert(strToFind != NULL);
    assert(strKeywords != NULL);
    assert(pFound != NULL);
    assert(nCnt > 0);

    //記錄各個關鍵字首字符到集合中 后面判斷用
    bool mpFirstChar[256] = {0}; //這里如果用位圖，可以節省不少空間
    for (size_t i = 0; i < nCnt; i++)
    {
        //linux和win的char類型定義不一樣 這里統一強制轉換一下
        assert(strKeywords[i] != NULL);
        //使用unsigned char 確保char類型是負數時強制轉換不會超過256而越界
        mpFirstChar[(unsigned char)strKeywords[i][0]] = true;
        if (strKeywords[i][0] == '\0')
        {
            *pFound = i;
            return strToFind;
        }
    }

    KeyPos *sortKeywords = new KeyPos[nCnt];
    for (size_t i = 0; i < nCnt; ++i)
    {
        sortKeywords[i].m_str = strKeywords[i];
        sortKeywords[i].m_strLen = strlen(strKeywords[i]);
        sortKeywords[i].m_nIdx = i;
    }
    //不能排序，會導致關鍵字位置混亂
    //qsort(sortKeywords, nCnt, sizeof(KeyPos), __strstrs_cmp);

    //使用unsigned char 確保char類型是負數時強制轉換不會超過256而越界
    unsigned char *p = (unsigned char *)strToFind;
    KeyPos key;
    KeyPos *pEnd = sortKeywords + nCnt;
    KeyPos *pResult = NULL;
    while (*p)
    {
        //判斷當前字符是否在關鍵串首字符集中
        if (mpFirstChar[*p])
        {
            key.m_str = (char *)p;
            pResult = __strstrs_find_first(sortKeywords, pEnd, NULL, &key);
            if (pResult != pEnd)
            {
                *pFound = pResult->m_nIdx;
                delete []sortKeywords;
                return reinterpret_cast<char *>(p);
            }
        }

        p++;
    }

    delete []sortKeywords;
    return NULL;
}

typedef struct tagKeyPosExt
{
    size_t m_strLen;
    size_t m_strIdx;
    struct tagKeyPosExt *m_next;
}KeyPosExt;

char *strstrs_ext(char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound)
{
    //作者：皇家救星 創建於：2016-10-19 
    //有bug請發送郵件至89475049@qq.com 郵件主題注明：strstrs問題反饋
    //20190522 修改字符串有中文會導致內存訪問異常的bug
    //異常參數判斷
    assert(strToFind != NULL);
    assert(strKeywords != NULL);
    assert(pFound != NULL);
    assert(nCnt > 0);

    //仿內存池 減少new調用次數
    KeyPosExt *memPool = new KeyPosExt[nCnt]; //注意：memPool分配失敗會拋異常
    memset(memPool, 0, nCnt * sizeof(KeyPosExt));
    int nUsed = 0;

    //記錄各個關鍵字首字符到集合中 后面判斷用
    KeyPosExt mpFirstChar[256];
    memset(mpFirstChar, 0, sizeof(mpFirstChar));
    for (size_t i = nCnt - 1; i != (size_t)-1; --i)
    {
        KeyPosExt *pPos = &memPool[nUsed++];
        //如果同一個首字符對應多個關鍵字，則用鏈表連起來
        assert(strKeywords[i] != NULL);
        pPos->m_strIdx = i;
        pPos->m_strLen = strlen(strKeywords[i]);

        if (pPos->m_strLen == 0)
        {
            *pFound = i;
            delete []memPool;
            return strToFind;
        }

        //把新的節點插到最前面
        //使用unsigned char 確保char類型是負數時強制轉換不會超過256而越界
        KeyPosExt *pLast = &mpFirstChar[(unsigned char)strKeywords[i][0]];
        pPos->m_next = pLast->m_next;
        pLast->m_next = pPos;
    }

    //使用unsigned char 確保char類型是負數時強制轉換不會超過256而越界
    unsigned char *p = (unsigned char *) strToFind;
    while (*p)
    {
        //判斷當前字符是否在關鍵串首字符集中
        for (KeyPosExt *pPos = mpFirstChar[*p].m_next; pPos != NULL; pPos = pPos->m_next)
        {
            //遍歷以當前字符開頭的關鍵串，挨個比較 看是否有匹配的
            if (memcmp(p, strKeywords[pPos->m_strIdx], pPos->m_strLen) == 0)
            {
                *pFound = pPos->m_strIdx;
                delete []memPool;
                return reinterpret_cast<char *>(p);
            }
        }

        p++;
    }

    delete []memPool;
    return NULL;
}

char *strstrs_normal(char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound)
{
    //作者：皇家救星 創建於：2016-10-19 
    //有bug請發送郵件至89475049@qq.com 郵件主題注明：strstrs問題反饋
    //20190522 修改字符串有中文會導致內存訪問異常的bug
    //異常參數判斷
    assert(strToFind != NULL);
    assert(strKeywords != NULL);
    assert(pFound != NULL);
    assert(nCnt > 0);

    char *p = NULL;
    for (size_t i = 0; i < nCnt; i++)
    {
        assert(strKeywords[i] != NULL);
        if (strKeywords[i][0] == '\0')
        {
            *pFound = i;
            return strToFind;
        }
    }

    for (size_t i = 0; i < nCnt; i++)
    {
        assert(strKeywords[i] != NULL);
        if ((p = strstr(strToFind, strKeywords[i])) != NULL)
        {
            *pFound = i;
            return p;
        }
    }
    return NULL;
}

//准確性測試
int tets_strstrs1()
{
    const char *strKeywords[] = {"123", "select", "union", "or", "customer", "subsid",
        "2455", "group_id", "test", "from", "truncate", "s", "english1", "2222222222222222888888888888833300", "皇家"};
    const char *strSqls[] = {
        "select * from dual",
        "drop table",
        "truncate",
        "english",
        "goodby",
        "get 123",
        "123 get",
        " from"
        "D",
        "s",
        "89sfs89",
        "or",
        "sor",
        "orunion",
        "unionor",
        "83eejr3r9r9r33302002013345331224312343",
        "去9999給",
        "去皇家救星給"
    };

    for (int i = 0; i < sizeof(strSqls) / sizeof(strSqls[0]); ++i)
    {
        bool bFoundNormal = false;
        int nFoundNormal = 0;
        if (NULL != 
            strstrs_normal(strSqls[i], strKeywords, sizeof(strKeywords) / sizeof(strKeywords[0]), &nFoundNormal))
        {
            bFoundNormal = true;
        }

        bool bFoundExt = false;
        int nFoundExt = 0;
        if (NULL != 
            strstrs_ext(strSqls[i], strKeywords, sizeof(strKeywords) / sizeof(strKeywords[0]), &nFoundExt))
        {
            bFoundExt = true;
        }

        bool bFound = false;
        int nFound = 0;
        if (NULL != 
            strstrs(strSqls[i], strKeywords, sizeof(strKeywords) / sizeof(strKeywords[0]), &nFound))
        {
            bFound = true;
        }

        if ((bFound != bFoundExt || bFound != bFoundNormal) 
            || (nFound != nFoundExt /*|| nFound != nFoundNormal*/))
        {
            printf("error! strSqls[i] = [%s]\n", strSqls[i]);
            printf("bFound = %d nFound = %d\n", bFound, nFound);
            printf("bFoundNormal = %d nFoundNormal = %d\n", bFoundNormal, nFoundNormal);
            printf("bFoundExt = %d nFoundExt = %d\n", bFoundExt, nFoundExt);
            return -1 - i * 10;
        }
    }

    return 0;
}

//效率比較及准確性測試函數
void tets_strstrs(int nStep)
{
    const int max_length = 10000; //max_length必須大於1024
    const int max_keyword = 1000;
    char *strToFound = new char[max_length + 1]; //待查找的字符串
    char *strBackup = new char[max_length + 1]; 
    char *strKeywords[max_keyword]; //關鍵字數組
    const char strBase64[65] = {"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"};

    //為避免結果全是找不到關鍵字，隨機將一個關鍵字復制到strToFound中
    //這樣肯定會有找到關鍵字的情況，結果更有意義
    bool arrayFoundFlags[max_keyword] = {0}; //標記是否把關鍵字復制到strToFound中
    int arrayFoundIdxs[max_keyword] = {0}; //待替換的關鍵字（序號）
    int arrayFoundBeg[max_keyword] = {0}; //在strToFound替換關鍵字的起始位置

    if (tets_strstrs1() != 0)
    {
        printf("函數功能驗證失敗\n");
        return;
    }

    srand((int)time(NULL));
    //初始化要查詢的字符串
    for (int i = 0; i < max_length; i++)
    {
        strToFound[i] = strBase64[rand() % 64];
    }
    strToFound[max_length] = '\0';
    fprintf(stderr, "strToFound = [%s]\n", strToFound);

    //初始化查詢關鍵字
    for (int i = 0; i < max_keyword; i++)
    {
        size_t nKeyLen = max_length / 4;
        size_t nKeyLenMin = 50;
        strKeywords[i] = new char[nKeyLen + 1];

        if (nKeyLen < nKeyLenMin)
        {
            fprintf(stderr, "max_length is too small\n");
            exit(1);
        }
        int nLen = rand() % (nKeyLen - nKeyLenMin)  + nKeyLenMin;
        for (int j = 0; j < nLen; j++)
        {
            strKeywords[i][j] = strBase64[rand() % 64];
        }
        strKeywords[i][nLen] = '\0';

        //為避免隨機結果都是查不到的情況，這里增加一些干預
        //if (0 != (rand() % 10))
//         {
//             //隨機抽取約9/10的關鍵字 復制到待查字符串中
//             arrayFoundFlags[i] = true;
//             arrayFoundIdxs[i] = rand() % (i + 1);
//             arrayFoundBeg[i] = 0;
//         }

        fprintf(stderr, "strKeywords[%d] = [%s]\n", i, strKeywords[i]);
        fprintf(stderr, "%d: %d %d %d\n", i, arrayFoundFlags[i], arrayFoundIdxs[i], arrayFoundBeg[i]);
    }
    fflush(stderr);
    printf("RESULT: 函數類型 關鍵字總數 總耗時 總共找到次數\n");
    for (int cmpType = 0; cmpType < 3; cmpType++)
    {
        int nSn = 0;
        double total_start = GetTickCount();
        for (size_t nCnt = 0; nCnt < max_keyword; nCnt++)
        {
            bool bSetFound = arrayFoundFlags[nCnt];
            int nBeg = 0;
            int nChange = 0;
            int idxKeyword = 0;
            if (bSetFound)
            {
                //把關鍵字替換到字符串中 這樣能保證字符串肯定包含想要的字符串
                idxKeyword = arrayFoundIdxs[nCnt];
                nChange = strlen(strKeywords[idxKeyword]);
                nBeg = arrayFoundBeg[nCnt];
                memcpy(strBackup, strToFound + nBeg, nChange);
                strBackup[nChange] = '\0';
                memcpy(strToFound + nBeg, strKeywords[idxKeyword], nChange);
            }

            double start = GetTickCount();
            int nFoundCnt = 0;

            //待查字符串從短到長
            for (int nStrlen = 0; nStrlen < max_length; nStrlen += nStep)
            {
                //末尾要有\0 所以這里行把末尾字符備份起來 用\0覆蓋 后面調用strstrs后再替換回去
                char cBak = strToFound[nStrlen];
                strToFound[nStrlen] = '\0';
                int nFound = -1;
                const char *p;
                switch (cmpType)
                {
                case 0:
                    p = strstrs(strToFound, strKeywords, nCnt + 1, &nFound);
                    break;
                case 1:
                    p = strstrs_ext(strToFound, strKeywords, nCnt + 1, &nFound);
                    break;
                default:
                    p = strstrs_normal(strToFound, strKeywords, nCnt + 1, &nFound);
                    break;
                }

                //fprintf(stderr, "cmpType %d %d %d\n", cmpType, nSn, nFound);
                nSn++;
                if (p != NULL)
                {
                    nFoundCnt++;
                }
                else
                {
                    //假設明明有把關鍵字拷進去但還是返回找不到，說明結果有問題
                    if (bSetFound && ((nBeg + nChange) <= nStrlen))
                    {
                        printf("cmpType = %d ###############################error!\n", cmpType);
                        printf("strToFound = [%s], nStrlen = %d, nCnt = %d\n", strToFound, nStrlen, nCnt);
                        printf("strKeywords[arrayFoundIdxs[nCnt]] = [%s], nBeg = %d, nChange = %d\n", 
                            strKeywords[arrayFoundIdxs[nCnt]], nBeg, nChange);
                        exit(10);
                        //                     switch (cmpType)
                        //                     {
                        //                     case 0:
                        //                         p = strstrs(strToFound, strKeywords, nCnt + 1, &nFound);
                        //                         break;
                        //                     case 1:
                        //                         p = strstrs_ext(strToFound, strKeywords, nCnt + 1, &nFound);
                        //                         break;
                        //                     default:
                        //                         p = strstrs_normal(strToFound, strKeywords, nCnt + 1, &nFound);
                        //                         break;
                        //                     }
                    }
                }


                strToFound[nStrlen] = cBak;
            }
            double end = GetTickCount();
            //函數類型 關鍵字序列 耗時 總共找到次數 
            printf("RESULT: %d %d %f %d\n", 
                cmpType, nCnt + 1, end - start, nFoundCnt);
            fflush(stdout);
            fflush(stderr);

//             if (nFoundCnt == 499)
//             {
//                 printf("pre strToFound = [%s], strBackup = [%s], nCnt = %d nBeg %d nChange %d idxKeyword %d strKeywords[idxKeyword] %s\n", 
//                     strToFound, strBackup, nCnt, nBeg, nChange, idxKeyword, strKeywords[idxKeyword]);
//             }

            if (bSetFound)
            {
                memcpy(strToFound + nBeg, strBackup, nChange);
            }
// 
//             if (nFoundCnt == 499)
//             {
//                 printf("strToFound = [%s], nCnt = %d nBeg %d nChange %d idxKeyword %d\n", strToFound, nCnt, nBeg, nChange, idxKeyword);
//             }
        }


        double total_end = GetTickCount();
        fprintf(stderr, "總共耗時[%f]\n", total_end - total_start);
    }

    //TODO: 此處應該要釋放內存 
    delete []strToFound;
    delete []strBackup;
    for (int i = 0; i < max_keyword; i++)
    {
        delete []strKeywords[i];
    }
}

函數效率比較圖：

0 代表strstrs

1 代表strstrs_ext

2 代表strstrs_normal

可以看出，strstrs_ext比較穩定，而且效率也比較高。

在關鍵字列表都與查找字符串不匹配情況trstrs_normal表現好過strstrs

在關鍵字列表都與查找字符串基本都存在匹配項情況strstrs表現好過strs_normal

在任何情況下strstrs_ext都表現最好

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 linux 用 grep 查找單個或多個字符串（關鍵字）用lucene實現在一個(或者多個)字段中查找多個關鍵字查找jar包中.class文件關鍵字(變量名，字符串) 將Java的關鍵字保存在文本文檔中。判斷一個字符串是否為Java中的關鍵字 Linux - Shell - 在多個文件中查找關鍵字 js截取關鍵字之后的字符串 React字符串關鍵字替換樣式 ABAP中SPLIT關鍵字當分隔符位於字符串首尾時 Oracle 實現一個關鍵字匹配多個字段 super關鍵字的三種用法