C++回顧統計詞頻問題 -- vector、map、hash_map（三種方式時間比較）

本文轉載自查看原文 2014-10-03 10:41 3666 C/C++

本博文我們通過三個程序比較統計詞頻問題的時間復雜度問題（末尾有用時及其分析）；

問題描述;

1）、找一篇文章，將所有單詞輸入至程序；（The Bible Holy為例）

2）、統計出每個單詞的數量，即詞頻問題；

3）、增加停用詞功能；（遇到此類詞，直接略過）（網上搜）

4）、分別統計出讀取文件並計算詞頻時間、排序所用時間；

5）、用類實現各函數（處統計時間的函數除外）。

vector、map、hash_map 都要處理字符串的去除標點符號、將大寫字母轉換成小寫字母、不對數字進行統計問題。因此，我們可以將處理這些函數進行封裝。

StringUtil.h

 1 #ifndef STRING_UTIL_H_
 2 #define STRING_UTIL_H_
 3 
 4 #include <string>
 5 
 6 namespace stringutil
 7 {
 8 void erasePunct(std::string &s);
 9 void stringToLower(std::string &s);
10 bool isAllDigit(const std::string &s);
11 }
12 
13 #endif  /* STRING_UTIL_H_ */

StringUtil.cpp

 1 #include "StringUtil.h"
 2 #include <ctype.h>
 3 
 4 using namespace std;
 5 
 6 namespace stringutil
 7 {
 8 
 9 void erasePunct(string &s)
10 {
11     string::iterator it = s.begin();
12     while(it != s.end())
13     {
14         if(ispunct(*it))
15             it = s.erase(it);
16         else
17             ++it;
18     }
19 }
20 void stringToLower(string &s)
21 {
22     for(string::iterator it = s.begin();
23         it != s.end();
24         ++it)
25     {
26         if(isupper(*it))
27             *it = tolower(*it);
28     }
29 }
30 
31 bool isAllDigit(const std::string &s)
32 {
33     for(string::const_iterator it = s.begin();
34         it != s.end();
35         ++it)
36     {
37         if(!isdigit(*it))
38             return false;
39     }
40 
41     return true;
42 }
43 
44 }

一、采用vector、pair；

采用的數據結構：

1）、vector<string> stopList用於存放停用詞；

2）、vector<pair<string, int> > words 用於存放除禁用詞之外的其他The Bible Holy 的單詞；

思路：

1）、用ReadStopFile函數讀取停用詞到 stopList中；

2）、用ReadWordFile函數讀取The Bible Holy到 words中；在我們讀取的過程中首先要去除標點符號、將word轉化為小寫、跳過數字；

3）、然后判斷該單詞是否在 stoplist 中，若不在，最后添加單詞至 words中。

4）、對詞頻排序，這里我們采用algorithm庫中的sort函數；

5）、對words vector進行遍歷，打印之；

這里注意一點：除了在 main.cpp中顯式調用的函數，我們將其他函數聲明為類的私有函數。

函數聲明如下：

 1 #ifndef WORDFREQUENCY_H_
 2 #define WORDFREQUENCY_H_
 3 
 4 #include <vector>
 5 #include <string>
 6 #include <utility>
 7 
 8 class WordFrequency
 9 {
10     public:
11         WordFrequency(const std::string &filename, const std::string &stopFile);//初始化
12         void ReadStopFile();
13         void ReadWordFile();
14         void sortWordByFrequency();
15         void printWordFrequency()const;
16     private:
17         void addWordToDict(const std::string &word);
18         bool isStopWord(const std::string &word)const;
19 
20         typedef std::vector<std::pair<std::string, int> >::iterator Wordit;//為了方便
21         typedef std::vector<std::pair<std::string, int> >::const_iterator Wordkit;
22 
23         std::string filename_;
24         std::string stopFile_;
25         
26         std::vector<std::string> stopList_;
27         std::vector<std::pair<std::string, int> > words_;
28 
29 };
30 
31 #endif

函數實現如下：

 1 //WordFrequency.cpp
 2 #include "WordFrequency.h"
 3 #include "StringUtil.h"
 4 #include <algorithm>
 5 #include <stdexcept>
 6 #include <stdlib.h>
 7 #include <fstream>
 8 
 9 using namespace std;
10 using namespace stringutil;
11 
12 WordFrequency::WordFrequency(const std::string &filename, const std::string &stopFile)
13     :filename_(filename),stopFile_(stopFile)
14 { }
15 
16 void WordFrequency::ReadStopFile()
17 {
18     ifstream in(stopFile_.c_str());
19     if( !in )
20         throw runtime_error("open file failure");
21     string word;
22     while(in >> word)
23     {
24         stopList_.push_back(word);
25     }
26     in.close();
27 }
28 
29 void WordFrequency::ReadWordFile()
30 {
31     ifstream infile(filename_.c_str());
32     if( !infile )
33         throw runtime_error("open file failure!");
34     string word;
35     while(infile >> word)
36     {
37         erasePunct(word);
38         if( isAllDigit(word))
39             continue;
40         stringToLower(word);
41         if( !isStopWord(word))
42             addWordToDict(word);
43     }
44     infile.close();
45 }
46 
47 bool WordFrequency::isStopWord(const string &word)const
48 {
49     vector<string>::const_iterator it = stopList_.begin();
50     while( it  != stopList_.end())
51     {
52         if( *it == word) 
53             break;
54         it ++;
55     }
56     return (it != stopList_.end());
57 }
58 
59 void WordFrequency::addWordToDict(const string &word)
60 {
61     Wordit it = words_.begin();
62     while( it != words_.end())
63     {
64         if( it->first == word)
65         {
66             ++ it->second ;
67             break;
68         }
69         it ++;
70     }
71     if(it == words_.end())
72         words_.push_back(make_pair(word, 1)) ;
73 }
74 
75 bool cmp(const pair<string, int> &a, const pair<string, int>&b)
76 {
77     return a.second > b.second;
78 }
79 
80 void WordFrequency::sortWordByFrequency()
81 {
82     sort(words_.begin(), words_.end(), cmp);
83 }
84 
85 void WordFrequency::printWordFrequency()const
86 {
87     Wordkit it = words_.begin();
88     while(it != words_.end())
89     {
90         printf("words: %s, frequency: %d\n",it->first.c_str(),it->second);
91         it ++;
92     }
93 }

main.cpp

 1 //main.cpp
 2 #ifndef _STDC_FORMAT_MACROS
 3 #define _STDC_FORMAT_MACROS
 4 #endif /* _STDC_FORMAT_MACROS */
 5 #include <inttypes.h>
 6 
 7 #include "WordFrequency.h"
 8 #include <iostream>
 9 #include <vector>
10 #include <stdexcept>
11 #include <sys/time.h>
12 #include <stdio.h>
13 #include <string.h>
14 using namespace std;
15 
16 int64_t gettime();
17 
18 int main(int argc, const char *argv[])
19 {
20     if(argc < 3)
21         throw runtime_error("exe,filename, stopfile");
22     int64_t starttime =gettime();
23     WordFrequency wf(argv[1], argv[2]);
24     wf.ReadStopFile();
25     wf.ReadWordFile();
26     
27     int64_t readtime = gettime();
28     wf.sortWordByFrequency();
29 
30     int64_t sorttime = gettime();
31 
32     printf("readfile time: %lld ms\n",(readtime-starttime)/1000); 
33     printf("sortfile time: %lld ms\n",(sorttime-readtime)/1000);  
34     
35     wf.printWordFrequency();
36     return 0;
37 }
38 
39 int64_t gettime()
40 {
41     struct timeval tv;
42     ::memset(&tv, 0, sizeof tv);
43     if(::gettimeofday(&tv, NULL) == -1)
44     {
45         throw runtime_error("gettimeofday");
46     }
47     int64_t t = tv.tv_usec;
48     t += tv.tv_sec * 1000 * 1000;
49     return t;
50 }

二、map、set；

相較於第一種，這種方式用到的數據結構有

1、std::set<std::string> StopList_ 用於存放停用詞；
2、std::map<std::string, int> words_ 用於存放The Bible Holy 中除了停用詞之外的詞；

3、std::vector< std::pair<std::string,int> > copyWords_ 由於map不具備排序功能，故需要將其轉存均vector中。

WordFrequency.h

 1 #ifndef WORDFREQUENCY_H_
 2 #define WORDFREQUENCY_H_
 3 
 4 #include <vector>
 5 #include <map>
 6 #include <set>
 7 #include <string>
 8 class WordFrequency
 9 {
10     public:
11         WordFrequency(const std::string &filename, const std::string &stopfile);
12         void ReadStopFile();
13         void ReadWordFile();
14 
15         void copyWordToVector();
16         void sortWordByFrequency();
17         void printFrequency()const;
18         
19     private:
20         std::string filename_;
21         std::string stopfile_;
22 
23         std::set<std::string> StopList_;
24         std::map<std::string, int> words_;
25             
26         std::vector< std::pair<std::string,int> > copyWords_;
27 };
28 
29 #endif

WordFrequency.cpp

 1 //WordFrequency.cpp
 2 #include "WordFrequency.h"
 3 #include "StringUtil.h"
 4 #include <fstream>
 5 #include <algorithm>
 6 #include <stdio.h>
 7 #include <stdexcept>
 8 using namespace std;
 9 using namespace stringutil;
10 
11 WordFrequency::WordFrequency(const string &filename, const string &stopfile)
12     :filename_(filename),stopfile_(stopfile)
13 { }
14 
15 void WordFrequency::ReadStopFile()
16 {
17     ifstream infile(stopfile_.c_str());
18     if( !infile )
19         throw runtime_error("open file failure");
20     string word;
21     while(infile >> word)
22         StopList_.insert(word);
23 
24     infile.close();
25 }
26 
27 void WordFrequency::ReadWordFile()
28 {
29     ifstream infile(filename_.c_str());
30     if( !infile )
31         throw runtime_error("open file failure");
32     words_.clear();
33     string word;
34     while(infile >> word) //讀取單詞
35     {
36         erasePunct(word); //去除標點
37         stringToLower(word);//轉為小寫
38         if(isAllDigit(word))//去除數字
39             continue;
40         if( StopList_.count(word) == 0) //set中 count 標識 word這個單詞是否存在
41             words_[word]++; //如果存在，int++， 如果不存在，增添至 map中 
42             //words_[word]= cnt（詞頻數）
43     }
44     
45     infile.close();
46 }
47 
48 void WordFrequency::copyWordToVector()
49 {
50     copyWords_.clear();
51     //back_inserter(copyWords_)
52     //front_inserter
53     copy(words_.begin(),words_.end(),back_inserter(copyWords_));
54 }
55 
56 bool cmp(const pair<string, int> &a, const pair<string, int> &b)
57 {
58     return a.second > b.second;
59 }
60     
61 void WordFrequency::sortWordByFrequency()
62 {
63     sort(copyWords_.begin(), copyWords_.end(),cmp); //排序
64 }
65 
66 void WordFrequency::printFrequency() const
67 {
68     for(vector<pair<string, int> >::const_iterator it = copyWords_.begin(); 
69          it != copyWords_.end(); 
70          ++it)
71     {
72         printf("word :%s, frequency: %d\n", it->first.c_str(), it->second); //打印
73     }
74 }

main.cpp：

比上一程序的main函數多了一行

1 //多出的一行copyWordToVector
2 int64_t readtime = gettime();
3 wf.copyWordToVector();
4 wf.sortWordByFrequency();

三、哈希表（c11特性）：

程序三與程序二非常相似，只是其頭文件WordFrequency.h不同，因此我們只給出 .h文件；

注意編譯時，g++ *.cpp -std=c++0x

WordFrequency.h

 1 //WordFrequency.h
 2 #ifndef WORDFREQUENCY_H_
 3 #define WORDFREQUENCY_H_
 4 
 5 #include <vector>
 6 #include <unordered_map>
 7 #include <unordered_set>
 8 #include <string>
 9 class WordFrequency
10 {
11     public:
12         WordFrequency(const std::string &filename, const std::string &stopfile);
13         void ReadStopFile();
14         void ReadWordFile();
15 
16         void copyWordToVector();
17         void sortWordByFrequency();
18         void printFrequency()const;
19         
20     private:
21         std::string filename_;
22         std::string stopfile_;
23 
24         std::unordered_set<std::string> StopList_;
25         std::unordered_map<std::string, int> words_;
26             
27         std::vector< std::pair<std::string,int> > copyWords_;
28 };
29 
30 #endif

以上三個程序（所采用的數據結構不同）性能比較：

程序一：讀取文件時間：42997ms、排序時間 7ms；

程序二：讀取文件時間：1294ms、排序時間 10ms；

程序三：讀取文件時間： 616ms、排序時間 9ms；

分析：

讀取時間：

程序一采用的是vector，所以每一次從 txt 讀取一個單詞都要先遍歷一遍停用詞的vector，然后再順序遍歷一次 vector，以查找該單詞是否已經存在，故其平均查找時間復雜度為 O（N*N）；

程序二采用的是 map與set。map的實現是根據平衡二叉樹（具體是紅黑樹），其查找平均速度是n*lgN；而且如果不存在時，自動添加該元素至map中，詞頻置為1。

程序三采用的是哈希表， unordered_map，與unordered_set ，其查找時間復雜度為常量O(1)；

排序時間：

程序一所用時間比程序二、三多了幾毫秒，這是因為map不具備排序功能，因為要轉存至vector中。而多出來的這幾毫秒就是轉存時間。

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 C++中的hash_map和map的區別 c++ hash_map/unordered_map 使用 c++之STL模板，hash_set與hash_map C++ STL之 #include 頭文件 map，hash_map和unordered_map 實現比較 C++遍歷中刪除std::hash_map元素問題模板—Hash_map linux hash_map（轉） C++ STL中哈希表 hash_map從頭到尾詳細介紹[轉] hash_map與unordered_map區別

C++回顧 統計詞頻問題 -- vector、map、hash_map（三種方式時間比較）

免責聲明！

C++回顧統計詞頻問題 -- vector、map、hash_map（三種方式時間比較）