linux上配置boost手記

本文轉載自查看原文 2013-05-23 09:33 2700 C++/ boost unicode/ boost wregx/ boost segmentation fault

題記：一些應用中需要用到正則表達式，有的時候正則表達式中含有漢字。這時候就要用到支持unicode的正則，python是個不錯的選擇。但是如果代碼是用C++寫的，僅為了正則功能就system調用python得不償失，而且會影響到效率。考慮到之前有在windows上成果調用boost正則的經歷，所以想遷移到linux上，沒有想到的是一波三折。最后成功。

1. boost的安裝：

請參考

http://blog.csdn.net/wcjy07220114/article/details/7088005

重點摘錄

安裝boost之前最好安裝python-dev icu bzip2 機子上沒有在安裝

現在可以安裝boost了，首先要編譯生成boost安裝工具bjam
進入boost目錄執行：
./bootstrap.sh
然后執行剛生成的
./bjam -s
HAVE_ICU=1
編譯開始，大約半小時，全部編譯結束。
./bjam
install --prefix=/usr
將當前目錄下編譯好的頭文件安裝到相應位置：在/usr/include下有頭文件夾boost，在/usr/lib下有boost的庫

與其不同的是，我所有的庫都安裝在了/usr/local下面。

2. 測試代碼

  1 // please add your code here!
  2 #include <iostream>
  3 #include <stdlib.h>
  4 #include <math.h>
  5 #include<time.h>
  6 #include <set>
  7 #include <string>
  8 #include <sys/time.h>
  9 #include<locale.h>
 10 #include<boost/regex.hpp>
 11 #include </usr/local/include/iconv.h> 
 12 #include <errno.h>
 13 using namespace std;
 14 
 15 /*
 16    funcname:
 17    spec:
 18    parms:[IN]
 19          [IN]
 20          [OUT]
 21    returnValue:
 22    author liuyu, 20120528
 23 */
 24 void PrintUsage()
 25 {
 26     fprintf( stderr, "prog [IN]hzpylist_file [IN]input_file [OUT]output_file [OUT]errdmp_file\n" );
 27 }
 28 /*
 29 void PrintError(char error_text[])
 30 {
 31     fprintf(stderr,"liuyusi0121 lib run-time error...\n");     
 32     fprintf(stderr,"%s\n",error_text);
 33     fprintf(stderr,"...now exiting to system...\n");
 34     exit(1);                     
 35 
 36 }
 37 */
 38 //add your code here
 39 
 40 wstring String2Wstring(string sToMatch)  {     
 41     setlocale( LC_CTYPE, "" ); // 很重要，沒有這一句，轉換會失敗。   
 42     int iWLen = mbstowcs( NULL, sToMatch.c_str(), sToMatch.length() ); // 計算轉換后寬字符串的長度。（不包含字符串結束符）   
 43     wchar_t *lpwsz = new wchar_t[iWLen + 1];  
 44     int i = mbstowcs( lpwsz, sToMatch.c_str(), sToMatch.length() ); // 轉換。（轉換后的字符串有結束符）   
 45     wstring wsToMatch(lpwsz);  
 46     delete []lpwsz;  
 47     return wsToMatch;  
 48 }  
 49 //把寬字符串轉換成字符串，輸出使用   
 50 string Wstring2String(wstring sToMatch)  
 51 {     
 52     int iLen = wcstombs( NULL, sToMatch.c_str(), 0 ); // 計算轉換后字符串的長度。（不包含字符串結束符）   
 53     char *lpsz = new char[iLen + 1];  
 54     int i = wcstombs( lpsz, sToMatch.c_str(), iLen ); // 轉換。（沒有結束符）   
 55     lpsz[iLen] = '\0';  
 56     string sResult(lpsz);  
 57     delete []lpsz;  
 58     return sResult;  
 59 }
 60 int toUnicode(const char *toCode,const char *fromCode,char *srcstr, char *deststr, size_t srclen,size_t &destlen)
 61 {
 62     iconv_t convertor=iconv_open(toCode,fromCode);
 63     size_t inputsize;
 64     size_t outputsize;
 65     size_t oldoutputsize;
 66     char *input, *inputold;
 67     char *output=NULL;
 68     char *outputold=NULL;
 69     int flag=0;
 70     if(convertor==iconv_t(-1))
 71     {
 72         fprintf(stderr,"convertor device initailization failed!\n");
 73         return 1;
 74     }
 75     else
 76     {
 77         inputsize=srclen;
 78         input=new char[inputsize+1];
 79         memcpy(input,srcstr,inputsize);
 80         input[inputsize]='\0';
 81         inputold=input;
 82         outputsize=inputsize*10;
 83         oldoutputsize=outputsize;
 84         output=new char[outputsize];
 85         memset(output,0,outputsize);
 86         outputold=output;
 87         size_t rc = iconv(convertor,&input,&inputsize,&output,&outputsize);
 88         if(rc==0)
 89         {
 90            memcpy(deststr,outputold,oldoutputsize-outputsize);
 91            destlen=oldoutputsize-outputsize;
 92            flag=1;
 93         }
 94         delete []inputold;
 95         delete []outputold;
 96 
 97     }
 98     iconv_close(convertor);
 99     if(flag==1)
100     {
101         return 0;
102     }
103     else
104     {
105         return 1;
106     }
107 
108 }
109 int main( int argc, char *argv[] )
110 {
111     timeval tv1, tv2;
112     gettimeofday(&tv1, NULL); 
113     if ( 1!= argc )
114     {
115         PrintUsage();
116 
117         return 1;
118     }
119 /*    
120     char *s1="劉.*?";
121     char *s2="劉禹，劉德華，劉佳佳。。。王大虎。。。" ;
122     char buf[5000]={0};
123     wchar_t *wchartemp[5000]={0};
124     size_t s1_len=strlen(s1);
125     size_t s2_len=strlen(s2);
126     size_t s3_len=0;
127     char d1[5000]={0};
128     char d2[5000]={0};
129     char d3[5000]={0};
130     size_t d1_len;
131     size_t d2_len;
132     size_t d3_len;
133     toUnicode("UCS-4//IGNORE","GBK",s1,d1,s1_len,d1_len);
134     toUnicode("UCS-4//IGNORE","GBK",s2,d2,s2_len,d2_len);
135     wchar_t *m1=(wchar_t *)d1;
136     fprintf(stdout,"%d\n",wcslen(m1));
137     wstring p1=wstring(m1);
138     fprintf(stdout,"%d\n",p1.size());
139     wchar_t *m2=(wchar_t *)d2;
140     fprintf(stdout,"%d\n",wcslen(m2));
141     wstring wtext=wstring(m2);
142     
143     std::wstring::const_iterator  it=wtext.begin();
144     std::wstring::const_iterator  end=wtext.end();
145     std::wstring::const_iterator  it1=p1.begin();
146     std::wstring::const_iterator  end1=p1.end();
147     fprintf(stdout,"%d\n",wtext.size());
148     wcout<<p1<<endl;
149     wcout<<wtext<<endl;
150 
151     boost::wregex wreg(p1,boost::regbase::icase);
152     boost::wsmatch wm;
153     vector<string> results;
154     while(boost::regex_search(it,end,wm,wreg))
155     {
156         wstring wtemp=wm[0];
157         memcpy(wchartemp,wtemp.c_str(),wtemp.size());
158         char *temp=(char *) wchartemp;
159         toUnicode("GBK//IGNORE","UCS-4",temp,buf,wtemp.size()*4,d3_len);
160         string str(temp);
161         results.push_back(str);
162         it=wm[0].second;
163     }
164     for(vector<string>::iterator it=results.begin();it!=results.end();it++)
165     {
166         printf("%s\n",(*it).c_str());
167     }
168     
169     for(int i=0;i<d1_len;i++)
170     {
171         fprintf(stdout,"%2x\t",d1[i]);
172     }
173     putchar('\n');
174     for(int i=0;i<d2_len;i++)
175     {
176         fprintf(stdout,"%2x\t",d2[i]);
177     }
178     putchar('\n');
179     toUnicode("GBK//IGNORE","UCS-4",d1,d3,d1_len,s1_len);
180     for(int i=0;i<s1_len;i++)
181     {
182         fprintf(stdout,"%2x\t",d3[i]);
183     }
184     fprintf(stdout,"%d\n",s1_len);
185     putchar('\n');
186 */    
187   
188     
189     string s="劉禹,劉德華,劉佳佳。。。王大虎。。。劉長春,xixi";
190     string t="劉[^劉]*?,";
191     wstring p=String2Wstring(t);
192     wstring ws=String2Wstring(s);
193     cout<<p.size()<<endl;
194     cout<<ws.size()<<endl;
195     boost::wregex wreg(p,boost::regbase::icase|boost::regex::perl);
196     boost::wsmatch wm;
197     vector<string> results;
198     wstring::const_iterator  it=ws.begin();
199     wstring::const_iterator  end=ws.end();
200     while(boost::regex_search(it,end,wm,wreg))
201     {
202         wstring wtemp=wm[0];
203         string temp=Wstring2String(wtemp);
204         results.push_back(temp);
205         it=wm[0].second;
206     }
207     fprintf(stdout,"輸出正則匹配結果\n");
208     for(vector<string>::iterator it=results.begin();it!=results.end();it++)
209     {
210         printf("%s\n",(*it).c_str());
211     }
212     
213     gettimeofday(&tv2, NULL);
214     fprintf(stderr,"%s has finished congratulations!\n",argv[0]);
215     fprintf( stderr,"time elapsed: %.2f ms\n", (float)((tv2.tv_sec - tv1.tv_sec)*1000000+(tv2.tv_usec-tv1.tv_usec))/1000);
216     return 0;
217     
218 }

View Code

編譯 g++ -g GetsSource.cpp /home/liuyu/MyTars/boost_1_53_0/libs/regex/src/*.cpp -o m -liconv

注意：編譯的時候要鏈接boost源碼，否則boost正則的功能函數像regex_search,regex_match等有段錯誤。

運行 ./m

結果liuyu: ~/Weibo/Corpora$ ./m
8
27
輸出正則匹配結果
劉禹,
劉德華,
劉長春,

注意：我機器上的編碼是GBK的，g++ 編譯的時候源碼文件默認是utf-8的，所以寫正則模式串的時候如果直接寫wstring p=L"劉.*?"等會編譯不通過。建議采用我代碼中的寫法。

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 PXE配置手記(Linux) Linux.NET學習手記（6） Linux 命令行手記 Linux.NET學習手記（1） eclipse 中配置 jboss 7 手記 Linux.NET學習手記（3） Linux.NET學習手記（7） Linux.NET學習手記（2） Linux.NET學習手記（4） Linux.NET學習手記（5）