手頭的語料庫依然是msr_training.utf8和msr_test.utf8,它來自於自於SIGHAN Bakeoff 2005的 icwb2-data.rar
1.rmspace.cpp研究院的訓練文檔是已經分好詞,但我們並不需要這個結果,我們要使用計算所有分詞系統重新進行分詞並進行詞性標注,所以第一步要把訓練文檔中行內的空格去掉。
#include<iostream> #include<fstream> #include<sstream> #include<string> using namespace std; int main(int argc,char *argv[]){ if(argc<3){ cerr<<"Usage:"<<argv[0]<<" inputfile outputfile"<<endl; return 1; } ifstream ifs(argv[1]); ofstream ofs(argv[2]); if(!(ifs && ofs)){ cerr<<"open file failed."<<endl; return 1; } string line,word,line_out; while(getline(ifs,line)){ line_out.clear(); istringstream strstm(line); while(strstm>>word) line_out+=word; ofs<<line_out<<endl; } ifs.close(); ofs.close(); return 0; }
2.對第1步得到的輸出文件還需要稍作修善,即把每行句首和句尾的雙引號去掉。這個可以用vim來完成:1,$s/^“//g 1,$s/”$//g
3.wordseg.cpp對第2步得到的輸出文件進行分詞。g++ wordseg.cpp -o wordseg -I/home/orisun/master/ICTCLAS50_Linux_RHAS_32_C/API -lICTCLAS50運行命令時注意要把libICTCLAS50.so拷貝到當前目錄下。
#include <string> #include <iostream> #define OS_LINUX #include "ICTCLAS50.h" using namespace std; int main(int argc, char *argv[]) { if (argc < 2) { //命令行中需要給定要處理的文件名 cout << "Usage:command filename" << endl; return 1; } string filename = argv[1]; string outfile = filename + ".ws"; string initPath = "/home/orisun/master/ICTCLAS50_Linux_RHAS_32_C/API"; if (!ICTCLAS_Init(initPath.c_str())) { cout << "Init fails" << endl; return -1; } ICTCLAS_FileProcess(filename.c_str(), outfile.c_str(), CODE_TYPE_UTF8,1); ICTCLAS_Exit(); return 0; }
4.由於我們要做的是詞性標注,所以先要對測試文檔進行分詞。仍然使用wordseg.cpp。
5.rmpos.cpp計算所的分詞系統在分詞的同時也做了詞性標注(修改配置文件Configure.xml是不起作用的),所以現在還得把測試文本中標注好的詞性去掉。
#include<iostream> #include<fstream> #include<sstream> #include<string> using namespace std; int main(int argc,char *argv[]){ if(argc<3){ cerr<<"Usage:"<<argv[0]<<" inputfile outputfile"<<endl; return 1; } ifstream ifs(argv[1]); ofstream ofs(argv[2]); if(!(ifs && ofs)){ cerr<<"open file failed."<<endl; return 1; } string line,word,line_out,chinese; while(getline(ifs,line)){ line_out.clear(); istringstream strstm(line); while(strstm>>word){ string::size_type pos=word.find("/"); chinese=word.substr(0,pos); line_out+=chinese+" "; } ofs<<line_out<<endl; } ifs.close(); ofs.close(); return 0; }
6.對訓練文本(即第3步的輸出)也實行rmpos.cpp。
7.createdict.cpp第5步和第6步生成了訓練集和測試集中出現的所有詞語和標點符號,現在要把它們都存入GDBM數據庫。
#include<sys/stat.h> #include<gdbm.h> #include<iostream> #include<string> #include<fstream> #include<sstream> using namespace std; int main(int argc,char *argv[]){ if(argc<2){ cerr<<"Usage: "<<argv[0]<<" inputfile"; return 1; } ifstream ifs(argv[1]); if(!ifs){ cerr<<"open file failed."<<endl; return 1; } GDBM_FILE dbm_ptr; dbm_ptr = gdbm_open("dict_db",0,GDBM_WRCREAT,S_IRUSR | S_IWUSR,NULL); char v='w'; datum key,value; value.dptr=&v; value.dsize=1; string line,word; while(getline(ifs,line)){ istringstream strstm(line); while(strstm>>word){ char *chinese=const_cast<char*>(word.c_str()); key.dptr=chinese; key.dsize=word.size(); //cout<<chinese<<"\t"<<word.size()<<endl; gdbm_store(dbm_ptr,key,value,GDBM_REPLACE); } } ifs.close(); gdbm_close(dbm_ptr); return 0; }
8.indexword.cpp對數據庫中所有的詞語(包含標點)進行序號的標記。
#include<stdio.h> #include<string.h> #include<stdlib.h> #include<sys/stat.h> #include<gdbm.h> #include<ctype.h> #define DB_FILE_BLOCK "dict_db" int main(int argc,char* argv[]){ GDBM_FILE dbm_ptr; dbm_ptr = gdbm_open(DB_FILE_BLOCK,0,GDBM_WRCREAT,S_IRUSR | S_IWUSR,NULL); datum key,data; long index=0; //從0開始編號 char index_str[10]={0}; for(key=gdbm_firstkey(dbm_ptr);key.dptr;key=gdbm_nextkey(dbm_ptr,key)){ data=gdbm_fetch(dbm_ptr,key); bzero(index_str,sizeof(index_str)); sprintf(index_str,"%ld",index++); data.dptr=index_str; data.dsize=sizeof(index_str); gdbm_store(dbm_ptr,key,data,GDBM_REPLACE); } gdbm_close(dbm_ptr); return 0; }
9.query.c和lookup.c(可選輔助)前者打印輸出數據庫中的所有數據,后者根據用戶輸出的key去GDBM中查詢對應的value。
#include<stdio.h> #include<string.h> #include<stdlib.h> #include<sys/stat.h> #include<gdbm.h> #include<ctype.h> #define DB_FILE_BLOCK "dict_db" int main(int argc,char* argv[]){ GDBM_FILE dbm_ptr; dbm_ptr = gdbm_open(DB_FILE_BLOCK,0,GDBM_READER,S_IRUSR | S_IWUSR,NULL); datum key,data; for(key=gdbm_firstkey(dbm_ptr);key.dptr;key=gdbm_nextkey(dbm_ptr,key)){ data=gdbm_fetch(dbm_ptr,key); printf("%s--%s\t",key.dptr,data.dptr); } printf("\n"); gdbm_close(dbm_ptr); return 0; }
#include<sys/stat.h> #include<gdbm.h> #include<stdio.h> #include<string.h> #include<stdlib.h> int main(int argc,char *argv[]){ char *word=(char*)malloc(50); GDBM_FILE dbm_ptr; dbm_ptr=gdbm_open("dict_db",0,GDBM_WRCREAT,S_IRUSR | S_IWUSR,NULL); datum key,value; while(1){ printf("please input a word.\n"); bzero(word,50); scanf("%s",word); if(strcmp(word,"exit")==0) break; key.dptr=word; key.dsize=strlen(word); value=gdbm_fetch(dbm_ptr,key); if(value.dsize==0){ printf("%s not exist in dict.\n",word); } else{ printf("%s--%s\n",key.dptr,value.dptr); } } gdbm_close(dbm_ptr); return 0; }
10.AMatrix.cpp統計訓練文本(當然是第3步的輸出)生成狀態轉移矩陣和初始狀態概率矩陣,分別寫入A.mat和PI.mat。
header.h頭文件中主要包含ICTCLAS的詞性標注集和Good-Turing平滑算法。
#ifndef _HEADER_H #define _HEADER_H #include<vector> #include<list> #include<map> using namespace std; const int POS_NUM=97; //計算所漢語詞性標記集去掉標點符號共有POS_NUM個元素 /*POS_NUM種詞性,即POS_NUM種狀態*/ string posarr[POS_NUM]={"n","nr","nr1","nr2","nrj","nrf","ns","nsf","nt","nz", "nl","ng","t","tg","s","f","v","vd","vn","vshi", "vyou","vf","vx","vi","vl","vg","a","ad","an","ag", "al","b","bl","z","r","rr","rz","rzt","rzs","rzv", "ry","ryt","rys","ryv","Rg","m","mq","Mg","q","qv", "qt","d","dl","dg","p","pba","pbei","c","cc","u", "uzhe","ule","uguo","ude1","ude2","ude3","usuo","udeng","uyy","udh", "uls","uzhi","ulian","e","y","o","h","k","x","xx", "xu","w","wkz","wky","wyz","wyy","wj","ww","wd","wf", "wn","wm","ws","wp","wb","wh","wt"}; void goodturing(const int count[],double prob[],int len){ map<int, list<int> > count_map; //map可以自動按key排好序 int N=0; for(int i=0;i<len;++i){ int c=count[i]; N+=c; map<int, list<int> >::const_iterator itr; itr=count_map.find(c); if(itr==count_map.end()){ list<int> l; l.push_back(i); count_map[c]=l; } else{ count_map[c].push_back(i); } } map<int, list<int> >::const_iterator iter=count_map.begin(); while(iter!=count_map.end()){ double pr; int r=iter->first; int nr=iter->second.size(); if(++iter!=count_map.end()){ int r_new=iter->first; if(r_new=r+1){ int nr_1=iter->second.size(); pr=(1.0+r)*nr_1/(N*nr); } else{ pr=1.0*r/N; } } else{ pr=1.0*r/N; } list<int> l=(--iter)->second; list<int>::const_iterator itr1=l.begin(); while(itr1!=l.end()){ int index=*itr1; itr1++; prob[index]=pr; } ++iter; } //概率歸一化 double sum=0; for(int i=0;i<len;++i) sum+=prob[i]; for(int i=0;i<len;++i) prob[i]/=sum; } #endif
#include<iostream> #include<string> #include<fstream> #include<sstream> #include<vector> #include<algorithm> #include<iomanip> #include<iterator> #include<cassert> #include"header.h" int A[POS_NUM][POS_NUM]; //記錄狀態間轉移的次數 int PI[POS_NUM]; //記錄各種狀態出現的次數 inline int indexof(string search){ for(int i=0;i<POS_NUM;++i){ if(search==posarr[i]){ return i; } } return -1; } int main(int argc,char *argv[]){ if(argc<2){ cout<<"Usage:"<<argv[0]<<" pos_tagged_file"<<endl; return 1; } //打開輸入文件 ifstream ifs(argv[1]); if(!ifs){ cerr<<"open file "<<argv[1]<<" failed."<<endl; return 1; } string line,word; while(getline(ifs,line)){ istringstream strstm(line); string pre,post; //pre是前一個狀態,post是后一個狀態 strstm>>word; string::size_type pos=word.find("/"); post=word.substr(pos+1); int index1,index2; index2=indexof(post); if(index2<0){ cout<<post<<" not exist"<<endl; return 1; } PI[index2]++; while(strstm>>word){ pre=post; pos=word.find("/"); post=word.substr(pos+1); //cout<<"pre="<<pre<<"\tpost="<<post<<endl; index1=indexof(pre); //if(index1<0){ // cout<<pre<<" not exist"<<endl; // return 1; //} index2=indexof(post); //if(index2<0){ // cout<<post<<" not exist"<<endl; // return 1; //} A[index1][index2]++; PI[index2]++; } } ifs.close(); ofstream ofs1("A.mat"); ofstream ofs2("PI.mat"); if(!(ofs1 && ofs2)){ cerr<<"create file failed."<<endl; return 1; } ofs1<<setprecision(8); ofs2<<setprecision(8); double arr_out[POS_NUM]={0.0}; for(int i=0;i<POS_NUM;++i){ goodturing(A[i],arr_out,POS_NUM); for(int j=0;j<POS_NUM;++j){ ofs1<<arr_out[j]<<"\t"; } ofs1<<endl; } goodturing(PI,arr_out,POS_NUM); for(int j=0;j<POS_NUM;++j){ ofs2<<arr_out[j]<<"\t"; } ofs2<<endl; ofs1.close(); ofs2.close(); return 0; }
11.BMatrix.cpp統計訓練文本(當然是第3步的輸出)生成發射矩陣,寫入B.mat。
#include<iostream> #include<fstream> #include<sstream> #include<string> #include<iomanip> #include<cassert> #include<cstdlib> #include<gdbm.h> #include<sys/stat.h> #include"header.h" const int TERM_NUM=70000; int matrix[POS_NUM][TERM_NUM]={0.0}; //混淆矩陣(或稱發射矩陣) inline int indexof(string search){ for(int i=0;i<POS_NUM;++i){ if(search==posarr[i]){ return i; } } return -1; } int main(int argc,char *argv[]){ if(argc<2){ cout<<"Usage: "<<argv[0]<<" pos_tagged_file"<<endl; return 1; } ifstream ifs(argv[1]); if(!ifs){ cerr<<"open file "<<argv[1]<<" failed."<<endl; return 1; } GDBM_FILE dbm_ptr; dbm_ptr=gdbm_open("dict_db",0,GDBM_READER,S_IRUSR|S_IWUSR,NULL); datum key,value; string line,word,term,pos; string slash="/"; while(getline(ifs,line)){ istringstream strstm(line); while(strstm>>word){ string::size_type loc=word.find(slash); assert(loc!=string::npos); term=word.substr(0,loc); //詞語 pos=word.substr(loc+1); //詞性 //cout<<term<<"\t"<<pos<<endl; int rowindex=indexof(pos); assert(rowindex>=0); key.dsize=term.size(); key.dptr=const_cast<char*>(term.c_str()); value=gdbm_fetch(dbm_ptr,key); int colindex=atoi(value.dptr); //cout<<rowindex<<"\t"<<colindex<<endl; matrix[rowindex][colindex]++; } } ifs.close(); gdbm_close(dbm_ptr); //將發射矩陣寫入文件 ofstream ofs("B.mat"); if(!ofs){ cerr<<"create file B.mat failed."<<endl; return 1; } ofs<<setprecision(8); double arr_out[TERM_NUM]={0.0}; for(int i=0;i<POS_NUM;++i){ goodturing(matrix[i],arr_out,TERM_NUM); for(int j=0;j<TERM_NUM;++j){ ofs<<arr_out[j]<<"\t"; } ofs<<endl; } ofs.close(); return 0; }
12.postag.cpp對測試文本(第5步的輸出)進行詞性標注。
#include<sys/stat.h> #include<ctype.h> #include<gdbm.h> #include<iostream> #include<sstream> #include<fstream> #include<string> #include<cstring> #include<cstdlib> #include<stack> #include<vector> #include"header.h" const string DB_FILE_BLOCK="dict_db"; const int TERM_NUM=70000; const int TERM_MAXLEN=100; GDBM_FILE dbm_ptr; double PI[POS_NUM]; //初始狀態概率矩陣 double A[POS_NUM][POS_NUM]; //狀態轉移矩陣 double B[POS_NUM][TERM_NUM]; //發射矩陣 /*從文件中讀出HMM模型參數*/ void initHMM(string f1,string f2,string f3){ ifstream ifs1(f1.c_str()); ifstream ifs2(f2.c_str()); ifstream ifs3(f3.c_str()); if(!(ifs1 && ifs2 && ifs3)){ cerr<<"Open file failed!"<<endl; exit(1); } //讀取PI string line; if(getline(ifs1,line)){ istringstream strstm(line); string word; for(int i=0;i<POS_NUM;++i){ strstm>>word; PI[i]=atof(word.c_str()); } }else{ cerr<<"Read PI failed!"<<endl; exit(1); } //讀取A for(int i=0;i<POS_NUM;++i){ getline(ifs2,line); istringstream strstm(line); string word; for(int j=0;j<POS_NUM;++j){ strstm>>word; A[i][j]=atof(word.c_str()); } } //讀取B for(int i=0;i<POS_NUM;++i){ getline(ifs3,line); istringstream strstm(line); string word; for(int j=0;j<TERM_NUM;++j){ strstm>>word; B[i][j]=atof(word.c_str()); } } ifs1.close(); ifs2.close(); ifs3.close(); } /*Viterbi算法進行詞性標注*/ void viterbi(vector<string> terms,string &result){ if(terms.size()==0) return; result.clear(); int row=terms.size(); //觀察序列的長度 double **Q=new double*[row]; //初始化Q矩陣 for(int i=0;i<row;++i) Q[i]=new double[POS_NUM](); int **Path=new int*[row]; //初始化Path矩陣 for(int i=0;i<row;++i) Path[i]=new int[POS_NUM](); //給Q和Path矩陣的第1行賦值 datum key,data; char chinese[TERM_MAXLEN]={0}; char *bp=const_cast<char*>(terms[0].c_str()); strncpy(chinese,bp,terms[0].size()); //讀取句子中的第1個詞 key.dptr=chinese; key.dsize=terms[0].size(); data=gdbm_fetch(dbm_ptr,key); //從數據庫中獲取漢字對應的index,該index對應發射矩陣的列 int colindex=atoi(data.dptr); for(int i=0;i<POS_NUM;++i){ Path[0][i]=-1; Q[0][i]=PI[i]*B[i][colindex]; } //給Q和Path矩陣的后續行賦值 for(int i=1;i<row;++i){ bp=const_cast<char*>(terms[i].c_str()); strncpy(chinese,bp,terms[i].size()); //讀取句子中的下一個漢字 key.dptr=chinese; key.dsize=terms[i].size(); data=gdbm_fetch(dbm_ptr,key); colindex=atoi(data.dptr); for(int j=0;j<POS_NUM;++j){ double max=-1.0; int maxindex=-1; for(int k=0;k<POS_NUM;++k){ double product=Q[i-1][k]*A[k][j]; if(product>max){ max=product; maxindex=k; } } Q[i][j]=max*B[j][colindex]; Path[i][j]=maxindex; } } //找Q矩陣最后一行的最大值 double max=-1.0; int maxindex=-1; for(int i=0;i<POS_NUM;++i){ if(Q[row-1][i]>max){ max=Q[row-1][i]; maxindex=i; } } //從maxindex出發,根據Path矩陣找出最可能的狀態序列 stack<int> st; st.push(maxindex); for(int i=row-1;i>0;--i){ maxindex=Path[i][maxindex]; st.push(maxindex); } //釋放二維數組 for(int i=0;i<row;++i){ delete []Q[i]; delete []Path[i]; } delete []Q; delete []Path; //根據標記好的狀態序列分詞 int mark=-1; for(int i=0;i<terms.size();++i){ mark=st.top(); st.pop(); result+=terms[i]+"/"+posarr[mark]+"\t"; } } int main(int argc,char *argv[]){ if(argc<3){ cout<<"Usage: "<<argv[0]<<" inputfile outputfile"<<endl; return 1; } dbm_ptr = gdbm_open(DB_FILE_BLOCK.c_str(),0,GDBM_READER,S_IRUSR | S_IWUSR,NULL); initHMM("PI.mat","A.mat","B.mat"); ifstream ifs(argv[1]); ofstream ofs(argv[2]); if(!(ifs&&ofs)){ cerr<<"Open file failed!"<<endl; return 1; } string line; //循環讀取每一行 while(getline(ifs,line)){ istringstream strstm(line); string term; vector<string> term_vec; string result; while(strstm>>term){ term_vec.push_back(term); } viterbi(term_vec,result); ofs<<result<<endl; } ifs.close(); ofs.close(); gdbm_close(dbm_ptr); return 0; }
看一下效果吧,左邊是ICTCLAS的pos-tagging結果,作為標准答案,右邊是我用一階HMM詞性標注的結果。
使用簡單的加1平滑:
可以看到詞性標注准確度還很低,並且"mq"貢獻了大部分的錯誤率。
使用Good-Turing平滑后的效果,大體上已經看不出有什么錯誤: