一階HMM詞性標注


手頭的語料庫依然是msr_training.utf8和msr_test.utf8,它來自於自於SIGHAN Bakeoff 2005的 icwb2-data.rar

1.rmspace.cpp研究院的訓練文檔是已經分好詞,但我們並不需要這個結果,我們要使用計算所有分詞系統重新進行分詞並進行詞性標注,所以第一步要把訓練文檔中行內的空格去掉。

#include<iostream>
#include<fstream>
#include<sstream>
#include<string>

using namespace std;

int main(int argc,char *argv[]){
    if(argc<3){
        cerr<<"Usage:"<<argv[0]<<" inputfile outputfile"<<endl;
        return 1;
    }

    ifstream ifs(argv[1]);
    ofstream ofs(argv[2]);
    if(!(ifs && ofs)){
        cerr<<"open file failed."<<endl;
        return 1;
    }

    string line,word,line_out;
    while(getline(ifs,line)){
        line_out.clear();
        istringstream strstm(line);
        while(strstm>>word)
            line_out+=word;
        ofs<<line_out<<endl;
    }

    ifs.close();
    ofs.close();
    return 0;
}

2.對第1步得到的輸出文件還需要稍作修善,即把每行句首和句尾的雙引號去掉。這個可以用vim來完成:1,$s/^“//g         1,$s/”$//g

3.wordseg.cpp對第2步得到的輸出文件進行分詞。g++ wordseg.cpp -o wordseg -I/home/orisun/master/ICTCLAS50_Linux_RHAS_32_C/API -lICTCLAS50運行命令時注意要把libICTCLAS50.so拷貝到當前目錄下。

#include <string>
#include <iostream>
#define OS_LINUX
#include "ICTCLAS50.h"
using namespace std;
 
int main(int argc, char *argv[])
{
    if (argc < 2) {      //命令行中需要給定要處理的文件名
        cout << "Usage:command filename" << endl;
        return 1;
    }
    string filename = argv[1];
    string outfile = filename + ".ws";
    string initPath = "/home/orisun/master/ICTCLAS50_Linux_RHAS_32_C/API";
    if (!ICTCLAS_Init(initPath.c_str())) {
        cout << "Init fails" << endl;
        return -1;
    }
    ICTCLAS_FileProcess(filename.c_str(), outfile.c_str(), CODE_TYPE_UTF8,1);
    ICTCLAS_Exit();
    return 0;
}

4.由於我們要做的是詞性標注,所以先要對測試文檔進行分詞。仍然使用wordseg.cpp。

5.rmpos.cpp計算所的分詞系統在分詞的同時也做了詞性標注(修改配置文件Configure.xml是不起作用的),所以現在還得把測試文本中標注好的詞性去掉。

#include<iostream>
#include<fstream>
#include<sstream>
#include<string>

using namespace std;

int main(int argc,char *argv[]){
    if(argc<3){
        cerr<<"Usage:"<<argv[0]<<" inputfile outputfile"<<endl;
        return 1;
    }

    ifstream ifs(argv[1]);
    ofstream ofs(argv[2]);
    if(!(ifs && ofs)){
        cerr<<"open file failed."<<endl;
        return 1;
    }

    string line,word,line_out,chinese;
    while(getline(ifs,line)){
        line_out.clear();
        istringstream strstm(line);
        while(strstm>>word){
        	string::size_type pos=word.find("/");
        	chinese=word.substr(0,pos);
            line_out+=chinese+" ";
        }
        ofs<<line_out<<endl;
    }

    ifs.close();
    ofs.close();
    return 0;
}

6.對訓練文本(即第3步的輸出)也實行rmpos.cpp。

7.createdict.cpp第5步和第6步生成了訓練集和測試集中出現的所有詞語和標點符號,現在要把它們都存入GDBM數據庫。

#include<sys/stat.h>
#include<gdbm.h>
#include<iostream>
#include<string>
#include<fstream>
#include<sstream>

using namespace std;

int main(int argc,char *argv[]){
	if(argc<2){
		cerr<<"Usage: "<<argv[0]<<" inputfile";
		return 1;
	}
	
	ifstream ifs(argv[1]);
	if(!ifs){
		cerr<<"open file failed."<<endl;
		return 1;
	}
	
	GDBM_FILE dbm_ptr;
    dbm_ptr = gdbm_open("dict_db",0,GDBM_WRCREAT,S_IRUSR | S_IWUSR,NULL);
   	char v='w';
	datum key,value;
	value.dptr=&v;
	value.dsize=1;
	
	string line,word;
	while(getline(ifs,line)){
    	istringstream strstm(line);
    	while(strstm>>word){
    		char *chinese=const_cast<char*>(word.c_str());
    		key.dptr=chinese;
    		key.dsize=word.size();
    		//cout<<chinese<<"\t"<<word.size()<<endl;
    		gdbm_store(dbm_ptr,key,value,GDBM_REPLACE);
    	}
    }
    
    ifs.close();
	gdbm_close(dbm_ptr);
	return 0;
}

8.indexword.cpp對數據庫中所有的詞語(包含標點)進行序號的標記。

#include<stdio.h>
#include<string.h>
#include<stdlib.h>
#include<sys/stat.h>
#include<gdbm.h>
#include<ctype.h>

#define DB_FILE_BLOCK "dict_db"

int main(int argc,char* argv[]){
	GDBM_FILE dbm_ptr;
    dbm_ptr = gdbm_open(DB_FILE_BLOCK,0,GDBM_WRCREAT,S_IRUSR | S_IWUSR,NULL);
    datum key,data;
    long index=0;		//從0開始編號
    char index_str[10]={0};
    for(key=gdbm_firstkey(dbm_ptr);key.dptr;key=gdbm_nextkey(dbm_ptr,key)){
    	data=gdbm_fetch(dbm_ptr,key);
    	bzero(index_str,sizeof(index_str));
    	sprintf(index_str,"%ld",index++);
    	data.dptr=index_str;
    	data.dsize=sizeof(index_str);
    	gdbm_store(dbm_ptr,key,data,GDBM_REPLACE);
    }
    gdbm_close(dbm_ptr);
    return 0;
}

9.query.c和lookup.c(可選輔助)前者打印輸出數據庫中的所有數據,后者根據用戶輸出的key去GDBM中查詢對應的value。

#include<stdio.h>
#include<string.h>
#include<stdlib.h>
#include<sys/stat.h>
#include<gdbm.h>
#include<ctype.h>

#define DB_FILE_BLOCK "dict_db"

int main(int argc,char* argv[]){
	GDBM_FILE dbm_ptr;
    dbm_ptr = gdbm_open(DB_FILE_BLOCK,0,GDBM_READER,S_IRUSR | S_IWUSR,NULL);
    datum key,data;
    for(key=gdbm_firstkey(dbm_ptr);key.dptr;key=gdbm_nextkey(dbm_ptr,key)){
    	data=gdbm_fetch(dbm_ptr,key);
    	printf("%s--%s\t",key.dptr,data.dptr);
    }
    printf("\n");
    gdbm_close(dbm_ptr);
    return 0;
}

  

#include<sys/stat.h>
#include<gdbm.h>
#include<stdio.h>
#include<string.h>
#include<stdlib.h>

int main(int argc,char *argv[]){
	char *word=(char*)malloc(50);
	GDBM_FILE dbm_ptr;
	dbm_ptr=gdbm_open("dict_db",0,GDBM_WRCREAT,S_IRUSR | S_IWUSR,NULL);
	datum key,value;
	
	while(1){
		printf("please input a word.\n");
		bzero(word,50);
		scanf("%s",word);
		if(strcmp(word,"exit")==0)
			break;
		key.dptr=word;
		key.dsize=strlen(word);
		value=gdbm_fetch(dbm_ptr,key);
		if(value.dsize==0){
			printf("%s not exist in dict.\n",word);
		}
		else{
			printf("%s--%s\n",key.dptr,value.dptr);
		}
	}
	gdbm_close(dbm_ptr);
	return 0;
}

10.AMatrix.cpp統計訓練文本(當然是第3步的輸出)生成狀態轉移矩陣和初始狀態概率矩陣,分別寫入A.mat和PI.mat。

header.h頭文件中主要包含ICTCLAS的詞性標注集和Good-Turing平滑算法。

#ifndef _HEADER_H
#define _HEADER_H

#include<vector>
#include<list>
#include<map>

using namespace std;

const int POS_NUM=97;		//計算所漢語詞性標記集去掉標點符號共有POS_NUM個元素
/*POS_NUM種詞性,即POS_NUM種狀態*/
string posarr[POS_NUM]={"n","nr","nr1","nr2","nrj","nrf","ns","nsf","nt","nz",
						"nl","ng","t","tg","s","f","v","vd","vn","vshi",
						"vyou","vf","vx","vi","vl","vg","a","ad","an","ag",
						"al","b","bl","z","r","rr","rz","rzt","rzs","rzv",
						"ry","ryt","rys","ryv","Rg","m","mq","Mg","q","qv",
						"qt","d","dl","dg","p","pba","pbei","c","cc","u",
						"uzhe","ule","uguo","ude1","ude2","ude3","usuo","udeng","uyy","udh",
						"uls","uzhi","ulian","e","y","o","h","k","x","xx",
						"xu","w","wkz","wky","wyz","wyy","wj","ww","wd","wf",
						"wn","wm","ws","wp","wb","wh","wt"};

void goodturing(const int count[],double prob[],int len){
	map<int, list<int> > count_map;		//map可以自動按key排好序
	int N=0;			
	for(int i=0;i<len;++i){
		int c=count[i];
		N+=c;
		map<int, list<int> >::const_iterator itr;
		itr=count_map.find(c);
		if(itr==count_map.end()){
			list<int> l;
			l.push_back(i);
			count_map[c]=l;
		}
		else{
			count_map[c].push_back(i);
		}
	}

	map<int, list<int> >::const_iterator iter=count_map.begin();
	while(iter!=count_map.end()){
		double pr;
		int r=iter->first;
		int nr=iter->second.size();
		if(++iter!=count_map.end()){
			int r_new=iter->first;
			if(r_new=r+1){
				int nr_1=iter->second.size();
				pr=(1.0+r)*nr_1/(N*nr);
			}
			else{
				pr=1.0*r/N;
			}
		}
		else{
			pr=1.0*r/N;
		}
		list<int> l=(--iter)->second;
		list<int>::const_iterator itr1=l.begin();
		while(itr1!=l.end()){
			int index=*itr1;
			itr1++;
			prob[index]=pr;
		}
		++iter;
	}
	
	//概率歸一化
	double sum=0;
	for(int i=0;i<len;++i)
		sum+=prob[i];
	for(int i=0;i<len;++i)
		prob[i]/=sum;
}

#endif

 

#include<iostream>
#include<string>
#include<fstream>
#include<sstream>
#include<vector>
#include<algorithm>
#include<iomanip>
#include<iterator>
#include<cassert>
#include"header.h"

int A[POS_NUM][POS_NUM];	//記錄狀態間轉移的次數
int PI[POS_NUM];			//記錄各種狀態出現的次數

					
inline int indexof(string search){
	for(int i=0;i<POS_NUM;++i){
		if(search==posarr[i]){
			return i;
		}
	}
	return -1;
}
				
int main(int argc,char *argv[]){
	if(argc<2){
		cout<<"Usage:"<<argv[0]<<" pos_tagged_file"<<endl;
		return 1;
	}
	//打開輸入文件
	ifstream ifs(argv[1]);
	if(!ifs){
		cerr<<"open file "<<argv[1]<<" failed."<<endl;
		return 1;
	}

	string line,word;
	while(getline(ifs,line)){
		istringstream strstm(line);
		string pre,post;		//pre是前一個狀態,post是后一個狀態
		strstm>>word;
		string::size_type pos=word.find("/");
		post=word.substr(pos+1);
		int index1,index2;
		index2=indexof(post);
		if(index2<0){
			cout<<post<<" not exist"<<endl;
			return 1;
		}
		PI[index2]++;
		while(strstm>>word){
			pre=post;
			pos=word.find("/");
			post=word.substr(pos+1);
			//cout<<"pre="<<pre<<"\tpost="<<post<<endl;
			index1=indexof(pre);
			//if(index1<0){
			//	cout<<pre<<" not exist"<<endl;
			//	return 1;
			//}
			index2=indexof(post);
			//if(index2<0){
			//	cout<<post<<" not exist"<<endl;
			//	return 1;
			//}
			A[index1][index2]++;
			PI[index2]++;
		}
	}
	ifs.close();
	
	ofstream ofs1("A.mat");
	ofstream ofs2("PI.mat");
	if(!(ofs1 && ofs2)){
		cerr<<"create file failed."<<endl;
		return 1;
	}
	ofs1<<setprecision(8);
	ofs2<<setprecision(8);
	double arr_out[POS_NUM]={0.0};
	for(int i=0;i<POS_NUM;++i){
		goodturing(A[i],arr_out,POS_NUM);
		for(int j=0;j<POS_NUM;++j){
			ofs1<<arr_out[j]<<"\t";
		}
		ofs1<<endl;
	}
	goodturing(PI,arr_out,POS_NUM);
	for(int j=0;j<POS_NUM;++j){
		ofs2<<arr_out[j]<<"\t";
	}
	ofs2<<endl;
	
	ofs1.close();
	ofs2.close();
	return 0;
}

11.BMatrix.cpp統計訓練文本(當然是第3步的輸出)生成發射矩陣,寫入B.mat。

#include<iostream>
#include<fstream>
#include<sstream>
#include<string>
#include<iomanip>
#include<cassert>
#include<cstdlib>
#include<gdbm.h>
#include<sys/stat.h>
#include"header.h"

const int TERM_NUM=70000;

int matrix[POS_NUM][TERM_NUM]={0.0};	//混淆矩陣(或稱發射矩陣)

inline int indexof(string search){
	for(int i=0;i<POS_NUM;++i){
		if(search==posarr[i]){
			return i;
		}
	}
	return -1;
}

int main(int argc,char *argv[]){
	if(argc<2){
		cout<<"Usage: "<<argv[0]<<" pos_tagged_file"<<endl;
		return 1;
	}
	
	ifstream ifs(argv[1]);
	if(!ifs){
		cerr<<"open file "<<argv[1]<<" failed."<<endl;
		return 1;
	}
	
	GDBM_FILE dbm_ptr;
	dbm_ptr=gdbm_open("dict_db",0,GDBM_READER,S_IRUSR|S_IWUSR,NULL);
	datum key,value;
	
	string line,word,term,pos;
	string slash="/";
	while(getline(ifs,line)){
		istringstream strstm(line);
		while(strstm>>word){
			string::size_type loc=word.find(slash);
			assert(loc!=string::npos);
			term=word.substr(0,loc);		//詞語
			pos=word.substr(loc+1);			//詞性
			//cout<<term<<"\t"<<pos<<endl;
			int rowindex=indexof(pos);
			assert(rowindex>=0);
			key.dsize=term.size();
			key.dptr=const_cast<char*>(term.c_str());
			value=gdbm_fetch(dbm_ptr,key);
			int colindex=atoi(value.dptr);
			//cout<<rowindex<<"\t"<<colindex<<endl;
			matrix[rowindex][colindex]++;
		}
	}
	
	ifs.close();
	gdbm_close(dbm_ptr);
	
	//將發射矩陣寫入文件
	ofstream ofs("B.mat");
	if(!ofs){
		cerr<<"create file B.mat failed."<<endl;
		return 1;
	}
	ofs<<setprecision(8);
	double arr_out[TERM_NUM]={0.0};
	for(int i=0;i<POS_NUM;++i){
		goodturing(matrix[i],arr_out,TERM_NUM);
		for(int j=0;j<TERM_NUM;++j){
			ofs<<arr_out[j]<<"\t";
		}
		ofs<<endl;
	}
	ofs.close();
	
	return 0;
}

12.postag.cpp對測試文本(第5步的輸出)進行詞性標注。

#include<sys/stat.h>
#include<ctype.h>
#include<gdbm.h>
#include<iostream>
#include<sstream>
#include<fstream>
#include<string>
#include<cstring>
#include<cstdlib>
#include<stack>
#include<vector>
#include"header.h"

const string DB_FILE_BLOCK="dict_db";
const int TERM_NUM=70000;
const int TERM_MAXLEN=100;
GDBM_FILE dbm_ptr;

double PI[POS_NUM];				//初始狀態概率矩陣
double A[POS_NUM][POS_NUM];		//狀態轉移矩陣
double B[POS_NUM][TERM_NUM];	//發射矩陣

/*從文件中讀出HMM模型參數*/
void initHMM(string f1,string f2,string f3){
	ifstream ifs1(f1.c_str());
	ifstream ifs2(f2.c_str());
	ifstream ifs3(f3.c_str());
	if(!(ifs1 && ifs2 && ifs3)){
		cerr<<"Open file failed!"<<endl;
		exit(1);
	}
	
	//讀取PI
	string line;
	if(getline(ifs1,line)){
		istringstream strstm(line);
		string word;
		for(int i=0;i<POS_NUM;++i){
			strstm>>word;
			PI[i]=atof(word.c_str());
		}	
	}else{
		cerr<<"Read PI failed!"<<endl;
		exit(1);
	}
	
	//讀取A
	for(int i=0;i<POS_NUM;++i){
		getline(ifs2,line);
		istringstream strstm(line);
		string word;
		for(int j=0;j<POS_NUM;++j){
			strstm>>word;
			A[i][j]=atof(word.c_str());
		}
	}
	
	//讀取B
	for(int i=0;i<POS_NUM;++i){
		getline(ifs3,line);
		istringstream strstm(line);
		string word;
		for(int j=0;j<TERM_NUM;++j){
			strstm>>word;
			B[i][j]=atof(word.c_str());
		}
	}
	
	ifs1.close();
	ifs2.close();
	ifs3.close();
}

/*Viterbi算法進行詞性標注*/
void viterbi(vector<string> terms,string &result){
	if(terms.size()==0)
		return;
	result.clear();
	int row=terms.size();		//觀察序列的長度
	double **Q=new double*[row];	//初始化Q矩陣
	for(int i=0;i<row;++i)
		Q[i]=new double[POS_NUM]();
	int **Path=new int*[row];	//初始化Path矩陣
	for(int i=0;i<row;++i)
		Path[i]=new int[POS_NUM]();
	
	//給Q和Path矩陣的第1行賦值
	datum key,data;
	char chinese[TERM_MAXLEN]={0};
	char *bp=const_cast<char*>(terms[0].c_str());
	strncpy(chinese,bp,terms[0].size());		//讀取句子中的第1個詞	
	key.dptr=chinese;
	key.dsize=terms[0].size();
	data=gdbm_fetch(dbm_ptr,key);		//從數據庫中獲取漢字對應的index,該index對應發射矩陣的列
	int colindex=atoi(data.dptr);
	for(int i=0;i<POS_NUM;++i){
		Path[0][i]=-1;
		Q[0][i]=PI[i]*B[i][colindex];
	}
	
	//給Q和Path矩陣的后續行賦值
	for(int i=1;i<row;++i){
		bp=const_cast<char*>(terms[i].c_str());
		strncpy(chinese,bp,terms[i].size());	//讀取句子中的下一個漢字
		key.dptr=chinese;
		key.dsize=terms[i].size();
		data=gdbm_fetch(dbm_ptr,key);
		colindex=atoi(data.dptr);
		for(int j=0;j<POS_NUM;++j){
			double max=-1.0;
			int maxindex=-1;
			for(int k=0;k<POS_NUM;++k){
				double product=Q[i-1][k]*A[k][j];
				if(product>max){
					max=product;
					maxindex=k;
				}
			}
			Q[i][j]=max*B[j][colindex];
			Path[i][j]=maxindex;
		}
	}
	
	//找Q矩陣最后一行的最大值
	double max=-1.0;
	int maxindex=-1;
	for(int i=0;i<POS_NUM;++i){
		if(Q[row-1][i]>max){
			max=Q[row-1][i];
			maxindex=i;
		}
	}
	//從maxindex出發,根據Path矩陣找出最可能的狀態序列
	stack<int> st;
	st.push(maxindex);
	for(int i=row-1;i>0;--i){
		maxindex=Path[i][maxindex];
		st.push(maxindex);
	}
	//釋放二維數組
	for(int i=0;i<row;++i){
		delete []Q[i];
		delete []Path[i];
	}
	delete []Q;
	delete []Path;
	
	//根據標記好的狀態序列分詞
	int mark=-1;
	for(int i=0;i<terms.size();++i){
		mark=st.top();
		st.pop();
		result+=terms[i]+"/"+posarr[mark]+"\t";
	}
}

int main(int argc,char *argv[]){
	if(argc<3){
		cout<<"Usage: "<<argv[0]<<" inputfile outputfile"<<endl;
		return 1;
	}
	
	dbm_ptr = gdbm_open(DB_FILE_BLOCK.c_str(),0,GDBM_READER,S_IRUSR | S_IWUSR,NULL);
	initHMM("PI.mat","A.mat","B.mat");
	
	ifstream ifs(argv[1]);
	ofstream ofs(argv[2]);
	if(!(ifs&&ofs)){
		cerr<<"Open file failed!"<<endl;
		return 1;
	}
	
	string line;
	//循環讀取每一行
	while(getline(ifs,line)){
		istringstream strstm(line);
		string term;
		vector<string> term_vec;
		string result;
		while(strstm>>term){
			term_vec.push_back(term);
		}
		viterbi(term_vec,result);
		ofs<<result<<endl;
	}
	ifs.close();
	ofs.close();
	gdbm_close(dbm_ptr);
	return 0;
}

看一下效果吧,左邊是ICTCLAS的pos-tagging結果,作為標准答案,右邊是我用一階HMM詞性標注的結果。

使用簡單的加1平滑:

可以看到詞性標注准確度還很低,並且"mq"貢獻了大部分的錯誤率。

使用Good-Turing平滑后的效果,大體上已經看不出有什么錯誤:


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM