版权声明:作者:金良山庄,欲联系请评论博客或私信,个人主页:http://www.jinliangxu.com/,CSDN博客: http://blog.csdn.net/u012176591
作者:金良(golden1314521@gmail.com) csdn博客: http://blog.csdn.net/u012176591
对数域操作函数
class Logspace: def __init__(self): self.LOGZERO =np.nan def eexp(self,x): if np.isnan(x): return 0 else: return np.exp(x) def eln(self,x): if x == 0: return self.LOGZERO elif x>0: return np.log(x) else: print 'Wrong!!!\n\t negative input error' return np.nan def elnsum(self,elnx,elny): if np.isnan(elnx): return elny elif np.isnan(elny): return elnx elif elnx > elny: return elnx + self.eln(1+np.exp(elny-elnx)) else: return elny + self.eln(1+np.exp(elnx-elny)) def elnproduct(self,elnx,elny): if np.isnan(elnx) or np.isnan(elny): return self.LOGZERO else: return elnx + elny def elnmatprod(self,elnx,elny): #array([[ 0.]])其size是2 xsize = np.size(np.shape(elnx)) ysize = np.size(np.shape(elny)) if xsize == 1 and ysize == 1: r = self.LOGZERO for i in range(np.shape(elnx)[0]): r = self.elnsum(r,self.elnproduct(elnx[i],elny[i])) return r elif xsize == 1 and not ysize == 1: n = np.shape(elny)[1] r = np.zeros(n) for i in range(n): r[i] = self.elnmatprod(elnx,elny[:,i]) return r elif not xsize == 1 and ysize == 1: n = np.shape(elnx)[0] r = np.zeros(n) for i in range(n): r[i] = self.elnmatprod(elnx[i,:],elny) return r else: m,n= np.shape(elnx) p = np.shape(elny)[1] r = np.zeros((m,p)) for i in range(m): for j in range(p): r[i][j] = self.elnmatprod(elnx[i,:],elny[:,j]) return r def eexpmat(self,elny): expy = np.copy(elny) if np.size(np.shape(elny)) == 1: for i in range(np.shape(elny)[0]): expy[i] = self.eexp(expy[i]) else: for i in range(np.shape(elny)[0]): for j in range(np.shape(elny)[1]): expy[i][j] = self.eexp(expy[i][j]) return expy def elnmat(self,x): elnx = np.copy(x) if np.size(np.shape(x)) == 1: for i in range(np.shape(x)[0]): elnx[i] = self.eln(x[i]) else: for i in range(np.shape(x)[0]): for j in range(np.shape(x)[1]): elnx[i,j] = self.eln(x[i,j]) return elnx
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
测试举例
logspace = Logspace()
M1 = np.array([1,0.5]) M2 = np.array([[1.3,1.5],[1.8,0.5]]) M3 = np.array([[0.8,1.5],[1.8,0.7]]) M4 = np.array([0,0]) print logspace.eexpmat(logspace.elnmatprod(M1,M2)) print np.dot(logspace.eexpmat(M1),logspace.eexpmat(M2))
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
[ 19.94836491 14.90077579]
[ 19.94836491 14.90077579]
条件随机场的函数
def read_corps(corpsfile='testchunk.data'): #http://www.chokkan.org/software/crfsuite/tutorial.html,该页面有两个网址可下载数据集,该数据集量很大 #http://blog.dpdearing.com/2011/12/opennlp-part-of-speech-pos-tags-penn-english-treebank/ tagids = defaultdict(lambda: len(tagids)) tagids["<S>"] = 0 corps=[] onesentence = [] words = [ "<S>" ] tags = [ 0 ] #wordnumcount = 0 with open(corpsfile,'r') as f: for line in f: if len(line)<=1: pass elif line != '. . O\n': # '. . O\n'表示一句话结束,当一句话未结束则将该单词加入列表onesentence onesentence.append(line) else: #如果一句话结束,则对该句话的所有出现的单词进行处理,将处理结果存入列表corps for texts in onesentence: #wordnumcount += 1 w_t = texts.strip().split(" ") #print w_t try: #由于表示数字的字符串变化较多,为了减少其干扰,这里将其检测出来并替换掉 float(w_t[0].strip().replace(',','')); #print w_t words.append('#CD#') except: words.append(w_t[0].lower()) #if w_t[1] in{ '``',',',"''",'$','#',')','('}: # print w_t tags.append(tagids[w_t[1]]) words.append("<S>") #words是一句话的单词组成的列表 tags.append(0) #tags是一句话的标注组成的列表,与单词列表words一一对应 if np.shape(words)[0] > 2: #排除掉空句子 corps.append((words,tags)) #对onesentence,words和tags重新初始化 onesentence = [] words = [ "<S>" ] tags = [ 0 ] #print '一共出现的单词个数:'+np.str(wordnumcount) #一共出现的单词个数:40377 return corps,tagids def getfeatureTS(corps): featuresets = set() #特征的集合 featureT = [] #转移特征的列表,比如列表元素('T', 2, 3)表示从状态2转到特征3 featureS = [] #状态特征的列表,比如列表元素('S','Confidence', 1) for corp in corps: for i in range(np.shape(corp[0])[0]): if corp[0][i] == '<S>': continue if ('S',corp[0][i],corp[1][i]) not in featuresets: featuresets.add(('S',corp[0][i],corp[1][i])) featureS.append(('S',corp[0][i],corp[1][i])) if corp[0][i-1] != '<S>': if ('T',corp[1][i-1],corp[1][i]) not in featuresets: featuresets.add(('T',corp[1][i-1],corp[1][i])) featureT.append(('T',corp[1][i-1],corp[1][i])) featureTS = featureT+featureS words2tagids = words2tagidfromfeatureS(featureS) return featureTS,words2tagids def getpriorfeatureE(corps,featureTS): #计算先验特征期望值 N = np.shape(corps)[0] #训练样本数 K = np.shape(featureTS)[0] #特征数 priorfeatureE = np.zeros(K) for corp in corps: for i in range(np.shape(corp[0])[0]): if corp[0][i] == '<S>': continue try: idex = featureTS.index(('S', corp[0][i], corp[1][i])) priorfeatureE[idex] += 1.0 except: pass try: idex = featureTS.index(('T', corp[1][i-1], corp[1][i])) priorfeatureE[idex] += 1.0 except: pass priorfeatureE /=N #plt.plot(priorfeatureE) #从特征的先验期望值可以看出无论是转移特征(从横坐标0开始)还是状态特征(从横坐标318开始),先被记录的先验期望值越大 return priorfeatureE def words2tagidfromfeatureS(featureS): #统计所有单词分别对应的词性列表 words2tagids = {} for feature in featureS: word = feature[1] state = feature[2] if word in words2tagids: words2tagids[word].append(state) else: words2tagids[word] = [state] #lennums列表统计单词对应的词性的长度的分布 #lennums = [[lenlist.count(i) for i in range(1,max(lenlist)+1)] # for lenlist in [[len(words2tagids[i]) for i in words2tagids]]][0] #lennums = [3760, 389, 32, 1] return words2tagids def getpostfeatureE(weights,corps,featureTS,words2tagids): K = np.shape(featureTS)[0] #特征数 postfeatureE = np.zeros(K) #特征的后验期望值 N = np.shape(corps)[0] for corpidx in range(N): corp = corps[corpidx][0][1:-1] lencorp = np.size(corp) #语料长度,即句子中的单词数 Mlist = {} Mlist['mat'] = ['']*(lencorp+1) Mlist['dim'] = [words2tagids[corp[i]] for i in range(lencorp)] Mlist['len'] = [np.size(words2tagids[corp[i]]) for i in range(lencorp)] for i in range(lencorp+1): if i == 0:#第一个矩阵,只有状态特征的行向量 d = Mlist['len'][0] Mlist['mat'][i] = np.zeros((1,d)) for j in range(d): Mlist['mat'][i][0,j] = weights[featureTS.index(('S', corp[0], words2tagids[corp[0]][j]))] continue if i == lencorp:#最后一个矩阵,元素为0的列向量矩阵 Mlist['mat'][i] = np.zeros((Mlist['len'][-1],1)) continue #既非第一个矩阵,亦非第二个矩阵,每个元素要计算状态特征和转移特征 Mlist['mat'][i] = np.zeros((Mlist['len'][i-1],Mlist['len'][i])) for d1 in range(Mlist['len'][i-1]): for d2 in range(Mlist['len'][i]): id1 = words2tagids[corp[i-1]][d1] id2 = words2tagids[corp[i]][d2] try: Sweight = weights[featureTS.index(('S', corp[i], id2))] except: Sweight = 0 try: Tweight = weights[featureTS.index(('T', id1, id2))] except: Tweight = 0 Mlist['mat'][i][d1,d2] = Sweight + Tweight #return Mlist,corps[0] #return 0 z = np.array([[0]]) for i in range(lencorp+1): z = logspace.elnmatprod(z,Mlist['mat'][i]) Alphalist = ['']*(lencorp+2) Betalist = ['']*(lencorp+2) Alphalist[0] = np.zeros((1,1)) # 第一个前向向量:1*1的矩阵 Betalist[-