參考:http://blog.csdn.net/zhangzhenhu/article/details/7014271
#!/usr/bin/python # -*- coding: utf-8 -*- import struct import os import sys #拼音表偏移, startPy = 0x1540 #漢語詞組表偏移 startChinese = 0x2628 #全局拼音表 GPy_Table = {} #解析結果 #元組(詞頻,拼音,中文詞組)的列表 GTable = [] def byte2str(data): '''將原始字節碼轉為字符串''' i = 0 length = len(data) ret = u'' while i < length: x = data[i] + data[i+1] t = unichr(struct.unpack('H',x)[0]) if t == u'\r': ret += u'\n' elif t != u' ': ret += t i += 2 return ret #獲取拼音表 def getPyTable(data): if data[0:4] != "\x9D\x01\x00\x00": return None data = data[4:] pos = 0 length = len(data) while pos < length: index = struct.unpack('H',data[pos]+data[pos+1])[0] #print index, pos += 2 l = struct.unpack('H',data[pos]+data[pos+1])[0] #print l, pos += 2 py = byte2str(data[pos:pos+l]) #print py GPy_Table[index]=py pos += l #獲取一個詞組的拼音 def getWordPy(data): pos = 0 length = len(data) ret = u'' while pos < length: index = struct.unpack('H',data[pos]+data[pos+1])[0] ret += GPy_Table[index] pos += 2 return ret #獲取一個詞組 def getWord(data): pos = 0 length = len(data) ret = u'' while pos < length: index = struct.unpack('H',data[pos]+data[pos+1])[0] ret += GPy_Table[index] pos += 2 return ret #讀取中文表 def getChinese(data): #import pdb #pdb.set_trace() pos = 0 length = len(data) while pos < length: #同音詞數量 same = struct.unpack('H',data[pos]+data[pos+1])[0] #print '[same]:',same, #拼音索引表長度 pos += 2 py_table_len = struct.unpack('H',data[pos]+data[pos+1])[0] #拼音索引表 pos += 2 # py = getWordPy(data[pos: pos+py_table_len]) #中文詞組 pos += py_table_len for i in xrange(same): #中文詞組長度 c_len = struct.unpack('H',data[pos]+data[pos+1])[0] #中文詞組 pos += 2 word = byte2str(data[pos: pos + c_len]) #擴展數據長度 pos += c_len ext_len = struct.unpack('H',data[pos]+data[pos+1])[0] #詞頻 pos += 2 count = struct.unpack('H',data[pos]+data[pos+1])[0] # 保存 # GTable.append((count,py,word)) GTable.append(word) # 到下個詞的偏移位置 pos += ext_len def deal(file_name): print '-'*60 f = open(file_name, 'rb') data = f.read() f.close() if data[0:12] != "\x40\x15\x00\x00\x44\x43\x53\x01\x01\x00\x00\x00": print "確認你選擇的是搜狗(.scel)詞庫?" sys.exit(0) #pdb.set_trace() print u"詞庫名:" ,byte2str(data[0x130:0x338])#.encode('GB18030') print u"詞庫類型:" ,byte2str(data[0x338:0x540])#.encode('GB18030') print u"描述信息:" ,byte2str(data[0x540:0xd40])#.encode('GB18030') # print u"詞庫示例:",byte2str(data[0xd40:startPy])#.encode('GB18030') # getPyTable(data[startPy:startChinese]) getChinese(data[startChinese:]) if __name__ == '__main__': path1 = r'D:\myProject\textClassifier\inputfile1/'.replace('\\', '/') path2 = r'D:\myProject\textClassifier\outputfile/'.replace('\\', '/') for filename in os.listdir(path1): print filename deal(path1 + filename) # 保存結果 f = open(path2 + 'sougou.txt', 'w') # 刪除相同元素 GTable_filter = sorted(set(GTable), key=GTable.index) for word in GTable_filter: f.write(unicode(word).encode('GB18030')) f.write('\n') f.close()
有個問題沒解決,就是在調用deal函數的時候,如果我這樣調用的話,deal(filename)的話,就會顯示IOError: [Errno 2] No such file or directory: 'IT\xbc\xc6\xcb\xe3\xbb\xfa.scel'這個錯誤,比如輸入全路徑才正確,很困惑。