#創建單詞表 def createVocabList(dataSet): vocabSet = set([]) #創建一個空的集合 for document in dataSet: vocabSet = vocabSet | set(document) #union of the two sets return list(vocabSet) def trainNB0(trainMatrix,trainCategory): numTrainDocs = len(trainMatrix) #訓練矩陣的行數 numWords = len(trainMatrix[0])#字母表的維度,即訓練矩陣的列數 pAbusive = sum(trainCategory)/float(numTrainDocs) #先驗信息 p0Num = ones(numWords); p1Num = ones(numWords) #改為 ones() p0Denom = 2.0; p1Denom = 2.0 #改成 2.0 for i in range(numTrainDocs): if trainCategory[i] == 1: p1Num += trainMatrix[i] p1Denom += sum(trainMatrix[i]) else: p0Num += trainMatrix[i] p0Denom += sum(trainMatrix[i]) p1Vect = log(p1Num/p1Denom) #改為 log() p0Vect = log(p0Num/p0Denom) #改為 log() return p0Vect,p1Vect,pAbusive #返回先驗信息PAbusive,返回確定分類的條件下的每個單詞出現的概率(此時概率為頻率) def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1): p1 = sum(vec2Classify * p1Vec) + log(pClass1) #此時p1vec為對原始數組分別取對數之后的矩陣了,利用log(a*b)=sum(log(a)+log(b))再sum求和 #pClass1為先驗概率,此時p1就是最終的概率值。同理p0,根據后驗概率最大准則,判別 p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1) if p1 > p0: return 1 else: return 0 #定義詞袋模型,詞出現幾次算幾次 def bagOfWords2VecMN(vocabList, inputSet): returnVec = [0]*len(vocabList) #初始化矩陣 for word in inputSet: if word in vocabList: returnVec[vocabList.index(word)] += 1 return returnVec def textParse(bigString): #input is big string, #output is word list import re listOfTokens = re.split(r'\W*', bigString) return [tok.lower() for tok in listOfTokens if len(tok) > 2] def spamTest(): """" 文本矩陣化,構建文本矩陣和分類矩陣; 注意:由於有個文本的編碼解碼有問題,我給直接過濾掉了,所以最后矩陣有49行而不是50行 """ docList=[];classList=[];fullText=[] spam_name_lists=["email/spam/{}.txt".format(i) for i in range(1,26)] ham_name_lists=["email/ham/{}.txt".format(i) for i in range(1,26)] for spam_name_list in spam_name_lists: with open(spam_name_list) as f: try: a=textParse(f.read()) docList.append(a) classList.append(1) fullText.extend(docList) except Exception as e: print(e) pass for ham_name_list in ham_name_lists: with open(ham_name_list) as f: try: a=textParse(f.read()) docList.append(a) classList.append(0) fullText.extend(docList) except Exception as e: print(e) pass vocabList = createVocabList(docList) #創建詞匯表 trainingSet=list(range(49));testSet=[] #隨機的構建測試集和訓練集,留存交叉驗證的方法 for i in range(10): #測試集大小為10,訓練集大小為49-10=39 randIndex=int(random.uniform(0,len(trainingSet))) testSet.append(trainingSet[randIndex]) trainingSet.pop(randIndex) trainMat=[]; trainClasses = [] for docIndex in trainingSet: trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses)) errorCount = 0 for docIndex in testSet: #classify the remaining items wordVector = bagOfWords2VecMN(vocabList, docList[docIndex]) if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]: errorCount += 1 print("classification error", docList[docIndex]) print('the error rate is: ', float(errorCount) / len(testSet)) #return vocabList,fullText if __name__ == '__main__': spamTest()
待處理的數據為放在兩個文件夾中的各25個txt文本,文本信息為電子郵件內容,文件夾spam中的25個郵件都是正常郵件;ham中的25個郵件是垃圾郵件;
利用朴素貝葉斯算法,訓練分類器,采取交叉驗證的方式,結果證明,分類器能夠很好的識別垃圾郵件;
代碼主要參考【機器學習實戰】,但是有的代碼已經不能用了,而且有的有問題,做了一點修改。希望對看到文章的童鞋有點參考。朴素貝葉斯的思想不再過度敘述,參考互聯網;
整個示意流程如下:
(1)文本處理(讀取文本,分詞)——>
(2)根據分好詞的文本數據建立詞匯表(函數createVocabList【參數為文本數據】),矩陣化(函數bagOfWords2VecMN,【參數為詞匯表、待處理的“文本數據”】)——>
(3)拆分數據為訓練集、測試集——>
(4)訓練分類器——>
(5)測試分類器——>end
學習朴素貝葉斯的其他文章,建議看看github上的這幾個項目,鏈接如下
https://github.com/search?l=Python&q=%E6%9C%B4%E7%B4%A0%E8%B4%9D%E5%8F%B6%E6%96%AF&type=Repositories&utf8=%E2%9C%93