1 from numpy import *#科學計算包 2 from numpy import tile 3 from numpy import zeros 4 import operator #運算符模塊 5 import importlib 6 import sys 7 importlib.reload(sys) 8 9 def createDataSet(): 10 group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]]) 11 labels = ['A','A','B','B'] 12 return group,labels 13 14 def classify0(inX, dataSet, labels, k): 15 dataSetSize = dataSet.shape[0] 16 #距離計算 17 diffMat = tile(inX, (dataSetSize,1)) - dataSet 18 sqDiffMat = diffMat**2 #平方 19 sqDistances = sqDiffMat.sum(axis=1) #根號下平方相加 20 distances = sqDistances**0.5 #根號 21 sortedDistIndicies = distances.argsort() #排序 22 classCount={} 23 #選擇距離最小的k個點 24 for i in range(k): 25 voteIlabel = labels[sortedDistIndicies[i]] 26 classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 27 #排序,將classCount字典分解為元祖列表,導入itemgeeter方法,按照第二個元素的次序對元祖進行排序 28 #此處排序為逆序,即從大到小排序,最后返回發生頻率最高的元素標簽。 29 sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True) 30 return sortedClassCount[0][0] 31 # 為預測數據所在分類:kNN.classify0([0,0], group, labels, 3) 32 33 # mat()函數可以將數組(array)轉化為矩陣(matrix) 34 # randMat = mat(random.rand(4,4)) 35 # 求逆矩陣:randMat.I 36 # 存儲逆矩陣:invRandMat = randMat.I 37 # 矩陣乘法:randMat*invRandMat 38 # 求誤差值:myEye = randMat*invRandMat 39 #myEye - eye(4) 40 #eye(4)創建4*4的單位矩陣 41 # 使用createDataSet()函數,創建數據集和標簽 42 # 創建變量group和labels:group,labels = kNN.createDataSet() 43 # labels包含的元素個數 = group矩陣的行數 44 # 輸入變量名字檢驗是否正確:group和labels 45 # 46 # 47 # 准備數據:從文本文件中解析數據 48 # 在kNN.py中創建名為file2matrix的函數,處理輸入格式問題 49 # 該函數的輸入為文件名字符串,輸出為訓練樣本矩陣和類標簽向量 50 # 將文本記錄到轉換Numpy的解析程序 51 def file2matrix(filename): 52 fr = open(filename) 53 arrayOLines = fr.readlines() 54 numberOfLines = len(arrayOLines) #得到文件行數 55 returnMat = zeros((numberOfLines,3)) #創建返回的Numpy矩陣 56 classLabelVector = [] 57 index = 0 58 for line in arrayOLines: #解析文件數據列表 59 line = line.strip() #使用line.strip()截取掉所有的回車字符 60 listFromLine = line.split('\t') #使用tab字符\t將上一步得到的整行數據分割成一個元素列表 61 returnMat[index,:] = listFromLine[0:3] #選取前三個元素,存儲到特征矩陣中 62 classLabelVector.append(int(listFromLine[-1])) #-1表示列表中的最后一列元素,存儲到向量classLabelVector中 63 index += 1 64 return returnMat,classLabelVector 65 66 #准備數據:歸一化數值 67 def autoNorm(dataSet): #autoNorm()函數可以自動將數字特征值轉換為0到1的區間 68 minVals = dataSet.min(0) 69 maxVals = dataSet.max(0) #ddataSet.max(0)中的參數0使得函數可以從列中選取最小值 70 ranges = maxVals - minVals 71 normDataSet = zeros(shape(dataSet)) 72 m = dataSet.shape[0] 73 #newValue = (oldValue-min)/(max-min),該公式可以將任意取值范圍的特征值轉換為0到1區間內的值 74 #tile()函數將變量內容復制成輸入矩陣同樣大小的矩陣(具體特征值相除) 75 #在numpy庫中,矩陣除法需要使用函數linalg.solve(matA,matB) 76 normDataSet = dataSet - tile(minVals, (m,1)) 77 normDataSet = normDataSet/tile(ranges, (m,1)) 78 return normDataSet, ranges, minVals 79 80 #測試算法:作為完整程序驗證分類器 81 def datingClassTest(): 82 hoRatio = 0.10 #設置測試集比重,前10%作為測試集,后90%作為訓練集 83 datingDataMat,datingLabels = file2matrix('datingTestSet.txt') 84 normMat, ranges, minVals = autoNorm(datingDataMat) 85 m = normMat.shape[0] #得到樣本數量m 86 numTestVecs = int(m*hoRatio) #得到測試集最后一個樣本的位置 87 errorCount = 0.0 #初始化定義錯誤個數為0 88 for i in range(numTestVecs): 89 #測試集中元素逐一放進分類器測試,k = 3 90 classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3) 91 #輸出分類結果與實際label 92 print("the classifier came back with: %d, the real answer is: %d"% (classifierResult, datingLabels[i])) 93 #若預測結果與實際label不同,則errorCount+1 94 if (classifierResult !=datingLabels[i]): errorCount += 1.0 95 #輸出錯誤率 = 錯誤的個數 / 總樣本個數 96 print("the total error rate is: %f" % (errorCount/float(numTestVecs))) 97 98 99 #約會網站預測數據 100 def classifyPersion(): 101 resultList = ['not at all','in small doses','in large doses'] 102 #input()函數允許用戶輸入文本行命令並返回用戶所輸入的命令 103 percentTats = float(input("percentage of time spent playing video games?")) 104 ffMiles = float(input("frequent year?")) 105 iceCream = float(input("liters years?")) 106 datingDataMat,datingLabels = file2matrix('datingTestSet2.txt') 107 normMat, ranges, minVals = autoNorm(datingDataMat) 108 inArr = array([ffMiles,percentTats, iceCream]) 109 classifierResult = classify0((inArr-minVals)/ranges,normMat,datingLabels,3) 110 print("you like person:",resultList[classifierResult - 1]) 111 112 113 #准備數據:將圖像轉換為測試向量 114 #img2vector函數,將圖像轉換為向量:該函數創建1*2014的numpy數組, 115 #然后打開給定的文件,循環讀出文件的前32行,並將每行的頭32個字符值存儲在numpy數組中,最后返回數組 116 def img2vector(filename): 117 returnVect = zeros((1,1024)) 118 fr = open(filename) 119 for i in range(32): 120 lineStr = fr.readline() 121 for j in range(32): 122 returnVect[0,32*i+j] = int(lineStr[j]) 123 return returnVect 124 125 #測試算法:識別手寫數字 126 def handwritingClassTest(): 127 hwLabels = [] 128 trainingFileList = os.listdir('trainingDigits') 129 m = len(trainingFileList) 130 trainingMat = zeros((m,1024)) 131 #文件名下划線_左邊的數字是標簽 132 for i in range(m): 133 fileNameStr = trainingFileList[i] 134 fileStr = fileNameStr.split(".")[0] 135 classNumStr = int(fileStr.split('_')[0]) 136 hwLabels.append(classNumStr) 137 trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr) 138 testFileList = os.listdir('trainingDigits') 139 errorCount = 0.0 140 mTest = len(testFileList) 141 for i in range(mTest): 142 fileNameStr = testFileList[i] 143 fileStr = fileNameStr.split('.')[0] # take off .txt 144 classNumStr = int(fileStr.split('_')[0]) 145 vectorUnderTest = img2vector('digits/testDigits/%s' % fileNameStr) 146 classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) 147 print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr)) 148 if (classifierResult != classNumStr): errorCount += 1.0 149 print("the total number of errors is: %d" % errorCount) 150 print("the total error rate is: %f" % (errorCount / float(mTest)))