Logistic回歸python實現


2017-08-12

Logistic 回歸,作為分類器:

分別用了梯度上升,牛頓法來最優化損失函數:

 

 1 # -*- coding: utf-8 -*-
 2 
 3 '''
 4 function: 實現Logistic回歸,擬合直線,對數據進行分類;  5  利用梯度上升,隨機梯度上升,改進的隨機梯度上升,牛頓法分別對損失函數優化;  6  這里沒有給出最后測試分類的函數;  7 date: 2017.8.12  8 '''
 9 
 10 from numpy import *
 11 
 12 #從文件加載處理數據
 13 def loadDataSet():  14     dataMat = []  15     labelMat = []  16     fr = open('testSet.txt')  17     for line in fr.readlines():  18         lineArr = line.strip().split()  19         dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])  20         labelMat.append(int(lineArr[2]))  21     return dataMat, labelMat  22 
 23 #sigmoid function X: w1*x1+w2*x2+...+wn*xn
 24 def sigmoid(x):  25     return 1 / (1 + exp(-x))  26 
 27 #梯度上升求函數的最大值時取得的權重
 28 def gradAscent(dataMatIn, classLabels):  29     dataMatrix = mat(dataMatIn)  30     labelMat = mat(classLabels).transpose()   #將m維行向量轉制為m維列向量
 31     m,n = shape(dataMatrix)  32     alpha = 0.001 #設置梯度上升的步長
 33     maxCycles = 500 #最大迭代次數 
 34     weights = ones((n,1))  #weights就是theta,n維列向量,二維數組
 35     for i in range(maxCycles):  36         h = sigmoid(dataMatrix*weights)  #計算所有數據的分類概率,h是m維向量,這里實際上進行了300次乘法運算
 37         error = labelMat - h #計算(y-h(x)):誤差
 38         weights = weights + alpha * dataMatrix.transpose()*error #對所有weights同時更新
 39     print('shape of weights', shape(weights))  40     return weights  41 
 42 #隨機上升上升
 43 def stocGradAscent0(dataMatrix, classLabels):  44     m,n = shape(dataMatrix)  45     alpha = 0.01
 46     weights = ones(n, float) #n 維行向量
 47     for i in range(m):  48         h = sigmoid(sum(dataMatrix[i] * weights))  49         error = classLabels[i] - h  50         weights = weights + alpha * error * dataMatrix[int(i)]  51     return weights  52 
 53 #改進的隨機上升上升
 54 def stocGradAscent1(dataMatrix, classLabels, numIter = 550):  55     m,n = shape(dataMatrix)  56     weights = ones(n)  57     dataIndex = []  58     for j in range(numIter):  59         for k in range(m):  60  dataIndex.append(k)  61         for i in range(m):  62             alpha = 4 / (i + j + 1.0) + 0.01 #aloha隨着迭代次數增多不斷減小但是不為0,為了解決局部波動
 63             randIndex = int(random.uniform(0, len(dataIndex))) #隨機選取一個數據進行更新參數
 64             h = sigmoid(dataMatrix[randIndex] * weights)  65             error = classLabels[randIndex] - h  66             weights = weights + alpha * error * dataMatrix[randIndex]  67             del(dataIndex[randIndex]) #這里會不會對本來的dataMatrix有影響,不會,因為沒有操作矩陣
 68     return weights  69 
 70 '''
 71 functuion: 牛頓法更新theta,計算海森矩陣  72 note: 這里一定要對 XT*X 結果求逆,不然分類效果無法直視  73 '''
 74 def computeHessianMatrix(dataMatrix):  75     hessianMatrix = mat(dataMatrix).transpose().dot(mat(dataMatrix))  76     return hessianMatrix.I #矩陣求逆
 77 
 78 '''
 79 function: 牛頓法更新theta  80 '''
 81 def newtonMethod(dataMat, labelMat, numIter = 10):  82     m,n = shape(dataMat)  83     dataMatrix = mat(dataMat)  84     labelMat = mat(labelMat).transpose()  85     weights = ones((n,1)) #參數向量
 86     hessianMatrix = computeHessianMatrix(dataMatrix)  87     print('shape of hessian', shape(hessianMatrix))  88     for k in range(numIter):  89         h = sigmoid(dataMatrix * weights)  90         error = (labelMat - h)  91         weights = weights - (dataMatrix*hessianMatrix).transpose() * error  92     return weights  93 
 94 #畫出決策邊界
 95 def plotBestFit(weights):  96     import matplotlib.pyplot as plt  97     dataMat, labelMat = loadDataSet() #或取數據和標簽
 98     dataArr = array(dataMat) #將二維列表轉換為數組
 99     n = shape(dataArr)[0] #行數,代表數據的個數
100 
101     xcord1 = []; ycord1 = [] 102     xcord2 = []; ycord2 = [] 103     for i in range(n): 104         if int(labelMat[i]) == 1: 105             xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2]) 106         else: 107             xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2]) 108 
109     fig = plt.figure() 110     ax = fig.add_subplot(111) 111     ax.scatter(xcord1, ycord1, s = 30, c = 'red', marker = 's') 112     ax.scatter(xcord2, ycord2, s = 30, c = 'green') 113     x = arange(-3.0, 3.0, 0.1) 114     print(len(x)) 115     print(len(weights)) 116     print(shape(weights)) 117     y = (-weights[0] -  weights[1]*x) / weights[2] #把x2堪稱y,x0=1,所以就是x1,x2之間的函數
118     print(len(y)) 119  ax.plot(x,y) 120     plt.xlabel('X1'); plt.ylabel('X2') 121     plt.show() #顯示圖像
122 
123 '''
124 function: 對測試數據進行測試 125 input: testDate 是一個向量,或者多個向量 126 '''
127 def logisticTest(weights, testData): 128     m,n = shape(testData) 129     typeLabel = [] 130     for i in range(m): 131         result = sigmoid(sum(testData[i] * weights)) #得到一個冊數數據的概率
132         if result > 0.5: 133             typeLabel.append(1) 134         else: 135  typeLabel.append(0) 136     return typeLabel 137 
138 '''
139 function: 計算分類的正確率 140 input: calssLables 是測試數據的類別向量 141 '''
142 def getCorrectRate(classLabels, testLabel): 143     correctRate = 0.0
144     numOfRight = 0 145     for i in range(len(testLabel)): 146         if classLabels[i] == testLabel[i]: 147             numOfRight += 1
148     correctRate = numOfRight / float(len(testLabel)) 149     return correctRate 150 
151 #測試算法
152 dataMat, labelMat = loadDataSet() 153 print('shape of datamet, ', shape(dataMat)) 154 weights1 = gradAscent(dataMat, labelMat) 155 weights2 = stocGradAscent0(array(dataMat), labelMat) 156 weights3 = stocGradAscent1(dataMat, labelMat) 157 
158 weights4 = newtonMethod(dataMat, labelMat, 2000) #牛頓法
159 
160 
161 #測試正確率
162 typeLabel = logisticTest(weights4, [[1,1,0],[1,1,1]]) 163 print(typeLabel) 164 correctRate = getCorrectRate([1,0],typeLabel) 165 print('正確率: ' ,str(correctRate)) 166 
167 #畫圖
168 #plotBestFit(array(weights1)) #這里因為普通梯度上升返回的是一個矩陣,所以要轉換為向量
169 #plotBestFit(weights2)
170 #plotBestFit(weights3)
171 plotBestFit(array(weights4))

 

# -*- coding: utf-8 -*-
'''function: 實現Logistic回歸,擬合直線,對數據進行分類;利用梯度上升,隨機梯度上升,改進的隨機梯度上升,牛頓法分別對損失函數優化;這里沒有給出最后測試分類的函數;date: 2017.8.12'''
from numpy import *
#從文件加載處理數據def loadDataSet():dataMat = []labelMat = []fr = open('testSet.txt')for line in fr.readlines():lineArr = line.strip().split()dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])labelMat.append(int(lineArr[2]))return dataMat, labelMat
#sigmoid function X: w1*x1+w2*x2+...+wn*xndef sigmoid(x):return 1 / (1 + exp(-x))
#梯度上升求函數的最大值時取得的權重def gradAscent(dataMatIn, classLabels):dataMatrix = mat(dataMatIn)labelMat = mat(classLabels).transpose()   #將m維行向量轉制為m維列向量m,n = shape(dataMatrix)alpha = 0.001 #設置梯度上升的步長maxCycles = 500 #最大迭代次數 weights = ones((n,1))  #weights就是theta,n維列向量,二維數組for i in range(maxCycles):h = sigmoid(dataMatrix*weights)  #計算所有數據的分類概率,h是m維向量,這里實際上進行了300次乘法運算error = labelMat - h #計算(y-h(x)):誤差weights = weights + alpha * dataMatrix.transpose()*error #對所有weights同時更新print('shape of weights', shape(weights))return weights
#隨機上升上升def stocGradAscent0(dataMatrix, classLabels):m,n = shape(dataMatrix)alpha = 0.01weights = ones(n, float) #n 維行向量for i in range(m):h = sigmoid(sum(dataMatrix[i] * weights))error = classLabels[i] - hweights = weights + alpha * error * dataMatrix[int(i)]return weights
#改進的隨機上升上升def stocGradAscent1(dataMatrix, classLabels, numIter = 550):m,n = shape(dataMatrix)weights = ones(n)dataIndex = []for j in range(numIter):for k in range(m):dataIndex.append(k)for i in range(m):alpha = 4 / (i + j + 1.0) + 0.01 #aloha隨着迭代次數增多不斷減小但是不為0,為了解決局部波動randIndex = int(random.uniform(0, len(dataIndex))) #隨機選取一個數據進行更新參數h = sigmoid(dataMatrix[randIndex] * weights)error = classLabels[randIndex] - hweights = weights + alpha * error * dataMatrix[randIndex]del(dataIndex[randIndex]) #這里會不會對本來的dataMatrix有影響,不會,因為沒有操作矩陣return weights
'''functuion: 牛頓法更新theta,計算海森矩陣note: 這里一定要對 XT*X 結果求逆,不然分類效果無法直視'''def computeHessianMatrix(dataMatrix):hessianMatrix = mat(dataMatrix).transpose().dot(mat(dataMatrix))return hessianMatrix.I #矩陣求逆
'''function: 牛頓法更新theta'''def newtonMethod(dataMat, labelMat, numIter = 10):m,n = shape(dataMat)dataMatrix = mat(dataMat)labelMat = mat(labelMat).transpose()weights = ones((n,1)) #參數向量hessianMatrix = computeHessianMatrix(dataMatrix)print('shape of hessian', shape(hessianMatrix))for k in range(numIter):h = sigmoid(dataMatrix * weights)error = (labelMat - h)weights = weights - (dataMatrix*hessianMatrix).transpose() * error return weights
#畫出決策邊界def plotBestFit(weights):import matplotlib.pyplot as pltdataMat, labelMat = loadDataSet() #或取數據和標簽dataArr = array(dataMat) #將二維列表轉換為數組n = shape(dataArr)[0] #行數,代表數據的個數
xcord1 = []; ycord1 = []xcord2 = []; ycord2 = []for i in range(n):if int(labelMat[i]) == 1:xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2])else:xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2])
fig = plt.figure()ax = fig.add_subplot(111)ax.scatter(xcord1, ycord1, s = 30, c = 'red', marker = 's')ax.scatter(xcord2, ycord2, s = 30, c = 'green')x = arange(-3.0, 3.0, 0.1)print(len(x))print(len(weights))print(shape(weights))y = (-weights[0] -  weights[1]*x) / weights[2] #把x2堪稱y,x0=1,所以就是x1,x2之間的函數print(len(y))ax.plot(x,y) plt.xlabel('X1'); plt.ylabel('X2')plt.show() #顯示圖像
'''function: 對測試數據進行測試input: testDate 是一個向量,或者多個向量'''def logisticTest(weights, testData):m,n = shape(testData)typeLabel = []for i in range(m): result = sigmoid(sum(testData[i] * weights)) #得到一個冊數數據的概率if result > 0.5:typeLabel.append(1)else:typeLabel.append(0)return typeLabel
'''function: 計算分類的正確率input: calssLables 是測試數據的類別向量'''def getCorrectRate(classLabels, testLabel):correctRate = 0.0numOfRight = 0for i in range(len(testLabel)):if classLabels[i] == testLabel[i]:numOfRight += 1correctRate = numOfRight / float(len(testLabel))return correctRate
#測試算法dataMat, labelMat = loadDataSet()print('shape of datamet, ', shape(dataMat))weights1 = gradAscent(dataMat, labelMat)weights2 = stocGradAscent0(array(dataMat), labelMat)weights3 = stocGradAscent1(dataMat, labelMat)
weights4 = newtonMethod(dataMat, labelMat, 2000) #牛頓法

#測試正確率typeLabel = logisticTest(weights4, [[1,1,0],[1,1,1]])print(typeLabel)correctRate = getCorrectRate([1,0],typeLabel)print('正確率: ' ,str(correctRate))
#畫圖#plotBestFit(array(weights1)) #這里因為普通梯度上升返回的是一個矩陣,所以要轉換為向量#plotBestFit(weights2)#plotBestFit(weights3)plotBestFit(array(weights4))


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM