#邏輯回歸算法是一個二分類的算法,但是通過變形可以解決多分類的任務
#邏輯回歸將數據的特征轉變為數據的發生概率,然后與閾值作比較,判斷是0還是1,所以也可以叫做回歸算法
import numpy as np
import matplotlib.pyplot as plt
#定義概率轉換函數sigmoid函數
def sigmoid(t):
return 1/(1+np.exp(-t))
x=np.linspace(-10,10,100)
y=sigmoid(x)
plt.figure()
plt.plot(x,y,"r",label="Sigmoid")
plt.legend(loc=2)
plt.show()
#定義邏輯回歸算法的損失函數,與線性回歸算法比較一致
#邏輯回歸算法的數學原理底層實現編寫
def J1(theta,x_b,y): #損失函數的定義
y_hat=sigmoid(x_b.dot(theta))
return np.sum(y*np.log(y_hat)+(1-y)*np.log(1-y_hat))/len(x_b)
def DJ2(theta,x_b,y):
res=np.empty(len(theta))
res[0]=np.sum(sigmoid(x_b.dot(theta))-y)
for i in range(1,len(theta)):
res[i]=np.sum((sigmoid(x_b.dot(theta))-y).dot(x_b[:,i]))
return res*2/len(x_b)
def DJ1(theta, x_b, y): #梯度計算公式
return x_b.T.dot(sigmoid(x_b.dot(theta))-y)/len(y)
def gradient_descent1(x_b,y,eta,theta_initial,erro=1e-8, n=1e5): #采用批量梯度下降法來進行尋找損失函數的最小值
theta=theta_initial
i=0
while i<n:
gradient = DJ1(theta,x_b,y)
last_theta = theta
theta = theta - gradient * eta
if (abs(J1(theta,x_b,y) - J1(last_theta,x_b,y)))<erro:
break
i+=1
return theta
#利用iris數據集進行原理代碼的驗證
from sklearn import datasets
d=datasets.load_iris()
x=d.data
y=d.target
x=x[y<2,:2]
y=y[y<2] #邏輯回歸適用於二元分類數據
print(x)
print(y)
plt.figure()
plt.scatter(x[y==0,0],x[y==0,1],color="r")
plt.scatter(x[y==1,0],x[y==1,1],color="g")
plt.show()
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=1)
x_b=np.hstack([np.ones((len(x_train),1)),x_train])
print(x_b)
theta0=np.zeros(x_b.shape[1])
eta=0.1
theta1=gradient_descent1(x_b,y_train,eta,theta0)
print(theta1)
from sklearn.metrics import accuracy_score
x_b=np.hstack([np.ones((len(x_test),1)),x_test])
y_hat=sigmoid(x_b.dot(theta1))
print(y_hat)
p=np.array(y_hat>0.5,dtype="int") #如果概率大於0.5,輸出為1,不然輸出為0
print(p) #輸出預測的分類結果
print(y_test) #輸出實際的結果
print(accuracy_score(p,y_test)) #輸出預測的准確度
#二維特征的數據的決策邊界,使得線性回歸函數轉換為概率閾值的時候其線性參數邊界
def f(x):
return (-theta1[1]*x-theta1[0])/theta1[2]
x1=np.linspace(4,7.5,100)
plt.plot(x1,f(x1))
plt.scatter(x[y==0,0],x[y==0,1],color="r")
plt.scatter(x[y==1,0],x[y==1,1],color="g")
plt.show()
#定義機器學習算法的決策邊界輸出函數
def plot_decision_boundary(model,axis):
x0,x1=np.meshgrid(
np.linspace(axis[0],axis[1],int((axis[1]-axis[0])*100)).reshape(-1,1),
np.linspace(axis[2],axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1,1)
)
x_new=np.c_[x0.ravel(),x1.ravel()]
y_pre=model.predict(x_new)
zz=y_pre.reshape(x0.shape)
from matplotlib.colors import ListedColormap
cus=ListedColormap(["#EF9A9A","#FFF59D","#90CAF9"])
plt.contourf(x0,x1,zz,cmap=cus)
#使用KNN算法展示KNN分類算法的決策邊界(以二維特征作為例子,便可以在平面上進行展示)
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=666)
from sklearn.neighbors import KNeighborsClassifier
knn1=KNeighborsClassifier() #默認的k比較小,模型比較復雜
knn1.fit(x_train,y_train)
plot_decision_boundary(knn1,axis=[4,8,1,5])
plt.scatter(x[y==0,0],x[y==0,1],color="r")
plt.scatter(x[y==1,0],x[y==1,1],color="g")
plt.show()
knn2=KNeighborsClassifier(n_neighbors=3) #k越大,模型越簡單,也意味着過擬合的程度越輕,決策邊界越清晰
knn2.fit(d.data[:,:2],d.target)
x=d.data
y=d.target
plot_decision_boundary(knn2,axis=[4,8,1,5])
plt.scatter(x[y==0,0],x[y==0,1],color="r")
plt.scatter(x[y==1,0],x[y==1,1],color="g")
plt.scatter(x[y==2,0],x[y==2,1],color="b")
plt.show()
knn2=KNeighborsClassifier(n_neighbors=50) #k越大,模型越簡單,也意味着過擬合的程度越輕,決策邊界越清晰
knn2.fit(d.data[:,:2],d.target)
x=d.data
y=d.target
plot_decision_boundary(knn2,axis=[4,8,1,5])
plt.scatter(x[y==0,0],x[y==0,1],color="r")
plt.scatter(x[y==1,0],x[y==1,1],color="g")
plt.scatter(x[y==2,0],x[y==2,1],color="b")
plt.show()
#sklearn中調用邏輯回歸算法的過程
#1-1單純的邏輯回歸算法
x=np.random.normal(0,1,size=(200,2)) #自定義數據
y=np.array(x[:,0]**2+x[:,1]**2<1.5,dtype="int")
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=666)
from sklearn.linear_model import LogisticRegression
log=LogisticRegression()
log.fit(x_train,y_train)
print(log.score(x_test,y_test))
knn3=KNeighborsClassifier()
knn3.fit(x_train,y_train)
print(knn3.score(x_test,y_test))
plot_decision_boundary(log,axis=[-4,4,-4,4])
plt.scatter(x[y==0,0],x[y==0,1],color="r")
plt.scatter(x[y==1,0],x[y==1,1],color="g")
plt.show()
plot_decision_boundary(knn3,axis=[-4,4,-4,4])
plt.scatter(x[y==0,0],x[y==0,1],color="r")
plt.scatter(x[y==1,0],x[y==1,1],color="g")
plt.show()
#1-2sklearn中的邏輯回歸(多項式參與,並不帶正則化),采用管道的方式可以訓練模型
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
def polynomiallogisticregression(degree):
return Pipeline([
("poly",PolynomialFeatures(degree=degree)),
("std_reg",StandardScaler()),
("log_reg",LogisticRegression())
])
x=np.random.normal(0,1,size=(200,2))
y=np.array(x[:,0]**2+x[:,1]<1.5,dtype="int")
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=666)
p1=polynomiallogisticregression(degree=2)
p1.fit(x_train,y_train)
print(p1.score(x_train,y_train))
print(p1.score(x_test,y_test))
plot_decision_boundary(p1,axis=[-4,4,-4,4])
plt.scatter(x[y==0,0],x[y==0,1],color="r")
plt.scatter(x[y==1,0],x[y==1,1],color="g")
plt.show()
p1=polynomiallogisticregression(degree=20) #當其次數變為高次時,其訓練模型已經過擬合,決策邊界比較復雜
p1.fit(x_train,y_train)
print(p1.score(x_train,y_train))
print(p1.score(x_test,y_test))
plot_decision_boundary(p1,axis=[-4,4,-4,4])
plt.scatter(x[y==0,0],x[y==0,1],color="r")
plt.scatter(x[y==1,0],x[y==1,1],color="g")
plt.show()
#1-3邏輯回歸的正則化形式函數(減小degree次數,另外需要加入正則化系數C以及正則化方式penalty,提高泛化能力)
x=np.random.normal(0,1,size=(200,2)) #自定義數據
y=np.array(x[:,0]**2+x[:,1]<1.5,dtype="int")
#隨機化重置2個點的值,使得其含有一定的噪聲
for _ in range(20):
y[np.random.randint(200)]=1
def Polynomiallogisticregression(degree,C,penalty): #邏輯回歸的三大超參數:多項式系數,正則化系數,正則化方式L1/L2
return Pipeline([
("poly",PolynomialFeatures(degree=degree)),
("std_reg",StandardScaler()),
("log_reg",LogisticRegression(C=C,penalty=penalty))
])
p1=Polynomiallogisticregression(degree=20,C=1,penalty="l1") #當其次數變為高次時,其訓練模型已經過擬合
p1.fit(x_train,y_train)
print(p1.score(x_train,y_train))
print(p1.score(x_test,y_test))
plot_decision_boundary(p1,axis=[-4,4,-4,4])
plt.scatter(x[y==0,0],x[y==0,1],color="r")
plt.scatter(x[y==1,0],x[y==1,1],color="g")
plt.show()
#多分類任務的封裝OVR(n個)和OVO(Cmn個)
#sklearn中采用的邏輯回歸是可以進行多分類任務的,默認采用ovr方式
from sklearn import datasets
d=datasets.load_iris()
x=d.data
y=d.target
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=1)
#默認的OVR的多分類任務,時間更短,准確度較低
log1=LogisticRegression()
log1.fit(x_train,y_train)
print(log1.score(x_test,y_test))
#修改默認參數,使得其成為OVO的多分類算法,准確度更高一點,時間更長
log2=LogisticRegression(multi_class="multinomial",solver="newton-cg")
log2.fit(x_train,y_train)
print(log2.score(x_test,y_test))
#sklearn中封裝的OVO和OVR
#sklearn中對於所有的二分類算法提供了統一的OVR和OVO的分類器函數,可以方便調用實現所有二分類算法的多分類實現
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
log_reg=LogisticRegression() #1-1定義一種二分類算法
ovr=OneVsRestClassifier(log_reg) #1-2進行多分類轉換OVR
ovo=OneVsOneClassifier(log_reg) #1-2進行多分類轉換OVO
ovr.fit(x_train,y_train) #1-3進行數據訓練與預測
print(ovr.score(x_test,y_test))
ovo.fit(x_train,y_train)
print(ovo.score(x_test,y_test))