线性回归
数据集:为房价的数据,第一列是面积,第二列是房间数,第三列是价格
链接:https://pan.baidu.com/s/1VCtTde2vb3wOPG5dGfmucg&shfl=sharepset 提取码:3ekz
载入库
1 import numpy as np 2 import pandas as pd #可使用DataFrame查看数据 3 from matplotlib import pyplot as plt
导入文件
1 def loadtxtAndcsv(fileName,split,datatype): 2 return np.loadtxt(fileName,delimiter=split,dtype = datatype) 3 4 def loadnpy(fileName): 5 return np.load(fileName)
归一化,目的是减少同一数据集中,数据相差过大的现象
1 def featureNormalize(X): 2 X_norm = np.array(X) #将X转化为numpy对象 3 mu = np.zeros((1,X.shape[1])) 4 sigma = np.zeros((1,X.shape[1])) 5 6 mu = np.mean(X_norm,0) # 求每一列的平均值(0指定为列,1代表行) 7 sigma = np.std(X_norm,0) 8 for i in range(X.shape[1]): 9 X_norm[:,i] = (X_norm[:,i] - mu[i]) / sigma[i] 10 11 return X_norm,mu,sigma
画图
def plot_X(X): plt.scatter(X[:,0],X[:,1]) plt.show() def plotJ(J_history,num_iters): x = np.arange(1,num_iters+1) plt.plot(x,J_history) plt.xlabel("Number of iterations") plt.ylabel("Value of costFunction") plt.title("The change of Value over the Number") plt.show()
计算梯度和代价函数
def computerCost(X,y,theta): m = len(y) J = 0 J = (np.transpose(X*theta - y)) * (X*theta - y) / (2*m) return J def gradientDescent(X, y, theta, alpha, num_iters): m = len(y) n = len(theta) temp = np.matrix(np.zeros((n,num_iters))) #暂存每次迭代计算的theta,转化为矩阵形式 J_history = np.zeros((num_iters,1)) for i in range(num_iters): h = np.dot(X,theta) #h是目标函数 temp[:,i] = theta - ((alpha/m)) * np.dot(np.transpose(X),(h - y)) #计算梯度 theta = temp[:,i] J_history[i] = computerCost(X,y,theta) print('.',end=' ') return theta,J_history
主函数体和预测函数
def linearRegression(alpha = 0.01, num_iters = 400): print(u"加载数据中...\n") data = loadtxtAndcsv("data.txt",",",np.float64) X = data[:,0:-1] y = data[:,-1] plot_X(X) #作图 m = len(y) #y的长度 col = data.shape[1] #data的列数 X,mu,sigma = featureNormalize(X) #归一化 plot_X(X) #画图 ''' np.vstack():在竖直方向上堆叠 np.hstack():在水平方向上平铺 ''' X = np.hstack((np.ones((m,1)),X)) #在X前加1 print(u"\n执行梯度下降...\n") theta = np.zeros((col,1)) y = y.reshape(-1,1) #行转列 theta,J_history = gradientDescent(X, y, theta, alpha, num_iters) plotJ(J_history,num_iters) return mu,sigma,theta def predict(mu,sigma,theta): result = 0 predict = np.array([2000,3]) norm_predict = (predict - mu) / sigma final_predict = np.hstack((np.ones((1)),norm_predict)) result = np.dot(final_predict,theta) return result def testlinearRegression(): mu,sigma,theta = linearRegression(0.01,400) print(predict(mu,sigma,theta)) if __name__ == "__main__": testlinearRegression()