python實現bgd,sgd,mini-bgd,newton,bfgs,lbfgs優化算法
# coding=utf-8 import numpy as np import os def X3(a, b, c): a = np.dot(np.dot(a, b), c) return a def X2(a, b): a = np.dot(a, b) return a def get_data(obj_path_name): pro_path = os.path.abspath('.') data_path = str(pro_path + obj_path_name) print data_path data = np.loadtxt(data_path) x = np.asarray(np.column_stack((np.ones((data.shape[0], 1)), data[:, 0:-1])), dtype='double') y = data[:, -1] y = np.reshape(y, (data.shape[0], 1)) print x.shape, y.shape return x, y def inverse_hessian_mat(x): # 計算hessian矩陣,並求其逆矩陣 x_hessian = np.dot(x.T, x) inverse_hessian = np.linalg.inv(x_hessian) return inverse_hessian def get_e(x, theta, y): y_pre = np.dot(x, theta) e = y_pre - y return e def compute_grad(x, e, stand_flag=0): # batch法必須做梯度歸一化 print e.T.shape, x.shape grad = np.dot(e.T, x) # grad = np.dot(e.T, x)/x.shape[0] if stand_flag == 1: grad = grad/(np.dot(grad, grad.T)**0.5) return np.reshape(grad, (x.shape[1], 1)) def get_cost(e): # print e.shape # 計算當前theta組合點下的各個樣本的預測值 y_pre cost = np.dot(e.T, e) / 2 # cost = np.dot(e.T, e) / (2*e.shape[0]) return cost def get_cost_l12(e, theta, m, l=1, lmd=10e10): # print e.shape if l == 1: cost = (np.dot(e.T, e) + lmd*sum(abs(theta))) / (2*m) elif l == 2: cost = (np.dot(e.T, e) + lmd*np.dot(theta.T*theta)) / (2*m) else: cost = (np.dot(e.T, e)) / (2*m) return cost def update_batch_theta(theta, grad, alpha): theta = theta - alpha*grad return theta def update_batch_theta_l12(theta, grad, alpha, m, l=1, lmd=10e10): if l == 1: theta = theta - alpha * (grad + (lmd/m)*theta) elif l == 2: theta = theta - alpha * (grad + (lmd/m)) else: theta = theta - alpha * grad return theta def iter_batch(x, theta, y, out_n, out_e_reduce_rate, alpha): step = 1 while step < out_n: """計算初始的損失值""" if step == 1: e = get_e(x, theta, y) cost_0 = get_cost(e) """計算當前theta組合下的grad值""" grad = compute_grad(x, e, stand_flag=1) """依據grad更新theta""" theta = update_batch_theta(theta, grad, alpha) """計算新的損失值""" e = get_e(x, theta, y) cost_1 = get_cost(e) e_reduce_rate = abs(cost_1 - cost_0)/cost_0 print 'Step: %-6s, cost: %s' % (step, cost_1[0, 0]) if e_reduce_rate < out_e_reduce_rate: flag = 'Finish!\n' print flag break else: cost_0 = cost_1 step += 1 return theta def iter_random_batch(x, theta, y, out_n, out_e_reduce_rate, alpha): step = 0 while step < out_n: step += 1 for i in range(x.shape[0]): x_i = np.reshape(x[i, ], (1, x.shape[1])) y_i = np.reshape(y[i, ], (1, 1)) """計算初始的損失值""" e_0 = get_e(x, theta, y) cost_0 = get_cost(e_0) """用一個樣本,計算當前theta組合下的grad值""" e_i = get_e(x_i, theta, y_i) grad = compute_grad(x_i, e_i, stand_flag=1) """依據grad更新theta""" theta = update_batch_theta(theta, grad, alpha) """計算新的損失值""" e_1 = get_e(x, theta, y) cost_1 = get_cost(e_1) e_reduce_rate = abs(cost_1 - cost_0)/cost_0 if e_reduce_rate < out_e_reduce_rate: flag = 'Finish!\n' print flag step = out_n + 1 break print 'Step: %-6s, cost: %s' % (step, cost_1[0,0]) return theta def iter_mini_batch(x, theta, y, out_n, out_e_reduce_rate, alpha, batch): batch_n = x.shape[0]//batch batch_left_n = x.shape[0] % batch step = 0 while step < out_n: step += 1 for i in range(batch_n+1): """計算初始的損失值""" e_0 = get_e(x, theta, y) cost_0 = get_cost(e_0) """選取更新梯度用的batch個樣本""" if i <= batch_n-1: start = i*batch x_i = np.reshape(x[start:start+batch, ], (batch, x.shape[1])) y_i = np.reshape(y[start:start+batch, ], (batch, 1)) else: if batch_left_n != 0: x_i = np.reshape(x[-1*batch_left_n:, ], (batch_left_n, x.shape[1])) y_i = np.reshape(y[-1*batch_left_n:, ], (batch_left_n, 1)) """用batch個樣本,計算當前theta組合下的grad值""" e_i = get_e(x_i, theta, y_i) grad = compute_grad(x_i, e_i, stand_flag=1) """依據grad更新theta""" theta = update_batch_theta(theta, grad, alpha) """計算新的損失值""" e_1 = get_e(x, theta, y) cost_1 = get_cost(e_1) e_reduce_rate = abs(cost_1 - cost_0)/cost_0 if e_reduce_rate < out_e_reduce_rate: flag = 'Finish!\n' print flag step = out_n break print 'Step: %-6s, cost: %s' % (step, cost_1[0, 0]) return theta def update_newton_theta(x_hessian_inv, theta, grad, alpha): theta = theta - alpha*np.dot(x_hessian_inv, grad) # print 'New Theta --> ', theta1 return theta def iter_newton(x, theta, y, out_n, out_e_reduce_rate, alpha): e = get_e(x, theta, y) cost_0 = get_cost(e) x_hessian_inv = inverse_hessian_mat(x) step = 1 while step < out_n: grad = compute_grad(x, e) theta = update_newton_theta(x_hessian_inv, theta, grad, alpha) e = get_e(x, theta, y) cost_1 = get_cost(e) print 'Step: %-6s, cost: %s' % (step, cost_1[0,0]) e_reduce_rate = abs(cost_1 - cost_0)/cost_0 if e_reduce_rate < out_e_reduce_rate: flag = 'Finish!\n' print flag break else: cost_0 = cost_1 step += 1 return theta def iter_batch_linesearch(x, theta_0, y, out_n, out_e_reduce_rate, alpha): e_0 = get_e(x, theta_0, y) cost_0 = get_cost(e_0) step = 1 while step < out_n: grad = compute_grad(x, e_0, stand_flag=1) theta_1, cost_1, e_1, count = line_search(x, y, alpha, theta_0, grad) e_reduce_rate = abs(cost_1 - cost_0)/cost_0 print 'Step: %-6s, count: %-4s, cost: %s' % (step, count, cost_1[0, 0]) if e_reduce_rate < out_e_reduce_rate: flag = 'Finish!\n' print flag break else: cost_0 = cost_1 theta_0 = theta_1 e_0 = e_1 step += 1 return theta_1 def line_search(x, y, alpha, theta_0, grad): # 不更新梯度,在當前梯度方向上,迭代100次尋找最佳的下一個theta組合點, e_0 = get_e(x, theta_0, y) cost_0 = get_cost(e_0) max_iter, count, a, b = 1000, 0, 0.8, 0.5 while count < max_iter: # 隨count增大,alpha減小,0.8*0.8*0.8*0.8*... alpha_count = pow(a, count) * alpha theta_1 = theta_0 - alpha_count*grad e_1 = get_e(x, theta_1, y) cost_1 = get_cost(e_1) # 當前theta組合下的梯度為grad, grad == tan(w) == 切線y變化量/theta變化量, 推出 切線y變化量 = grad * theta變化量 grad_dy_chage = abs(np.dot(grad.T, theta_1-theta_0)) # 實際y變化量為cost0-cost1, 不能加絕對值,如果加了就不收斂了。只有cost_y_chage>0且大於b*grad_dy_chage時才符合條件 cost_y_change = cost_0-cost_1 # 當前梯度方向上,實際y變化量 占 切線y變化量的比率為b,b越大越好,至少大於0.5才行,實際損失減小量要占 dy的一半以上。 if cost_y_change > b*grad_dy_chage: break else: count += 1 return theta_1, cost_1, e_1, count def iter_bfgs(x, theta_0, y, out_n, out_e_reduce_rate, dfp): e_0 = get_e(x, theta_0, y) cost_0 = get_cost(e_0) grad_0 = np.reshape(compute_grad(x, e_0), (x.shape[1], 1)) hk = np.eye(x.shape[1]) step = 1 while step < out_n: dk = -1*np.dot(hk, grad_0) theta_1, cost_1, e_1, l_count = armijo_line_search(x, y, theta_0, grad_0, dk, cost_0) grad_1 = compute_grad(x, e_1) # print theta_1 # print grad_1 yk = grad_1 - grad_0 sk = theta_1 - theta_0 condition = np.dot(sk.T, yk) # 更新以此theta就更新一次hk if dfp == 1: if condition > 0: hk = hk - X2(X3(hk, yk, yk.T), hk)/X3(yk.T, hk, yk)+(X2(sk, sk.T))/condition else: if condition > 0: hk = hk + (1+X3(yk.T, hk, yk)/condition)*(X2(sk, sk.T)/condition)-(X3(sk, yk.T, hk)+X3(hk, yk, sk.T))/condition e_reduce_rate = abs(cost_1 - cost_0) / cost_0 print 'Step: %-6s, l_count: %-6s, cost: %s' % (step, l_count, cost_1[0,0]) if e_reduce_rate < out_e_reduce_rate: flag = 'Finish!\n' print flag break cost_0, grad_0, theta_0 = cost_1, grad_1, theta_1 step += 1 return theta_1 def iter_lbfgs(x, theta_0, y, out_n, out_e_reduce_rate): limit_n = 5 ss, yy = [], [] step = 1 while step < out_n: if step == 1: e_0 = get_e(x, theta_0, y) cost_0 = get_cost(e_0) grad_0 = compute_grad(x, e_0) dk = -1*grad_0 theta_1, cost_1, e_1, l_count = armijo_line_search(x, y, theta_0, grad_0, dk, cost_0) grad_1 = compute_grad(x, e_1) if len(ss) > limit_n and len(yy) > limit_n: del ss[0] del yy[0] yk = grad_1 - grad_0 sk = theta_1 - theta_0 ss.append(sk) yy.append(yk) qk = grad_1 k = len(ss) condition = X2(yk.T, sk) alpha = [] for i in range(k): # t 4->0 倒着計算,倒着存alpha t = k-i-1 pt = 1 / X2(yy[t].T, ss[t]) alpha.append(pt*X2(ss[t].T, qk)) qk = qk - alpha[i] * yy[t] z = qk for i in range(k): t = k - i - 1 # i 0->4 正着計算,正着存beta pi = 1 / (X2(yy[i].T, ss[i])[0, 0]) # pi數 beta = pi*X2(yy[i].T, z) # beta[i]數 z = z+ss[i]*(alpha[t] - beta) if condition > 0: dk = -z e_reduce_rate = abs(cost_1 - cost_0) / cost_0 print 'Step: %-6s, l_count: %-6s, cost: %s' % (step, l_count, cost_1[0, 0]) if e_reduce_rate < out_e_reduce_rate: flag = 'Finish!\n' print flag break cost_0, grad_0, theta_0 = cost_1, grad_1, theta_1 step += 1 return theta_1 def armijo_line_search(x, y, theta_0, grad_0, dk_0, cost_0): # print theta_0.shape, grad_0.shape, dk_0.shape, cost_0.shape # 不更新梯度,在當前梯度方向上,迭代100次尋找最佳的下一個theta組合點, max_iter, count, countk, a, b = 100, 0, 0, 0.55, 0.4 while count < max_iter: """ batch方法使用梯度方向grad更新theta,newton等使用牛頓方向dk來更新theta newton:hessian逆*grad; dfp:當前步dfpHk; bfgs:當前步bfgsHk 當前梯度方向上,實際y變化量 占 切線y變化量的比率為b,b越大越好,至少大於0.5才行,實際損失減小量要占 dy的一半以上。 """ """更新theta""" theta_1 = theta_0 + pow(a, count)*dk_0 """計算損失""" e_1 = get_e(x, theta_1, y) cost_1 = get_cost(e_1) cost_y_change = cost_1 - cost_0 dy_change = b * pow(a, count) * np.dot(grad_0.T, dk_0) # if cost_1 < cost_0 + b * pow(a, count) * np.dot(grad_0.T, dk_0): if cost_y_change < dy_change: """如果一直不滿足條件,那么一直沒有將count賦值給countk,countk仍為0""" countk = count break count += 1 theta_1 = theta_0 + pow(a, countk) * dk_0 e_1 = get_e(x, theta_1, y) cost_1 = get_cost(e_1) return theta_1, cost_1, e_1, count if __name__ == '__main__': x1, y1 = get_data('/optdata/line_sample_data.txt') # x1, y1 = get_data('/optdata/airfoil_self_noise.txt') theta1 = np.zeros(x1.shape[1]).reshape(x1.shape[1], 1) res_theta = iter_batch(x1, theta1, y1, out_n=1e5, out_e_reduce_rate=1e-6, alpha=0.02) # res_theta = iter_random_batch(x1, theta1, y1, out_n=1e5, out_e_reduce_rate=1e-6, alpha=0.02) # res_theta = iter_mini_batch(x1, theta1, y1, out_n=1e5, out_e_reduce_rate=1e-6, alpha=0.01, batch=10) # res_theta = iter_batch_linesearch(x1, theta1, y1, out_n=1e5, out_e_reduce_rate=1e-6, alpha=1) # res_theta = iter_newton(x1, theta1, y1, out_n=1e5, out_e_reduce_rate=1e-6, alpha=1) # res_theta = iter_bfgs(x1, theta1, y1, out_n=100, out_e_reduce_rate=1e-6, dfp=1) # res_theta = iter_bfgs(x1, theta1, y1, out_n=100, out_e_reduce_rate=1e-6, dfp=0) # res_theta = iter_lbfgs(x1, theta1, y1, out_n=100, out_e_reduce_rate=1e-6) print 'Res_theta:', np.reshape(res_theta, (1, res_theta.shape[0]))[0] # def iter_bfgs(x, theta_0, y, out_n, out_e_reduce_rate):
數據樣本三列特征,一列線性回歸目標
x1 x2 x3 y
3.5 9 6.1 33.2
5.3 20 6.4 40.3
5.1 18 7.4 38.7
5.8 33 6.7 46.8
4.2 31 7.5 41.4
6 13 5.9 37.5
6.8 25 6 39
5.5 30 4 40.7
3.1 5 5.8 30.1
7.2 47 8.3 52.9
4.5 25 5 38.2
4.9 11 6.4 31.8
8 23 7.6 43.3
6.5 35 7 44.1
6.6 39 5 42.5
3.7 21 4.4 33.6
6.2 7 5.5 34.2
7 40 7 48
4 35 6 38
4.5 23 3.5 35.9
5.9 33 4.9 40.4
5.6 27 4.3 36.8
4.8 34 8 45.2
3.9 15 5.8 35.1