作业三：使用minibatch的方式进行梯度下降

项目	内容
这个作业属于的课程	人工智能实战2019(北京航空航天大学）
这个作业的要求	第三次作业：使用minibatch的方式进行梯度下降
我在这个课程的目标是	学习算法，积累项目经验，锻炼coding能力
这个作业在哪个具体方面帮助我实现目标	了解batch, iteration,epoch的概念；学习使用批处理操作
作业正文	见下文
其他参考文献	微软示例代码

1. 作业要求

使用minibatch的方式进行梯度下降
复习讲过的课程（链接），并回答关于损失函数的 2D 示意图的问题
- 为什么是椭圆而不是圆？如何把这个图变成一个圆？
- 为什么中心是个椭圆区域而不是一个点？

2. 实现随机选取数据的方式进行minibatch梯度下降

示例代码位置：/B-教学案例与实践/B6-神经网络基本原理简明教程/微软-方案1/NeuralNetwork/ch04/level4-BatchGradientDescent.py

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import savefig
from pathlib import Path
x_data_name =  "TemperatureControlXData.dat"
y_data_name =  "TemperatureControlYData.dat"
class  CData(object):
    def  __init__(self, loss, w, b, epoch, iteration):
    self.loss = loss
    self.w = w
    self.b = b
    self.epoch = epoch
    self.iteration = iteration

def  ReadData():
	Xfile = Path(x_data_name)
	Yfile = Path(y_data_name)
	if Xfile.exists() & Yfile.exists():
		X = np.load(Xfile)
		Y = np.load(Yfile)
		return X.reshape(1,-1),Y.reshape(1,-1)
	else:
		return  None,None


def  ForwardCalculationBatch(W,B,batch_x):
    Z = np.dot(W, batch_x) + B
    return Z

  

def  BackPropagationBatch(batch_x, batch_y, batch_z):

	m = batch_x.shape[1]

	dZ = batch_z - batch_y

	dB = dZ.sum(axis=1, keepdims=True)/m

	dW = np.dot(dZ, batch_x.T)/m

	return dW, dB


def  UpdateWeights(w, b, dW, dB, eta):

	w = w - eta*dW

	b = b - eta*dB

	return w,b


def  InitialWeights(num_input, num_output, flag):

	if flag ==  0:

		# zero

		W = np.zeros((num_output, num_input))

	elif flag ==  1:

		# normalize

		W = np.random.normal(size=(num_output, num_input))

	elif flag ==  2:

		# xavier

		W=np.random.uniform(
			-np.sqrt(6/(num_input+num_output)),

			np.sqrt(6/(num_input+num_output)),

			size=(num_output,num_input))

  

	B = np.zeros((num_output, 1))

	return W,B

def  CheckLoss(W, B, X, Y):

m = X.shape[1]

Z = np.dot(W, X) + B

LOSS = (Z - Y)**2

loss = LOSS.sum()/m/2

return loss

  

def  shuffle(X, Y):

	num_example = X.shape[1]

	rank = np.arange(0, num_example)

	np.random.shuffle(rank)

	X_shuffle = []

	Y_shuffle = []

	for i in rank:

		X_shuffle.append(X[:,i])

		Y_shuffle.append(Y[:,i])

	X_shuffle = np.transpose(X_shuffle)

	Y_shuffle = np.transpose(Y_shuffle)

	return X_shuffle, Y_shuffle


def  GetBatchSamples(X,Y,batch_size,iteration):

	num_feature = X.shape[0]

	start = iteration * batch_size

	end = start + batch_size

	batch_x = X[0:num_feature, start:end].reshape(num_feature, batch_size)

	batch_y = Y[0, start:end].reshape(1, batch_size)

	return batch_x, batch_y


def  GetMinimalLossData(dict_loss):

	key =  sorted(dict_loss.keys())[0]

	w = dict_loss[key].w

	b = dict_loss[key].b

	return w,b,dict_loss[key]


def  ShowIterLossHistory(dict_loss, batch_size):

	loss = []

	for key in dict_loss:

	loss.append(key)

	plt.title("batch size :"  +  str(batch_size))

	plt.xlabel("iteration")

	plt.plot(loss[30:800])

	plt.ylabel("loss")

	savefig("/Users/souchiguu/Desktop/"  +  str(batch_size) +  ".png")

	plt.show()


def  ShowEpochLossHistory(list_epoch, Batchsize):

	color = ['b','g','y']

	for num_batch in  range(len(Batchsize)):

		loss = []

		for key in list_epoch[num_batch]:

		loss.append(key)

		plt.plot(loss, color[num_batch], label  =  'batchsize='+str(Batchsize[num_batch]))

	plt.title("learning rate = 0.01" )

	plt.xlabel("epoch")

	plt.ylabel("loss")

	plt.legend()

	savefig("/Users/souchiguu/Desktop/"  +  "0.1"  ".png")

	plt.show()



if  __name__  ==  '__main__':
	# method = "MiniBatch"

	eta, max_epoch =  0.01, 50

	Batchsize = [5, 10, 15]

	list_epoch = []

	# read data

	X_origin, Y_origin = ReadData()

	# count of samples

	num_example = X_origin.shape[1]

	num_feature = X_origin.shape[0]

	for batch_size in Batchsize:

		W, B = InitialWeights(1,1,0)

		# calculate loss to decide the stop condition

		# loss = 5

		dict_epoch_loss = {}

		dict_iter_loss = {}

		for epoch in  range(max_epoch):

			# random shuffle

			X, Y = shuffle(X_origin, Y_origin)

			# if num_example=200, batch_size=10, then iteration=200/10=20

			max_iteration = (int)(num_example / batch_size)
	
			sum_loss =  0

			for iteration in  range(max_iteration):

				# get x and y value for one sample

				batch_x, batch_y = GetBatchSamples(X,Y,batch_size,iteration)

				# get z from x,y

				batch_z = ForwardCalculationBatch(W, B, batch_x)

				# calculate gradient of w and b

				dW, dB = BackPropagationBatch(batch_x, batch_y, batch_z)

				# update w,b

				W, B = UpdateWeights(W, B, dW, dB, eta)
				# calculate loss for this batch
				
				loss = CheckLoss(W,B,X,Y)

				# print("batchsize=%d, epoch=%d, iteration=%d, loss=%f" %(batch_size, epoch, iteration, loss))

				dict_iter_loss[loss] = CData(loss, W, B, epoch, iteration)

				sum_loss += loss
			# end for
			
			dict_epoch_loss[sum_loss] = CData(sum_loss, W, B, epoch, max_iteration)
			
		# end for
		
		list_epoch.append(dict_epoch_loss)
		
		ShowIterLossHistory(dict_iter_loss, batch_size)
		
		w,b,cdata = GetMinimalLossData(dict_epoch_loss)
		
		print("w:", cdata.w, "b:", cdata.b)
		
		print("batchsize=%d, epoch=%d, iteration=%d, loss=%f"  %(batch_size, cdata.epoch, cdata.iteration, cdata.loss))
		
	ShowEpochLossHistory(list_epoch, Batchsize)

learning rate 为0.1，loss达到最小时的结果、loss随epoch, iteration, batchsize的变化趋势：

0.1 lr
w: [[1.99854936]] b: [[3.00973446]]
batchsize=5, epoch=27, iteration=40, loss=0.196999
w: [[1.99322693]] b: [[3.00426234]]
batchsize=10, epoch=37, iteration=20, loss=0.098183
w: [[1.99197605]] b: [[3.0102479]]
batchsize=15, epoch=46, iteration=13, loss=0.063790

learning rate 为0.01，loss达到最小时的结果、loss随epoch, iteration, batchsize的变化趋势：

0.01 lr
w: [[1.90823943]] b: [[3.05213545]]
batchsize=5, epoch=49, iteration=40, loss=0.209842
w: [[1.82458827]] b: [[3.09514729]]
batchsize=10, epoch=49, iteration=20, loss=0.123555
w: [[1.78029477]] b: [[3.11700018]]
batchsize=15, epoch=49, iteration=13, loss=0.089676

3. 思考题

loss_2d:

为什么是椭圆而不是圆？如何把这个图变成一个圆？
直观上是因为\(w\)与\(b\)在损失函数中的系数不同。本质原因是它们在正向传播中的地位不同，类似两个正交的特征向量，对应着不同的特征值。损失函数 \(J(w,b)=\frac{1}{m}\sum_{i=1}^m(wx_i+b_i-y_i)^2\)。\(w\)的系数是\(b\)的系数的\(\frac{1}{m}\sum_{i=1}^mx_i^2\)倍，当\(\sum_{i=1}^mx_i^2\)不等于1时，满足椭圆型方程\(\frac{x^2}{a^2}+\frac{y^2}{b^2}=1\)。强制\(\sum_{i=1}^mx_i^2==1\)可以把这个图变成一个圆。如，令\(x\)的均值为0，模长为m。
为什么中心是个椭圆区域而不是一个点？
loss最小的点应该是唯一取得的。中心不是一个点而是一个椭圆区域是因为无法连续对w,b进行取值，只能用离散的形式逼近，中心点附近的loss取值相似的地方构成这个椭圆区域。

免责声明！

本站转载的文章为个人学习借鉴使用，本站对版权不负任何法律责任。如果侵犯了您的隐私权益，请联系本站邮箱yoyou2525@163.com删除。

猜您在找 梯度下降之随机梯度下降 -minibatch 与并行化方法梯度下降与随机梯度下降梯度下降、随机梯度下降和批量梯度下降 tensorflow随机梯度下降算法使用滑动平均模型三、sklearn实现梯度下降梯度下降优化算法梯度下降（Gradient Descent）梯度下降与pytorch 梯度下降法梯度下降-Momentum