1. 實驗要求

題目：計算最佳策略在下面例子基礎上，自行設計一個問題（例如:求解某兩點之間的最短路徑，或是在圖中加一些障礙物，計算最短路徑），給出該問題對應的 MDP 模型描述，然后分別使用 value iteration 和 policy iteration 算法計算出最佳策略。

2．實驗思路

（1）設計問題（MDP描述）

設計4*4的方格，即初始化的矩陣，每個空格都是一個狀態，存在收益情況，在每到達一個點時便可選擇上下左右四個方向移動，遇到邊緣時狀態不變，當移動一步則收益-1

（2）實驗思路

設計2個矩陣Matrix1和Matrix2，用來存放上次迭代的收益值，每次走完一步先更新Matrix2，再將值賦給Matrix1，收益值由上下左右操作后得到。

As for the Policy Iteration:

(1) Initialization the Matrix1

(2) Update the value according to the function:

(delta)

(3) Until the strategy unchanged, it’s restrained.

As for the Value Iteration:

(1) Initialize the Matrix1

(2) Update the value according to the function:

(delta)

(3) Output the value function, and obtain the best strategy.

3.實驗結果

Policy Iteration:

Value Iteration;

class State(object):
    
    def __init__(self,i,j):
    
    self.up = 0.25

    self.down = 0.25

    self.left = 0.25

    self.right = 0.25

    if i > 0:

    self.upState = (i - 1, j)

    else:

    self.upState = (i, j)

    if i < 3:


    self.downState = (i + 1, j)

    else:

    self.downState = (i, j)

    if j > 0:
            self.leftState = (i, j - 1)
        else:
            self.leftState = (i,j)
        if j < 3:
            self.rightState = (i, j + 1)
        else:
            self.rightState = (i, j)
        # Add barrier in position(1, 2) and (2, 1)

        if i == 2 and j == 2:
            self.upState = (i, j)
            self.leftState = (i, j)
        if i == 3 and j == 1:
            self.upState = (i, j)
        if i == 2 and j == 0:
            self.rightState = (i, j)
        if i == 1 and j == 1:
            self.rightState = (i, j)
            self.downState = (i, j)
        if i == 0 and j == 2:
            self.downState = (i, j)
        if i == 1 and j == 3:
            self.leftState = (i, j)

    def updatePolicy(self, rewardMatrix):

        oldUp = self.up
        oldDown = self.down
        oldLeft = self.left
        oldRight = self.right

        upValue = -1 + rewardMatirx[self.upState[0]][self.upState[1]]
        downValue = -1 + rewardMatirx[self.downState[0]][self.downState[1]]
        leftValue = -1 + rewardMatirx[self.leftState[0]][self.leftState[1]]
        rightValue = -1 + rewardMatirx[self.rightState[0]][self.rightState[1]]
        if(upValue >= downValue)and(upValue >= leftValue)and(upValue >= rightValue):
            self.up = 1.0
            self.down = 0
            self.left = 0
            self.right = 0
        elif(downValue >= upValue)and(downValue >= leftValue)and(downValue >= rightValue):
            self.up = 0
            self.down = 1.0
            self.left = 0
            self.right = 0
        elif(leftValue >= upValue)and(leftValue >= downValue)and(leftValue >= rightValue):
            self.up = 0
            self.down = 0
            self.left = 1.0
            self.right = 0
        else:
            self.up = 0
            self.down = 0
            self.left = 0
            self.right = 1.0
        if(oldUp == self.up)and(oldDown == self.down)and(oldLeft == self.left)and(oldRight == self.right):
            return True
        else:
            return False


################################################################################################################
# Update the reward matrix
def updateReward(i, j):
    tempMatrix[i][j] = -1 + stateMatirx[i][j].up * rewardMatirx[stateMatirx[i][j].upState[0]][stateMatirx[i][j].upState[1]] \
                          + stateMatirx[i][j].down * rewardMatirx[stateMatirx[i][j].downState[0]][stateMatirx[i][j].downState[1]] \
                          + stateMatirx[i][j].left * rewardMatirx[stateMatirx[i][j].leftState[0]][stateMatirx[i][j].leftState[1]] \
                          + stateMatirx[i][j].right * rewardMatirx[stateMatirx[i][j].rightState[0]][stateMatirx[i][j].rightState[1]]


#################################################################################################################
# Initialize the state matrix
stateMatirx = [[] for i in range(4)]
for i in range(4):
    for j in range(4):
        stateMatirx[i].append(State(i, j))


################################################################################################################
# Initialize the reward matrix
rewardMatirx = [
    [0,0,0,0],
    [0,0,0,0],
    [0,0,0,0],
    [0,0,0,0]
]
# The matrix used to backup reward matrix
tempMatrix = [
    [0,0,0,0],
    [0,0,0,0],
    [0,0,0,0],
    [0,0,0,0]
]
#################################################################################################################
thresh = 0   # set a threshold value here
#################################################################################################################
# Policy iteration
stableFlag = True
while stableFlag:
    # policy evaluation
    delta = 0
    for i in range(0, 4):
        for j in range(0, 4):
            if ((i == 0) and (j == 0)):
                continue
            if i == 1 and j == 2:
                continue
            if i == 2 and j == 1:
                continue
            else:
                v = tempMatrix[i][j]
                updateReward(i, j)
                delta = max(delta, abs(tempMatrix[i][j] - v))
    rewardMatirx = tempMatrix
    while delta > thresh:
        delta = 0
        for i in range(0,4):
            for j in range(0,4):
                if((i == 0)and(j == 0)):
                    continue
                if i == 1 and j == 2:
                    continue
                if i == 2 and j == 1:
                    continue
                else:
                    v = tempMatrix[i][j]
                    updateReward(i, j)
                    delta = max(delta, abs(tempMatrix[i][j] - v))
        rewardMatirx = tempMatrix
    # improve the policy
    for i in range(0,4):
        for j in range(0,4):
            if (i == 0) and (j == 0):
                continue
            if i == 1 and j == 2:
                continue
            if i == 2 and j == 1:
                continue
            else:
                stableFlag = (stableFlag and stateMatirx[i][j].updatePolicy(rewardMatirx))

    stableFlag = not stableFlag
################################################################################################################
for i in range(0, 4):
    for j in range(0, 4):
        print(rewardMatirx[i][j])
        print(" ")
    print("\n")

#設置名為Operation的類存放信息
class Operation(object):

    def __init__(self,i,j):

        self.up = 0.25#概率為0.25

        self.down = 0.25#概率為0.25

        self.left = 0.25#概率為0.25

        self.right = 0.25#概率為0.25

        if i > 0:

            self.upState = (i - 1, j)

        else:

            self.upState = (i, j)

        if i < 3:

            self.downState = (i + 1, j)

        else:

            self.downState = (i, j)

        if j > 0:

            self.leftState = (i, j - 1)

        else:

            self.leftState = (i,j)

        if j < 3:

            self.rightState = (i, j + 1)

        else:

            self.rightState = (i, j)

# 初始化收益矩陣

rewardMatrix = [

    [0,0,0,0],[0,0,0,0],[0,0,0,0],[0,0,0,0]

]

tempMatrix = [

    [0,0,0,0],[0,0,0,0],[0,0,0,0],[0,0,0,0]

]

# 初始化狀態矩陣

stateMatirx = [[] for i in range(4)]

for i in range(4):

    for j in range(4):

        stateMatirx[i].append(Operation(i, j))

def updateReward(i,j):

    tempMatrix[i][j] = max((-1 + rewardMatrix[stateMatirx[i][j].upState[0]][stateMatirx[i][j].upState[1]] ),

                        (-1 + rewardMatrix[stateMatirx[i][j].downState[0]][stateMatirx[i][j].downState[1]]) ,

                        (-1 + rewardMatrix[stateMatirx[i][j].leftState[0]][stateMatirx[i][j].leftState[1]]),

                        (-1 + rewardMatrix[stateMatirx[i][j].rightState[0]][stateMatirx[i][j].rightState[1]]))

#Value iteration

thresh = 0 # 設置閾值

delta = 0

for i in range(4):

    for j in range(4):

        if ((i == 0) and (j == 0)):

            continue

        if i == 1 and j == 2:

            continue

        if i == 2 and j == 1:

            continue

        else:

            v = rewardMatrix[i][j]

            updateReward(i,j)

            delta = max(delta, abs(v - tempMatrix[i][j]))

rewardMatrix = tempMatrix

while delta > thresh:

    delta = 0

    for i in range(4):

        for j in range(4):

            if i == 0 and j == 0:

                continue

            if i == 1 and j == 2:

                continue

            if i == 2 and j == 1:

                continue

            else:

                v = rewardMatrix[i][j]

                updateReward(i, j)

                delta = max(delta, abs(v - tempMatrix[i][j]))

    rewardMatrix = tempMatrix

#輸出結果矩陣

for i in range(0, 4):

    for j in range(0, 4):

        print(rewardMatrix[i][j])

        print(" ")

    print("\n")

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 強化學習 2—— 用動態規划求解 MDP (Policy Iteration and Value Iteration) 人工智能領域最佳5種編程語言十款最佳人工智能軟件《人工智能導論》第5章搜索求解策略《人工智能導論》第6章智能計算及其應用人工智能中的常用搜索策略人工智能-實驗一策略迭代和值迭代人工智能是什么關於人工智能和python 人工智能-搜索