- 使用Open-AI中的gym库创建环境。
import torch import time import matplotlib.pyplot as plt from gym.envs.registration import register import gym register( id='FrozenLakeNotSlippery-v0', entry_point='gym.envs.toy_text:FrozenLakeEnv', kwargs={'map_name' : '4x4', 'is_slippery': False}, ) plt.ion() #plt默认为阻塞模式,.ion为解除阻塞 env = gym.make('FrozenLakeNotSlippery-v0')#创建环境 env.render() #加载图像引擎
- 确定好需要使用的参数值。
number_of_states = env.observation_space.n number_of_actions = env.action_space.n print( "States = ", number_of_states) print( "Actions = ", number_of_actions) num_episodes = 1000 steps_total = [] rewards_total = [] egreedy_total = [] gamma = 0.95 # 未来行为最大化收益的期望占比 learning_rate = 0.9 #学习率 egreedy = 0.7 #贪婪率 值越低,代表经验占比越高 egreedy_final = 0.1 egreedy_decay = 0.999 Q = torch.zeros([number_of_states, number_of_actions]) #创建一个新的Q值矩阵
- 本段代码的意图为首先根据贪婪率确定下一步的行为,然后把行为赋值到环境中,获得新的环境以及行为状态。
- 获得新的Q值矩阵。
- 获得新的状态,行为信息。
for i_episode in range(num_episodes): # 重置环境 state = env.reset() step = 0 while True: step += 1 random_for_egreedy = torch.rand(1)[0] if random_for_egreedy > egreedy: random_values = Q[state] + torch.rand(1,number_of_actions) / 1000 action = torch.max(random_values,1)[1][0] #torch.max(tensor, dim) dim是索引的维度 0是每列的最大值,1是每行的最大值 #函数会返回两个tensor,第一个tensor是每行的最大值,softmax的输出中最大的是1,所以第一个tensor是全1的tensor;第二个tensor是每行最大值的索引。 action = action.item() else: action = env.action_space.sample() if egreedy > egreedy_final: egreedy *= egreedy_decay new_state, reward, done, info = env.step(action) # Filling the Q Table Q[state, action] = reward + gamma * torch.max(Q[new_state]) # Setting new state for next action state = new_state # env.render() # time.sleep(0.4) if done: steps_total.append(step) rewards_total.append(reward) egreedy_total.append(egreedy) if i_episode % 10 == 0: print('Episode: {} Reward: {} Steps Taken: {}'.format(i_episode,reward, step)) break
- 在plt中显示rewards,steps,egreedy的变化曲线
print("Percent of episodes finished successfully: {0}".format(sum(rewards_total)/num_episodes)) print("Percent of episodes finished successfully (last 100 episodes): {0}".format(sum(rewards_total[-100:])/100)) print("Average number of steps: %.2f" % (sum(steps_total)/num_episodes)) print("Average number of steps (last 100 episodes): %.2f" % (sum(steps_total[-100:])/100)) plt.figure(figsize=(6,3)) plt.title("Rewards") plt.bar(torch.arange(len(rewards_total)), rewards_total, alpha=0.6, color='green', width=5) plt.show() plt.figure(figsize=(6,3)) plt.title("Steps / Episode length") plt.bar(torch.arange(len(steps_total)), steps_total, alpha=0.6, color='red', width=5) plt.show() plt.figure(figsize=(6,3)) plt.title("Egreedy value") plt.bar(torch.arange(len(egreedy_total)), egreedy_total, alpha=0.6, color='blue', width=5) plt.ioff() plt.show()
Comments