程序经验 March 01, 2021

Gym-Reinforcement-FrozenLake

Words count 3.4k Reading time 3 mins. Read count 0

  • 使用Open-AI中的gym库创建环境。
    import torch
    import time
    import matplotlib.pyplot as plt
    from gym.envs.registration import register
    import gym
    register(
      id='FrozenLakeNotSlippery-v0',
      entry_point='gym.envs.toy_text:FrozenLakeEnv',
      kwargs={'map_name' : '4x4', 'is_slippery': False},
    )
    plt.ion() #plt默认为阻塞模式,.ion为解除阻塞
    env = gym.make('FrozenLakeNotSlippery-v0')#创建环境
    env.render() #加载图像引擎
  • 确定好需要使用的参数值。
    number_of_states = env.observation_space.n
    number_of_actions = env.action_space.n
    print( "States = ", number_of_states)
    print( "Actions = ", number_of_actions)
    num_episodes = 1000
    steps_total = []
    rewards_total = []
    egreedy_total = []
    gamma = 0.95  # 未来行为最大化收益的期望占比
    learning_rate = 0.9   #学习率
    egreedy = 0.7  #贪婪率    值越低,代表经验占比越高
    egreedy_final = 0.1
    egreedy_decay = 0.999
    Q = torch.zeros([number_of_states, number_of_actions])   #创建一个新的Q值矩阵
  • 本段代码的意图为首先根据贪婪率确定下一步的行为,然后把行为赋值到环境中,获得新的环境以及行为状态。
  • 获得新的Q值矩阵。
  • 获得新的状态,行为信息。
    for i_episode in range(num_episodes):
      # 重置环境
      state = env.reset()
      step = 0
      while True:
          step += 1
          random_for_egreedy = torch.rand(1)[0]
          if random_for_egreedy > egreedy:      
              random_values = Q[state] + torch.rand(1,number_of_actions) / 1000      
              action = torch.max(random_values,1)[1][0]  
              #torch.max(tensor, dim)  dim是索引的维度  0是每列的最大值,1是每行的最大值
              #函数会返回两个tensor,第一个tensor是每行的最大值,softmax的输出中最大的是1,所以第一个tensor是全1的tensor;第二个tensor是每行最大值的索引。
              action = action.item()
          else:
              action = env.action_space.sample()
          if egreedy > egreedy_final:
              egreedy *= egreedy_decay
          new_state, reward, done, info = env.step(action)
          # Filling the Q Table
          Q[state, action] = reward + gamma * torch.max(Q[new_state])
          # Setting new state for next action
          state = new_state
          # env.render()
          # time.sleep(0.4)
          if done:
              steps_total.append(step)
              rewards_total.append(reward)
              egreedy_total.append(egreedy)
              if i_episode % 10 == 0:
                  print('Episode: {} Reward: {} Steps Taken: {}'.format(i_episode,reward, step))
              break
  • 在plt中显示rewards,steps,egreedy的变化曲线
    print("Percent of episodes finished successfully: {0}".format(sum(rewards_total)/num_episodes))
    print("Percent of episodes finished successfully (last 100 episodes): {0}".format(sum(rewards_total[-100:])/100))
    print("Average number of steps: %.2f" % (sum(steps_total)/num_episodes))
    print("Average number of steps (last 100 episodes): %.2f" % (sum(steps_total[-100:])/100))
    plt.figure(figsize=(6,3))
    plt.title("Rewards")
    plt.bar(torch.arange(len(rewards_total)), rewards_total, alpha=0.6, color='green', width=5)
    plt.show()
    plt.figure(figsize=(6,3))
    plt.title("Steps / Episode length")
    plt.bar(torch.arange(len(steps_total)), steps_total, alpha=0.6, color='red', width=5)
    plt.show()
    plt.figure(figsize=(6,3))
    plt.title("Egreedy value")
    plt.bar(torch.arange(len(egreedy_total)), egreedy_total, alpha=0.6, color='blue', width=5)
    plt.ioff()
    plt.show()
0%