FrozenLake 4x4

[6]:
import gymnasium as gym
[7]:
from gymcts.gymcts_agent import GymctsAgent
from gymcts.gymcts_deepcopy_wrapper import DeepCopyMCTSGymEnvWrapper
[8]:
from gymcts.logger import log
[9]:
# set log level to 20 (INFO)
# set log level to 10 (DEBUG) to see more detailed information
log.setLevel(20)
[10]:
if __name__ == '__main__':
    # 0. create the environment
    env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=True, render_mode="ansi")
    env.reset()

    # 1. wrap the environment with the naive wrapper or a custom gymcts wrapper
    env = DeepCopyMCTSGymEnvWrapper(env)

    # 2. create the agent
    agent = GymctsAgent(
        env=env,
        clear_mcts_tree_after_step=False,
        render_tree_after_step=True,
        number_of_simulations_per_step=50,
        exclude_unvisited_nodes_from_render=True
    )

    # 3. solve the environment
    actions = agent.solve()

    # 4. render the environment solution in the terminal
    print(env.render())
    for a in actions:
        obs, rew, term, trun, info = env.step(a)
        print(env.render())

    # 5. print the solution
    # read the solution from the info provided by the RecordEpisodeStatistics wrapper
    # (that NaiveSoloMCTSGymEnvWrapper uses internally)
    episode_length = info["episode"]["l"]
    episode_return = info["episode"]["r"]

    if episode_return == 1.0:
        print(f"Environment solved in {episode_length} steps.")
    else:
        print(f"Environment not solved in {episode_length} steps.")
(N=50, Q_v=0.00, best=0.00)
├── (a=0, N=13, Q_v=0.00, best=0.00, ubc=0.39)
│   ├── (a=0, N=3, Q_v=0.00, best=0.00, ubc=0.65)
│   ├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
│   └── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.74)
│   ├── (a=1, N=3, Q_v=0.00, best=0.00, ubc=0.65)
│   ├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
│   └── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.74)
│   ├── (a=2, N=3, Q_v=0.00, best=0.00, ubc=0.65)
│   ├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
│   └── (a=3, N=1, Q_v=0.00, best=0.00, ubc=0.74)
│   └── (a=3, N=3, Q_v=0.00, best=0.00, ubc=0.65)
│       ├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
│       └── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.74)
├── (a=1, N=12, Q_v=0.00, best=0.00, ubc=0.40)
│   ├── (a=0, N=3, Q_v=0.00, best=0.00, ubc=0.64)
│   ├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
│   └── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.74)
│   ├── (a=1, N=3, Q_v=0.00, best=0.00, ubc=0.64)
│   ├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
│   └── (a=2, N=1, Q_v=0.00, best=0.00, ubc=0.74)
│   ├── (a=2, N=3, Q_v=0.00, best=0.00, ubc=0.64)
│   └── (a=3, N=2, Q_v=0.00, best=0.00, ubc=0.79)
│       └── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.59)
├── (a=2, N=12, Q_v=0.00, best=0.00, ubc=0.40)
│   ├── (a=0, N=3, Q_v=0.00, best=0.00, ubc=0.64)
│   ├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
│   └── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.74)
│   ├── (a=1, N=3, Q_v=0.00, best=0.00, ubc=0.64)
│   ├── (a=2, N=3, Q_v=0.00, best=0.00, ubc=0.64)
│   ├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
│   └── (a=2, N=1, Q_v=0.00, best=0.00, ubc=0.74)
│   └── (a=3, N=2, Q_v=0.00, best=0.00, ubc=0.79)
│       └── (a=2, N=1, Q_v=0.00, best=0.00, ubc=0.59)
└── (a=3, N=12, Q_v=0.00, best=0.00, ubc=0.40)
    ├── (a=0, N=3, Q_v=0.00, best=0.00, ubc=0.64)
    ├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
    └── (a=3, N=1, Q_v=0.00, best=0.00, ubc=0.74)
    ├── (a=1, N=3, Q_v=0.00, best=0.00, ubc=0.64)
    ├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
    └── (a=2, N=1, Q_v=0.00, best=0.00, ubc=0.74)
    ├── (a=2, N=3, Q_v=0.00, best=0.00, ubc=0.64)
    ├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
    └── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.74)
    └── (a=3, N=2, Q_v=0.00, best=0.00, ubc=0.79)
        └── (a=3, N=1, Q_v=0.00, best=0.00, ubc=0.59)
[16:43:09] INFO     selected action 0 after 50 simulations.
           INFO     current action list: [0]
(a=0, N=63, Q_v=0.00, best=0.00, ubc=0.19)
├── (a=0, N=16, Q_v=0.00, best=0.00, ubc=0.36)
├── (a=0, N=4, Q_v=0.00, best=0.00, ubc=0.59)
├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.83)
├── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.83)
└── (a=3, N=1, Q_v=0.00, best=0.00, ubc=0.83)
├── (a=1, N=4, Q_v=0.00, best=0.00, ubc=0.59)
├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.83)
├── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.83)
└── (a=2, N=1, Q_v=0.00, best=0.00, ubc=0.83)
├── (a=2, N=4, Q_v=0.00, best=0.00, ubc=0.59)
├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.83)
├── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.83)
└── (a=3, N=1, Q_v=0.00, best=0.00, ubc=0.83)
└── (a=3, N=3, Q_v=0.00, best=0.00, ubc=0.68)
    ├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
    └── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.74)
├── (a=1, N=16, Q_v=0.00, best=0.00, ubc=0.36)
├── (a=0, N=4, Q_v=0.00, best=0.00, ubc=0.59)
├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.83)
├── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.83)
└── (a=2, N=1, Q_v=0.00, best=0.00, ubc=0.83)
├── (a=1, N=4, Q_v=0.00, best=0.00, ubc=0.59)
├── (a=2, N=4, Q_v=0.00, best=0.00, ubc=0.59)
├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.83)
├── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.83)
└── (a=2, N=1, Q_v=0.00, best=0.00, ubc=0.83)
└── (a=3, N=3, Q_v=0.00, best=0.00, ubc=0.68)
    ├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
    └── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.74)
├── (a=2, N=15, Q_v=0.00, best=0.00, ubc=0.37)
├── (a=0, N=4, Q_v=0.00, best=0.00, ubc=0.58)
├── (a=1, N=4, Q_v=0.00, best=0.00, ubc=0.58)
├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.83)
├── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.83)
└── (a=2, N=1, Q_v=0.00, best=0.00, ubc=0.83)
├── (a=2, N=3, Q_v=0.00, best=0.00, ubc=0.67)
├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
└── (a=2, N=1, Q_v=0.00, best=0.00, ubc=0.74)
└── (a=3, N=3, Q_v=0.00, best=0.00, ubc=0.67)
    ├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
    └── (a=2, N=1, Q_v=0.00, best=0.00, ubc=0.74)
└── (a=3, N=15, Q_v=0.00, best=0.00, ubc=0.37)
    ├── (a=0, N=4, Q_v=0.00, best=0.00, ubc=0.58)
    ├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.83)
    ├── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.83)
    └── (a=2, N=1, Q_v=0.00, best=0.00, ubc=0.83)
    ├── (a=1, N=4, Q_v=0.00, best=0.00, ubc=0.58)
    ├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.83)
    ├── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.83)
    └── (a=2, N=1, Q_v=0.00, best=0.00, ubc=0.83)
    ├── (a=2, N=3, Q_v=0.00, best=0.00, ubc=0.67)
    ├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
    └── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.74)
    └── (a=3, N=3, Q_v=0.00, best=0.00, ubc=0.67)
        ├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
        └── (a=3, N=1, Q_v=0.00, best=0.00, ubc=0.74)
           INFO     selected action 0 after 50 simulations.
           INFO     current action list: [0, 0]
(a=0, N=66, Q_v=0.09, best=1.00, ubc=0.28)
├── (a=0, N=22, Q_v=0.14, best=1.00, ubc=0.44)
├── (a=0, N=4, Q_v=0.00, best=0.00, ubc=0.62)
├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.83)
├── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.83)
└── (a=2, N=1, Q_v=0.00, best=0.00, ubc=0.83)
├── (a=1, N=10, Q_v=0.30, best=1.00, ubc=0.69)
├── (a=0, N=3, Q_v=0.33, best=1.00, ubc=0.95)
├── (a=1, N=1, Q_v=0.00, best=0.00, ubc=1.07)
├── (a=2, N=4, Q_v=0.50, best=1.00, ubc=1.04)
└── (a=3, N=1, Q_v=0.00, best=0.00, ubc=1.07)
├── (a=2, N=4, Q_v=0.00, best=0.00, ubc=0.62)
└── (a=3, N=3, Q_v=0.00, best=0.00, ubc=0.72)
    ├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
    └── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.74)
├── (a=1, N=15, Q_v=0.07, best=1.00, ubc=0.44)
├── (a=0, N=5, Q_v=0.20, best=1.00, ubc=0.72)
├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.90)
├── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.90)
├── (a=2, N=1, Q_v=0.00, best=0.00, ubc=0.90)
└── (a=3, N=1, Q_v=1.00, best=1.00, ubc=1.90)
├── (a=1, N=3, Q_v=0.00, best=0.00, ubc=0.67)
├── (a=2, N=3, Q_v=0.00, best=0.00, ubc=0.67)
├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
└── (a=2, N=1, Q_v=0.00, best=0.00, ubc=0.74)
└── (a=3, N=3, Q_v=0.00, best=0.00, ubc=0.67)
    ├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
    └── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.74)
├── (a=2, N=15, Q_v=0.07, best=1.00, ubc=0.44)
├── (a=0, N=3, Q_v=0.00, best=0.00, ubc=0.67)
├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
└── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.74)
├── (a=1, N=5, Q_v=0.20, best=1.00, ubc=0.72)
├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.90)
├── (a=1, N=1, Q_v=1.00, best=1.00, ubc=1.90)
├── (a=2, N=1, Q_v=0.00, best=0.00, ubc=0.90)
└── (a=3, N=1, Q_v=0.00, best=0.00, ubc=0.90)
├── (a=2, N=3, Q_v=0.00, best=0.00, ubc=0.67)
├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
└── (a=3, N=1, Q_v=0.00, best=0.00, ubc=0.74)
└── (a=3, N=3, Q_v=0.00, best=0.00, ubc=0.67)
    ├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
    └── (a=3, N=1, Q_v=0.00, best=0.00, ubc=0.74)
└── (a=3, N=13, Q_v=0.08, best=1.00, ubc=0.48)
    ├── (a=0, N=3, Q_v=0.00, best=0.00, ubc=0.65)
    ├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
    └── (a=3, N=1, Q_v=0.00, best=0.00, ubc=0.74)
    ├── (a=1, N=5, Q_v=0.20, best=1.00, ubc=0.71)
    ├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.90)
    ├── (a=1, N=1, Q_v=1.00, best=1.00, ubc=1.90)
    ├── (a=2, N=1, Q_v=0.00, best=0.00, ubc=0.90)
    └── (a=3, N=1, Q_v=0.00, best=0.00, ubc=0.90)
    ├── (a=2, N=2, Q_v=0.00, best=0.00, ubc=0.80)
    └── (a=2, N=1, Q_v=0.00, best=0.00, ubc=0.59)
    └── (a=3, N=2, Q_v=0.00, best=0.00, ubc=0.80)
        └── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.59)
           INFO     selected action 0 after 50 simulations.
           INFO     current action list: [0, 0, 0]
(a=0, N=72, Q_v=0.08, best=1.00, ubc=0.26)
├── (a=0, N=12, Q_v=0.00, best=0.00, ubc=0.42)
├── (a=0, N=3, Q_v=0.00, best=0.00, ubc=0.64)
├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
└── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.74)
├── (a=1, N=3, Q_v=0.00, best=0.00, ubc=0.64)
├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
└── (a=2, N=1, Q_v=0.00, best=0.00, ubc=0.74)
├── (a=2, N=3, Q_v=0.00, best=0.00, ubc=0.64)
└── (a=3, N=2, Q_v=0.00, best=0.00, ubc=0.79)
    └── (a=2, N=1, Q_v=0.00, best=0.00, ubc=0.59)
├── (a=1, N=35, Q_v=0.17, best=1.00, ubc=0.42)
├── (a=0, N=10, Q_v=0.20, best=1.00, ubc=0.62)
├── (a=0, N=2, Q_v=0.00, best=0.00, ubc=0.76)
├── (a=1, N=2, Q_v=0.00, best=0.00, ubc=0.76)
├── (a=2, N=2, Q_v=0.00, best=0.00, ubc=0.76)
└── (a=3, N=3, Q_v=0.33, best=1.00, ubc=0.95)
├── (a=1, N=5, Q_v=0.00, best=0.00, ubc=0.60)
├── (a=2, N=14, Q_v=0.29, best=1.00, ubc=0.64)
├── (a=0, N=2, Q_v=0.00, best=0.00, ubc=0.81)
├── (a=1, N=4, Q_v=0.25, best=1.00, ubc=0.82)
├── (a=2, N=2, Q_v=0.00, best=0.00, ubc=0.81)
└── (a=3, N=5, Q_v=0.40, best=1.00, ubc=0.91)
└── (a=3, N=5, Q_v=0.00, best=0.00, ubc=0.60)
    ├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.90)
    ├── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.90)
    ├── (a=2, N=1, Q_v=0.00, best=0.00, ubc=0.90)
    └── (a=3, N=1, Q_v=0.00, best=0.00, ubc=0.90)
├── (a=2, N=12, Q_v=0.00, best=0.00, ubc=0.42)
└── (a=3, N=12, Q_v=0.00, best=0.00, ubc=0.42)
    ├── (a=0, N=3, Q_v=0.00, best=0.00, ubc=0.64)
    ├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
    └── (a=2, N=1, Q_v=0.00, best=0.00, ubc=0.74)
    ├── (a=1, N=3, Q_v=0.00, best=0.00, ubc=0.64)
    ├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
    └── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.74)
    ├── (a=2, N=3, Q_v=0.00, best=0.00, ubc=0.64)
    ├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
    └── (a=3, N=1, Q_v=0.00, best=0.00, ubc=0.74)
    └── (a=3, N=2, Q_v=0.00, best=0.00, ubc=0.79)
        └── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.59)
           INFO     selected action 1 after 50 simulations.
           INFO     current action list: [0, 0, 0, 1]
(a=1, N=85, Q_v=0.27, best=1.00, ubc=0.44)
├── (a=0, N=17, Q_v=0.12, best=1.00, ubc=0.48)
├── (a=0, N=4, Q_v=0.00, best=0.00, ubc=0.60)
├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.83)
├── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.83)
└── (a=2, N=1, Q_v=0.00, best=0.00, ubc=0.83)
├── (a=1, N=3, Q_v=0.00, best=0.00, ubc=0.69)
├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
└── (a=3, N=1, Q_v=0.00, best=0.00, ubc=0.74)
├── (a=2, N=3, Q_v=0.00, best=0.00, ubc=0.69)
└── (a=3, N=6, Q_v=0.17, best=1.00, ubc=0.65)
    ├── (a=0, N=2, Q_v=0.00, best=0.00, ubc=0.67)
    ├── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.95)
    ├── (a=2, N=1, Q_v=0.00, best=0.00, ubc=0.95)
    └── (a=3, N=1, Q_v=0.00, best=0.00, ubc=0.95)
├── (a=1, N=10, Q_v=0.00, best=0.00, ubc=0.47)
├── (a=2, N=47, Q_v=0.45, best=1.00, ubc=0.66)
├── (a=0, N=4, Q_v=0.00, best=0.00, ubc=0.69)
├── (a=1, N=9, Q_v=0.22, best=1.00, ubc=0.68)
├── (a=0, N=2, Q_v=0.00, best=0.00, ubc=0.74)
├── (a=1, N=3, Q_v=0.33, best=1.00, ubc=0.94)
├── (a=2, N=2, Q_v=0.00, best=0.00, ubc=0.74)
└── (a=3, N=1, Q_v=0.00, best=0.00, ubc=1.05)
├── (a=2, N=7, Q_v=0.14, best=1.00, ubc=0.67)
├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.99)
├── (a=1, N=3, Q_v=0.33, best=1.00, ubc=0.90)
├── (a=2, N=1, Q_v=0.00, best=0.00, ubc=0.99)
└── (a=3, N=1, Q_v=0.00, best=0.00, ubc=0.99)
└── (a=3, N=26, Q_v=0.65, best=1.00, ubc=0.93)
    ├── (a=0, N=19, Q_v=0.84, best=1.00, ubc=1.13)
    ├── (a=1, N=2, Q_v=0.00, best=0.00, ubc=0.90)
    ├── (a=2, N=2, Q_v=0.00, best=0.00, ubc=0.90)
    └── (a=3, N=2, Q_v=0.00, best=0.00, ubc=0.90)
└── (a=3, N=10, Q_v=0.00, best=0.00, ubc=0.47)
    ├── (a=0, N=3, Q_v=0.00, best=0.00, ubc=0.62)
    ├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
    └── (a=3, N=1, Q_v=0.00, best=0.00, ubc=0.74)
    ├── (a=1, N=2, Q_v=0.00, best=0.00, ubc=0.76)
    └── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.59)
    ├── (a=2, N=2, Q_v=0.00, best=0.00, ubc=0.76)
    └── (a=3, N=1, Q_v=0.00, best=0.00, ubc=0.59)
    └── (a=3, N=2, Q_v=0.00, best=0.00, ubc=0.76)
           INFO     selected action 2 after 50 simulations.
           INFO     current action list: [0, 0, 0, 1, 2]
(a=2, N=97, Q_v=0.69, best=1.00, ubc=0.85)
├── (a=0, N=4, Q_v=0.00, best=0.00, ubc=0.76)
├── (a=1, N=9, Q_v=0.22, best=1.00, ubc=0.73)
├── (a=0, N=2, Q_v=0.00, best=0.00, ubc=0.74)
├── (a=1, N=3, Q_v=0.33, best=1.00, ubc=0.94)
├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
└── (a=2, N=1, Q_v=1.00, best=1.00, ubc=1.74)
├── (a=2, N=2, Q_v=0.00, best=0.00, ubc=0.74)
└── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.59)
└── (a=3, N=1, Q_v=0.00, best=0.00, ubc=1.05)
├── (a=2, N=7, Q_v=0.14, best=1.00, ubc=0.71)
├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.99)
├── (a=1, N=3, Q_v=0.33, best=1.00, ubc=0.90)
├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
└── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.74)
├── (a=2, N=1, Q_v=0.00, best=0.00, ubc=0.99)
└── (a=3, N=1, Q_v=0.00, best=0.00, ubc=0.99)
└── (a=3, N=76, Q_v=0.83, best=1.00, ubc=1.00)
    ├── (a=0, N=69, Q_v=0.90, best=1.00, ubc=1.08)
    ├── (a=0, N=2, Q_v=0.00, best=0.00, ubc=1.03)
    ├── (a=1, N=2, Q_v=0.00, best=0.00, ubc=1.03)
    ├── (a=2, N=6, Q_v=0.50, best=1.00, ubc=1.09)
    └── (a=3, N=58, Q_v=1.00, best=1.00, ubc=1.19)
    ├── (a=1, N=2, Q_v=0.00, best=0.00, ubc=1.04)
    ├── (a=2, N=2, Q_v=0.00, best=0.00, ubc=1.04)
    └── (a=3, N=1, Q_v=0.00, best=0.00, ubc=0.59)
    └── (a=3, N=2, Q_v=0.00, best=0.00, ubc=1.04)
        └── (a=2, N=1, Q_v=0.00, best=0.00, ubc=0.59)
           INFO     selected action 3 after 50 simulations.
           INFO     current action list: [0, 0, 0, 1, 2, 3]
(a=3, N=126, Q_v=0.87, best=1.00, ubc=1.01)
├── (a=0, N=116, Q_v=0.94, best=1.00, ubc=1.08)
├── (a=0, N=2, Q_v=0.00, best=0.00, ubc=1.09)
└── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.59)
├── (a=1, N=2, Q_v=0.00, best=0.00, ubc=1.09)
└── (a=3, N=1, Q_v=0.00, best=0.00, ubc=0.59)
├── (a=2, N=6, Q_v=0.50, best=1.00, ubc=1.13)
├── (a=0, N=2, Q_v=0.50, best=1.00, ubc=1.17)
├── (a=1, N=1, Q_v=1.00, best=1.00, ubc=1.95)
├── (a=2, N=1, Q_v=0.00, best=0.00, ubc=0.95)
└── (a=3, N=1, Q_v=0.00, best=0.00, ubc=0.95)
└── (a=3, N=105, Q_v=1.00, best=1.00, ubc=1.15)
├── (a=1, N=3, Q_v=0.00, best=0.00, ubc=0.90)
├── (a=2, N=3, Q_v=0.00, best=0.00, ubc=0.90)
├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
└── (a=3, N=1, Q_v=0.00, best=0.00, ubc=0.74)
└── (a=3, N=3, Q_v=0.00, best=0.00, ubc=0.90)
    ├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
    └── (a=2, N=1, Q_v=0.00, best=0.00, ubc=0.74)
[16:43:10] INFO     selected action 0 after 50 simulations.
           INFO     current action list: [0, 0, 0, 1, 2, 3, 0]
(a=0, N=166, Q_v=0.95, best=1.00, ubc=1.07)
├── (a=0, N=2, Q_v=0.00, best=0.00, ubc=1.13)
└── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.59)
├── (a=1, N=2, Q_v=0.00, best=0.00, ubc=1.13)
└── (a=3, N=1, Q_v=0.00, best=0.00, ubc=0.59)
├── (a=2, N=28, Q_v=0.82, best=1.00, ubc=1.12)
├── (a=0, N=3, Q_v=0.33, best=1.00, ubc=1.08)
├── (a=0, N=1, Q_v=0.00, best=0.00, ubc=0.74)
└── (a=1, N=1, Q_v=0.00, best=0.00, ubc=0.74)
├── (a=1, N=21, Q_v=1.00, best=1.00, ubc=1.28)
├── (a=2, N=2, Q_v=0.00, best=0.00, ubc=0.91)
└── (a=3, N=1, Q_v=0.00, best=0.00, ubc=0.59)
└── (a=3, N=1, Q_v=0.00, best=0.00, ubc=1.29)
└── (a=3, N=133, Q_v=1.00, best=1.00, ubc=1.14)
           INFO     selected action 3 after 50 simulations.
           INFO     current action list: [0, 0, 0, 1, 2, 3, 0, 3]
           INFO     Final action list: [0, 0, 0, 1, 2, 3, 0, 3]

SFFF
FHFH
FFFH
HFFG

  (Left)
SFFF
FHFH
FFFH
HFFG

  (Left)
SFFF
FHFH
FFFH
HFFG

  (Left)
SFFF
FHFH
FFFH
HFFG

  (Down)
SFFF
FHFH
FFFH
HFFG

  (Right)
SFFF
FHFH
FFFH
HFFG

  (Up)
SFFF
FHFH
FFFH
HFFG

  (Left)
SFFF
FHFH
FFFH
HFFG

  (Up)
SFFF
FHFH
FFFH
HFFG

Environment solved in 8 steps.