JSSEnv (Deepcopy Wrapper)

[1]:
from jsp_vis.console import gantt_chart_console
[2]:
from gymcts.gymcts_agent import GymctsAgent
from gymcts.gymcts_deepcopy_wrapper import DeepCopyMCTSGymEnvWrapper
from gymnasium.wrappers import TransformReward, NormalizeReward
from gymcts.logger import log
[3]:
import gymnasium as gym
from typing import Any
[4]:
import bisect
import datetime
import random
[5]:
import pandas as pd
import numpy as np
import plotly.figure_factory as ff
from pathlib import Path
[6]:
class JssEnv(gym.Env):
    def __init__(self, env_config=None):
        """
        This environment model the job shop scheduling problem as a single agent problem:

        -The actions correspond to a job allocation + one action for no allocation at this time step (NOPE action)

        -We keep a time with next possible time steps

        -Each time we allocate a job, the end of the job is added to the stack of time steps

        -If we don't have a legal action (i.e. we can't allocate a job),
        we automatically go to the next time step until we have a legal action

        -
        :param env_config: Ray dictionary of config parameter
        """
        if env_config is None:
            env_config = {
                "instance_path": Path(__file__).parent.absolute() / "instances" / "ta80"
            }
        instance_path = env_config["instance_path"]

        # initial values for variables used for instance
        self.jobs = 0
        self.machines = 0
        self.instance_matrix = None
        self.jobs_length = None
        self.max_time_op = 0
        self.max_time_jobs = 0
        self.nb_legal_actions = 0
        self.nb_machine_legal = 0
        # initial values for variables used for solving (to reinitialize when reset() is called)
        self.solution = None
        self.last_solution = None
        self.last_time_step = float("inf")
        self.current_time_step = float("inf")
        self.next_time_step = list()
        self.next_jobs = list()
        self.legal_actions = None
        self.time_until_available_machine = None
        self.time_until_finish_current_op_jobs = None
        self.todo_time_step_job = None
        self.total_perform_op_time_jobs = None
        self.needed_machine_jobs = None
        self.total_idle_time_jobs = None
        self.idle_time_jobs_last_op = None
        self.state = None
        self.illegal_actions = None
        self.action_illegal_no_op = None
        self.machine_legal = None
        # initial values for variables used for representation
        self.start_timestamp = datetime.datetime.now().timestamp()
        self.sum_op = 0
        with open(instance_path, "r") as instance_file:
            for line_cnt, line_str in enumerate(instance_file, start=1):
                split_data = list(map(int, line_str.split()))

                if line_cnt == 1:
                    self.jobs, self.machines = split_data
                    self.instance_matrix = np.zeros((self.jobs, self.machines), dtype=(int, 2))
                    self.jobs_length = np.zeros(self.jobs, dtype=int)
                else:
                    assert len(split_data) % 2 == 0 and len(split_data) // 2 == self.machines
                    job_nb = line_cnt - 2
                    for i in range(0, len(split_data), 2):
                        machine, time = split_data[i], split_data[i + 1]
                        self.instance_matrix[job_nb][i // 2] = (machine, time)
                        self.max_time_op = max(self.max_time_op, time)
                        self.jobs_length[job_nb] += time
                        self.sum_op += time
        self.max_time_jobs = max(self.jobs_length)
        # check the parsed data are correct
        assert self.max_time_op > 0
        assert self.max_time_jobs > 0
        assert self.jobs > 0
        assert self.machines > 1, "We need at least 2 machines"
        assert self.instance_matrix is not None
        # allocate a job + one to wait
        self.action_space = gym.spaces.Discrete(self.jobs + 1)
        # used for plotting
        self.colors = [
            tuple([random.random() for _ in range(3)]) for _ in range(self.machines)
        ]
        """
        matrix with the following attributes for each job:
            -Legal job
            -Left over time on the current op
            -Current operation %
            -Total left over time
            -When next machine available
            -Time since IDLE: 0 if not available, time otherwise
            -Total IDLE time in the schedule
        """
        self.observation_space = gym.spaces.Dict(
            {
                "action_mask": gym.spaces.Box(0, 1, shape=(self.jobs + 1,)),
                "real_obs": gym.spaces.Box(
                    low=0.0, high=1.0, shape=(self.jobs, 7), dtype=float
                ),
            }
        )

    def _get_current_state_representation(self):
        self.state[:, 0] = self.legal_actions[:-1]
        return {
            "real_obs": self.state,
            "action_mask": self.legal_actions,
        }

    def get_legal_actions(self):
        return self.legal_actions

    def reset(self, seed=None, options=None):
        self.current_time_step = 0
        self.next_time_step = list()
        self.next_jobs = list()
        self.nb_legal_actions = self.jobs
        self.nb_machine_legal = 0
        # represent all the legal actions
        self.legal_actions = np.ones(self.jobs + 1, dtype=bool)
        self.legal_actions[self.jobs] = False
        # used to represent the solution
        self.solution = np.full((self.jobs, self.machines), -1, dtype=int)
        self.time_until_available_machine = np.zeros(self.machines, dtype=int)
        self.time_until_finish_current_op_jobs = np.zeros(self.jobs, dtype=int)
        self.todo_time_step_job = np.zeros(self.jobs, dtype=int)
        self.total_perform_op_time_jobs = np.zeros(self.jobs, dtype=int)
        self.needed_machine_jobs = np.zeros(self.jobs, dtype=int)
        self.total_idle_time_jobs = np.zeros(self.jobs, dtype=int)
        self.idle_time_jobs_last_op = np.zeros(self.jobs, dtype=int)
        self.illegal_actions = np.zeros((self.machines, self.jobs), dtype=bool)
        self.action_illegal_no_op = np.zeros(self.jobs, dtype=bool)
        self.machine_legal = np.zeros(self.machines, dtype=bool)
        for job in range(self.jobs):
            needed_machine = self.instance_matrix[job][0][0]
            self.needed_machine_jobs[job] = needed_machine
            if not self.machine_legal[needed_machine]:
                self.machine_legal[needed_machine] = True
                self.nb_machine_legal += 1
        self.state = np.zeros((self.jobs, 7), dtype=float)
        return self._get_current_state_representation(), {}

    def _prioritization_non_final(self):
        if self.nb_machine_legal >= 1:
            for machine in range(self.machines):
                if self.machine_legal[machine]:
                    final_job = list()
                    non_final_job = list()
                    min_non_final = float("inf")
                    for job in range(self.jobs):
                        if (
                                self.needed_machine_jobs[job] == machine
                                and self.legal_actions[job]
                        ):
                            if self.todo_time_step_job[job] == (self.machines - 1):
                                final_job.append(job)
                            else:
                                current_time_step_non_final = self.todo_time_step_job[
                                    job
                                ]
                                time_needed_legal = self.instance_matrix[job][
                                    current_time_step_non_final
                                ][1]
                                machine_needed_nextstep = self.instance_matrix[job][
                                    current_time_step_non_final + 1
                                    ][0]
                                if (
                                        self.time_until_available_machine[
                                            machine_needed_nextstep
                                        ]
                                        == 0
                                ):
                                    min_non_final = min(
                                        min_non_final, time_needed_legal
                                    )
                                    non_final_job.append(job)
                    if len(non_final_job) > 0:
                        for job in final_job:
                            current_time_step_final = self.todo_time_step_job[job]
                            time_needed_legal = self.instance_matrix[job][
                                current_time_step_final
                            ][1]
                            if time_needed_legal > min_non_final:
                                self.legal_actions[job] = False
                                self.nb_legal_actions -= 1

    def _check_no_op(self):
        self.legal_actions[self.jobs] = False
        if (
                len(self.next_time_step) > 0
                and self.nb_machine_legal <= 3
                and self.nb_legal_actions <= 4
        ):
            machine_next = set()
            next_time_step = self.next_time_step[0]
            max_horizon = self.current_time_step
            max_horizon_machine = [
                self.current_time_step + self.max_time_op for _ in range(self.machines)
            ]
            for job in range(self.jobs):
                if self.legal_actions[job]:
                    time_step = self.todo_time_step_job[job]
                    machine_needed = self.instance_matrix[job][time_step][0]
                    time_needed = self.instance_matrix[job][time_step][1]
                    end_job = self.current_time_step + time_needed
                    if end_job < next_time_step:
                        return
                    max_horizon_machine[machine_needed] = min(
                        max_horizon_machine[machine_needed], end_job
                    )
                    max_horizon = max(max_horizon, max_horizon_machine[machine_needed])
            for job in range(self.jobs):
                if not self.legal_actions[job]:
                    if (
                            self.time_until_finish_current_op_jobs[job] > 0
                            and self.todo_time_step_job[job] + 1 < self.machines
                    ):
                        time_step = self.todo_time_step_job[job] + 1
                        time_needed = (
                                self.current_time_step
                                + self.time_until_finish_current_op_jobs[job]
                        )
                        while (
                                time_step < self.machines - 1 and max_horizon > time_needed
                        ):
                            machine_needed = self.instance_matrix[job][time_step][0]
                            if (
                                    max_horizon_machine[machine_needed] > time_needed
                                    and self.machine_legal[machine_needed]
                            ):
                                machine_next.add(machine_needed)
                                if len(machine_next) == self.nb_machine_legal:
                                    self.legal_actions[self.jobs] = True
                                    return
                            time_needed += self.instance_matrix[job][time_step][1]
                            time_step += 1
                    elif (
                            not self.action_illegal_no_op[job]
                            and self.todo_time_step_job[job] < self.machines
                    ):
                        time_step = self.todo_time_step_job[job]
                        machine_needed = self.instance_matrix[job][time_step][0]
                        time_needed = (
                                self.current_time_step
                                + self.time_until_available_machine[machine_needed]
                        )
                        while (
                                time_step < self.machines - 1 and max_horizon > time_needed
                        ):
                            machine_needed = self.instance_matrix[job][time_step][0]
                            if (
                                    max_horizon_machine[machine_needed] > time_needed
                                    and self.machine_legal[machine_needed]
                            ):
                                machine_next.add(machine_needed)
                                if len(machine_next) == self.nb_machine_legal:
                                    self.legal_actions[self.jobs] = True
                                    return
                            time_needed += self.instance_matrix[job][time_step][1]
                            time_step += 1

    def step(self, action: int):
        reward = 0.0
        if action == self.jobs:
            self.nb_machine_legal = 0
            self.nb_legal_actions = 0
            for job in range(self.jobs):
                if self.legal_actions[job]:
                    self.legal_actions[job] = False
                    needed_machine = self.needed_machine_jobs[job]
                    self.machine_legal[needed_machine] = False
                    self.illegal_actions[needed_machine][job] = True
                    self.action_illegal_no_op[job] = True
            while self.nb_machine_legal == 0:
                reward -= self.increase_time_step()
            scaled_reward = self._reward_scaler(reward)
            self._prioritization_non_final()
            self._check_no_op()
            return (
                self._get_current_state_representation(),
                scaled_reward,
                self._is_done(),
                False,
                {},
            )
        else:
            current_time_step_job = self.todo_time_step_job[action]
            machine_needed = self.needed_machine_jobs[action]
            time_needed = self.instance_matrix[action][current_time_step_job][1]
            reward += time_needed
            self.time_until_available_machine[machine_needed] = time_needed
            self.time_until_finish_current_op_jobs[action] = time_needed
            self.state[action][1] = time_needed / self.max_time_op
            to_add_time_step = self.current_time_step + time_needed
            if to_add_time_step not in self.next_time_step:
                index = bisect.bisect_left(self.next_time_step, to_add_time_step)
                self.next_time_step.insert(index, to_add_time_step)
                self.next_jobs.insert(index, action)
            self.solution[action][current_time_step_job] = self.current_time_step
            for job in range(self.jobs):
                if (
                        self.needed_machine_jobs[job] == machine_needed
                        and self.legal_actions[job]
                ):
                    self.legal_actions[job] = False
                    self.nb_legal_actions -= 1
            self.nb_machine_legal -= 1
            self.machine_legal[machine_needed] = False
            for job in range(self.jobs):
                if self.illegal_actions[machine_needed][job]:
                    self.action_illegal_no_op[job] = False
                    self.illegal_actions[machine_needed][job] = False
            # if we can't allocate new job in the current timestep, we pass to the next one
            while self.nb_machine_legal == 0 and len(self.next_time_step) > 0:
                reward -= self.increase_time_step()
            self._prioritization_non_final()
            self._check_no_op()
            # we then need to scale the reward
            scaled_reward = self._reward_scaler(reward)
            return (
                self._get_current_state_representation(),
                scaled_reward,
                self._is_done(),
                False,
                {},
            )

    def _reward_scaler(self, reward):
        return reward / self.max_time_op

    def increase_time_step(self):
        """
        The heart of the logic his here, we need to increase every counter when we have a nope action called
        and return the time elapsed
        :return: time elapsed
        """
        hole_planning = 0
        next_time_step_to_pick = self.next_time_step.pop(0)
        self.next_jobs.pop(0)
        difference = next_time_step_to_pick - self.current_time_step
        self.current_time_step = next_time_step_to_pick
        for job in range(self.jobs):
            was_left_time = self.time_until_finish_current_op_jobs[job]
            if was_left_time > 0:
                performed_op_job = min(difference, was_left_time)
                self.time_until_finish_current_op_jobs[job] = max(
                    0, self.time_until_finish_current_op_jobs[job] - difference
                )
                self.state[job][1] = (
                        self.time_until_finish_current_op_jobs[job] / self.max_time_op
                )
                self.total_perform_op_time_jobs[job] += performed_op_job
                self.state[job][3] = (
                        self.total_perform_op_time_jobs[job] / self.max_time_jobs
                )
                if self.time_until_finish_current_op_jobs[job] == 0:
                    self.total_idle_time_jobs[job] += difference - was_left_time
                    self.state[job][6] = self.total_idle_time_jobs[job] / self.sum_op
                    self.idle_time_jobs_last_op[job] = difference - was_left_time
                    self.state[job][5] = self.idle_time_jobs_last_op[job] / self.sum_op
                    self.todo_time_step_job[job] += 1
                    self.state[job][2] = self.todo_time_step_job[job] / self.machines
                    if self.todo_time_step_job[job] < self.machines:
                        self.needed_machine_jobs[job] = self.instance_matrix[job][
                            self.todo_time_step_job[job]
                        ][0]
                        self.state[job][4] = (
                                max(
                                    0,
                                    self.time_until_available_machine[
                                        self.needed_machine_jobs[job]
                                    ]
                                    - difference,
                                )
                                / self.max_time_op
                        )
                    else:
                        self.needed_machine_jobs[job] = -1
                        # this allow to have 1 is job is over (not 0 because, 0 strongly indicate that the job is a
                        # good candidate)
                        self.state[job][4] = 1.0
                        if self.legal_actions[job]:
                            self.legal_actions[job] = False
                            self.nb_legal_actions -= 1
            elif self.todo_time_step_job[job] < self.machines:
                self.total_idle_time_jobs[job] += difference
                self.idle_time_jobs_last_op[job] += difference
                self.state[job][5] = self.idle_time_jobs_last_op[job] / self.sum_op
                self.state[job][6] = self.total_idle_time_jobs[job] / self.sum_op
        for machine in range(self.machines):
            if self.time_until_available_machine[machine] < difference:
                empty = difference - self.time_until_available_machine[machine]
                hole_planning += empty
            self.time_until_available_machine[machine] = max(
                0, self.time_until_available_machine[machine] - difference
            )
            if self.time_until_available_machine[machine] == 0:
                for job in range(self.jobs):
                    if (
                            self.needed_machine_jobs[job] == machine
                            and not self.legal_actions[job]
                            and not self.illegal_actions[machine][job]
                    ):
                        self.legal_actions[job] = True
                        self.nb_legal_actions += 1
                        if not self.machine_legal[machine]:
                            self.machine_legal[machine] = True
                            self.nb_machine_legal += 1
        return hole_planning

    def _is_done(self):
        if self.nb_legal_actions == 0:
            self.last_time_step = self.current_time_step
            self.last_solution = self.solution
            return True
        return False

    def render(self, mode="human"):
        df = []
        for job in range(self.jobs):
            i = 0
            while i < self.machines and self.solution[job][i] != -1:
                dict_op = dict()
                dict_op["Task"] = "Job {}".format(job)
                start_sec = self.solution[job][i]
                finish_sec = start_sec + self.instance_matrix[job][i][1]
                dict_op["Start"] = start_sec
                dict_op["Finish"] = finish_sec
                dict_op["Resource"] = "Machine {}".format(
                    self.instance_matrix[job][i][0]
                )
                df.append(dict_op)
                i += 1
        fig = None
        if len(df) > 0:
            df = pd.DataFrame(df)
            fig = ff.create_gantt(
                df,
                index_col="Resource",
                colors=self.colors,
                show_colorbar=True,
                group_tasks=True,
            )
            fig.update_yaxes(
                autorange="reversed"
            )  # otherwise tasks are listed from the bottom up
        gantt_chart_console(df, n_machines=self.machines)
        return fig
[7]:
class JSSEnvRewardWrapper(gym.Wrapper):
    def __init__(self, env: gym.Env, lower_bound):
        super().__init__(env)
        self.lower_bound = lower_bound

    def step(self, action: Any) -> tuple[Any, float, bool, bool, dict]:
        observation, reward, done, truncated, info = self.env.step(action)
        # Add custom step logic here
        reward = -env.unwrapped.last_time_step / self.lower_bound + 2 if self.env.unwrapped._is_done() else 0.0
        return observation, reward, done, truncated, info
[8]:
if __name__ == '__main__':
    log.setLevel(20)

    jsp_std_path = "ft06.txt"

    gym.envs.registration.register(
        id="jss-v1",
        entry_point="JSSEnv.envs:JssEnv",
    )

    env = JssEnv(env_config={'instance_path': jsp_std_path})
    env.reset()
    env = NormalizeReward(env, gamma=0.99, epsilon=1e-8)
    env = TransformReward(env, lambda r: r / 36)


    # env = JSSEnvRewardWrapper(env, lower_bound=55.0)

    def mask_fn(env: gym.Env) -> np.ndarray:
        # Do whatever you'd like in this function to return the action mask
        # for the current env. In this example, we assume the env has a
        # helpful method we can rely on.
        return env.unwrapped.legal_actions


    env = DeepCopyMCTSGymEnvWrapper(
        env,
        action_mask_fn=mask_fn
    )

    agent = GymctsAgent(
        env=env,
        clear_mcts_tree_after_step=False,
        render_tree_after_step=True,
        exclude_unvisited_nodes_from_render=True,
        number_of_simulations_per_step=125,
    )

    root = agent.search_root_node.get_root()

    actions = agent.solve(render_tree_after_step=True)

    env.reset()
    for a in actions:
        obs, rew, term, trun, info = env.step(a)

    env.unwrapped.render(mode="human")
    print(f"makespan: {env.unwrapped.last_time_step}")
(N=125, Q_v=1.65, best=2.00)
├── (a=0, N=2, Q_v=0.44, best=0.46, ubc=1.54)
│   └── (a=3, N=1, Q_v=0.46, best=0.46, ubc=1.05)
├── (a=1, N=31, Q_v=1.65, best=1.83, ubc=1.93)
│   ├── (a=0, N=10, Q_v=1.66, best=1.79, ubc=2.07)
│   ├── (a=0, N=3, Q_v=1.64, best=1.74, ubc=2.26)
│   ├── (a=2, N=3, Q_v=1.75, best=1.79, ubc=2.36)
│   └── (a=4, N=3, Q_v=1.63, best=1.66, ubc=2.24)
│   ├── (a=2, N=10, Q_v=1.67, best=1.83, ubc=2.09)
│   ├── (a=0, N=3, Q_v=1.69, best=1.83, ubc=2.31)
│   ├── (a=2, N=3, Q_v=1.69, best=1.81, ubc=2.31)
│   └── (a=4, N=3, Q_v=1.64, best=1.71, ubc=2.26)
│   └── (a=4, N=10, Q_v=1.64, best=1.78, ubc=2.05)
│       ├── (a=3, N=3, Q_v=1.58, best=1.72, ubc=2.19)
│       ├── (a=5, N=3, Q_v=1.70, best=1.78, ubc=2.32)
│       └── (a=6, N=3, Q_v=1.69, best=1.77, ubc=2.31)
├── (a=2, N=6, Q_v=1.27, best=1.36, ubc=1.90)
│   ├── (a=1, N=2, Q_v=1.23, best=1.36, ubc=1.90)
│   └── (a=0, N=1, Q_v=1.10, best=1.10, ubc=1.69)
│   ├── (a=3, N=1, Q_v=1.19, best=1.19, ubc=2.13)
│   └── (a=5, N=2, Q_v=1.33, best=1.33, ubc=2.00)
│       └── (a=5, N=1, Q_v=1.33, best=1.33, ubc=1.92)
├── (a=3, N=5, Q_v=1.21, best=1.29, ubc=1.91)
│   ├── (a=0, N=1, Q_v=1.16, best=1.16, ubc=2.05)
│   ├── (a=2, N=1, Q_v=1.29, best=1.29, ubc=2.19)
│   └── (a=4, N=2, Q_v=1.25, best=1.29, ubc=1.88)
│       └── (a=1, N=1, Q_v=1.20, best=1.20, ubc=1.79)
├── (a=4, N=78, Q_v=1.77, best=2.00, ubc=1.94)
│   ├── (a=1, N=24, Q_v=1.75, best=1.92, ubc=2.06)
│   ├── (a=3, N=7, Q_v=1.75, best=1.92, ubc=2.23)
│   ├── (a=5, N=10, Q_v=1.79, best=1.90, ubc=2.19)
│   └── (a=6, N=6, Q_v=1.72, best=1.81, ubc=2.23)
│   ├── (a=3, N=21, Q_v=1.74, best=1.96, ubc=2.06)
│   ├── (a=1, N=5, Q_v=1.67, best=1.72, ubc=2.22)
│   ├── (a=3, N=8, Q_v=1.77, best=1.96, ubc=2.21)
│   └── (a=5, N=7, Q_v=1.76, best=1.87, ubc=2.23)
│   └── (a=5, N=32, Q_v=1.80, best=2.00, ubc=2.06)
│       ├── (a=1, N=8, Q_v=1.72, best=1.91, ubc=2.18)
│       ├── (a=3, N=10, Q_v=1.80, best=1.95, ubc=2.22)
│       └── (a=5, N=13, Q_v=1.84, best=2.00, ubc=2.20)
└── (a=5, N=2, Q_v=0.71, best=0.73, ubc=1.81)
    └── (a=4, N=1, Q_v=0.73, best=0.73, ubc=1.32)
[17:00:10] INFO     selected action 4 after 125 simulations.
           INFO     current action list: [4]
(a=4, N=203, Q_v=1.80, best=2.00, ubc=1.92)
├── (a=1, N=55, Q_v=1.77, best=1.95, ubc=1.99)
├── (a=3, N=17, Q_v=1.75, best=1.92, ubc=2.10)
├── (a=0, N=5, Q_v=1.70, best=1.83, ubc=2.24)
├── (a=1, N=4, Q_v=1.65, best=1.70, ubc=2.24)
└── (a=2, N=7, Q_v=1.87, best=1.92, ubc=2.32)
├── (a=5, N=23, Q_v=1.82, best=1.95, ubc=2.12)
├── (a=0, N=6, Q_v=1.76, best=1.86, ubc=2.27)
├── (a=1, N=8, Q_v=1.86, best=1.95, ubc=2.30)
└── (a=2, N=8, Q_v=1.84, best=1.94, ubc=2.28)
└── (a=6, N=14, Q_v=1.73, best=1.88, ubc=2.11)
    ├── (a=0, N=3, Q_v=1.76, best=1.83, ubc=2.42)
    ├── (a=1, N=4, Q_v=1.72, best=1.85, ubc=2.30)
    ├── (a=2, N=3, Q_v=1.71, best=1.88, ubc=2.37)
    └── (a=4, N=3, Q_v=1.72, best=1.81, ubc=2.39)
├── (a=3, N=49, Q_v=1.76, best=1.96, ubc=2.00)
├── (a=1, N=11, Q_v=1.70, best=1.84, ubc=2.12)
└── (a=3, N=10, Q_v=1.70, best=1.84, ubc=2.05)
├── (a=3, N=17, Q_v=1.77, best=1.96, ubc=2.11)
├── (a=1, N=5, Q_v=1.67, best=1.86, ubc=2.20)
└── (a=5, N=11, Q_v=1.83, best=1.96, ubc=2.19)
└── (a=5, N=20, Q_v=1.80, best=1.92, ubc=2.11)
    └── (a=3, N=19, Q_v=1.80, best=1.92, ubc=2.08)
└── (a=5, N=98, Q_v=1.83, best=2.00, ubc=2.00)
    ├── (a=1, N=28, Q_v=1.81, best=1.97, ubc=2.10)
    └── (a=5, N=27, Q_v=1.82, best=1.97, ubc=2.07)
    ├── (a=3, N=33, Q_v=1.83, best=1.95, ubc=2.10)
    └── (a=5, N=32, Q_v=1.84, best=1.95, ubc=2.07)
    └── (a=5, N=36, Q_v=1.85, best=2.00, ubc=2.10)
        ├── (a=1, N=20, Q_v=1.87, best=2.00, ubc=2.17)
        └── (a=3, N=15, Q_v=1.81, best=1.98, ubc=2.16)
[17:00:11] INFO     selected action 5 after 125 simulations.
           INFO     current action list: [4, 5]
(a=5, N=223, Q_v=1.84, best=2.01, ubc=1.96)
├── (a=1, N=64, Q_v=1.83, best=2.00, ubc=2.03)
└── (a=5, N=63, Q_v=1.83, best=2.00, ubc=2.01)
    └── (a=5, N=62, Q_v=1.83, best=2.00, ubc=2.02)
├── (a=3, N=76, Q_v=1.84, best=1.99, ubc=2.03)
└── (a=5, N=75, Q_v=1.85, best=1.99, ubc=2.02)
    └── (a=5, N=74, Q_v=1.84, best=1.99, ubc=2.02)
└── (a=5, N=82, Q_v=1.85, best=2.01, ubc=2.03)
    ├── (a=1, N=46, Q_v=1.87, best=2.01, ubc=2.09)
    └── (a=5, N=45, Q_v=1.87, best=2.01, ubc=2.08)
    └── (a=3, N=35, Q_v=1.84, best=1.98, ubc=2.09)
        └── (a=5, N=34, Q_v=1.83, best=1.98, ubc=2.06)
           INFO     selected action 5 after 125 simulations.
           INFO     current action list: [4, 5, 5]
(a=5, N=207, Q_v=1.85, best=2.01, ubc=1.97)
├── (a=1, N=115, Q_v=1.86, best=2.01, ubc=2.01)
└── (a=5, N=114, Q_v=1.86, best=2.01, ubc=2.00)
    ├── (a=0, N=76, Q_v=1.88, best=2.01, ubc=2.06)
    └── (a=2, N=37, Q_v=1.80, best=2.00, ubc=2.06)
└── (a=3, N=91, Q_v=1.84, best=1.99, ubc=2.01)
    └── (a=5, N=90, Q_v=1.84, best=1.99, ubc=2.00)
        ├── (a=1, N=31, Q_v=1.84, best=1.98, ubc=2.11)
        ├── (a=3, N=27, Q_v=1.82, best=1.99, ubc=2.11)
        └── (a=6, N=31, Q_v=1.85, best=1.98, ubc=2.12)
           INFO     selected action 1 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1]
(a=1, N=240, Q_v=1.87, best=2.02, ubc=1.98)
└── (a=5, N=239, Q_v=1.87, best=2.02, ubc=1.98)
    ├── (a=0, N=166, Q_v=1.89, best=2.02, ubc=2.02)
    ├── (a=0, N=70, Q_v=1.88, best=2.01, ubc=2.07)
    └── (a=2, N=95, Q_v=1.90, best=2.02, ubc=2.07)
    └── (a=2, N=72, Q_v=1.82, best=2.00, ubc=2.02)
        ├── (a=3, N=29, Q_v=1.79, best=2.00, ubc=2.07)
        └── (a=4, N=42, Q_v=1.85, best=1.99, ubc=2.07)
[17:00:12] INFO     selected action 5 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5]
(a=5, N=364, Q_v=1.87, best=2.02, ubc=1.96)
├── (a=0, N=265, Q_v=1.89, best=2.02, ubc=2.00)
├── (a=0, N=111, Q_v=1.88, best=2.01, ubc=2.04)
├── (a=2, N=57, Q_v=1.88, best=2.01, ubc=2.08)
└── (a=6, N=53, Q_v=1.87, best=2.01, ubc=2.09)
└── (a=2, N=153, Q_v=1.90, best=2.02, ubc=2.04)
    └── (a=0, N=152, Q_v=1.90, best=2.02, ubc=2.03)
└── (a=2, N=98, Q_v=1.82, best=2.00, ubc=1.99)
    ├── (a=3, N=38, Q_v=1.80, best=2.00, ubc=2.04)
    ├── (a=0, N=12, Q_v=1.77, best=1.89, ubc=2.16)
    ├── (a=1, N=13, Q_v=1.81, best=1.97, ubc=2.19)
    └── (a=2, N=12, Q_v=1.81, best=2.00, ubc=2.19)
    └── (a=4, N=59, Q_v=1.84, best=1.99, ubc=2.04)
        ├── (a=0, N=10, Q_v=1.81, best=1.97, ubc=2.26)
        ├── (a=1, N=15, Q_v=1.88, best=1.98, ubc=2.24)
        ├── (a=2, N=10, Q_v=1.81, best=1.93, ubc=2.27)
        ├── (a=3, N=10, Q_v=1.82, best=1.99, ubc=2.27)
        └── (a=4, N=13, Q_v=1.86, best=1.96, ubc=2.26)
           INFO     selected action 0 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0]
(a=0, N=390, Q_v=1.89, best=2.02, ubc=1.98)
├── (a=0, N=171, Q_v=1.88, best=2.02, ubc=2.01)
├── (a=2, N=95, Q_v=1.89, best=2.02, ubc=2.05)
├── (a=3, N=29, Q_v=1.88, best=2.01, ubc=2.16)
├── (a=4, N=41, Q_v=1.93, best=2.02, ubc=2.16)
└── (a=6, N=24, Q_v=1.85, best=1.99, ubc=2.15)
└── (a=6, N=75, Q_v=1.87, best=2.01, ubc=2.05)
    ├── (a=1, N=20, Q_v=1.84, best=2.00, ubc=2.17)
    ├── (a=3, N=26, Q_v=1.86, best=1.98, ubc=2.15)
    └── (a=4, N=28, Q_v=1.89, best=2.01, ubc=2.17)
└── (a=2, N=218, Q_v=1.90, best=2.02, ubc=2.01)
    └── (a=0, N=217, Q_v=1.90, best=2.02, ubc=2.01)
        ├── (a=3, N=68, Q_v=1.89, best=2.01, ubc=2.09)
        ├── (a=4, N=99, Q_v=1.92, best=2.02, ubc=2.09)
        └── (a=6, N=49, Q_v=1.85, best=1.97, ubc=2.09)
           INFO     selected action 2 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2]
(a=2, N=343, Q_v=1.90, best=2.02, ubc=1.99)
└── (a=0, N=342, Q_v=1.90, best=2.02, ubc=1.99)
    ├── (a=3, N=98, Q_v=1.88, best=2.01, ubc=2.06)
    ├── (a=1, N=32, Q_v=1.88, best=2.01, ubc=2.15)
    ├── (a=2, N=37, Q_v=1.90, best=2.01, ubc=2.15)
    └── (a=5, N=28, Q_v=1.87, best=1.99, ubc=2.15)
    ├── (a=4, N=174, Q_v=1.93, best=2.02, ubc=2.06)
    ├── (a=0, N=61, Q_v=1.93, best=2.02, ubc=2.14)
    ├── (a=3, N=60, Q_v=1.93, best=2.02, ubc=2.14)
    └── (a=4, N=52, Q_v=1.92, best=2.02, ubc=2.14)
    └── (a=6, N=69, Q_v=1.85, best=1.97, ubc=2.06)
        └── (a=0, N=68, Q_v=1.85, best=1.97, ubc=2.03)
           INFO     selected action 0 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0]
(a=0, N=467, Q_v=1.90, best=2.03, ubc=1.98)
├── (a=3, N=133, Q_v=1.89, best=2.02, ubc=2.04)
├── (a=1, N=43, Q_v=1.89, best=2.01, ubc=2.12)
├── (a=2, N=18, Q_v=1.86, best=1.97, ubc=2.18)
└── (a=5, N=24, Q_v=1.90, best=2.00, ubc=2.18)
├── (a=2, N=49, Q_v=1.90, best=2.02, ubc=2.12)
├── (a=1, N=23, Q_v=1.89, best=2.02, ubc=2.18)
└── (a=5, N=25, Q_v=1.91, best=2.00, ubc=2.19)
└── (a=5, N=40, Q_v=1.88, best=1.99, ubc=2.13)
    ├── (a=1, N=18, Q_v=1.86, best=1.99, ubc=2.18)
    └── (a=2, N=21, Q_v=1.89, best=1.99, ubc=2.19)
├── (a=4, N=249, Q_v=1.93, best=2.03, ubc=2.04)
├── (a=0, N=86, Q_v=1.93, best=2.02, ubc=2.11)
└── (a=4, N=85, Q_v=1.93, best=2.02, ubc=2.10)
├── (a=3, N=83, Q_v=1.93, best=2.02, ubc=2.11)
└── (a=4, N=82, Q_v=1.93, best=2.02, ubc=2.09)
└── (a=4, N=79, Q_v=1.92, best=2.03, ubc=2.11)
    ├── (a=0, N=44, Q_v=1.94, best=2.03, ubc=2.16)
    └── (a=3, N=34, Q_v=1.91, best=2.02, ubc=2.16)
└── (a=6, N=84, Q_v=1.85, best=1.97, ubc=2.04)
    └── (a=0, N=83, Q_v=1.85, best=1.97, ubc=2.01)
        ├── (a=1, N=27, Q_v=1.85, best=1.97, ubc=2.14)
        ├── (a=2, N=26, Q_v=1.84, best=1.97, ubc=2.13)
        └── (a=5, N=29, Q_v=1.86, best=1.97, ubc=2.13)
[17:00:13] INFO     selected action 4 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0, 4]
(a=4, N=374, Q_v=1.93, best=2.03, ubc=2.02)
├── (a=0, N=138, Q_v=1.94, best=2.02, ubc=2.09)
└── (a=4, N=137, Q_v=1.94, best=2.02, ubc=2.07)
    ├── (a=1, N=41, Q_v=1.93, best=2.02, ubc=2.17)
    ├── (a=2, N=42, Q_v=1.93, best=2.02, ubc=2.17)
    └── (a=5, N=53, Q_v=1.96, best=2.02, ubc=2.17)
├── (a=3, N=119, Q_v=1.93, best=2.02, ubc=2.09)
└── (a=4, N=118, Q_v=1.93, best=2.02, ubc=2.07)
    ├── (a=1, N=40, Q_v=1.93, best=2.02, ubc=2.17)
    ├── (a=2, N=35, Q_v=1.91, best=2.02, ubc=2.17)
    └── (a=5, N=42, Q_v=1.94, best=2.01, ubc=2.18)
└── (a=4, N=116, Q_v=1.93, best=2.03, ubc=2.09)
    ├── (a=0, N=66, Q_v=1.94, best=2.03, ubc=2.13)
    ├── (a=1, N=20, Q_v=1.93, best=2.02, ubc=2.25)
    ├── (a=2, N=24, Q_v=1.96, best=2.02, ubc=2.25)
    └── (a=5, N=21, Q_v=1.93, best=2.03, ubc=2.25)
    └── (a=3, N=49, Q_v=1.91, best=2.02, ubc=2.13)
        ├── (a=1, N=14, Q_v=1.89, best=2.02, ubc=2.26)
        ├── (a=2, N=17, Q_v=1.91, best=2.02, ubc=2.25)
        └── (a=5, N=17, Q_v=1.92, best=2.01, ubc=2.26)
           INFO     selected action 0 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0, 4, 0]
(a=0, N=263, Q_v=1.94, best=2.02, ubc=2.05)
└── (a=4, N=262, Q_v=1.94, best=2.02, ubc=2.05)
    ├── (a=1, N=80, Q_v=1.94, best=2.02, ubc=2.12)
    ├── (a=2, N=33, Q_v=1.91, best=2.02, ubc=2.17)
    └── (a=5, N=46, Q_v=1.95, best=2.02, ubc=2.17)
    ├── (a=2, N=85, Q_v=1.94, best=2.02, ubc=2.12)
    ├── (a=1, N=38, Q_v=1.93, best=2.02, ubc=2.17)
    └── (a=5, N=46, Q_v=1.95, best=2.02, ubc=2.17)
    └── (a=5, N=96, Q_v=1.95, best=2.02, ubc=2.12)
        ├── (a=1, N=48, Q_v=1.95, best=2.02, ubc=2.17)
        └── (a=2, N=47, Q_v=1.95, best=2.02, ubc=2.17)
           INFO     selected action 4 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0, 4, 0, 4]
(a=4, N=387, Q_v=1.95, best=2.03, ubc=2.03)
├── (a=1, N=120, Q_v=1.94, best=2.03, ubc=2.10)
├── (a=2, N=49, Q_v=1.92, best=2.02, ubc=2.14)
├── (a=5, N=33, Q_v=1.95, best=2.02, ubc=2.20)
└── (a=6, N=15, Q_v=1.84, best=1.94, ubc=2.20)
└── (a=5, N=70, Q_v=1.95, best=2.03, ubc=2.14)
    └── (a=2, N=69, Q_v=1.96, best=2.03, ubc=2.13)
├── (a=2, N=119, Q_v=1.94, best=2.02, ubc=2.10)
├── (a=1, N=53, Q_v=1.93, best=2.02, ubc=2.14)
├── (a=5, N=32, Q_v=1.96, best=2.02, ubc=2.21)
└── (a=6, N=20, Q_v=1.88, best=1.95, ubc=2.20)
└── (a=5, N=65, Q_v=1.95, best=2.02, ubc=2.14)
    └── (a=1, N=64, Q_v=1.95, best=2.02, ubc=2.13)
└── (a=5, N=147, Q_v=1.96, best=2.02, ubc=2.10)
    ├── (a=1, N=74, Q_v=1.96, best=2.02, ubc=2.14)
    └── (a=2, N=73, Q_v=1.96, best=2.02, ubc=2.13)
    └── (a=2, N=72, Q_v=1.95, best=2.02, ubc=2.14)
        └── (a=1, N=71, Q_v=1.96, best=2.02, ubc=2.13)
           INFO     selected action 5 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0, 4, 0, 4, 5]
(a=5, N=272, Q_v=1.96, best=2.02, ubc=2.06)
├── (a=1, N=141, Q_v=1.96, best=2.02, ubc=2.10)
└── (a=2, N=140, Q_v=1.96, best=2.02, ubc=2.09)
    ├── (a=2, N=64, Q_v=1.95, best=2.02, ubc=2.15)
    └── (a=4, N=75, Q_v=1.97, best=2.02, ubc=2.15)
└── (a=2, N=130, Q_v=1.95, best=2.02, ubc=2.10)
    └── (a=1, N=129, Q_v=1.95, best=2.02, ubc=2.09)
        ├── (a=2, N=59, Q_v=1.95, best=2.02, ubc=2.15)
        └── (a=4, N=69, Q_v=1.96, best=2.02, ubc=2.15)
[17:00:14] INFO     selected action 1 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0, 4, 0, 4, 5, 1]
(a=1, N=266, Q_v=1.96, best=2.02, ubc=2.06)
└── (a=2, N=265, Q_v=1.96, best=2.02, ubc=2.06)
    ├── (a=2, N=128, Q_v=1.96, best=2.02, ubc=2.10)
    ├── (a=0, N=43, Q_v=1.96, best=2.02, ubc=2.20)
    ├── (a=1, N=44, Q_v=1.96, best=2.02, ubc=2.20)
    └── (a=3, N=40, Q_v=1.94, best=2.02, ubc=2.19)
    └── (a=4, N=136, Q_v=1.96, best=2.02, ubc=2.10)
        ├── (a=0, N=44, Q_v=1.96, best=2.02, ubc=2.20)
        ├── (a=1, N=48, Q_v=1.97, best=2.02, ubc=2.19)
        └── (a=3, N=43, Q_v=1.96, best=2.02, ubc=2.20)
           INFO     selected action 2 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0, 4, 0, 4, 5, 1, 2]
(a=2, N=390, Q_v=1.96, best=2.02, ubc=2.05)
├── (a=2, N=184, Q_v=1.96, best=2.02, ubc=2.08)
├── (a=0, N=58, Q_v=1.95, best=2.02, ubc=2.16)
├── (a=1, N=30, Q_v=1.96, best=2.02, ubc=2.22)
└── (a=3, N=27, Q_v=1.94, best=2.01, ubc=2.21)
├── (a=1, N=66, Q_v=1.96, best=2.02, ubc=2.16)
├── (a=0, N=33, Q_v=1.96, best=2.02, ubc=2.22)
└── (a=3, N=32, Q_v=1.96, best=2.02, ubc=2.22)
└── (a=3, N=59, Q_v=1.95, best=2.02, ubc=2.16)
    ├── (a=0, N=26, Q_v=1.94, best=2.02, ubc=2.22)
    └── (a=1, N=32, Q_v=1.97, best=2.02, ubc=2.22)
└── (a=4, N=205, Q_v=1.96, best=2.02, ubc=2.08)
    ├── (a=0, N=67, Q_v=1.96, best=2.02, ubc=2.16)
    ├── (a=1, N=37, Q_v=1.98, best=2.02, ubc=2.21)
    └── (a=3, N=29, Q_v=1.94, best=2.02, ubc=2.21)
    ├── (a=1, N=71, Q_v=1.97, best=2.02, ubc=2.16)
    ├── (a=0, N=35, Q_v=1.97, best=2.02, ubc=2.22)
    └── (a=3, N=35, Q_v=1.97, best=2.02, ubc=2.21)
    └── (a=3, N=66, Q_v=1.96, best=2.02, ubc=2.16)
        ├── (a=0, N=30, Q_v=1.95, best=2.02, ubc=2.21)
        └── (a=1, N=35, Q_v=1.97, best=2.02, ubc=2.21)
           INFO     selected action 4 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0, 4, 0, 4, 5, 1, 2, 4]
(a=4, N=330, Q_v=1.97, best=2.02, ubc=2.06)
├── (a=0, N=106, Q_v=1.96, best=2.02, ubc=2.13)
├── (a=1, N=59, Q_v=1.98, best=2.02, ubc=2.17)
└── (a=3, N=58, Q_v=1.98, best=2.02, ubc=2.16)
└── (a=3, N=46, Q_v=1.95, best=2.02, ubc=2.17)
    ├── (a=1, N=26, Q_v=1.97, best=2.02, ubc=2.24)
    └── (a=6, N=19, Q_v=1.92, best=1.96, ubc=2.23)
├── (a=1, N=121, Q_v=1.97, best=2.02, ubc=2.13)
├── (a=0, N=60, Q_v=1.97, best=2.02, ubc=2.17)
└── (a=3, N=59, Q_v=1.97, best=2.02, ubc=2.16)
└── (a=3, N=60, Q_v=1.97, best=2.02, ubc=2.17)
    └── (a=0, N=59, Q_v=1.97, best=2.02, ubc=2.16)
└── (a=3, N=102, Q_v=1.96, best=2.02, ubc=2.13)
    ├── (a=0, N=46, Q_v=1.95, best=2.02, ubc=2.17)
    ├── (a=1, N=26, Q_v=1.97, best=2.02, ubc=2.24)
    └── (a=6, N=19, Q_v=1.92, best=1.96, ubc=2.24)
    └── (a=1, N=55, Q_v=1.97, best=2.02, ubc=2.17)
        └── (a=0, N=54, Q_v=1.97, best=2.02, ubc=2.16)
           INFO     selected action 1 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0, 4, 0, 4, 5, 1, 2, 4, 1]
(a=1, N=246, Q_v=1.97, best=2.02, ubc=2.09)
├── (a=0, N=124, Q_v=1.98, best=2.02, ubc=2.12)
└── (a=3, N=123, Q_v=1.98, best=2.02, ubc=2.12)
    ├── (a=2, N=75, Q_v=1.99, best=2.02, ubc=2.17)
    └── (a=6, N=47, Q_v=1.95, best=1.97, ubc=2.17)
└── (a=3, N=121, Q_v=1.97, best=2.02, ubc=2.12)
    └── (a=0, N=120, Q_v=1.97, best=2.02, ubc=2.11)
        ├── (a=2, N=72, Q_v=1.99, best=2.02, ubc=2.17)
        └── (a=6, N=47, Q_v=1.95, best=1.97, ubc=2.17)
           INFO     selected action 0 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0, 4, 0, 4, 5, 1, 2, 4, 1, 0]
(a=0, N=249, Q_v=1.98, best=2.02, ubc=2.09)
└── (a=3, N=248, Q_v=1.98, best=2.02, ubc=2.08)
    ├── (a=2, N=158, Q_v=1.99, best=2.02, ubc=2.13)
    ├── (a=3, N=92, Q_v=2.01, best=2.02, ubc=2.17)
    └── (a=4, N=65, Q_v=1.97, best=2.01, ubc=2.17)
    └── (a=6, N=89, Q_v=1.95, best=1.97, ubc=2.12)
        ├── (a=3, N=44, Q_v=1.95, best=1.97, ubc=2.17)
        └── (a=4, N=44, Q_v=1.95, best=1.97, ubc=2.17)
[17:00:15] INFO     selected action 3 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0, 4, 0, 4, 5, 1, 2, 4, 1, 0, 3]
(a=3, N=373, Q_v=1.98, best=2.02, ubc=2.07)
├── (a=2, N=250, Q_v=1.99, best=2.02, ubc=2.10)
├── (a=3, N=148, Q_v=2.01, best=2.02, ubc=2.14)
├── (a=3, N=49, Q_v=2.01, best=2.02, ubc=2.23)
├── (a=4, N=49, Q_v=2.01, best=2.02, ubc=2.23)
└── (a=5, N=49, Q_v=2.01, best=2.02, ubc=2.23)
└── (a=4, N=101, Q_v=1.98, best=2.01, ubc=2.14)
    ├── (a=3, N=60, Q_v=2.00, best=2.01, ubc=2.19)
    └── (a=4, N=40, Q_v=1.95, best=2.01, ubc=2.19)
└── (a=6, N=122, Q_v=1.95, best=1.97, ubc=2.10)
    ├── (a=3, N=61, Q_v=1.95, best=1.97, ubc=2.15)
    └── (a=0, N=60, Q_v=1.95, best=1.97, ubc=2.13)
    └── (a=4, N=60, Q_v=1.95, best=1.97, ubc=2.15)
        └── (a=0, N=59, Q_v=1.95, best=1.97, ubc=2.13)
           INFO     selected action 2 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0, 4, 0, 4, 5, 1, 2, 4, 1, 0, 3, 2]
(a=2, N=375, Q_v=2.00, best=2.02, ubc=2.09)
├── (a=3, N=224, Q_v=2.01, best=2.02, ubc=2.12)
├── (a=3, N=75, Q_v=2.01, best=2.02, ubc=2.20)
├── (a=4, N=36, Q_v=2.00, best=2.02, ubc=2.25)
└── (a=5, N=38, Q_v=2.01, best=2.02, ubc=2.25)
├── (a=4, N=74, Q_v=2.01, best=2.02, ubc=2.20)
├── (a=3, N=37, Q_v=2.01, best=2.02, ubc=2.25)
└── (a=5, N=36, Q_v=2.00, best=2.02, ubc=2.25)
└── (a=5, N=74, Q_v=2.01, best=2.02, ubc=2.20)
    ├── (a=3, N=37, Q_v=2.01, best=2.02, ubc=2.25)
    └── (a=4, N=36, Q_v=2.00, best=2.02, ubc=2.25)
└── (a=4, N=150, Q_v=1.98, best=2.01, ubc=2.12)
    ├── (a=3, N=90, Q_v=2.00, best=2.01, ubc=2.16)
    └── (a=4, N=89, Q_v=2.00, best=2.01, ubc=2.15)
    └── (a=4, N=59, Q_v=1.96, best=2.01, ubc=2.16)
        ├── (a=3, N=41, Q_v=1.99, best=2.01, ubc=2.22)
        └── (a=6, N=17, Q_v=1.87, best=1.91, ubc=2.22)
           INFO     selected action 3 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0, 4, 0, 4, 5, 1, 2, 4, 1, 0, 3, 2, 3]
(a=3, N=349, Q_v=2.01, best=2.02, ubc=2.10)
├── (a=3, N=116, Q_v=2.01, best=2.02, ubc=2.17)
├── (a=4, N=57, Q_v=2.01, best=2.02, ubc=2.21)
└── (a=5, N=56, Q_v=2.01, best=2.02, ubc=2.19)
└── (a=5, N=58, Q_v=2.01, best=2.02, ubc=2.21)
    ├── (a=4, N=28, Q_v=2.01, best=2.02, ubc=2.27)
    └── (a=6, N=29, Q_v=2.01, best=2.02, ubc=2.27)
├── (a=4, N=115, Q_v=2.00, best=2.02, ubc=2.16)
├── (a=3, N=57, Q_v=2.01, best=2.02, ubc=2.21)
└── (a=5, N=56, Q_v=2.00, best=2.02, ubc=2.19)
└── (a=5, N=57, Q_v=2.00, best=2.02, ubc=2.21)
    └── (a=3, N=56, Q_v=2.00, best=2.02, ubc=2.19)
└── (a=5, N=117, Q_v=2.01, best=2.02, ubc=2.16)
    ├── (a=3, N=59, Q_v=2.01, best=2.02, ubc=2.21)
    ├── (a=4, N=29, Q_v=2.01, best=2.02, ubc=2.27)
    └── (a=6, N=29, Q_v=2.01, best=2.02, ubc=2.27)
    └── (a=4, N=57, Q_v=2.01, best=2.02, ubc=2.21)
        └── (a=3, N=56, Q_v=2.01, best=2.02, ubc=2.20)
           INFO     selected action 5 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0, 4, 0, 4, 5, 1, 2, 4, 1, 0, 3, 2, 3, 5]
(a=5, N=242, Q_v=2.01, best=2.02, ubc=2.12)
├── (a=3, N=122, Q_v=2.01, best=2.02, ubc=2.16)
├── (a=4, N=59, Q_v=2.01, best=2.02, ubc=2.21)
├── (a=0, N=31, Q_v=2.01, best=2.02, ubc=2.27)
└── (a=1, N=27, Q_v=1.99, best=2.01, ubc=2.27)
└── (a=6, N=62, Q_v=2.01, best=2.02, ubc=2.21)
    ├── (a=0, N=21, Q_v=2.01, best=2.02, ubc=2.33)
    ├── (a=1, N=20, Q_v=2.01, best=2.01, ubc=2.33)
    └── (a=2, N=20, Q_v=2.01, best=2.02, ubc=2.33)
└── (a=4, N=119, Q_v=2.01, best=2.02, ubc=2.16)
    └── (a=3, N=118, Q_v=2.01, best=2.02, ubc=2.15)
        ├── (a=0, N=65, Q_v=2.01, best=2.02, ubc=2.21)
        └── (a=1, N=52, Q_v=1.99, best=2.01, ubc=2.21)
           INFO     selected action 3 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0, 4, 0, 4, 5, 1, 2, 4, 1, 0, 3, 2, 3, 5, 3]
(a=3, N=247, Q_v=2.01, best=2.02, ubc=2.12)
├── (a=4, N=120, Q_v=2.01, best=2.02, ubc=2.16)
├── (a=0, N=65, Q_v=2.01, best=2.02, ubc=2.21)
├── (a=2, N=32, Q_v=2.01, best=2.02, ubc=2.27)
└── (a=4, N=32, Q_v=2.01, best=2.02, ubc=2.27)
└── (a=1, N=54, Q_v=2.00, best=2.01, ubc=2.21)
    ├── (a=2, N=29, Q_v=2.01, best=2.01, ubc=2.27)
    └── (a=4, N=24, Q_v=1.98, best=2.01, ubc=2.27)
└── (a=6, N=126, Q_v=2.01, best=2.02, ubc=2.16)
    ├── (a=0, N=43, Q_v=2.01, best=2.02, ubc=2.25)
    └── (a=2, N=42, Q_v=2.01, best=2.02, ubc=2.22)
    ├── (a=1, N=40, Q_v=2.01, best=2.01, ubc=2.25)
    └── (a=2, N=39, Q_v=2.01, best=2.01, ubc=2.22)
    └── (a=2, N=42, Q_v=2.01, best=2.02, ubc=2.25)
        ├── (a=0, N=21, Q_v=2.01, best=2.02, ubc=2.31)
        └── (a=1, N=20, Q_v=2.01, best=2.01, ubc=2.31)
           INFO     selected action 6 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0, 4, 0, 4, 5, 1, 2, 4, 1, 0, 3, 2, 3, 5, 3, 6]
(a=6, N=251, Q_v=2.01, best=2.02, ubc=2.12)
├── (a=0, N=87, Q_v=2.01, best=2.02, ubc=2.19)
└── (a=2, N=86, Q_v=2.01, best=2.02, ubc=2.17)
    ├── (a=0, N=43, Q_v=2.02, best=2.02, ubc=2.24)
    └── (a=1, N=42, Q_v=2.01, best=2.02, ubc=2.24)
├── (a=1, N=80, Q_v=2.01, best=2.01, ubc=2.19)
└── (a=2, N=79, Q_v=2.01, best=2.01, ubc=2.17)
    ├── (a=3, N=39, Q_v=2.01, best=2.01, ubc=2.24)
    └── (a=5, N=39, Q_v=2.01, best=2.01, ubc=2.24)
└── (a=2, N=83, Q_v=2.01, best=2.02, ubc=2.19)
    ├── (a=0, N=42, Q_v=2.01, best=2.02, ubc=2.24)
    ├── (a=0, N=21, Q_v=2.02, best=2.02, ubc=2.31)
    └── (a=1, N=20, Q_v=2.01, best=2.02, ubc=2.32)
    └── (a=1, N=40, Q_v=2.01, best=2.01, ubc=2.24)
        ├── (a=3, N=20, Q_v=2.01, best=2.01, ubc=2.31)
        └── (a=5, N=19, Q_v=2.01, best=2.01, ubc=2.32)
           INFO     selected action 0 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0, 4, 0, 4, 5, 1, 2, 4, 1, 0, 3, 2, 3, 5, 3, 6, 0]
(a=0, N=212, Q_v=2.01, best=2.02, ubc=2.13)
└── (a=2, N=211, Q_v=2.01, best=2.02, ubc=2.13)
    ├── (a=0, N=108, Q_v=2.02, best=2.02, ubc=2.17)
    └── (a=1, N=107, Q_v=2.02, best=2.02, ubc=2.16)
    └── (a=1, N=102, Q_v=2.01, best=2.02, ubc=2.17)
        ├── (a=0, N=53, Q_v=2.02, best=2.02, ubc=2.22)
        └── (a=6, N=48, Q_v=2.01, best=2.01, ubc=2.23)
           INFO     selected action 2 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0, 4, 0, 4, 5, 1, 2, 4, 1, 0, 3, 2, 3, 5, 3, 6, 0,
                    2]
(a=2, N=336, Q_v=2.01, best=2.02, ubc=2.11)
├── (a=0, N=173, Q_v=2.02, best=2.02, ubc=2.15)
└── (a=1, N=172, Q_v=2.02, best=2.02, ubc=2.14)
    ├── (a=3, N=86, Q_v=2.02, best=2.02, ubc=2.19)
    └── (a=5, N=85, Q_v=2.02, best=2.02, ubc=2.19)
└── (a=1, N=162, Q_v=2.01, best=2.02, ubc=2.14)
    ├── (a=0, N=85, Q_v=2.02, best=2.02, ubc=2.19)
    ├── (a=3, N=42, Q_v=2.02, best=2.02, ubc=2.25)
    └── (a=5, N=42, Q_v=2.02, best=2.02, ubc=2.25)
    └── (a=6, N=76, Q_v=2.01, best=2.01, ubc=2.19)
        ├── (a=3, N=38, Q_v=2.01, best=2.01, ubc=2.24)
        └── (a=5, N=37, Q_v=2.01, best=2.01, ubc=2.25)
[17:00:16] INFO     selected action 0 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0, 4, 0, 4, 5, 1, 2, 4, 1, 0, 3, 2, 3, 5, 3, 6, 0,
                    2, 0]
(a=0, N=298, Q_v=2.02, best=2.02, ubc=2.12)
└── (a=1, N=297, Q_v=2.02, best=2.02, ubc=2.11)
    ├── (a=3, N=148, Q_v=2.02, best=2.02, ubc=2.15)
    └── (a=5, N=147, Q_v=2.02, best=2.02, ubc=2.15)
    └── (a=5, N=148, Q_v=2.02, best=2.02, ubc=2.15)
        └── (a=3, N=147, Q_v=2.02, best=2.02, ubc=2.15)
           INFO     selected action 1 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0, 4, 0, 4, 5, 1, 2, 4, 1, 0, 3, 2, 3, 5, 3, 6, 0,
                    2, 0, 1]
(a=1, N=422, Q_v=2.02, best=2.02, ubc=2.10)
├── (a=3, N=211, Q_v=2.02, best=2.02, ubc=2.14)
└── (a=5, N=210, Q_v=2.02, best=2.02, ubc=2.13)
    ├── (a=2, N=69, Q_v=2.02, best=2.02, ubc=2.21)
    ├── (a=3, N=70, Q_v=2.02, best=2.02, ubc=2.21)
    └── (a=4, N=70, Q_v=2.02, best=2.02, ubc=2.21)
└── (a=5, N=210, Q_v=2.02, best=2.02, ubc=2.14)
    └── (a=3, N=209, Q_v=2.02, best=2.02, ubc=2.13)
        ├── (a=2, N=69, Q_v=2.02, best=2.02, ubc=2.21)
        ├── (a=3, N=70, Q_v=2.02, best=2.02, ubc=2.21)
        └── (a=4, N=69, Q_v=2.02, best=2.02, ubc=2.21)
           INFO     selected action 3 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0, 4, 0, 4, 5, 1, 2, 4, 1, 0, 3, 2, 3, 5, 3, 6, 0,
                    2, 0, 1, 3]
(a=3, N=336, Q_v=2.02, best=2.02, ubc=2.11)
└── (a=5, N=335, Q_v=2.02, best=2.02, ubc=2.11)
    ├── (a=2, N=111, Q_v=2.02, best=2.02, ubc=2.18)
    ├── (a=3, N=55, Q_v=2.02, best=2.02, ubc=2.22)
    └── (a=4, N=55, Q_v=2.02, best=2.02, ubc=2.22)
    ├── (a=3, N=112, Q_v=2.02, best=2.02, ubc=2.18)
    ├── (a=2, N=55, Q_v=2.02, best=2.02, ubc=2.22)
    └── (a=4, N=56, Q_v=2.02, best=2.02, ubc=2.22)
    └── (a=4, N=111, Q_v=2.02, best=2.02, ubc=2.18)
        ├── (a=2, N=55, Q_v=2.02, best=2.02, ubc=2.22)
        └── (a=3, N=55, Q_v=2.02, best=2.02, ubc=2.22)
           INFO     selected action 5 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0, 4, 0, 4, 5, 1, 2, 4, 1, 0, 3, 2, 3, 5, 3, 6, 0,
                    2, 0, 1, 3, 5]
(a=5, N=460, Q_v=2.02, best=2.02, ubc=2.10)
├── (a=2, N=152, Q_v=2.02, best=2.02, ubc=2.16)
├── (a=3, N=76, Q_v=2.02, best=2.02, ubc=2.20)
└── (a=4, N=75, Q_v=2.02, best=2.02, ubc=2.19)
└── (a=4, N=75, Q_v=2.02, best=2.02, ubc=2.20)
    └── (a=3, N=74, Q_v=2.02, best=2.02, ubc=2.19)
├── (a=3, N=154, Q_v=2.02, best=2.02, ubc=2.16)
├── (a=2, N=76, Q_v=2.02, best=2.02, ubc=2.20)
└── (a=4, N=75, Q_v=2.02, best=2.02, ubc=2.19)
└── (a=4, N=77, Q_v=2.02, best=2.02, ubc=2.20)
    └── (a=2, N=76, Q_v=2.02, best=2.02, ubc=2.18)
└── (a=4, N=153, Q_v=2.02, best=2.02, ubc=2.16)
    ├── (a=2, N=76, Q_v=2.02, best=2.02, ubc=2.20)
    └── (a=3, N=75, Q_v=2.02, best=2.02, ubc=2.19)
    └── (a=3, N=76, Q_v=2.02, best=2.02, ubc=2.20)
        └── (a=2, N=75, Q_v=2.02, best=2.02, ubc=2.19)
           INFO     selected action 3 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0, 4, 0, 4, 5, 1, 2, 4, 1, 0, 3, 2, 3, 5, 3, 6, 0,
                    2, 0, 1, 3, 5, 3]
(a=3, N=279, Q_v=2.02, best=2.02, ubc=2.12)
├── (a=2, N=139, Q_v=2.02, best=2.02, ubc=2.16)
└── (a=4, N=138, Q_v=2.02, best=2.02, ubc=2.15)
    └── (a=4, N=137, Q_v=2.02, best=2.02, ubc=2.15)
└── (a=4, N=139, Q_v=2.02, best=2.02, ubc=2.16)
    └── (a=2, N=138, Q_v=2.02, best=2.02, ubc=2.15)
        └── (a=4, N=137, Q_v=2.02, best=2.02, ubc=2.15)
           INFO     selected action 4 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0, 4, 0, 4, 5, 1, 2, 4, 1, 0, 3, 2, 3, 5, 3, 6, 0,
                    2, 0, 1, 3, 5, 3, 4]
(a=4, N=264, Q_v=2.02, best=2.02, ubc=2.12)
└── (a=2, N=263, Q_v=2.02, best=2.02, ubc=2.12)
    └── (a=4, N=262, Q_v=2.02, best=2.02, ubc=2.12)
        └── (a=1, N=261, Q_v=2.02, best=2.02, ubc=2.12)
           INFO     selected action 2 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0, 4, 0, 4, 5, 1, 2, 4, 1, 0, 3, 2, 3, 5, 3, 6, 0,
                    2, 0, 1, 3, 5, 3, 4, 2]
(a=2, N=388, Q_v=2.02, best=2.02, ubc=2.10)
└── (a=4, N=387, Q_v=2.02, best=2.02, ubc=2.10)
    └── (a=1, N=386, Q_v=2.02, best=2.02, ubc=2.10)
        ├── (a=2, N=192, Q_v=2.02, best=2.02, ubc=2.14)
        └── (a=3, N=193, Q_v=2.02, best=2.02, ubc=2.14)
           INFO     selected action 4 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0, 4, 0, 4, 5, 1, 2, 4, 1, 0, 3, 2, 3, 5, 3, 6, 0,
                    2, 0, 1, 3, 5, 3, 4, 2, 4]
(a=4, N=512, Q_v=2.02, best=2.02, ubc=2.09)
└── (a=1, N=511, Q_v=2.02, best=2.02, ubc=2.09)
    ├── (a=2, N=255, Q_v=2.02, best=2.02, ubc=2.13)
    └── (a=3, N=254, Q_v=2.02, best=2.02, ubc=2.12)
    └── (a=3, N=255, Q_v=2.02, best=2.02, ubc=2.13)
        └── (a=2, N=254, Q_v=2.02, best=2.02, ubc=2.12)
           INFO     selected action 1 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0, 4, 0, 4, 5, 1, 2, 4, 1, 0, 3, 2, 3, 5, 3, 6, 0,
                    2, 0, 1, 3, 5, 3, 4, 2, 4, 1]
(a=1, N=636, Q_v=2.02, best=2.02, ubc=2.09)
├── (a=2, N=317, Q_v=2.02, best=2.02, ubc=2.12)
└── (a=3, N=316, Q_v=2.02, best=2.02, ubc=2.11)
    └── (a=1, N=315, Q_v=2.02, best=2.02, ubc=2.11)
└── (a=3, N=318, Q_v=2.02, best=2.02, ubc=2.12)
    └── (a=2, N=317, Q_v=2.02, best=2.02, ubc=2.11)
        └── (a=1, N=316, Q_v=2.02, best=2.02, ubc=2.11)
           INFO     selected action 3 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0, 4, 0, 4, 5, 1, 2, 4, 1, 0, 3, 2, 3, 5, 3, 6, 0,
                    2, 0, 1, 3, 5, 3, 4, 2, 4, 1, 3]
(a=3, N=443, Q_v=2.02, best=2.02, ubc=2.10)
└── (a=2, N=442, Q_v=2.02, best=2.02, ubc=2.10)
    └── (a=1, N=441, Q_v=2.02, best=2.02, ubc=2.10)
           INFO     selected action 2 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0, 4, 0, 4, 5, 1, 2, 4, 1, 0, 3, 2, 3, 5, 3, 6, 0,
                    2, 0, 1, 3, 5, 3, 4, 2, 4, 1, 3, 2]
(a=2, N=567, Q_v=2.02, best=2.02, ubc=2.09)
└── (a=1, N=566, Q_v=2.02, best=2.02, ubc=2.09)
           INFO     selected action 1 after 125 simulations.
           INFO     current action list: [4, 5, 5, 1, 5, 0, 2, 0, 4, 0, 4, 5, 1, 2, 4, 1, 0, 3, 2, 3, 5, 3, 6, 0,
                    2, 0, 1, 3, 5, 3, 4, 2, 4, 1, 3, 2, 1]
           INFO     Final action list: [4, 5, 5, 1, 5, 0, 2, 0, 4, 0, 4, 5, 1, 2, 4, 1, 0, 3, 2, 3, 5, 3, 6, 0, 2,
                    0, 1, 3, 5, 3, 4, 2, 4, 1, 3, 2, 1]
         ╔═══════════════════════════════════════════════════════╗
Job 0    ║         ║ Machine 0   
Job 1    ║   ║ Machine 1   
Job 2    ║          ║ Machine 2   
Job 3    ║                   ║ Machine 3   
Job 4    ║║ Machine 4   
Job 5    ║║ Machine 5   
         ╚╦════╤════╤════╤════╤════╦════╤════╤════╤════╤════╦════╝
          0.0                      26.8                     53.6
makespan: 58