Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions docs/source/tutorials/algos/customize_algos.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,16 +119,17 @@ Here is an example of unit testing in LightZero. In this example, we test the `i
```Python
import pytest
import torch
from lzero.policy.scaling_transform import inverse_scalar_transform, InverseScalarTransform
from lzero.policy.scaling_transform import DiscreteSupport, inverse_scalar_transform, InverseScalarTransform

@pytest.mark.unittest
def test_scaling_transform():
import time
logit = torch.randn(16, 601)
discrete_support = DiscreteSupport(-300., 301., 1.)
start = time.time()
output_1 = inverse_scalar_transform(logit, 300)
output_1 = inverse_scalar_transform(logit, discrete_support)
print('t1', time.time() - start)
handle = InverseScalarTransform(300)
handle = InverseScalarTransform(discrete_support)
start = time.time()
output_2 = handle(logit)
print('t2', time.time() - start)
Expand Down
7 changes: 4 additions & 3 deletions docs/source/tutorials/algos/customize_algos_zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,16 +120,17 @@ if timestep.done:
```Python
import pytest
import torch
from lzero.policy.scaling_transform import inverse_scalar_transform, InverseScalarTransform
from lzero.policy.scaling_transform import DiscreteSupport, inverse_scalar_transform, InverseScalarTransform

@pytest.mark.unittest
def test_scaling_transform():
import time
logit = torch.randn(16, 601)
discrete_support = DiscreteSupport(-300., 301., 1.)
start = time.time()
output_1 = inverse_scalar_transform(logit, 300)
output_1 = inverse_scalar_transform(logit, discrete_support)
print('t1', time.time() - start)
handle = InverseScalarTransform(300)
handle = InverseScalarTransform(discrete_support)
start = time.time()
output_2 = handle(logit)
print('t2', time.time() - start)
Expand Down
3 changes: 2 additions & 1 deletion docs/source/tutorials/config/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ The `main_config` dictionary contains the main parameter settings for running th
- `downsample`: Whether to downsample the input.
- `norm_type`: The type of normalization used.
- `num_channels`: The number of channels in the convolutional layers (number of features extracted).
- `support_scale`: The range of the value support set (`-support_scale` to `support_scale`).
- `reward_support_range`: The range of the reward support set (`(start, stop, step)`).
- `value_support_range`: The range of the value support set (`(start, stop, step)`).
- `bias`: Whether to use bias terms in the layers.
- `discrete_action_encoding_type`: How discrete actions are encoded.
- `self_supervised_learning_loss`: Whether to use a self-supervised learning loss (as in EfficientZero).
Expand Down
3 changes: 2 additions & 1 deletion docs/source/tutorials/config/config_zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@
- `downsample`: 是否进行降采样。
- `norm_type`: 归一化使用的方法。
- `num_channels`: 卷积层提取的特征个数。
- `support_scale`: 价值支持集的范围 (-support_scale, support_scale)。
- `reward_support_range`: 价值支持集的范围 (`(start, stop, step)`)。<!-- TODO : ADAPT THIS DESCRIPTION, I DON'T SPEAK CHINESE -->
- `value_support_range`: 价值支持集的范围 (`(start, stop, step)`)。<!-- TODO : ADAPT THIS DESCRIPTION, I DON'T SPEAK CHINESE -->
- `bias`: 是否使用偏置。
- `discrete_action_encoding_type`: 离散化动作空间使用的编码类型。
- `self_supervised_learning_loss`: 是否使用自监督学习损失(参照EfficientZero的实现)。
Expand Down
5 changes: 2 additions & 3 deletions lzero/agent/config/gumbel_muzero/gomoku_play_with_bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,8 @@
image_channel=3,
num_res_blocks=1,
num_channels=32,
support_scale=10,
reward_support_size=21,
value_support_size=21,
reward_support_range=(-10., 11., 1.),
value_support_range=(-10., 11., 1.),
),
cuda=True,
env_type='board_games',
Expand Down
5 changes: 2 additions & 3 deletions lzero/agent/config/gumbel_muzero/tictactoe_play_with_bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,8 @@
reward_head_hidden_channels=[8],
value_head_hidden_channels=[8],
policy_head_hidden_channels=[8],
support_scale=10,
reward_support_size=21,
value_support_size=21,
reward_support_range=(-10., 11., 1.),
value_support_range=(-10., 11., 1.),
),
cuda=True,
env_type='board_games',
Expand Down
5 changes: 2 additions & 3 deletions lzero/agent/config/muzero/gomoku_play_with_bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,8 @@
image_channel=3,
num_res_blocks=1,
num_channels=32,
support_scale=10,
reward_support_size=21,
value_support_size=21,
reward_support_range=(-10., 11., 1.),
value_support_range=(-10., 11., 1.),
),
cuda=True,
env_type='board_games',
Expand Down
5 changes: 2 additions & 3 deletions lzero/agent/config/muzero/tictactoe_play_with_bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,8 @@
reward_head_hidden_channels=[8],
value_head_hidden_channels=[8],
policy_head_hidden_channels=[8],
support_scale=10,
reward_support_size=21,
value_support_size=21,
reward_support_range=(-10., 11., 1.),
value_support_range=(-10., 11., 1.),
norm_type='BN',
),
cuda=True,
Expand Down
9 changes: 6 additions & 3 deletions lzero/mcts/buffer/game_buffer_efficientzero.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from lzero.mcts.tree_search.mcts_ctree import EfficientZeroMCTSCtree as MCTSCtree
from lzero.mcts.tree_search.mcts_ptree import EfficientZeroMCTSPtree as MCTSPtree
from lzero.mcts.utils import prepare_observation
from lzero.policy import to_detach_cpu_numpy, concat_output, concat_output_value, inverse_scalar_transform
from lzero.policy import DiscreteSupport, to_detach_cpu_numpy, concat_output, concat_output_value, inverse_scalar_transform
from .game_buffer_muzero import MuZeroGameBuffer


Expand Down Expand Up @@ -45,6 +45,9 @@ def __init__(self, cfg: dict):
self.base_idx = 0
self.clear_time = 0

self.value_support = DiscreteSupport(*self._cfg.model.value_support_range)
self.reward_support = DiscreteSupport(*self._cfg.model.reward_support_range)

def sample(self, batch_size: int, policy: Any) -> List[Any]:
"""
Overview:
Expand Down Expand Up @@ -209,7 +212,7 @@ def _compute_target_reward_value(self, reward_value_context: List[Any], model: A
[m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
[
m_output.latent_state,
inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
inverse_scalar_transform(m_output.value, self.value_support),
m_output.policy_logits
]
)
Expand Down Expand Up @@ -359,7 +362,7 @@ def _compute_target_policy_reanalyzed(self, policy_re_context: List[Any], model:
[m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
[
m_output.latent_state,
inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
inverse_scalar_transform(m_output.value, self.value_support),
m_output.policy_logits
]
)
Expand Down
9 changes: 6 additions & 3 deletions lzero/mcts/buffer/game_buffer_muzero.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from lzero.mcts.tree_search.mcts_ctree import MuZeroMCTSCtree as MCTSCtree
from lzero.mcts.tree_search.mcts_ptree import MuZeroMCTSPtree as MCTSPtree
from lzero.mcts.utils import prepare_observation
from lzero.policy import to_detach_cpu_numpy, concat_output, concat_output_value, inverse_scalar_transform
from lzero.policy import DiscreteSupport, to_detach_cpu_numpy, concat_output, concat_output_value, inverse_scalar_transform
from .game_buffer import GameBuffer

if TYPE_CHECKING:
Expand Down Expand Up @@ -61,6 +61,9 @@ def __init__(self, cfg: dict):
self.sample_times = 0
self.active_root_num = 0

self.value_support = DiscreteSupport(*self._cfg.model.value_support_range)
self.reward_support = DiscreteSupport(*self._cfg.model.reward_support_range)

def reset_runtime_metrics(self):
"""
Overview:
Expand Down Expand Up @@ -473,7 +476,7 @@ def _compute_target_reward_value(self, reward_value_context: List[Any], model: A
[m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
[
m_output.latent_state,
inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
inverse_scalar_transform(m_output.value, self.value_support),
m_output.policy_logits
]
)
Expand Down Expand Up @@ -598,7 +601,7 @@ def _compute_target_policy_reanalyzed(self, policy_re_context: List[Any], model:
[m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
[
m_output.latent_state,
inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
inverse_scalar_transform(m_output.value, self.value_support),
m_output.policy_logits
]
)
Expand Down
7 changes: 5 additions & 2 deletions lzero/mcts/buffer/game_buffer_rezero_ez.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from lzero.mcts.tree_search.mcts_ctree import EfficientZeroMCTSCtree as MCTSCtree
from lzero.mcts.utils import prepare_observation
from lzero.policy import to_detach_cpu_numpy, concat_output, inverse_scalar_transform
from lzero.policy import DiscreteSupport, to_detach_cpu_numpy, concat_output, inverse_scalar_transform
from .game_buffer_efficientzero import EfficientZeroGameBuffer
from .game_buffer_rezero_mz import ReZeroMZGameBuffer, compute_all_filters

Expand Down Expand Up @@ -71,6 +71,9 @@ def __init__(self, cfg: dict):
self.active_root_num = 0
self.average_infer = 0

self.value_support = DiscreteSupport(*self._cfg.model.value_support_range)
self.reward_support = DiscreteSupport(*self._cfg.model.reward_support_range)

def sample(
self, batch_size: int, policy: Union["MuZeroPolicy", "EfficientZeroPolicy", "SampledEfficientZeroPolicy"]
) -> List[Any]:
Expand Down Expand Up @@ -172,7 +175,7 @@ def _compute_target_policy_reanalyzed(self, policy_re_context: List[Any], model:
[m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
[
m_output.latent_state,
inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
inverse_scalar_transform(m_output.value, self.value_support),
m_output.policy_logits
]
)
Expand Down
7 changes: 5 additions & 2 deletions lzero/mcts/buffer/game_buffer_rezero_mz.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from lzero.mcts.tree_search.mcts_ctree import MuZeroMCTSCtree as MCTSCtree
from lzero.mcts.tree_search.mcts_ptree import MuZeroMCTSPtree as MCTSPtree
from lzero.mcts.utils import prepare_observation
from lzero.policy import to_detach_cpu_numpy, concat_output, inverse_scalar_transform
from lzero.policy import DiscreteSupport, to_detach_cpu_numpy, concat_output, inverse_scalar_transform
from .game_buffer_muzero import MuZeroGameBuffer

# from line_profiler import line_profiler
Expand Down Expand Up @@ -76,6 +76,9 @@ def __init__(self, cfg: dict):
self.active_root_num = 0
self.average_infer = 0

self.value_support = DiscreteSupport(*self._cfg.model.value_support_range)
self.reward_support = DiscreteSupport(*self._cfg.model.reward_support_range)

def reanalyze_buffer(
self, batch_size: int, policy: Union["MuZeroPolicy", "EfficientZeroPolicy", "SampledEfficientZeroPolicy"]
) -> List[Any]:
Expand Down Expand Up @@ -244,7 +247,7 @@ def _compute_target_policy_reanalyzed(self, policy_re_context: List[Any], model:
m_output.latent_state, m_output.value, m_output.policy_logits = to_detach_cpu_numpy(
[
m_output.latent_state,
inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
inverse_scalar_transform(m_output.value, self.value_support),
m_output.policy_logits
]
)
Expand Down
9 changes: 6 additions & 3 deletions lzero/mcts/buffer/game_buffer_sampled_efficientzero.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from lzero.mcts.tree_search.mcts_ctree_sampled import SampledEfficientZeroMCTSCtree as MCTSCtree
from lzero.mcts.tree_search.mcts_ptree_sampled import SampledEfficientZeroMCTSPtree as MCTSPtree
from lzero.mcts.utils import prepare_observation, generate_random_actions_discrete
from lzero.policy import to_detach_cpu_numpy, concat_output, concat_output_value, inverse_scalar_transform
from lzero.policy import DiscreteSupport, to_detach_cpu_numpy, concat_output, concat_output_value, inverse_scalar_transform
from .game_buffer_efficientzero import EfficientZeroGameBuffer


Expand Down Expand Up @@ -45,6 +45,9 @@ def __init__(self, cfg: dict):
self.base_idx = 0
self.clear_time = 0

self.value_support = DiscreteSupport(*self._cfg.model.value_support_range)
self.reward_support = DiscreteSupport(*self._cfg.model.reward_support_range)

def sample(self, batch_size: int, policy: Any) -> List[Any]:
"""
Overview:
Expand Down Expand Up @@ -291,7 +294,7 @@ def _compute_target_reward_value(self, reward_value_context: List[Any], model: A
[m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
[
m_output.latent_state,
inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
inverse_scalar_transform(m_output.value, self.value_support),
m_output.policy_logits
]
)
Expand Down Expand Up @@ -469,7 +472,7 @@ def _compute_target_policy_reanalyzed(self, policy_re_context: List[Any], model:
[m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
[
m_output.latent_state,
inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
inverse_scalar_transform(m_output.value, self.value_support),
m_output.policy_logits
]
)
Expand Down
9 changes: 6 additions & 3 deletions lzero/mcts/buffer/game_buffer_sampled_muzero.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from lzero.mcts.tree_search.mcts_ctree_sampled import SampledMuZeroMCTSCtree as MCTSCtree
# from lzero.mcts.tree_search.mcts_ptree_sampled import SampledMuZeroMCTSPtree as MCTSPtree
from lzero.mcts.utils import prepare_observation, generate_random_actions_discrete
from lzero.policy import to_detach_cpu_numpy, concat_output, concat_output_value, inverse_scalar_transform
from lzero.policy import DiscreteSupport, to_detach_cpu_numpy, concat_output, concat_output_value, inverse_scalar_transform
from .game_buffer_muzero import MuZeroGameBuffer


Expand Down Expand Up @@ -45,6 +45,9 @@ def __init__(self, cfg: dict):
self.base_idx = 0
self.clear_time = 0

self.value_support = DiscreteSupport(*self._cfg.model.value_support_range)
self.reward_support = DiscreteSupport(*self._cfg.model.reward_support_range)

def sample(self, batch_size: int, policy: Any) -> List[Any]:
"""
Overview:
Expand Down Expand Up @@ -291,7 +294,7 @@ def _compute_target_reward_value(self, reward_value_context: List[Any], model: A
[m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
[
m_output.latent_state,
inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
inverse_scalar_transform(m_output.value, self.value_support),
m_output.policy_logits
]
)
Expand Down Expand Up @@ -454,7 +457,7 @@ def _compute_target_policy_reanalyzed(self, policy_re_context: List[Any], model:
[m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
[
m_output.latent_state,
inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
inverse_scalar_transform(m_output.value, self.value_support),
m_output.policy_logits
]
)
Expand Down
9 changes: 6 additions & 3 deletions lzero/mcts/buffer/game_buffer_sampled_unizero.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from lzero.mcts.tree_search.mcts_ctree_sampled import SampledUniZeroMCTSCtree as MCTSCtree
# from lzero.mcts.tree_search.mcts_ptree import MuZeroMCTSPtree as MCTSPtree
from lzero.mcts.utils import prepare_observation, generate_random_actions_discrete
from lzero.policy import to_detach_cpu_numpy, concat_output, concat_output_value, inverse_scalar_transform
from lzero.policy import DiscreteSupport, to_detach_cpu_numpy, concat_output, concat_output_value, inverse_scalar_transform
from .game_buffer_unizero import UniZeroGameBuffer

if TYPE_CHECKING:
Expand Down Expand Up @@ -51,6 +51,9 @@ def __init__(self, cfg: dict):
# self.task_id = self._cfg.task_id
self.sample_type = self._cfg.sample_type # 'transition' or 'episode'

self.value_support = DiscreteSupport(*self._cfg.model.value_support_range)
self.reward_support = DiscreteSupport(*self._cfg.model.reward_support_range)

def reanalyze_buffer(
self, batch_size: int, policy: Union["MuZeroPolicy", "EfficientZeroPolicy", "SampledEfficientZeroPolicy"]
) -> List[Any]:
Expand Down Expand Up @@ -493,7 +496,7 @@ def _compute_target_policy_reanalyzed(self, policy_re_context: List[Any], model:
[m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
[
m_output.latent_state,
inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
inverse_scalar_transform(m_output.value, self.value_support),
m_output.policy_logits
]
)
Expand Down Expand Up @@ -651,7 +654,7 @@ def _compute_target_reward_value(self, reward_value_context: List[Any], model: A
[m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
[
m_output.latent_state,
inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
inverse_scalar_transform(m_output.value, self.value_support),
m_output.policy_logits
]
)
Expand Down
Loading