From e571f50e3d2f0571f1e12337c135767337aa11c0 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 4 Nov 2022 14:04:06 +0800 Subject: [PATCH 001/244] td3 fix --- ding/framework/middleware/functional/logger.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ding/framework/middleware/functional/logger.py b/ding/framework/middleware/functional/logger.py index d13b4a9c2f..46b16a9561 100644 --- a/ding/framework/middleware/functional/logger.py +++ b/ding/framework/middleware/functional/logger.py @@ -148,7 +148,14 @@ def _plot(ctx: "OnlineRLContext"): return for metric in metric_list: if metric in ctx.train_output[0]: - metric_value = np.mean([item[metric] for item in ctx.train_output]) + # metric_value = np.mean([item[metric] for item in ctx.train_output]) + metric_value_list = [] + for item in ctx.train_output: + if isinstance(item[metric], torch.Tensor): + metric_value_list.append(item[metric].cpu().detach().numpy()) + else: + metric_value_list.append(item[metric]) + metric_value = np.mean(metric_value_list) wandb.log({metric: metric_value}) if ctx.eval_value != -np.inf: From 9060c53ef44c5871fe1c021c3b155f6f589924ce Mon Sep 17 00:00:00 2001 From: zjowowen Date: Mon, 19 Dec 2022 16:39:08 +0800 Subject: [PATCH 002/244] Add benchmark config file. --- benchmark/hopper_td3_config.py | 72 ++++++++++ benchmark/hopper_td3_pipeline.py | 50 +++++++ benchmark/hopper_td3_wanb_pipeline.py | 129 +++++++++++++++++ benchmark/lunarlander_dqn_config.py | 80 +++++++++++ benchmark/lunarlander_dqn_deploy.py | 39 ++++++ benchmark/lunarlander_dqn_eval.py | 64 +++++++++ benchmark/lunarlander_dqn_pipeline.py | 52 +++++++ benchmark/pendulum_td3_config.py | 64 +++++++++ benchmark/pendulum_td3_pipeline.py | 49 +++++++ benchmark/pendulum_td3_wandb.py | 61 ++++++++ benchmark/qbert_dqn_config.py | 60 ++++++++ benchmark/qbert_dqn_wandb.py | 192 ++++++++++++++++++++++++++ 12 files changed, 912 insertions(+) create mode 100644 benchmark/hopper_td3_config.py create mode 100644 benchmark/hopper_td3_pipeline.py create mode 100644 benchmark/hopper_td3_wanb_pipeline.py create mode 100644 benchmark/lunarlander_dqn_config.py create mode 100644 benchmark/lunarlander_dqn_deploy.py create mode 100644 benchmark/lunarlander_dqn_eval.py create mode 100644 benchmark/lunarlander_dqn_pipeline.py create mode 100644 benchmark/pendulum_td3_config.py create mode 100644 benchmark/pendulum_td3_pipeline.py create mode 100644 benchmark/pendulum_td3_wandb.py create mode 100644 benchmark/qbert_dqn_config.py create mode 100644 benchmark/qbert_dqn_wandb.py diff --git a/benchmark/hopper_td3_config.py b/benchmark/hopper_td3_config.py new file mode 100644 index 0000000000..207a5dc75e --- /dev/null +++ b/benchmark/hopper_td3_config.py @@ -0,0 +1,72 @@ +from easydict import EasyDict + +hopper_td3_config = dict( + exp_name='hopper_td3_seed0', + env=dict( + env_id='Hopper-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=1, + evaluator_env_num=1, + n_evaluator_episode=1, + stop_value=6000, + ), + policy=dict( + cuda=True, + random_collect_size=25000, + model=dict( + obs_shape=11, + action_shape=3, + twin_critic=True, + actor_head_hidden_size=256, + critic_head_hidden_size=256, + action_space='regression', + ), + logger=dict(record_path='./video_hopper_td3', gradient_logger=True, plot_logger=True, action_logger='q_value'), + learn=dict( + update_per_collect=1, + batch_size=256, + learning_rate_actor=1e-3, + learning_rate_critic=1e-3, + ignore_done=False, + target_theta=0.005, + discount_factor=0.99, + actor_update_freq=2, + noise=True, + noise_sigma=0.2, + noise_range=dict( + min=-0.5, + max=0.5, + ), + ), + collect=dict( + n_sample=1, + unroll_len=1, + noise_sigma=0.1, + ), + other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), + ) +) + +hopper_td3_config = EasyDict(hopper_td3_config) +main_config = hopper_td3_config + +hopper_td3_create_config = dict( + env=dict( + type='mujoco', + import_names=['dizoo.mujoco.envs.mujoco_env'], + ), + env_manager=dict(type='subprocess'), + policy=dict( + type='td3', + import_names=['ding.policy.td3'], + ), + replay_buffer=dict(type='naive', ), +) +hopper_td3_create_config = EasyDict(hopper_td3_create_config) +create_config = hopper_td3_create_config + +if __name__ == "__main__": + # or you can enter `ding -m serial -c hopper_td3_config.py -s 0` + from ding.entry import serial_pipeline + serial_pipeline([main_config, create_config], seed=0) diff --git a/benchmark/hopper_td3_pipeline.py b/benchmark/hopper_td3_pipeline.py new file mode 100644 index 0000000000..5de4ea0cfd --- /dev/null +++ b/benchmark/hopper_td3_pipeline.py @@ -0,0 +1,50 @@ +import gym +from ditk import logging +from ding.model.template.qac import QAC +from ding.policy import TD3Policy +from ding.envs import DingEnvWrapper, BaseEnvManagerV2, SubprocessEnvManagerV2 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, CkptSaver +from ding.utils import set_pkg_seed +from ding.utils.log_helper import build_logger +from dizoo.mujoco.envs.mujoco_env import MujocoEnv + +from hopper_td3_config import main_config, create_config + + +def main(seed): + logging.getLogger().setLevel(logging.INFO) + cfg = compile_config(main_config, create_cfg=create_config, auto=True) + cfg.env.seed = seed + + logger_, tb_logger = build_logger(path='./log/hopper_td3/seed' + str(seed), need_tb=True) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = SubprocessEnvManagerV2( + env_fn=[lambda: MujocoEnv(cfg.env) for _ in range(cfg.env.collector_env_num)], cfg=cfg.env.manager + ) + evaluator_env = SubprocessEnvManagerV2( + env_fn=[lambda: MujocoEnv(cfg.env) for _ in range(cfg.env.evaluator_env_num)], cfg=cfg.env.manager + ) + evaluator_env.enable_save_replay(replay_path=cfg.policy.logger.record_path) + + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = QAC(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = TD3Policy(cfg.policy, model) + + task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use( + StepCollector(cfg, policy.collect_mode, collector_env, random_collect_size=cfg.policy.random_collect_size) + ) + task.use(data_pusher(cfg, buffer_)) + task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(CkptSaver(cfg, policy, train_freq=100)) + task.run() + + +if __name__ == "__main__": + main(0) diff --git a/benchmark/hopper_td3_wanb_pipeline.py b/benchmark/hopper_td3_wanb_pipeline.py new file mode 100644 index 0000000000..445fbf0db2 --- /dev/null +++ b/benchmark/hopper_td3_wanb_pipeline.py @@ -0,0 +1,129 @@ +from ditk import logging +from ding.model.template.qac import QAC +from ding.policy import TD3Policy +from ding.envs import DingEnvWrapper, BaseEnvManagerV2, SubprocessEnvManagerV2 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, CkptSaver, termination_checker, wandb_online_logger +from ding.utils import set_pkg_seed +from ding.utils.log_helper import build_logger +from dizoo.mujoco.envs.mujoco_env import MujocoEnv +from easydict import EasyDict +import wandb + +hopper_td3_config = dict( + exp_name='hopper_td3_wandb_seed0', + seed=0, + env=dict( + env_id='Hopper-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=6000, + ), + policy=dict( + cuda=True, + random_collect_size=25000, + model=dict( + obs_shape=11, + action_shape=3, + twin_critic=True, + actor_head_hidden_size=256, + critic_head_hidden_size=256, + action_space='regression', + ), + logger=dict(record_path='./video_hopper_td3', gradient_logger=True, plot_logger=True, action_logger=None), + learn=dict( + update_per_collect=1, + batch_size=256, + learning_rate_actor=1e-3, + learning_rate_critic=1e-3, + ignore_done=False, + target_theta=0.005, + discount_factor=0.99, + actor_update_freq=2, + noise=True, + noise_sigma=0.2, + noise_range=dict( + min=-0.5, + max=0.5, + ), + ), + collect=dict( + n_sample=1, + unroll_len=1, + noise_sigma=0.1, + ), + other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), + ) +) + +hopper_td3_config = EasyDict(hopper_td3_config) +main_config = hopper_td3_config + +hopper_td3_create_config = dict( + env=dict( + type='mujoco', + import_names=['dizoo.mujoco.envs.mujoco_env'], + ), + env_manager=dict(type='subprocess'), + policy=dict( + type='td3', + import_names=['ding.policy.td3'], + ), + replay_buffer=dict(type='naive', ), +) +hopper_td3_create_config = EasyDict(hopper_td3_create_config) +create_config = hopper_td3_create_config + + +def main(seed=0, max_env_step=10000000): + logging.getLogger().setLevel(logging.INFO) + cfg = compile_config(main_config, create_cfg=create_config, auto=True) + cfg.env.seed = seed + + wandb.init( + # Set the project where this run will be logged + project='hopper-td3', + # We pass a run name (otherwise it’ll be randomly assigned, like sunshine-lollypop-10) + name=str(main_config["DI-toolkit-hpo-id"]), + # Track hyperparameters and run metadata + config=cfg + ) + + # logger_, tb_logger = build_logger(path='./log/hopper_td3/seed' + str(seed), + # need_tb=True) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = SubprocessEnvManagerV2( + env_fn=[lambda: MujocoEnv(cfg.env) for _ in range(cfg.env.collector_env_num)], cfg=cfg.env.manager + ) + evaluator_env = SubprocessEnvManagerV2( + env_fn=[lambda: MujocoEnv(cfg.env) for _ in range(cfg.env.evaluator_env_num)], cfg=cfg.env.manager + ) + cfg.policy.logger.record_path = './' + cfg.exp_name + '/video' + evaluator_env.enable_save_replay(replay_path=cfg.policy.logger.record_path) + + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = QAC(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = TD3Policy(cfg.policy, model) + + task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use( + StepCollector(cfg, policy.collect_mode, collector_env, random_collect_size=cfg.policy.random_collect_size) + ) + task.use(data_pusher(cfg, buffer_)) + task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(CkptSaver(cfg, policy, train_freq=100)) + task.use(wandb_online_logger(cfg.policy.logger, evaluator_env, model)) + task.use(termination_checker(max_env_step=max_env_step)) + task.run() + + +if __name__ == "__main__": + main(seed=main_config.seed, max_env_step=10000000) diff --git a/benchmark/lunarlander_dqn_config.py b/benchmark/lunarlander_dqn_config.py new file mode 100644 index 0000000000..cc4bfcc9ec --- /dev/null +++ b/benchmark/lunarlander_dqn_config.py @@ -0,0 +1,80 @@ +from easydict import EasyDict + +nstep = 3 +lunarlander_dqn_config = dict( + exp_name='lunarlander_dqn_seed0', + env=dict( + # Whether to use shared memory. Only effective if "env_manager_type" is 'subprocess' + # Env number respectively for collector and evaluator. + collector_env_num=1, + evaluator_env_num=1, + env_id='LunarLander-v2', + n_evaluator_episode=1, + stop_value=200, + # The path to save the game replay + # replay_path='./lunarlander_dqn_seed0/video', + ), + policy=dict( + # Whether to use cuda for network. + cuda=True, + load_path="./lunarlander_dqn_seed0/ckpt/ckpt_best.pth.tar", + model=dict( + obs_shape=8, + action_shape=4, + encoder_hidden_size_list=[512, 64], + # Whether to use dueling head. + dueling=True, + ), + # Reward's future discount factor, aka. gamma. + discount_factor=0.99, + # How many steps in td error. + nstep=nstep, + # learn_mode config + learn=dict( + update_per_collect=10, + batch_size=64, + learning_rate=0.001, + # Frequency of target network update. + target_update_freq=100, + ), + # collect_mode config + collect=dict( + # You can use either "n_sample" or "n_episode" in collector.collect. + # Get "n_sample" samples per collect. + n_sample=64, + # Cut trajectories into pieces with length "unroll_len". + unroll_len=1, + ), + # command_mode config + other=dict( + # Epsilon greedy with decay. + eps=dict( + # Decay type. Support ['exp', 'linear']. + type='exp', + start=0.95, + end=0.1, + decay=50000, + ), + replay_buffer=dict(replay_buffer_size=100000, ) + ), + ), +) +lunarlander_dqn_config = EasyDict(lunarlander_dqn_config) +main_config = lunarlander_dqn_config + +lunarlander_dqn_create_config = dict( + env=dict( + type='lunarlander', + import_names=['dizoo.box2d.lunarlander.envs.lunarlander_env'], + ), + env_manager=dict(type='subprocess'), + # env_manager=dict(type='base'), + policy=dict(type='dqn'), +) +lunarlander_dqn_create_config = EasyDict(lunarlander_dqn_create_config) +create_config = lunarlander_dqn_create_config + +if __name__ == "__main__": + # or you can enter `ding -m serial -c lunarlander_dqn_config.py -s 0` + from ding.entry import serial_pipeline + serial_pipeline([main_config, create_config], seed=0) diff --git a/benchmark/lunarlander_dqn_deploy.py b/benchmark/lunarlander_dqn_deploy.py new file mode 100644 index 0000000000..e3bd346b48 --- /dev/null +++ b/benchmark/lunarlander_dqn_deploy.py @@ -0,0 +1,39 @@ +import gym +import torch +from easydict import EasyDict +from ding.config import compile_config +from ding.envs import DingEnvWrapper +from ding.policy import DQNPolicy, single_env_forward_wrapper +from ding.model import DQN +#from dizoo.box2d.lunarlander.config.lunarlander_dqn_config import main_config, create_config +from lunarlander_dqn_config import main_config, create_config + + +def main(main_config: EasyDict, create_config: EasyDict, ckpt_path: str): + main_config.exp_name = 'lunarlander_dqn_deploy' + + cfg = compile_config(main_config, create_cfg=create_config, auto=True) + + env = DingEnvWrapper(gym.make(cfg.env.env_id), EasyDict(env_wrapper='default')) + env.enable_save_replay(replay_path='./lunarlander_dqn_deploy/video') + + model = DQN(**cfg.policy.model) + state_dict = torch.load(ckpt_path, map_location='cpu') + model.load_state_dict(state_dict['model']) + policy = DQNPolicy(cfg.policy, model=model).eval_mode + forward_fn = single_env_forward_wrapper(policy.forward) + + obs = env.reset() + + returns = 0. + while True: + action = forward_fn(obs) + obs, rew, done, info = env.step(action) + returns += rew + if done: + break + print(f'Deploy is finished, final epsiode return is: {returns}') + + +if __name__ == "__main__": + main(main_config=main_config, create_config=create_config, ckpt_path='lunarlander_dqn_seed0/ckpt/final.pth.tar') \ No newline at end of file diff --git a/benchmark/lunarlander_dqn_eval.py b/benchmark/lunarlander_dqn_eval.py new file mode 100644 index 0000000000..3309bb308c --- /dev/null +++ b/benchmark/lunarlander_dqn_eval.py @@ -0,0 +1,64 @@ +import os +import gym +import torch +from tensorboardX import SummaryWriter +from easydict import EasyDict + +from ding.config import compile_config +from ding.worker import BaseLearner, SampleSerialCollector, InteractionSerialEvaluator, AdvancedReplayBuffer +from ding.envs import BaseEnvManager, DingEnvWrapper +from ding.policy import DQNPolicy +from ding.model import DQN +from ding.utils import set_pkg_seed +from ding.rl_utils import get_epsilon_greedy_fn +#from dizoo.box2d.lunarlander.config.lunarlander_dqn_config import main_config, create_config +from lunarlander_dqn_config import main_config, create_config + + +# Get DI-engine form env class +def wrapped_cartpole_env(): + return DingEnvWrapper( + gym.make(main_config['env']['env_id']), + EasyDict(env_wrapper='default'), + ) + + +def main(cfg, seed=0): + cfg['exp_name'] = 'lunarlander_dqn_eval' + cfg = compile_config( + cfg, + BaseEnvManager, + DQNPolicy, + BaseLearner, + SampleSerialCollector, + InteractionSerialEvaluator, + AdvancedReplayBuffer, + save_cfg=True + ) + cfg.policy.load_path = 'lunarlander_dqn_seed0/ckpt/final.pth.tar' + evaluator_env_num = cfg.env.evaluator_env_num + evaluator_env = BaseEnvManager(env_fn=[wrapped_cartpole_env for _ in range(evaluator_env_num)], cfg=cfg.env.manager) + + # switch save replay interface + # evaluator_env.enable_save_replay(cfg.env.replay_path) + evaluator_env.enable_save_replay(replay_path='./lunarlander_dqn_eval/video') + + # Set random seed for all package and instance + evaluator_env.seed(seed, dynamic_seed=False) + set_pkg_seed(seed, use_cuda=cfg.policy.cuda) + + # Set up RL Policy + model = DQN(**cfg.policy.model) + policy = DQNPolicy(cfg.policy, model=model) + policy.eval_mode.load_state_dict(torch.load(cfg.policy.load_path, map_location='cpu')) + + # evaluate + tb_logger = SummaryWriter(os.path.join('./{}/log/'.format(cfg.exp_name), 'serial')) + evaluator = InteractionSerialEvaluator( + cfg.policy.eval.evaluator, evaluator_env, policy.eval_mode, tb_logger, exp_name=cfg.exp_name + ) + evaluator.eval() + + +if __name__ == "__main__": + main(main_config) \ No newline at end of file diff --git a/benchmark/lunarlander_dqn_pipeline.py b/benchmark/lunarlander_dqn_pipeline.py new file mode 100644 index 0000000000..24e7e6a0d7 --- /dev/null +++ b/benchmark/lunarlander_dqn_pipeline.py @@ -0,0 +1,52 @@ +import gym +from ditk import logging +from ding.model import DQN +from ding.policy import DQNPolicy +from ding.envs import DingEnvWrapper, BaseEnvManagerV2, SubprocessEnvManagerV2 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, online_logger, nstep_reward_enhancer +from ding.utils import set_pkg_seed +from ding.utils.log_helper import build_logger +#from dizoo.box2d.lunarlander.config.lunarlander_dqn_config import main_config, create_config +from lunarlander_dqn_config import main_config, create_config + + +def main(): + logging.getLogger().setLevel(logging.INFO) + cfg = compile_config(main_config, create_cfg=create_config, auto=True) + ding_init(cfg) + + #logger_, tb_logger = build_logger(path='./lunarlander_dqn_seed0/log/', need_tb=True) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = SubprocessEnvManagerV2( + env_fn=[lambda: DingEnvWrapper(gym.make(cfg.env.env_id)) for _ in range(cfg.env.collector_env_num)], + cfg=cfg.env.manager + ) + evaluator_env = SubprocessEnvManagerV2( + env_fn=[lambda: DingEnvWrapper(gym.make(cfg.env.env_id)) for _ in range(cfg.env.evaluator_env_num)], + cfg=cfg.env.manager + ) + + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNPolicy(cfg.policy, model=model) + + task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use(StepCollector(cfg, policy.collect_mode, collector_env)) + task.use(nstep_reward_enhancer(cfg)) + task.use(data_pusher(cfg, buffer_)) + task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use(CkptSaver(cfg, policy, train_freq=100)) + task.run() + + +if __name__ == "__main__": + main() diff --git a/benchmark/pendulum_td3_config.py b/benchmark/pendulum_td3_config.py new file mode 100644 index 0000000000..48e868b215 --- /dev/null +++ b/benchmark/pendulum_td3_config.py @@ -0,0 +1,64 @@ +from easydict import EasyDict + +pendulum_td3_config = dict( + exp_name='pendulum_td3_seed0', + env=dict( + collector_env_num=1, + evaluator_env_num=1, + # (bool) Scale output action into legal range. + act_scale=True, + n_evaluator_episode=1, + stop_value=-250, + ), + policy=dict( + cuda=False, + priority=False, + random_collect_size=800, + logger=dict(record_path='./video_pendulum_td3', gradient_logger=True, plot_logger=True, action_logger=None), + model=dict( + obs_shape=3, + action_shape=1, + twin_critic=True, + action_space='regression', + ), + learn=dict( + update_per_collect=2, + batch_size=128, + learning_rate_actor=0.001, + learning_rate_critic=0.001, + ignore_done=True, + actor_update_freq=2, + noise=True, + noise_sigma=0.1, + noise_range=dict( + min=-0.5, + max=0.5, + ), + ), + collect=dict( + n_sample=48, + noise_sigma=0.1, + collector=dict(collect_print_freq=1000, ), + ), + eval=dict(evaluator=dict(eval_freq=100, ), ), + other=dict(replay_buffer=dict(replay_buffer_size=20000, ), ), + ), +) +pendulum_td3_config = EasyDict(pendulum_td3_config) +main_config = pendulum_td3_config + +pendulum_td3_create_config = dict( + env=dict( + type='pendulum', + import_names=['dizoo.classic_control.pendulum.envs.pendulum_env'], + ), + env_manager=dict(type='base'), + policy=dict(type='td3'), +) +pendulum_td3_create_config = EasyDict(pendulum_td3_create_config) +create_config = pendulum_td3_create_config + +if __name__ == "__main__": + # or you can enter `ding -m serial -c pendulum_td3_config.py -s 0` + from ding.entry import serial_pipeline + serial_pipeline([main_config, create_config], seed=0) diff --git a/benchmark/pendulum_td3_pipeline.py b/benchmark/pendulum_td3_pipeline.py new file mode 100644 index 0000000000..45794e8464 --- /dev/null +++ b/benchmark/pendulum_td3_pipeline.py @@ -0,0 +1,49 @@ +import gym +from ditk import logging +from ding.model.template.qac import QAC +from ding.policy import TD3Policy +from ding.envs import DingEnvWrapper, BaseEnvManagerV2, SubprocessEnvManagerV2 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, CkptSaver +from ding.utils import set_pkg_seed +from ding.utils.log_helper import build_logger +from dizoo.classic_control.pendulum.envs.pendulum_env import PendulumEnv +from pendulum_td3_config import main_config, create_config + + +def main(seed): + logging.getLogger().setLevel(logging.INFO) + cfg = compile_config(main_config, create_cfg=create_config, auto=True) + cfg.env.seed = seed + + logger_, tb_logger = build_logger(path='./log/pendulum_td3/seed' + str(seed), need_tb=True) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = SubprocessEnvManagerV2( + env_fn=[lambda: PendulumEnv(cfg.env) for _ in range(cfg.env.collector_env_num)], cfg=cfg.env.manager + ) + evaluator_env = SubprocessEnvManagerV2( + env_fn=[lambda: PendulumEnv(cfg.env) for _ in range(cfg.env.evaluator_env_num)], cfg=cfg.env.manager + ) + evaluator_env.enable_save_replay(replay_path=cfg.policy.logger.record_path) + + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = QAC(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = TD3Policy(cfg.policy, model) + + task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use( + StepCollector(cfg, policy.collect_mode, collector_env, random_collect_size=cfg.policy.random_collect_size) + ) + task.use(data_pusher(cfg, buffer_)) + task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(CkptSaver(cfg, policy, train_freq=100)) + task.run() + + +if __name__ == "__main__": + main(0) diff --git a/benchmark/pendulum_td3_wandb.py b/benchmark/pendulum_td3_wandb.py new file mode 100644 index 0000000000..cdfcc6568f --- /dev/null +++ b/benchmark/pendulum_td3_wandb.py @@ -0,0 +1,61 @@ +import gym +from ditk import logging +from ding.model.template.qac import QAC +from ding.policy import TD3Policy +from ding.envs import DingEnvWrapper, BaseEnvManagerV2, SubprocessEnvManagerV2 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, CkptSaver, wandb_online_logger +from ding.utils import set_pkg_seed +from ding.utils.log_helper import build_logger +from dizoo.classic_control.pendulum.envs.pendulum_env import PendulumEnv + +from pendulum_td3_config import main_config, create_config +import wandb + + +def main(seed): + logging.getLogger().setLevel(logging.INFO) + cfg = compile_config(main_config, create_cfg=create_config, auto=True) + cfg.env.seed = seed + + wandb.init( + # Set the project where this run will be logged + project='pendulum-td3-0', + # We pass a run name (otherwise it’ll be randomly assigned, like sunshine-lollypop-10) + name=f"td3", + # Track hyperparameters and run metadata + config=cfg + ) + + logger_, tb_logger = build_logger(path='./log/pendulum_td3/seed' + str(seed), need_tb=True) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = SubprocessEnvManagerV2( + env_fn=[lambda: PendulumEnv(cfg.env) for _ in range(cfg.env.collector_env_num)], cfg=cfg.env.manager + ) + evaluator_env = SubprocessEnvManagerV2( + env_fn=[lambda: PendulumEnv(cfg.env) for _ in range(cfg.env.evaluator_env_num)], cfg=cfg.env.manager + ) + evaluator_env.enable_save_replay(replay_path=cfg.policy.logger.record_path) + + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = QAC(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = TD3Policy(cfg.policy, model) + + task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use( + StepCollector(cfg, policy.collect_mode, collector_env, random_collect_size=cfg.policy.random_collect_size) + ) + task.use(data_pusher(cfg, buffer_)) + task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(wandb_online_logger(cfg.policy.logger, evaluator_env, model)) + task.use(CkptSaver(cfg, policy, train_freq=100)) + task.run() + + +if __name__ == "__main__": + main(0) diff --git a/benchmark/qbert_dqn_config.py b/benchmark/qbert_dqn_config.py new file mode 100644 index 0000000000..f109fc9deb --- /dev/null +++ b/benchmark/qbert_dqn_config.py @@ -0,0 +1,60 @@ +from easydict import EasyDict + +qbert_dqn_config = dict( + exp_name='qbert_dqn_normal', + env=dict( + collector_env_num=1, + evaluator_env_num=1, + n_evaluator_episode=1, + stop_value=30000, + env_id='QbertNoFrameskip-v4', + frame_stack=4 + ), + policy=dict( + cuda=True, + priority=False, + model=dict( + obs_shape=[4, 84, 84], + action_shape=6, + encoder_hidden_size_list=[128, 128, 512], + ), + nstep=3, + discount_factor=0.99, + learn=dict( + update_per_collect=10, + batch_size=32, + learning_rate=0.0001, + target_update_freq=500, + reward_scale_function='normal', + ), + collect=dict(n_sample=100, ), + logger=dict(record_path='./video_qbert_dqn', gradient_logger=True, plot_logger=True, action_logger='q_value'), + eval=dict(evaluator=dict(eval_freq=4000, )), + other=dict( + eps=dict( + type='exp', + start=1., + end=0.05, + decay=1000000, + ), + replay_buffer=dict(replay_buffer_size=400000, ), + ), + ), +) +qbert_dqn_config = EasyDict(qbert_dqn_config) +main_config = qbert_dqn_config +qbert_dqn_create_config = dict( + env=dict( + type='atari', + import_names=['dizoo.atari.envs.atari_env'], + ), + env_manager=dict(type='subprocess'), + policy=dict(type='dqn'), +) +qbert_dqn_create_config = EasyDict(qbert_dqn_create_config) +create_config = qbert_dqn_create_config + +if __name__ == '__main__': + # or you can enter ding -m serial -c qbert_dqn_config.py -s 0 + from ding.entry import serial_pipeline + serial_pipeline((main_config, create_config), seed=0) \ No newline at end of file diff --git a/benchmark/qbert_dqn_wandb.py b/benchmark/qbert_dqn_wandb.py new file mode 100644 index 0000000000..06ed9a2033 --- /dev/null +++ b/benchmark/qbert_dqn_wandb.py @@ -0,0 +1,192 @@ +from typing import TYPE_CHECKING, Callable, Any, List, Union +import sys +from copy import deepcopy +from collections import deque +import gym +import torch +import treetensor.torch as ttorch +import numpy as np + +from ditk import logging +from ding.model import DQN +from ding.policy import DQNPolicy +from ding.policy import Policy +from ding.envs import DingEnvWrapper, BaseEnvManagerV2, BaseEnvManager, SubprocessEnvManagerV2 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task +from ding.framework.context import OfflineRLContext +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, StepCollector, data_pusher, \ + eps_greedy_handler, CkptSaver, termination_checker, nstep_reward_enhancer, wandb_online_logger, interaction_evaluator +from ding.utils import set_pkg_seed +from ding.utils import lists_to_dicts +from ding.utils.log_helper import build_logger +from ding.torch_utils import tensor_to_list, to_ndarray +from dizoo.atari.envs.atari_env import AtariEnv +import wandb +from easydict import EasyDict + +from qbert_dqn_config import main_config, create_config + + +class VectorEvalMonitor(object): + """ + Overview: + In some cases, different environment in evaluator may collect different length episode. For example, \ + suppose we want to collect 12 episodes in evaluator but only have 5 environments, if we didn’t do \ + any thing, it is likely that we will get more short episodes than long episodes. As a result, \ + our average reward will have a bias and may not be accurate. we use VectorEvalMonitor to solve the problem. + Interfaces: + __init__, is_finished, update_info, update_reward, get_episode_reward, get_latest_reward, get_current_episode,\ + get_episode_info + """ + + def __init__(self, env_num: int, n_episode: int) -> None: + """ + Overview: + Init method. According to the number of episodes and the number of environments, determine how many \ + episodes need to be opened for each environment, and initialize the reward, info and other \ + information + Arguments: + - env_num (:obj:`int`): the number of episodes need to be open + - n_episode (:obj:`int`): the number of environments + """ + assert n_episode >= env_num, "n_episode < env_num, please decrease the number of eval env" + self._env_num = env_num + self._n_episode = n_episode + each_env_episode = [n_episode // env_num for _ in range(env_num)] + for i in range(n_episode % env_num): + each_env_episode[i] += 1 + self._reward = {env_id: deque(maxlen=maxlen) for env_id, maxlen in enumerate(each_env_episode)} + self._info = {env_id: deque(maxlen=maxlen) for env_id, maxlen in enumerate(each_env_episode)} + + def is_finished(self) -> bool: + """ + Overview: + Determine whether the evaluator has completed the work. + Return: + - result: (:obj:`bool`): whether the evaluator has completed the work + """ + return all([len(v) == v.maxlen for v in self._reward.values()]) + + def update_info(self, env_id: int, info: Any) -> None: + """ + Overview: + Update the information of the environment indicated by env_id. + Arguments: + - env_id: (:obj:`int`): the id of the environment we need to update information + - info: (:obj:`Any`): the information we need to update + """ + info = tensor_to_list(info) + self._info[env_id].append(info) + + def update_reward(self, env_id: Union[int, np.ndarray], reward: Any) -> None: + """ + Overview: + Update the reward indicated by env_id. + Arguments: + - env_id: (:obj:`int`): the id of the environment we need to update the reward + - reward: (:obj:`Any`): the reward we need to update + """ + if isinstance(reward, torch.Tensor): + reward = reward.item() + if isinstance(env_id, np.ndarray): + env_id = env_id.item() + self._reward[env_id].append(reward) + + def get_episode_reward(self) -> list: + """ + Overview: + Get the total reward of one episode. + """ + return sum([list(v) for v in self._reward.values()], []) # sum(iterable, start) + + def get_latest_reward(self, env_id: int) -> int: + """ + Overview: + Get the latest reward of a certain environment. + Arguments: + - env_id: (:obj:`int`): the id of the environment we need to get reward. + """ + return self._reward[env_id][-1] + + def get_current_episode(self) -> int: + """ + Overview: + Get the current episode. We can know which episode our evaluator is executing now. + """ + return sum([len(v) for v in self._reward.values()]) + + def get_episode_info(self) -> dict: + """ + Overview: + Get all episode information, such as total reward of one episode. + """ + if len(self._info[0]) == 0: + return None + else: + total_info = sum([list(v) for v in self._info.values()], []) + total_info = lists_to_dicts(total_info) + new_dict = {} + for k in total_info.keys(): + if np.isscalar(total_info[k][0]): + new_dict[k + '_mean'] = np.mean(total_info[k]) + total_info.update(new_dict) + return total_info + + +num_seed = 1 + + +def main(seed): + logging.getLogger().setLevel(logging.INFO) + cfg = compile_config(main_config, create_cfg=create_config, auto=True) + cfg.env.seed = seed + + for i in range(num_seed): + wandb.init( + # Set the project where this run will be logged + project='zjow-QbertNoFrameskip-v4-3', + # We pass a run name (otherwise it’ll be randomly assigned, like sunshine-lollypop-10) + name=f"dqn", + # Track hyperparameters and run metadata + config=cfg + ) + logger_, tb_logger = build_logger(path='./log/qbert_dqn/seed' + str(seed), need_tb=True) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_cfg = deepcopy(cfg.env) + collector_cfg.is_train = True + evaluator_cfg = deepcopy(cfg.env) + evaluator_cfg.is_train = False + collector_env = SubprocessEnvManagerV2( + env_fn=[lambda: AtariEnv(collector_cfg) for _ in range(cfg.env.collector_env_num)], cfg=cfg.env.manager + ) + evaluator_env = SubprocessEnvManagerV2( + env_fn=[lambda: AtariEnv(evaluator_cfg) for _ in range(cfg.env.evaluator_env_num)], cfg=cfg.env.manager + ) + + # collector_env.enable_save_replay(replay_path='./lunarlander_video_train') + evaluator_env.enable_save_replay(replay_path=cfg.policy.logger.record_path) + + set_pkg_seed(seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNPolicy(cfg.policy, model=model) + + task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use(StepCollector(cfg, policy.collect_mode, collector_env)) + task.use(nstep_reward_enhancer(cfg)) + task.use(data_pusher(cfg, buffer_)) + task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(wandb_online_logger(cfg.policy.logger, evaluator_env, model)) + #task.use(CkptSaver(cfg, policy, train_freq=100)) + task.use(termination_checker(max_env_step=int(10e6))) + #task.use(_add_scalar) + task.run() + + +if __name__ == "__main__": + main(0) \ No newline at end of file From 82a4944ed79a5335c398a68810a5a8d92a191b8f Mon Sep 17 00:00:00 2001 From: zjowowen Date: Sun, 15 Jan 2023 14:17:01 +0800 Subject: [PATCH 003/244] add main --- benchmark/hopper_td3_pipeline.py | 3 ++- scheduler_main.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 scheduler_main.py diff --git a/benchmark/hopper_td3_pipeline.py b/benchmark/hopper_td3_pipeline.py index 5de4ea0cfd..2f47731d5b 100644 --- a/benchmark/hopper_td3_pipeline.py +++ b/benchmark/hopper_td3_pipeline.py @@ -1,3 +1,4 @@ +import os import gym from ditk import logging from ding.model.template.qac import QAC @@ -42,7 +43,7 @@ def main(seed): ) task.use(data_pusher(cfg, buffer_)) task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(CkptSaver(cfg, policy, train_freq=100)) + task.use(CkptSaver(policy=policy,save_dir=os.path.join(cfg["exp_name"],"model"), train_freq=100)) task.run() diff --git a/scheduler_main.py b/scheduler_main.py new file mode 100644 index 0000000000..fa9bd1a9bb --- /dev/null +++ b/scheduler_main.py @@ -0,0 +1,28 @@ +import os + +from ditk import logging + +from lighttuner.hpo import R, uniform, choice +from lighttuner.hpo import hpo +from lighttuner.scheduler import run_scheduler_local + + +def demo(): + dir_name = os.path.abspath('./benchmark') + + with run_scheduler_local(task_config_template_path=os.path.join(dir_name, "hopper_td3_wandb_pipeline.py"), + dijob_project_name="hopper_td3_wandb",max_number_of_running_task=4) as scheduler: + + opt = hpo(scheduler.get_hpo_callable()) + cfg, ret, metrics = opt.grid() \ + .max_steps(5) \ + .max_workers(4) \ + .maximize(R['eval_value']) \ + .spaces({'seed': choice([0,1])}).run() + print(cfg) + print(ret) + + +if __name__ == "__main__": + logging.try_init_root(logging.INFO) + demo() From ad616ff9a05fb60a0d8d135581610f09bdf3c7fd Mon Sep 17 00:00:00 2001 From: zjowowen Date: Sun, 15 Jan 2023 18:29:40 +0800 Subject: [PATCH 004/244] fix --- ..._td3_wanb_pipeline.py => hopper_td3_wandb_pipeline.py} | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) rename benchmark/{hopper_td3_wanb_pipeline.py => hopper_td3_wandb_pipeline.py} (95%) diff --git a/benchmark/hopper_td3_wanb_pipeline.py b/benchmark/hopper_td3_wandb_pipeline.py similarity index 95% rename from benchmark/hopper_td3_wanb_pipeline.py rename to benchmark/hopper_td3_wandb_pipeline.py index 445fbf0db2..4e0c488b13 100644 --- a/benchmark/hopper_td3_wanb_pipeline.py +++ b/benchmark/hopper_td3_wandb_pipeline.py @@ -1,3 +1,5 @@ +import os +import pathlib from ditk import logging from ding.model.template.qac import QAC from ding.policy import TD3Policy @@ -88,7 +90,7 @@ def main(seed=0, max_env_step=10000000): wandb.init( # Set the project where this run will be logged - project='hopper-td3', + project='hopper-td3-0111', # We pass a run name (otherwise it’ll be randomly assigned, like sunshine-lollypop-10) name=str(main_config["DI-toolkit-hpo-id"]), # Track hyperparameters and run metadata @@ -119,11 +121,11 @@ def main(seed=0, max_env_step=10000000): ) task.use(data_pusher(cfg, buffer_)) task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(CkptSaver(cfg, policy, train_freq=100)) + task.use(CkptSaver(policy=policy,save_dir=os.path.join(cfg["exp_name"],"model"), train_freq=100)) task.use(wandb_online_logger(cfg.policy.logger, evaluator_env, model)) task.use(termination_checker(max_env_step=max_env_step)) task.run() if __name__ == "__main__": - main(seed=main_config.seed, max_env_step=10000000) + main(seed=main_config.seed, max_env_step=10000000) From f1aba9c122e0d94b39544b53c1d1ded82cf419aa Mon Sep 17 00:00:00 2001 From: zjowowen Date: Sun, 15 Jan 2023 19:19:42 +0800 Subject: [PATCH 005/244] fix --- benchmark/hopper_td3_wandb_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/hopper_td3_wandb_pipeline.py b/benchmark/hopper_td3_wandb_pipeline.py index 4e0c488b13..c6e7e2e641 100644 --- a/benchmark/hopper_td3_wandb_pipeline.py +++ b/benchmark/hopper_td3_wandb_pipeline.py @@ -122,7 +122,7 @@ def main(seed=0, max_env_step=10000000): task.use(data_pusher(cfg, buffer_)) task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) task.use(CkptSaver(policy=policy,save_dir=os.path.join(cfg["exp_name"],"model"), train_freq=100)) - task.use(wandb_online_logger(cfg.policy.logger, evaluator_env, model)) + task.use(wandb_online_logger(record_path=cfg.policy.logger.record_path,cfg=cfg.policy.logger, env=evaluator_env, model=model)) task.use(termination_checker(max_env_step=max_env_step)) task.run() From 448daa13cdfd4a1376334ca5e0b0b695543a5924 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 10 Feb 2023 07:04:46 +0000 Subject: [PATCH 006/244] add feature to wandb;fix bugs --- ding/framework/context.py | 3 +- .../middleware/functional/collector.py | 4 +- .../middleware/functional/ctx_helper.py | 4 +- .../middleware/functional/evaluator.py | 7 +- .../framework/middleware/functional/logger.py | 210 ++++++++++++------ ding/policy/ddpg.py | 18 +- 6 files changed, 162 insertions(+), 84 deletions(-) diff --git a/ding/framework/context.py b/ding/framework/context.py index 1dbe998b49..c5a23251dc 100644 --- a/ding/framework/context.py +++ b/ding/framework/context.py @@ -65,13 +65,14 @@ class OnlineRLContext(Context): # eval eval_value: float = -np.inf last_eval_iter: int = -1 + last_eval_value: int = -np.inf eval_output: List = dataclasses.field(default_factory=dict) def __post_init__(self): # This method is called just after __init__ method. Here, concretely speaking, # this method is called just after the object initialize its fields. # We use this method here to keep the fields needed for each iteration. - self.keep('env_step', 'env_episode', 'train_iter', 'last_eval_iter') + self.keep('env_step', 'env_episode', 'train_iter', 'last_eval_iter', 'last_eval_value') @dataclasses.dataclass diff --git a/ding/framework/middleware/functional/collector.py b/ding/framework/middleware/functional/collector.py index 20820d7d00..afd92d7649 100644 --- a/ding/framework/middleware/functional/collector.py +++ b/ding/framework/middleware/functional/collector.py @@ -1,6 +1,7 @@ from typing import TYPE_CHECKING, Callable, List, Tuple, Any from easydict import EasyDict from functools import reduce +import numpy as np import treetensor.torch as ttorch from ding.envs import BaseEnvManager from ding.policy import Policy @@ -77,7 +78,8 @@ def _inference(ctx: "OnlineRLContext"): obs = {i: obs[i] for i in range(get_shape0(obs))} # TBD inference_output = policy.forward(obs, **ctx.collect_kwargs) - ctx.action = [to_ndarray(v['action']) for v in inference_output.values()] # TBD + # ctx.action = [to_ndarray(v['action']) for v in inference_output.values()] # TBD + ctx.action = np.array([to_ndarray(v['action']) for v in inference_output.values()]) # TBD ctx.inference_output = inference_output return _inference diff --git a/ding/framework/middleware/functional/ctx_helper.py b/ding/framework/middleware/functional/ctx_helper.py index 2994889eb5..c6d563f869 100644 --- a/ding/framework/middleware/functional/ctx_helper.py +++ b/ding/framework/middleware/functional/ctx_helper.py @@ -11,11 +11,13 @@ def final_ctx_saver(name: str) -> Callable: def _save(ctx: "Context"): if task.finish: + # make sure the items to be recorded are all kept in the context with open(os.path.join(name, 'result.pkl'), 'wb') as f: final_data = { 'total_step': ctx.total_step, 'train_iter': ctx.train_iter, - 'eval_value': ctx.eval_value, + 'last_eval_iter': ctx.last_eval_iter, + 'eval_value': ctx.last_eval_value, } if ctx.has_attr('env_step'): final_data['env_step'] = ctx.env_step diff --git a/ding/framework/middleware/functional/evaluator.py b/ding/framework/middleware/functional/evaluator.py index 58ed1268fc..8ded8404cb 100644 --- a/ding/framework/middleware/functional/evaluator.py +++ b/ding/framework/middleware/functional/evaluator.py @@ -257,7 +257,8 @@ def _evaluate(ctx: Union["OnlineRLContext", "OfflineRLContext"]): eval_monitor.update_video(env.ready_imgs) eval_monitor.update_output(inference_output) output = [v for v in inference_output.values()] - action = [to_ndarray(v['action']) for v in output] # TBD + # action = [to_ndarray(v['action']) for v in output] # TBD + action = np.array([to_ndarray(v['action']) for v in output]) # TBD timesteps = env.step(action) for timestep in timesteps: env_id = timestep.env_id.item() @@ -282,7 +283,8 @@ def _evaluate(ctx: Union["OnlineRLContext", "OfflineRLContext"]): raise TypeError("not supported ctx type: {}".format(type(ctx))) ctx.last_eval_iter = ctx.train_iter ctx.eval_value = episode_return - ctx.eval_output = {'reward': episode_return} + ctx.last_eval_value = ctx.eval_value + ctx.eval_output = {'episode_return': episode_return} episode_info = eval_monitor.get_episode_info() if episode_info is not None: ctx.eval_output['episode_info'] = episode_info @@ -374,6 +376,7 @@ def _evaluate(ctx: "OnlineRLContext"): ) ctx.last_eval_iter = ctx.train_iter ctx.eval_value = episode_return_mean + ctx.last_eval_value = ctx.eval_value ctx.eval_output = {'episode_return': episode_return} episode_info = eval_monitor.get_episode_info() if episode_info is not None: diff --git a/ding/framework/middleware/functional/logger.py b/ding/framework/middleware/functional/logger.py index 604300b6dd..d3b4b93cba 100644 --- a/ding/framework/middleware/functional/logger.py +++ b/ding/framework/middleware/functional/logger.py @@ -147,7 +147,7 @@ def wandb_online_logger( return task.void() color_list = ["orange", "red", "blue", "purple", "green", "darkcyan"] if metric_list is None: - metric_list = ["q_value", "target q_value", "loss", "lr", "entropy"] + metric_list = ["q_value", "target q_value", "loss", "lr", "entropy", "target_q_value", "td_error"] # Initialize wandb with default settings # Settings can be covered by calling wandb.init() at the top of the script if anonymous: @@ -176,6 +176,8 @@ def wandb_online_logger( ) def _plot(ctx: "OnlineRLContext"): + info_for_logging={} + if not cfg.plot_logger: one_time_warning( "If you want to use wandb to visualize the result, please set plot_logger = True in the config." @@ -191,10 +193,10 @@ def _plot(ctx: "OnlineRLContext"): else: metric_value_list.append(item[metric]) metric_value = np.mean(metric_value_list) - wandb.log({metric: metric_value}) + info_for_logging.update({metric: metric_value}) if ctx.eval_value != -np.inf: - wandb.log({"reward": ctx.eval_value, "train iter": ctx.train_iter, "env step": ctx.env_step}) + info_for_logging.update({"episode return mean": ctx.eval_value, "train iter": ctx.train_iter, "env step": ctx.env_step}) eval_output = ctx.eval_output['output'] episode_return = ctx.eval_output['episode_return'] @@ -209,26 +211,39 @@ def _plot(ctx: "OnlineRLContext"): file_list.append(p) file_list.sort(key=lambda fn: os.path.getmtime(os.path.join(record_path, fn))) video_path = os.path.join(record_path, file_list[-2]) - wandb.log({"video": wandb.Video(video_path, format="mp4")}) + info_for_logging.update({"video": wandb.Video(video_path, format="mp4")}) action_path = os.path.join(record_path, (str(ctx.env_step) + "_action.gif")) return_path = os.path.join(record_path, (str(ctx.env_step) + "_return.gif")) - if cfg.action_logger in ['q_value', 'action probability']: - if isinstance(eval_output, tnp.ndarray): - action_prob = softmax(eval_output.logit) - else: - action_prob = [softmax(to_ndarray(v['logit'])) for v in eval_output] - fig, ax = plt.subplots() - plt.ylim([-1, 1]) - action_dim = len(action_prob[1]) - x_range = [str(x + 1) for x in range(action_dim)] - ln = ax.bar(x_range, [0 for x in range(action_dim)], color=color_list[:action_dim]) - ani = animation.FuncAnimation( - fig, action_prob, fargs=(action_prob, ln), blit=True, save_count=len(action_prob) - ) - ani.save(action_path, writer='pillow') - wandb.log({cfg.action_logger: wandb.Video(action_path, format="gif")}) - plt.clf() + if cfg.action_logger: + if all(['logit' in v for v in eval_output]) or hasattr(eval_output, "logit"): + if isinstance(eval_output, tnp.ndarray): + action_prob = softmax(eval_output.logit) + else: + action_prob = [softmax(to_ndarray(v['logit'])) for v in eval_output] + fig, ax = plt.subplots() + plt.ylim([-1, 1]) + action_dim = len(action_prob[1]) + x_range = [str(x + 1) for x in range(action_dim)] + ln = ax.bar(x_range, [0 for x in range(action_dim)], color=color_list[:action_dim]) + ani = animation.FuncAnimation( + fig, action_prob, fargs=(action_prob, ln), blit=True, save_count=len(action_prob) + ) + ani.save(action_path, writer='pillow') + info_for_logging.update({"action": wandb.Video(action_path, format="gif")}) + + elif all(['action' in v for v in eval_output[0]]): + + num_trajectory=len(eval_output) + for i,action_trajectory in enumerate(eval_output): + fig, ax = plt.subplots() + fig_data=np.array([[i+1,*v['action']] for i,v in enumerate(action_trajectory)]) + steps=fig_data[:,0] + actions=fig_data[:,1:] + plt.ylim([-1, 1]) + for j in range(actions.shape[1]): + ax.scatter(steps, actions[:,j]) + info_for_logging.update({"actions_of_trajectory_{}".format(i): fig}) if cfg.return_logger: fig, ax = plt.subplots() @@ -239,45 +254,63 @@ def _plot(ctx: "OnlineRLContext"): ln_return = ax.bar(x_dim, hist, width=1, color='r', linewidth=0.7) ani = animation.FuncAnimation(fig, return_prob, fargs=(hist, ln_return), blit=True, save_count=1) ani.save(return_path, writer='pillow') - wandb.log({"return distribution": wandb.Video(return_path, format="gif")}) + info_for_logging.update({"return distribution": wandb.Video(return_path, format="gif")}) + + wandb.log(data=info_for_logging,step=ctx.env_step) + plt.clf() return _plot def wandb_offline_logger( - cfg: EasyDict, - env: BaseEnvManagerV2, - model: torch.nn.Module, + record_path: str, datasetpath: str, + cfg: Union[str, EasyDict] = 'default', + metric_list: Optional[List[str]] = None, + env: Optional[BaseEnvManagerV2] = None, + model: Optional[torch.nn.Module] = None, anonymous: bool = False ) -> Callable: ''' Overview: Wandb visualizer to track the experiment. Arguments: - - cfg (:obj:`EasyDict`): Config, a dict of following settings: - - record_path: string. The path to save the replay of simulation. + - record_path (:obj:`str`): The path to save the replay of simulation. + - cfg (:obj:`Union[str, EasyDict]`): Config, a dict of following settings: - gradient_logger: boolean. Whether to track the gradient. - plot_logger: boolean. Whether to track the metrics like reward and loss. - action_logger: `q_value` or `action probability`. + - metric_list (:obj:`Optional[List[str]]`): Logged metric list, specialized by different policies. - env (:obj:`BaseEnvManagerV2`): Evaluator environment. - - model (:obj:`nn.Module`): Model. - - datasetpath (:obj:`str`): The path of offline dataset. + - model (:obj:`nn.Module`): Policy neural network model. - anonymous (:obj:`bool`): Open the anonymous mode of wandb or not. The anonymous mode allows visualization of data without wandb count. ''' - + if task.router.is_active and not task.has_role(task.role.LEARNER): + return task.void() color_list = ["orange", "red", "blue", "purple", "green", "darkcyan"] - metric_list = ["q_value", "target q_value", "loss", "lr", "entropy", "target_q_value", "td_error"] + if metric_list is None: + metric_list = ["q_value", "target q_value", "loss", "lr", "entropy", "target_q_value", "td_error"] # Initialize wandb with default settings # Settings can be covered by calling wandb.init() at the top of the script if anonymous: wandb.init(anonymous="must") else: wandb.init() + if cfg == 'default': + cfg = EasyDict( + dict( + gradient_logger=False, + plot_logger=True, + video_logger=False, + action_logger=False, + return_logger=False, + ) + ) # The visualizer is called to save the replay of the simulation # which will be uploaded to wandb later - env.enable_save_replay(replay_path=cfg.record_path) + if env is not None: + env.enable_save_replay(replay_path=record_path) if cfg.gradient_logger: wandb.watch(model) else: @@ -333,60 +366,89 @@ def _vis_dataset(datasetpath: str): if cfg.vis_dataset is True: _vis_dataset(datasetpath) - def _plot(ctx: "OfflineRLContext"): + + def _plot(ctx: "OnlineRLContext"): + info_for_logging={} + if not cfg.plot_logger: one_time_warning( "If you want to use wandb to visualize the result, please set plot_logger = True in the config." ) return for metric in metric_list: - if metric in ctx.train_output: - metric_value = ctx.train_output[metric] - wandb.log({metric: metric_value}) + if metric in ctx.train_output[0]: + # metric_value = np.mean([item[metric] for item in ctx.train_output]) + metric_value_list = [] + for item in ctx.train_output: + if isinstance(item[metric], torch.Tensor): + metric_value_list.append(item[metric].cpu().detach().numpy()) + else: + metric_value_list.append(item[metric]) + metric_value = np.mean(metric_value_list) + info_for_logging.update({metric: metric_value}) if ctx.eval_value != -np.inf: - wandb.log({"reward": ctx.eval_value, "train iter": ctx.train_iter}) + info_for_logging.update({"episode return mean": ctx.eval_value, "train iter": ctx.train_iter, "env step": ctx.env_step}) eval_output = ctx.eval_output['output'] episode_return = ctx.eval_output['episode_return'] - if 'logit' in eval_output[0]: - action_value = [to_ndarray(F.softmax(v['logit'], dim=-1)) for v in eval_output] - - file_list = [] - for p in os.listdir(cfg.record_path): - if os.path.splitext(p)[-1] == ".mp4": - file_list.append(p) - file_list.sort(key=lambda fn: os.path.getmtime(os.path.join(cfg.record_path, fn))) - - video_path = os.path.join(cfg.record_path, file_list[-2]) - action_path = os.path.join(cfg.record_path, (str(ctx.train_iter) + "_action.gif")) - return_path = os.path.join(cfg.record_path, (str(ctx.train_iter) + "_return.gif")) - if cfg.action_logger in ['q_value', 'action probability']: + episode_return = np.array(episode_return) + if len(episode_return.shape) == 2: + episode_return = episode_return.squeeze(1) + + if cfg.video_logger: + file_list = [] + for p in os.listdir(record_path): + if os.path.splitext(p)[-1] == ".mp4": + file_list.append(p) + file_list.sort(key=lambda fn: os.path.getmtime(os.path.join(record_path, fn))) + video_path = os.path.join(record_path, file_list[-2]) + info_for_logging.update({"video": wandb.Video(video_path, format="mp4")}) + + action_path = os.path.join(record_path, (str(ctx.env_step) + "_action.gif")) + return_path = os.path.join(record_path, (str(ctx.env_step) + "_return.gif")) + if cfg.action_logger: + if all(['logit' in v for v in eval_output]) or hasattr(eval_output, "logit"): + if isinstance(eval_output, tnp.ndarray): + action_prob = softmax(eval_output.logit) + else: + action_prob = [softmax(to_ndarray(v['logit'])) for v in eval_output] + fig, ax = plt.subplots() + plt.ylim([-1, 1]) + action_dim = len(action_prob[1]) + x_range = [str(x + 1) for x in range(action_dim)] + ln = ax.bar(x_range, [0 for x in range(action_dim)], color=color_list[:action_dim]) + ani = animation.FuncAnimation( + fig, action_prob, fargs=(action_prob, ln), blit=True, save_count=len(action_prob) + ) + ani.save(action_path, writer='pillow') + info_for_logging.update({"action": wandb.Video(action_path, format="gif")}) + + elif all(['action' in v for v in eval_output[0]]): + + num_trajectory=len(eval_output) + for i,action_trajectory in enumerate(eval_output): + fig, ax = plt.subplots() + fig_data=np.array([[i+1,*v['action']] for i,v in enumerate(action_trajectory)]) + steps=fig_data[:,0] + actions=fig_data[:,1:] + plt.ylim([-1, 1]) + for j in range(actions.shape[1]): + ax.scatter(steps, actions[:,j]) + info_for_logging.update({"actions_of_trajectory_{}".format(i): fig}) + + if cfg.return_logger: fig, ax = plt.subplots() - plt.ylim([-1, 1]) - action_dim = len(action_value[0]) - x_range = [str(x + 1) for x in range(action_dim)] - ln = ax.bar(x_range, [0 for x in range(action_dim)], color=color_list[:action_dim]) - ani = animation.FuncAnimation( - fig, action_prob, fargs=(action_value, ln), blit=True, save_count=len(action_value) - ) - ani.save(action_path, writer='pillow') - wandb.log({cfg.action_logger: wandb.Video(action_path, format="gif")}) - plt.clf() - - fig, ax = plt.subplots() - ax = plt.gca() - ax.set_ylim([0, 1]) - hist, x_dim = return_distribution(episode_return) - assert len(hist) == len(x_dim) - ln_return = ax.bar(x_dim, hist, width=1, color='r', linewidth=0.7) - ani = animation.FuncAnimation(fig, return_prob, fargs=(hist, ln_return), blit=True, save_count=1) - ani.save(return_path, writer='pillow') - wandb.log( - { - "video": wandb.Video(video_path, format="mp4"), - "return distribution": wandb.Video(return_path, format="gif") - } - ) + ax = plt.gca() + ax.set_ylim([0, 1]) + hist, x_dim = return_distribution(episode_return) + assert len(hist) == len(x_dim) + ln_return = ax.bar(x_dim, hist, width=1, color='r', linewidth=0.7) + ani = animation.FuncAnimation(fig, return_prob, fargs=(hist, ln_return), blit=True, save_count=1) + ani.save(return_path, writer='pillow') + info_for_logging.update({"return distribution": wandb.Video(return_path, format="gif")}) + + wandb.log(data=info_for_logging,step=ctx.env_step) + plt.clf() return _plot diff --git a/ding/policy/ddpg.py b/ding/policy/ddpg.py index 32c07e2718..19238bb823 100644 --- a/ding/policy/ddpg.py +++ b/ding/policy/ddpg.py @@ -242,17 +242,24 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: reward = (reward - reward.mean()) / (reward.std() + 1e-8) # current q value q_value = self._learn_model.forward(data, mode='compute_critic')['q_value'] + + # target q value. + with torch.no_grad(): + next_actor_data = self._target_model.forward(next_obs, mode='compute_actor') + next_actor_data['obs'] = next_obs + target_q_value = self._target_model.forward(next_actor_data, mode='compute_critic')['q_value'] + q_value_dict = {} + target_q_value_dict = {} if self._twin_critic: q_value_dict['q_value'] = q_value[0].mean() q_value_dict['q_value_twin'] = q_value[1].mean() + target_q_value_dict['target q_value'] = target_q_value[0].mean() + target_q_value_dict['target q_value_twin'] = target_q_value[1].mean() else: q_value_dict['q_value'] = q_value.mean() - # target q value. - with torch.no_grad(): - next_actor_data = self._target_model.forward(next_obs, mode='compute_actor') - next_actor_data['obs'] = next_obs - target_q_value = self._target_model.forward(next_actor_data, mode='compute_critic')['q_value'] + target_q_value_dict['target q_value'] = target_q_value[0].mean() + if self._twin_critic: # TD3: two critic networks target_q_value = torch.min(target_q_value[0], target_q_value[1]) # find min one as target q value @@ -314,6 +321,7 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: 'td_error': td_error_per_sample.abs().mean(), **loss_dict, **q_value_dict, + **target_q_value_dict, } def _state_dict_learn(self) -> Dict[str, Any]: From 8de9b9e41b3366b8e5a45d94ba6eab26a914c2c9 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 10 Feb 2023 07:13:00 +0000 Subject: [PATCH 007/244] format code --- .../framework/middleware/functional/logger.py | 57 +++++++++++-------- ding/policy/ddpg.py | 2 +- 2 files changed, 33 insertions(+), 26 deletions(-) diff --git a/ding/framework/middleware/functional/logger.py b/ding/framework/middleware/functional/logger.py index d3b4b93cba..9e80ddc4db 100644 --- a/ding/framework/middleware/functional/logger.py +++ b/ding/framework/middleware/functional/logger.py @@ -176,7 +176,7 @@ def wandb_online_logger( ) def _plot(ctx: "OnlineRLContext"): - info_for_logging={} + info_for_logging = {} if not cfg.plot_logger: one_time_warning( @@ -196,7 +196,13 @@ def _plot(ctx: "OnlineRLContext"): info_for_logging.update({metric: metric_value}) if ctx.eval_value != -np.inf: - info_for_logging.update({"episode return mean": ctx.eval_value, "train iter": ctx.train_iter, "env step": ctx.env_step}) + info_for_logging.update( + { + "episode return mean": ctx.eval_value, + "train iter": ctx.train_iter, + "env step": ctx.env_step + } + ) eval_output = ctx.eval_output['output'] episode_return = ctx.eval_output['episode_return'] @@ -231,18 +237,16 @@ def _plot(ctx: "OnlineRLContext"): ) ani.save(action_path, writer='pillow') info_for_logging.update({"action": wandb.Video(action_path, format="gif")}) - + elif all(['action' in v for v in eval_output[0]]): - - num_trajectory=len(eval_output) - for i,action_trajectory in enumerate(eval_output): + for i, action_trajectory in enumerate(eval_output): fig, ax = plt.subplots() - fig_data=np.array([[i+1,*v['action']] for i,v in enumerate(action_trajectory)]) - steps=fig_data[:,0] - actions=fig_data[:,1:] + fig_data = np.array([[i + 1, *v['action']] for i, v in enumerate(action_trajectory)]) + steps = fig_data[:, 0] + actions = fig_data[:, 1:] plt.ylim([-1, 1]) for j in range(actions.shape[1]): - ax.scatter(steps, actions[:,j]) + ax.scatter(steps, actions[:, j]) info_for_logging.update({"actions_of_trajectory_{}".format(i): fig}) if cfg.return_logger: @@ -255,8 +259,8 @@ def _plot(ctx: "OnlineRLContext"): ani = animation.FuncAnimation(fig, return_prob, fargs=(hist, ln_return), blit=True, save_count=1) ani.save(return_path, writer='pillow') info_for_logging.update({"return distribution": wandb.Video(return_path, format="gif")}) - - wandb.log(data=info_for_logging,step=ctx.env_step) + + wandb.log(data=info_for_logging, step=ctx.env_step) plt.clf() return _plot @@ -366,9 +370,8 @@ def _vis_dataset(datasetpath: str): if cfg.vis_dataset is True: _vis_dataset(datasetpath) - def _plot(ctx: "OnlineRLContext"): - info_for_logging={} + info_for_logging = {} if not cfg.plot_logger: one_time_warning( @@ -388,7 +391,13 @@ def _plot(ctx: "OnlineRLContext"): info_for_logging.update({metric: metric_value}) if ctx.eval_value != -np.inf: - info_for_logging.update({"episode return mean": ctx.eval_value, "train iter": ctx.train_iter, "env step": ctx.env_step}) + info_for_logging.update( + { + "episode return mean": ctx.eval_value, + "train iter": ctx.train_iter, + "env step": ctx.env_step + } + ) eval_output = ctx.eval_output['output'] episode_return = ctx.eval_output['episode_return'] @@ -423,18 +432,16 @@ def _plot(ctx: "OnlineRLContext"): ) ani.save(action_path, writer='pillow') info_for_logging.update({"action": wandb.Video(action_path, format="gif")}) - + elif all(['action' in v for v in eval_output[0]]): - - num_trajectory=len(eval_output) - for i,action_trajectory in enumerate(eval_output): + for i, action_trajectory in enumerate(eval_output): fig, ax = plt.subplots() - fig_data=np.array([[i+1,*v['action']] for i,v in enumerate(action_trajectory)]) - steps=fig_data[:,0] - actions=fig_data[:,1:] + fig_data = np.array([[i + 1, *v['action']] for i, v in enumerate(action_trajectory)]) + steps = fig_data[:, 0] + actions = fig_data[:, 1:] plt.ylim([-1, 1]) for j in range(actions.shape[1]): - ax.scatter(steps, actions[:,j]) + ax.scatter(steps, actions[:, j]) info_for_logging.update({"actions_of_trajectory_{}".format(i): fig}) if cfg.return_logger: @@ -447,8 +454,8 @@ def _plot(ctx: "OnlineRLContext"): ani = animation.FuncAnimation(fig, return_prob, fargs=(hist, ln_return), blit=True, save_count=1) ani.save(return_path, writer='pillow') info_for_logging.update({"return distribution": wandb.Video(return_path, format="gif")}) - - wandb.log(data=info_for_logging,step=ctx.env_step) + + wandb.log(data=info_for_logging, step=ctx.env_step) plt.clf() return _plot diff --git a/ding/policy/ddpg.py b/ding/policy/ddpg.py index 19238bb823..850254e6e8 100644 --- a/ding/policy/ddpg.py +++ b/ding/policy/ddpg.py @@ -259,7 +259,7 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: else: q_value_dict['q_value'] = q_value.mean() target_q_value_dict['target q_value'] = target_q_value[0].mean() - + if self._twin_critic: # TD3: two critic networks target_q_value = torch.min(target_q_value[0], target_q_value[1]) # find min one as target q value From f36bec871181a722c69640bef0d3a172a81da38f Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 10 Feb 2023 07:21:12 +0000 Subject: [PATCH 008/244] remove files. --- benchmark/hopper_td3_config.py | 72 ---------- benchmark/hopper_td3_pipeline.py | 51 ------- benchmark/hopper_td3_wandb_pipeline.py | 131 ----------------- benchmark/lunarlander_dqn_config.py | 80 ----------- benchmark/lunarlander_dqn_deploy.py | 39 ----- benchmark/lunarlander_dqn_eval.py | 64 --------- benchmark/lunarlander_dqn_pipeline.py | 52 ------- benchmark/pendulum_td3_config.py | 64 --------- benchmark/pendulum_td3_pipeline.py | 49 ------- benchmark/pendulum_td3_wandb.py | 61 -------- benchmark/qbert_dqn_config.py | 60 -------- benchmark/qbert_dqn_wandb.py | 192 ------------------------- scheduler_main.py | 28 ---- 13 files changed, 943 deletions(-) delete mode 100644 benchmark/hopper_td3_config.py delete mode 100644 benchmark/hopper_td3_pipeline.py delete mode 100644 benchmark/hopper_td3_wandb_pipeline.py delete mode 100644 benchmark/lunarlander_dqn_config.py delete mode 100644 benchmark/lunarlander_dqn_deploy.py delete mode 100644 benchmark/lunarlander_dqn_eval.py delete mode 100644 benchmark/lunarlander_dqn_pipeline.py delete mode 100644 benchmark/pendulum_td3_config.py delete mode 100644 benchmark/pendulum_td3_pipeline.py delete mode 100644 benchmark/pendulum_td3_wandb.py delete mode 100644 benchmark/qbert_dqn_config.py delete mode 100644 benchmark/qbert_dqn_wandb.py delete mode 100644 scheduler_main.py diff --git a/benchmark/hopper_td3_config.py b/benchmark/hopper_td3_config.py deleted file mode 100644 index 207a5dc75e..0000000000 --- a/benchmark/hopper_td3_config.py +++ /dev/null @@ -1,72 +0,0 @@ -from easydict import EasyDict - -hopper_td3_config = dict( - exp_name='hopper_td3_seed0', - env=dict( - env_id='Hopper-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=1, - evaluator_env_num=1, - n_evaluator_episode=1, - stop_value=6000, - ), - policy=dict( - cuda=True, - random_collect_size=25000, - model=dict( - obs_shape=11, - action_shape=3, - twin_critic=True, - actor_head_hidden_size=256, - critic_head_hidden_size=256, - action_space='regression', - ), - logger=dict(record_path='./video_hopper_td3', gradient_logger=True, plot_logger=True, action_logger='q_value'), - learn=dict( - update_per_collect=1, - batch_size=256, - learning_rate_actor=1e-3, - learning_rate_critic=1e-3, - ignore_done=False, - target_theta=0.005, - discount_factor=0.99, - actor_update_freq=2, - noise=True, - noise_sigma=0.2, - noise_range=dict( - min=-0.5, - max=0.5, - ), - ), - collect=dict( - n_sample=1, - unroll_len=1, - noise_sigma=0.1, - ), - other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), - ) -) - -hopper_td3_config = EasyDict(hopper_td3_config) -main_config = hopper_td3_config - -hopper_td3_create_config = dict( - env=dict( - type='mujoco', - import_names=['dizoo.mujoco.envs.mujoco_env'], - ), - env_manager=dict(type='subprocess'), - policy=dict( - type='td3', - import_names=['ding.policy.td3'], - ), - replay_buffer=dict(type='naive', ), -) -hopper_td3_create_config = EasyDict(hopper_td3_create_config) -create_config = hopper_td3_create_config - -if __name__ == "__main__": - # or you can enter `ding -m serial -c hopper_td3_config.py -s 0` - from ding.entry import serial_pipeline - serial_pipeline([main_config, create_config], seed=0) diff --git a/benchmark/hopper_td3_pipeline.py b/benchmark/hopper_td3_pipeline.py deleted file mode 100644 index 2f47731d5b..0000000000 --- a/benchmark/hopper_td3_pipeline.py +++ /dev/null @@ -1,51 +0,0 @@ -import os -import gym -from ditk import logging -from ding.model.template.qac import QAC -from ding.policy import TD3Policy -from ding.envs import DingEnvWrapper, BaseEnvManagerV2, SubprocessEnvManagerV2 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, CkptSaver -from ding.utils import set_pkg_seed -from ding.utils.log_helper import build_logger -from dizoo.mujoco.envs.mujoco_env import MujocoEnv - -from hopper_td3_config import main_config, create_config - - -def main(seed): - logging.getLogger().setLevel(logging.INFO) - cfg = compile_config(main_config, create_cfg=create_config, auto=True) - cfg.env.seed = seed - - logger_, tb_logger = build_logger(path='./log/hopper_td3/seed' + str(seed), need_tb=True) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = SubprocessEnvManagerV2( - env_fn=[lambda: MujocoEnv(cfg.env) for _ in range(cfg.env.collector_env_num)], cfg=cfg.env.manager - ) - evaluator_env = SubprocessEnvManagerV2( - env_fn=[lambda: MujocoEnv(cfg.env) for _ in range(cfg.env.evaluator_env_num)], cfg=cfg.env.manager - ) - evaluator_env.enable_save_replay(replay_path=cfg.policy.logger.record_path) - - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = QAC(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = TD3Policy(cfg.policy, model) - - task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use( - StepCollector(cfg, policy.collect_mode, collector_env, random_collect_size=cfg.policy.random_collect_size) - ) - task.use(data_pusher(cfg, buffer_)) - task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(CkptSaver(policy=policy,save_dir=os.path.join(cfg["exp_name"],"model"), train_freq=100)) - task.run() - - -if __name__ == "__main__": - main(0) diff --git a/benchmark/hopper_td3_wandb_pipeline.py b/benchmark/hopper_td3_wandb_pipeline.py deleted file mode 100644 index c6e7e2e641..0000000000 --- a/benchmark/hopper_td3_wandb_pipeline.py +++ /dev/null @@ -1,131 +0,0 @@ -import os -import pathlib -from ditk import logging -from ding.model.template.qac import QAC -from ding.policy import TD3Policy -from ding.envs import DingEnvWrapper, BaseEnvManagerV2, SubprocessEnvManagerV2 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, CkptSaver, termination_checker, wandb_online_logger -from ding.utils import set_pkg_seed -from ding.utils.log_helper import build_logger -from dizoo.mujoco.envs.mujoco_env import MujocoEnv -from easydict import EasyDict -import wandb - -hopper_td3_config = dict( - exp_name='hopper_td3_wandb_seed0', - seed=0, - env=dict( - env_id='Hopper-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=6000, - ), - policy=dict( - cuda=True, - random_collect_size=25000, - model=dict( - obs_shape=11, - action_shape=3, - twin_critic=True, - actor_head_hidden_size=256, - critic_head_hidden_size=256, - action_space='regression', - ), - logger=dict(record_path='./video_hopper_td3', gradient_logger=True, plot_logger=True, action_logger=None), - learn=dict( - update_per_collect=1, - batch_size=256, - learning_rate_actor=1e-3, - learning_rate_critic=1e-3, - ignore_done=False, - target_theta=0.005, - discount_factor=0.99, - actor_update_freq=2, - noise=True, - noise_sigma=0.2, - noise_range=dict( - min=-0.5, - max=0.5, - ), - ), - collect=dict( - n_sample=1, - unroll_len=1, - noise_sigma=0.1, - ), - other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), - ) -) - -hopper_td3_config = EasyDict(hopper_td3_config) -main_config = hopper_td3_config - -hopper_td3_create_config = dict( - env=dict( - type='mujoco', - import_names=['dizoo.mujoco.envs.mujoco_env'], - ), - env_manager=dict(type='subprocess'), - policy=dict( - type='td3', - import_names=['ding.policy.td3'], - ), - replay_buffer=dict(type='naive', ), -) -hopper_td3_create_config = EasyDict(hopper_td3_create_config) -create_config = hopper_td3_create_config - - -def main(seed=0, max_env_step=10000000): - logging.getLogger().setLevel(logging.INFO) - cfg = compile_config(main_config, create_cfg=create_config, auto=True) - cfg.env.seed = seed - - wandb.init( - # Set the project where this run will be logged - project='hopper-td3-0111', - # We pass a run name (otherwise it’ll be randomly assigned, like sunshine-lollypop-10) - name=str(main_config["DI-toolkit-hpo-id"]), - # Track hyperparameters and run metadata - config=cfg - ) - - # logger_, tb_logger = build_logger(path='./log/hopper_td3/seed' + str(seed), - # need_tb=True) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = SubprocessEnvManagerV2( - env_fn=[lambda: MujocoEnv(cfg.env) for _ in range(cfg.env.collector_env_num)], cfg=cfg.env.manager - ) - evaluator_env = SubprocessEnvManagerV2( - env_fn=[lambda: MujocoEnv(cfg.env) for _ in range(cfg.env.evaluator_env_num)], cfg=cfg.env.manager - ) - cfg.policy.logger.record_path = './' + cfg.exp_name + '/video' - evaluator_env.enable_save_replay(replay_path=cfg.policy.logger.record_path) - - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = QAC(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = TD3Policy(cfg.policy, model) - - task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use( - StepCollector(cfg, policy.collect_mode, collector_env, random_collect_size=cfg.policy.random_collect_size) - ) - task.use(data_pusher(cfg, buffer_)) - task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(CkptSaver(policy=policy,save_dir=os.path.join(cfg["exp_name"],"model"), train_freq=100)) - task.use(wandb_online_logger(record_path=cfg.policy.logger.record_path,cfg=cfg.policy.logger, env=evaluator_env, model=model)) - task.use(termination_checker(max_env_step=max_env_step)) - task.run() - - -if __name__ == "__main__": - main(seed=main_config.seed, max_env_step=10000000) diff --git a/benchmark/lunarlander_dqn_config.py b/benchmark/lunarlander_dqn_config.py deleted file mode 100644 index cc4bfcc9ec..0000000000 --- a/benchmark/lunarlander_dqn_config.py +++ /dev/null @@ -1,80 +0,0 @@ -from easydict import EasyDict - -nstep = 3 -lunarlander_dqn_config = dict( - exp_name='lunarlander_dqn_seed0', - env=dict( - # Whether to use shared memory. Only effective if "env_manager_type" is 'subprocess' - # Env number respectively for collector and evaluator. - collector_env_num=1, - evaluator_env_num=1, - env_id='LunarLander-v2', - n_evaluator_episode=1, - stop_value=200, - # The path to save the game replay - # replay_path='./lunarlander_dqn_seed0/video', - ), - policy=dict( - # Whether to use cuda for network. - cuda=True, - load_path="./lunarlander_dqn_seed0/ckpt/ckpt_best.pth.tar", - model=dict( - obs_shape=8, - action_shape=4, - encoder_hidden_size_list=[512, 64], - # Whether to use dueling head. - dueling=True, - ), - # Reward's future discount factor, aka. gamma. - discount_factor=0.99, - # How many steps in td error. - nstep=nstep, - # learn_mode config - learn=dict( - update_per_collect=10, - batch_size=64, - learning_rate=0.001, - # Frequency of target network update. - target_update_freq=100, - ), - # collect_mode config - collect=dict( - # You can use either "n_sample" or "n_episode" in collector.collect. - # Get "n_sample" samples per collect. - n_sample=64, - # Cut trajectories into pieces with length "unroll_len". - unroll_len=1, - ), - # command_mode config - other=dict( - # Epsilon greedy with decay. - eps=dict( - # Decay type. Support ['exp', 'linear']. - type='exp', - start=0.95, - end=0.1, - decay=50000, - ), - replay_buffer=dict(replay_buffer_size=100000, ) - ), - ), -) -lunarlander_dqn_config = EasyDict(lunarlander_dqn_config) -main_config = lunarlander_dqn_config - -lunarlander_dqn_create_config = dict( - env=dict( - type='lunarlander', - import_names=['dizoo.box2d.lunarlander.envs.lunarlander_env'], - ), - env_manager=dict(type='subprocess'), - # env_manager=dict(type='base'), - policy=dict(type='dqn'), -) -lunarlander_dqn_create_config = EasyDict(lunarlander_dqn_create_config) -create_config = lunarlander_dqn_create_config - -if __name__ == "__main__": - # or you can enter `ding -m serial -c lunarlander_dqn_config.py -s 0` - from ding.entry import serial_pipeline - serial_pipeline([main_config, create_config], seed=0) diff --git a/benchmark/lunarlander_dqn_deploy.py b/benchmark/lunarlander_dqn_deploy.py deleted file mode 100644 index e3bd346b48..0000000000 --- a/benchmark/lunarlander_dqn_deploy.py +++ /dev/null @@ -1,39 +0,0 @@ -import gym -import torch -from easydict import EasyDict -from ding.config import compile_config -from ding.envs import DingEnvWrapper -from ding.policy import DQNPolicy, single_env_forward_wrapper -from ding.model import DQN -#from dizoo.box2d.lunarlander.config.lunarlander_dqn_config import main_config, create_config -from lunarlander_dqn_config import main_config, create_config - - -def main(main_config: EasyDict, create_config: EasyDict, ckpt_path: str): - main_config.exp_name = 'lunarlander_dqn_deploy' - - cfg = compile_config(main_config, create_cfg=create_config, auto=True) - - env = DingEnvWrapper(gym.make(cfg.env.env_id), EasyDict(env_wrapper='default')) - env.enable_save_replay(replay_path='./lunarlander_dqn_deploy/video') - - model = DQN(**cfg.policy.model) - state_dict = torch.load(ckpt_path, map_location='cpu') - model.load_state_dict(state_dict['model']) - policy = DQNPolicy(cfg.policy, model=model).eval_mode - forward_fn = single_env_forward_wrapper(policy.forward) - - obs = env.reset() - - returns = 0. - while True: - action = forward_fn(obs) - obs, rew, done, info = env.step(action) - returns += rew - if done: - break - print(f'Deploy is finished, final epsiode return is: {returns}') - - -if __name__ == "__main__": - main(main_config=main_config, create_config=create_config, ckpt_path='lunarlander_dqn_seed0/ckpt/final.pth.tar') \ No newline at end of file diff --git a/benchmark/lunarlander_dqn_eval.py b/benchmark/lunarlander_dqn_eval.py deleted file mode 100644 index 3309bb308c..0000000000 --- a/benchmark/lunarlander_dqn_eval.py +++ /dev/null @@ -1,64 +0,0 @@ -import os -import gym -import torch -from tensorboardX import SummaryWriter -from easydict import EasyDict - -from ding.config import compile_config -from ding.worker import BaseLearner, SampleSerialCollector, InteractionSerialEvaluator, AdvancedReplayBuffer -from ding.envs import BaseEnvManager, DingEnvWrapper -from ding.policy import DQNPolicy -from ding.model import DQN -from ding.utils import set_pkg_seed -from ding.rl_utils import get_epsilon_greedy_fn -#from dizoo.box2d.lunarlander.config.lunarlander_dqn_config import main_config, create_config -from lunarlander_dqn_config import main_config, create_config - - -# Get DI-engine form env class -def wrapped_cartpole_env(): - return DingEnvWrapper( - gym.make(main_config['env']['env_id']), - EasyDict(env_wrapper='default'), - ) - - -def main(cfg, seed=0): - cfg['exp_name'] = 'lunarlander_dqn_eval' - cfg = compile_config( - cfg, - BaseEnvManager, - DQNPolicy, - BaseLearner, - SampleSerialCollector, - InteractionSerialEvaluator, - AdvancedReplayBuffer, - save_cfg=True - ) - cfg.policy.load_path = 'lunarlander_dqn_seed0/ckpt/final.pth.tar' - evaluator_env_num = cfg.env.evaluator_env_num - evaluator_env = BaseEnvManager(env_fn=[wrapped_cartpole_env for _ in range(evaluator_env_num)], cfg=cfg.env.manager) - - # switch save replay interface - # evaluator_env.enable_save_replay(cfg.env.replay_path) - evaluator_env.enable_save_replay(replay_path='./lunarlander_dqn_eval/video') - - # Set random seed for all package and instance - evaluator_env.seed(seed, dynamic_seed=False) - set_pkg_seed(seed, use_cuda=cfg.policy.cuda) - - # Set up RL Policy - model = DQN(**cfg.policy.model) - policy = DQNPolicy(cfg.policy, model=model) - policy.eval_mode.load_state_dict(torch.load(cfg.policy.load_path, map_location='cpu')) - - # evaluate - tb_logger = SummaryWriter(os.path.join('./{}/log/'.format(cfg.exp_name), 'serial')) - evaluator = InteractionSerialEvaluator( - cfg.policy.eval.evaluator, evaluator_env, policy.eval_mode, tb_logger, exp_name=cfg.exp_name - ) - evaluator.eval() - - -if __name__ == "__main__": - main(main_config) \ No newline at end of file diff --git a/benchmark/lunarlander_dqn_pipeline.py b/benchmark/lunarlander_dqn_pipeline.py deleted file mode 100644 index 24e7e6a0d7..0000000000 --- a/benchmark/lunarlander_dqn_pipeline.py +++ /dev/null @@ -1,52 +0,0 @@ -import gym -from ditk import logging -from ding.model import DQN -from ding.policy import DQNPolicy -from ding.envs import DingEnvWrapper, BaseEnvManagerV2, SubprocessEnvManagerV2 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, online_logger, nstep_reward_enhancer -from ding.utils import set_pkg_seed -from ding.utils.log_helper import build_logger -#from dizoo.box2d.lunarlander.config.lunarlander_dqn_config import main_config, create_config -from lunarlander_dqn_config import main_config, create_config - - -def main(): - logging.getLogger().setLevel(logging.INFO) - cfg = compile_config(main_config, create_cfg=create_config, auto=True) - ding_init(cfg) - - #logger_, tb_logger = build_logger(path='./lunarlander_dqn_seed0/log/', need_tb=True) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = SubprocessEnvManagerV2( - env_fn=[lambda: DingEnvWrapper(gym.make(cfg.env.env_id)) for _ in range(cfg.env.collector_env_num)], - cfg=cfg.env.manager - ) - evaluator_env = SubprocessEnvManagerV2( - env_fn=[lambda: DingEnvWrapper(gym.make(cfg.env.env_id)) for _ in range(cfg.env.evaluator_env_num)], - cfg=cfg.env.manager - ) - - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNPolicy(cfg.policy, model=model) - - task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use(StepCollector(cfg, policy.collect_mode, collector_env)) - task.use(nstep_reward_enhancer(cfg)) - task.use(data_pusher(cfg, buffer_)) - task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use(CkptSaver(cfg, policy, train_freq=100)) - task.run() - - -if __name__ == "__main__": - main() diff --git a/benchmark/pendulum_td3_config.py b/benchmark/pendulum_td3_config.py deleted file mode 100644 index 48e868b215..0000000000 --- a/benchmark/pendulum_td3_config.py +++ /dev/null @@ -1,64 +0,0 @@ -from easydict import EasyDict - -pendulum_td3_config = dict( - exp_name='pendulum_td3_seed0', - env=dict( - collector_env_num=1, - evaluator_env_num=1, - # (bool) Scale output action into legal range. - act_scale=True, - n_evaluator_episode=1, - stop_value=-250, - ), - policy=dict( - cuda=False, - priority=False, - random_collect_size=800, - logger=dict(record_path='./video_pendulum_td3', gradient_logger=True, plot_logger=True, action_logger=None), - model=dict( - obs_shape=3, - action_shape=1, - twin_critic=True, - action_space='regression', - ), - learn=dict( - update_per_collect=2, - batch_size=128, - learning_rate_actor=0.001, - learning_rate_critic=0.001, - ignore_done=True, - actor_update_freq=2, - noise=True, - noise_sigma=0.1, - noise_range=dict( - min=-0.5, - max=0.5, - ), - ), - collect=dict( - n_sample=48, - noise_sigma=0.1, - collector=dict(collect_print_freq=1000, ), - ), - eval=dict(evaluator=dict(eval_freq=100, ), ), - other=dict(replay_buffer=dict(replay_buffer_size=20000, ), ), - ), -) -pendulum_td3_config = EasyDict(pendulum_td3_config) -main_config = pendulum_td3_config - -pendulum_td3_create_config = dict( - env=dict( - type='pendulum', - import_names=['dizoo.classic_control.pendulum.envs.pendulum_env'], - ), - env_manager=dict(type='base'), - policy=dict(type='td3'), -) -pendulum_td3_create_config = EasyDict(pendulum_td3_create_config) -create_config = pendulum_td3_create_config - -if __name__ == "__main__": - # or you can enter `ding -m serial -c pendulum_td3_config.py -s 0` - from ding.entry import serial_pipeline - serial_pipeline([main_config, create_config], seed=0) diff --git a/benchmark/pendulum_td3_pipeline.py b/benchmark/pendulum_td3_pipeline.py deleted file mode 100644 index 45794e8464..0000000000 --- a/benchmark/pendulum_td3_pipeline.py +++ /dev/null @@ -1,49 +0,0 @@ -import gym -from ditk import logging -from ding.model.template.qac import QAC -from ding.policy import TD3Policy -from ding.envs import DingEnvWrapper, BaseEnvManagerV2, SubprocessEnvManagerV2 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, CkptSaver -from ding.utils import set_pkg_seed -from ding.utils.log_helper import build_logger -from dizoo.classic_control.pendulum.envs.pendulum_env import PendulumEnv -from pendulum_td3_config import main_config, create_config - - -def main(seed): - logging.getLogger().setLevel(logging.INFO) - cfg = compile_config(main_config, create_cfg=create_config, auto=True) - cfg.env.seed = seed - - logger_, tb_logger = build_logger(path='./log/pendulum_td3/seed' + str(seed), need_tb=True) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = SubprocessEnvManagerV2( - env_fn=[lambda: PendulumEnv(cfg.env) for _ in range(cfg.env.collector_env_num)], cfg=cfg.env.manager - ) - evaluator_env = SubprocessEnvManagerV2( - env_fn=[lambda: PendulumEnv(cfg.env) for _ in range(cfg.env.evaluator_env_num)], cfg=cfg.env.manager - ) - evaluator_env.enable_save_replay(replay_path=cfg.policy.logger.record_path) - - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = QAC(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = TD3Policy(cfg.policy, model) - - task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use( - StepCollector(cfg, policy.collect_mode, collector_env, random_collect_size=cfg.policy.random_collect_size) - ) - task.use(data_pusher(cfg, buffer_)) - task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(CkptSaver(cfg, policy, train_freq=100)) - task.run() - - -if __name__ == "__main__": - main(0) diff --git a/benchmark/pendulum_td3_wandb.py b/benchmark/pendulum_td3_wandb.py deleted file mode 100644 index cdfcc6568f..0000000000 --- a/benchmark/pendulum_td3_wandb.py +++ /dev/null @@ -1,61 +0,0 @@ -import gym -from ditk import logging -from ding.model.template.qac import QAC -from ding.policy import TD3Policy -from ding.envs import DingEnvWrapper, BaseEnvManagerV2, SubprocessEnvManagerV2 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, CkptSaver, wandb_online_logger -from ding.utils import set_pkg_seed -from ding.utils.log_helper import build_logger -from dizoo.classic_control.pendulum.envs.pendulum_env import PendulumEnv - -from pendulum_td3_config import main_config, create_config -import wandb - - -def main(seed): - logging.getLogger().setLevel(logging.INFO) - cfg = compile_config(main_config, create_cfg=create_config, auto=True) - cfg.env.seed = seed - - wandb.init( - # Set the project where this run will be logged - project='pendulum-td3-0', - # We pass a run name (otherwise it’ll be randomly assigned, like sunshine-lollypop-10) - name=f"td3", - # Track hyperparameters and run metadata - config=cfg - ) - - logger_, tb_logger = build_logger(path='./log/pendulum_td3/seed' + str(seed), need_tb=True) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = SubprocessEnvManagerV2( - env_fn=[lambda: PendulumEnv(cfg.env) for _ in range(cfg.env.collector_env_num)], cfg=cfg.env.manager - ) - evaluator_env = SubprocessEnvManagerV2( - env_fn=[lambda: PendulumEnv(cfg.env) for _ in range(cfg.env.evaluator_env_num)], cfg=cfg.env.manager - ) - evaluator_env.enable_save_replay(replay_path=cfg.policy.logger.record_path) - - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = QAC(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = TD3Policy(cfg.policy, model) - - task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use( - StepCollector(cfg, policy.collect_mode, collector_env, random_collect_size=cfg.policy.random_collect_size) - ) - task.use(data_pusher(cfg, buffer_)) - task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(wandb_online_logger(cfg.policy.logger, evaluator_env, model)) - task.use(CkptSaver(cfg, policy, train_freq=100)) - task.run() - - -if __name__ == "__main__": - main(0) diff --git a/benchmark/qbert_dqn_config.py b/benchmark/qbert_dqn_config.py deleted file mode 100644 index f109fc9deb..0000000000 --- a/benchmark/qbert_dqn_config.py +++ /dev/null @@ -1,60 +0,0 @@ -from easydict import EasyDict - -qbert_dqn_config = dict( - exp_name='qbert_dqn_normal', - env=dict( - collector_env_num=1, - evaluator_env_num=1, - n_evaluator_episode=1, - stop_value=30000, - env_id='QbertNoFrameskip-v4', - frame_stack=4 - ), - policy=dict( - cuda=True, - priority=False, - model=dict( - obs_shape=[4, 84, 84], - action_shape=6, - encoder_hidden_size_list=[128, 128, 512], - ), - nstep=3, - discount_factor=0.99, - learn=dict( - update_per_collect=10, - batch_size=32, - learning_rate=0.0001, - target_update_freq=500, - reward_scale_function='normal', - ), - collect=dict(n_sample=100, ), - logger=dict(record_path='./video_qbert_dqn', gradient_logger=True, plot_logger=True, action_logger='q_value'), - eval=dict(evaluator=dict(eval_freq=4000, )), - other=dict( - eps=dict( - type='exp', - start=1., - end=0.05, - decay=1000000, - ), - replay_buffer=dict(replay_buffer_size=400000, ), - ), - ), -) -qbert_dqn_config = EasyDict(qbert_dqn_config) -main_config = qbert_dqn_config -qbert_dqn_create_config = dict( - env=dict( - type='atari', - import_names=['dizoo.atari.envs.atari_env'], - ), - env_manager=dict(type='subprocess'), - policy=dict(type='dqn'), -) -qbert_dqn_create_config = EasyDict(qbert_dqn_create_config) -create_config = qbert_dqn_create_config - -if __name__ == '__main__': - # or you can enter ding -m serial -c qbert_dqn_config.py -s 0 - from ding.entry import serial_pipeline - serial_pipeline((main_config, create_config), seed=0) \ No newline at end of file diff --git a/benchmark/qbert_dqn_wandb.py b/benchmark/qbert_dqn_wandb.py deleted file mode 100644 index 06ed9a2033..0000000000 --- a/benchmark/qbert_dqn_wandb.py +++ /dev/null @@ -1,192 +0,0 @@ -from typing import TYPE_CHECKING, Callable, Any, List, Union -import sys -from copy import deepcopy -from collections import deque -import gym -import torch -import treetensor.torch as ttorch -import numpy as np - -from ditk import logging -from ding.model import DQN -from ding.policy import DQNPolicy -from ding.policy import Policy -from ding.envs import DingEnvWrapper, BaseEnvManagerV2, BaseEnvManager, SubprocessEnvManagerV2 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task -from ding.framework.context import OfflineRLContext -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, StepCollector, data_pusher, \ - eps_greedy_handler, CkptSaver, termination_checker, nstep_reward_enhancer, wandb_online_logger, interaction_evaluator -from ding.utils import set_pkg_seed -from ding.utils import lists_to_dicts -from ding.utils.log_helper import build_logger -from ding.torch_utils import tensor_to_list, to_ndarray -from dizoo.atari.envs.atari_env import AtariEnv -import wandb -from easydict import EasyDict - -from qbert_dqn_config import main_config, create_config - - -class VectorEvalMonitor(object): - """ - Overview: - In some cases, different environment in evaluator may collect different length episode. For example, \ - suppose we want to collect 12 episodes in evaluator but only have 5 environments, if we didn’t do \ - any thing, it is likely that we will get more short episodes than long episodes. As a result, \ - our average reward will have a bias and may not be accurate. we use VectorEvalMonitor to solve the problem. - Interfaces: - __init__, is_finished, update_info, update_reward, get_episode_reward, get_latest_reward, get_current_episode,\ - get_episode_info - """ - - def __init__(self, env_num: int, n_episode: int) -> None: - """ - Overview: - Init method. According to the number of episodes and the number of environments, determine how many \ - episodes need to be opened for each environment, and initialize the reward, info and other \ - information - Arguments: - - env_num (:obj:`int`): the number of episodes need to be open - - n_episode (:obj:`int`): the number of environments - """ - assert n_episode >= env_num, "n_episode < env_num, please decrease the number of eval env" - self._env_num = env_num - self._n_episode = n_episode - each_env_episode = [n_episode // env_num for _ in range(env_num)] - for i in range(n_episode % env_num): - each_env_episode[i] += 1 - self._reward = {env_id: deque(maxlen=maxlen) for env_id, maxlen in enumerate(each_env_episode)} - self._info = {env_id: deque(maxlen=maxlen) for env_id, maxlen in enumerate(each_env_episode)} - - def is_finished(self) -> bool: - """ - Overview: - Determine whether the evaluator has completed the work. - Return: - - result: (:obj:`bool`): whether the evaluator has completed the work - """ - return all([len(v) == v.maxlen for v in self._reward.values()]) - - def update_info(self, env_id: int, info: Any) -> None: - """ - Overview: - Update the information of the environment indicated by env_id. - Arguments: - - env_id: (:obj:`int`): the id of the environment we need to update information - - info: (:obj:`Any`): the information we need to update - """ - info = tensor_to_list(info) - self._info[env_id].append(info) - - def update_reward(self, env_id: Union[int, np.ndarray], reward: Any) -> None: - """ - Overview: - Update the reward indicated by env_id. - Arguments: - - env_id: (:obj:`int`): the id of the environment we need to update the reward - - reward: (:obj:`Any`): the reward we need to update - """ - if isinstance(reward, torch.Tensor): - reward = reward.item() - if isinstance(env_id, np.ndarray): - env_id = env_id.item() - self._reward[env_id].append(reward) - - def get_episode_reward(self) -> list: - """ - Overview: - Get the total reward of one episode. - """ - return sum([list(v) for v in self._reward.values()], []) # sum(iterable, start) - - def get_latest_reward(self, env_id: int) -> int: - """ - Overview: - Get the latest reward of a certain environment. - Arguments: - - env_id: (:obj:`int`): the id of the environment we need to get reward. - """ - return self._reward[env_id][-1] - - def get_current_episode(self) -> int: - """ - Overview: - Get the current episode. We can know which episode our evaluator is executing now. - """ - return sum([len(v) for v in self._reward.values()]) - - def get_episode_info(self) -> dict: - """ - Overview: - Get all episode information, such as total reward of one episode. - """ - if len(self._info[0]) == 0: - return None - else: - total_info = sum([list(v) for v in self._info.values()], []) - total_info = lists_to_dicts(total_info) - new_dict = {} - for k in total_info.keys(): - if np.isscalar(total_info[k][0]): - new_dict[k + '_mean'] = np.mean(total_info[k]) - total_info.update(new_dict) - return total_info - - -num_seed = 1 - - -def main(seed): - logging.getLogger().setLevel(logging.INFO) - cfg = compile_config(main_config, create_cfg=create_config, auto=True) - cfg.env.seed = seed - - for i in range(num_seed): - wandb.init( - # Set the project where this run will be logged - project='zjow-QbertNoFrameskip-v4-3', - # We pass a run name (otherwise it’ll be randomly assigned, like sunshine-lollypop-10) - name=f"dqn", - # Track hyperparameters and run metadata - config=cfg - ) - logger_, tb_logger = build_logger(path='./log/qbert_dqn/seed' + str(seed), need_tb=True) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_cfg = deepcopy(cfg.env) - collector_cfg.is_train = True - evaluator_cfg = deepcopy(cfg.env) - evaluator_cfg.is_train = False - collector_env = SubprocessEnvManagerV2( - env_fn=[lambda: AtariEnv(collector_cfg) for _ in range(cfg.env.collector_env_num)], cfg=cfg.env.manager - ) - evaluator_env = SubprocessEnvManagerV2( - env_fn=[lambda: AtariEnv(evaluator_cfg) for _ in range(cfg.env.evaluator_env_num)], cfg=cfg.env.manager - ) - - # collector_env.enable_save_replay(replay_path='./lunarlander_video_train') - evaluator_env.enable_save_replay(replay_path=cfg.policy.logger.record_path) - - set_pkg_seed(seed, use_cuda=cfg.policy.cuda) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNPolicy(cfg.policy, model=model) - - task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use(StepCollector(cfg, policy.collect_mode, collector_env)) - task.use(nstep_reward_enhancer(cfg)) - task.use(data_pusher(cfg, buffer_)) - task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(wandb_online_logger(cfg.policy.logger, evaluator_env, model)) - #task.use(CkptSaver(cfg, policy, train_freq=100)) - task.use(termination_checker(max_env_step=int(10e6))) - #task.use(_add_scalar) - task.run() - - -if __name__ == "__main__": - main(0) \ No newline at end of file diff --git a/scheduler_main.py b/scheduler_main.py deleted file mode 100644 index fa9bd1a9bb..0000000000 --- a/scheduler_main.py +++ /dev/null @@ -1,28 +0,0 @@ -import os - -from ditk import logging - -from lighttuner.hpo import R, uniform, choice -from lighttuner.hpo import hpo -from lighttuner.scheduler import run_scheduler_local - - -def demo(): - dir_name = os.path.abspath('./benchmark') - - with run_scheduler_local(task_config_template_path=os.path.join(dir_name, "hopper_td3_wandb_pipeline.py"), - dijob_project_name="hopper_td3_wandb",max_number_of_running_task=4) as scheduler: - - opt = hpo(scheduler.get_hpo_callable()) - cfg, ret, metrics = opt.grid() \ - .max_steps(5) \ - .max_workers(4) \ - .maximize(R['eval_value']) \ - .spaces({'seed': choice([0,1])}).run() - print(cfg) - print(ret) - - -if __name__ == "__main__": - logging.try_init_root(logging.INFO) - demo() From e5ec188d3f4f256b231b5e0e15a9625f327723b5 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 10 Feb 2023 07:26:24 +0000 Subject: [PATCH 009/244] polish code --- ding/framework/middleware/functional/collector.py | 1 - ding/framework/middleware/functional/evaluator.py | 1 - ding/framework/middleware/functional/logger.py | 2 -- 3 files changed, 4 deletions(-) diff --git a/ding/framework/middleware/functional/collector.py b/ding/framework/middleware/functional/collector.py index afd92d7649..dfc94107be 100644 --- a/ding/framework/middleware/functional/collector.py +++ b/ding/framework/middleware/functional/collector.py @@ -78,7 +78,6 @@ def _inference(ctx: "OnlineRLContext"): obs = {i: obs[i] for i in range(get_shape0(obs))} # TBD inference_output = policy.forward(obs, **ctx.collect_kwargs) - # ctx.action = [to_ndarray(v['action']) for v in inference_output.values()] # TBD ctx.action = np.array([to_ndarray(v['action']) for v in inference_output.values()]) # TBD ctx.inference_output = inference_output diff --git a/ding/framework/middleware/functional/evaluator.py b/ding/framework/middleware/functional/evaluator.py index 8ded8404cb..22083759b2 100644 --- a/ding/framework/middleware/functional/evaluator.py +++ b/ding/framework/middleware/functional/evaluator.py @@ -257,7 +257,6 @@ def _evaluate(ctx: Union["OnlineRLContext", "OfflineRLContext"]): eval_monitor.update_video(env.ready_imgs) eval_monitor.update_output(inference_output) output = [v for v in inference_output.values()] - # action = [to_ndarray(v['action']) for v in output] # TBD action = np.array([to_ndarray(v['action']) for v in output]) # TBD timesteps = env.step(action) for timestep in timesteps: diff --git a/ding/framework/middleware/functional/logger.py b/ding/framework/middleware/functional/logger.py index 9e80ddc4db..05e75b56bd 100644 --- a/ding/framework/middleware/functional/logger.py +++ b/ding/framework/middleware/functional/logger.py @@ -185,7 +185,6 @@ def _plot(ctx: "OnlineRLContext"): return for metric in metric_list: if metric in ctx.train_output[0]: - # metric_value = np.mean([item[metric] for item in ctx.train_output]) metric_value_list = [] for item in ctx.train_output: if isinstance(item[metric], torch.Tensor): @@ -380,7 +379,6 @@ def _plot(ctx: "OnlineRLContext"): return for metric in metric_list: if metric in ctx.train_output[0]: - # metric_value = np.mean([item[metric] for item in ctx.train_output]) metric_value_list = [] for item in ctx.train_output: if isinstance(item[metric], torch.Tensor): From 6a9a565ed8e8d2a2978866798e934281d8ed4bc8 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 24 Feb 2023 17:55:51 +0800 Subject: [PATCH 010/244] fix td3 policy --- ding/policy/ddpg.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/ding/policy/ddpg.py b/ding/policy/ddpg.py index 850254e6e8..ebbf885621 100644 --- a/ding/policy/ddpg.py +++ b/ding/policy/ddpg.py @@ -251,18 +251,13 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: q_value_dict = {} target_q_value_dict = {} - if self._twin_critic: - q_value_dict['q_value'] = q_value[0].mean() - q_value_dict['q_value_twin'] = q_value[1].mean() - target_q_value_dict['target q_value'] = target_q_value[0].mean() - target_q_value_dict['target q_value_twin'] = target_q_value[1].mean() - else: - q_value_dict['q_value'] = q_value.mean() - target_q_value_dict['target q_value'] = target_q_value[0].mean() if self._twin_critic: # TD3: two critic networks target_q_value = torch.min(target_q_value[0], target_q_value[1]) # find min one as target q value + q_value_dict['q_value'] = q_value[0].mean() + q_value_dict['q_value_twin'] = q_value[1].mean() + target_q_value_dict['target q_value'] = target_q_value.mean() # critic network1 td_data = v_1step_td_data(q_value[0], target_q_value, reward, data['done'], data['weight']) critic_loss, td_error_per_sample1 = v_1step_td_error(td_data, self._gamma) @@ -274,6 +269,8 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: td_error_per_sample = (td_error_per_sample1 + td_error_per_sample2) / 2 else: # DDPG: single critic network + q_value_dict['q_value'] = q_value.mean() + target_q_value_dict['target q_value'] = target_q_value.mean() td_data = v_1step_td_data(q_value, target_q_value, reward, data['done'], data['weight']) critic_loss, td_error_per_sample = v_1step_td_error(td_data, self._gamma) loss_dict['critic_loss'] = critic_loss From 0222c04d8c8a3b31d2ca2328d886dd7e9e788f9f Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 28 Feb 2023 19:22:01 +0800 Subject: [PATCH 011/244] Add td3 --- ding/bonus/config.py | 192 ++++++++++------ ding/bonus/ppof.py | 36 ++- ding/bonus/td3.py | 205 ++++++++++++++++++ ding/config/config.py | 2 +- .../middleware/functional/evaluator.py | 8 +- .../framework/middleware/functional/logger.py | 34 +-- ding/policy/ppof.py | 5 +- 7 files changed, 385 insertions(+), 97 deletions(-) create mode 100644 ding/bonus/td3.py diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 210936afbb..19c1b699d4 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -4,81 +4,129 @@ from ding.envs import BaseEnv, DingEnvWrapper from ding.envs.env_wrappers import MaxAndSkipWrapper, WarpFrameWrapper, ScaledFloatFrameWrapper, FrameStackWrapper, \ EvalEpisodeReturnEnv, TransposeWrapper, TimeLimitWrapper -from ding.policy import PPOFPolicy +from ding.policy import PPOFPolicy, TD3Policy -def get_instance_config(env: str) -> EasyDict: - cfg = PPOFPolicy.default_config() - if env == 'lunarlander_discrete': - cfg.n_sample = 400 - elif env == 'lunarlander_continuous': - cfg.action_space = 'continuous' - cfg.n_sample = 400 - elif env == 'bipedalwalker': - cfg.learning_rate = 1e-3 - cfg.action_space = 'continuous' - cfg.n_sample = 1024 - elif env == 'rocket_landing': - cfg.n_sample = 2048 - cfg.adv_norm = False - cfg.model = dict( - encoder_hidden_size_list=[64, 64, 128], - actor_head_hidden_size=128, - critic_head_hidden_size=128, - ) - elif env == 'drone_fly': - cfg.action_space = 'continuous' - cfg.adv_norm = False - cfg.epoch_per_collect = 5 - cfg.learning_rate = 5e-5 - cfg.n_sample = 640 - elif env == 'hybrid_moving': - cfg.action_space = 'hybrid' - cfg.n_sample = 3200 - cfg.entropy_weight = 0.03 - cfg.batch_size = 320 - cfg.adv_norm = False - cfg.model = dict( - encoder_hidden_size_list=[256, 128, 64, 64], - sigma_type='fixed', - fixed_sigma_value=0.3, - bound_type='tanh', - ) - elif env == 'evogym_carrier': - cfg.action_space = 'continuous' - cfg.n_sample = 2048 - cfg.batch_size = 256 - cfg.epoch_per_collect = 10 - cfg.learning_rate = 3e-3 - elif env == 'mario': - cfg.n_sample = 256 - cfg.batch_size = 64 - cfg.epoch_per_collect = 2 - cfg.learning_rate = 1e-3 - cfg.model = dict( - encoder_hidden_size_list=[64, 64, 128], - critic_head_hidden_size=128, - actor_head_hidden_size=128, - ) - elif env == 'di_sheep': - cfg.n_sample = 3200 - cfg.batch_size = 320 - cfg.epoch_per_collect = 10 - cfg.learning_rate = 3e-4 - cfg.adv_norm = False - cfg.entropy_weight = 0.001 - elif env == 'procgen_bigfish': - cfg.n_sample = 16384 - cfg.batch_size = 16384 - cfg.epoch_per_collect = 10 - cfg.learning_rate = 5e-4 - cfg.model = dict( - encoder_hidden_size_list=[64, 128, 256], - critic_head_hidden_size=256, - actor_head_hidden_size=256, - ) +def get_instance_config(env: str, algorithm: str) -> EasyDict: + if algorithm == 'PPO': + cfg = PPOFPolicy.default_config() + if env == 'lunarlander_discrete': + cfg.n_sample = 400 + elif env == 'lunarlander_continuous': + cfg.action_space = 'continuous' + cfg.n_sample = 400 + elif env == 'bipedalwalker': + cfg.learning_rate = 1e-3 + cfg.action_space = 'continuous' + cfg.n_sample = 1024 + elif env == 'rocket_landing': + cfg.n_sample = 2048 + cfg.adv_norm = False + cfg.model = dict( + encoder_hidden_size_list=[64, 64, 128], + actor_head_hidden_size=128, + critic_head_hidden_size=128, + ) + elif env == 'drone_fly': + cfg.action_space = 'continuous' + cfg.adv_norm = False + cfg.epoch_per_collect = 5 + cfg.learning_rate = 5e-5 + cfg.n_sample = 640 + elif env == 'hybrid_moving': + cfg.action_space = 'hybrid' + cfg.n_sample = 3200 + cfg.entropy_weight = 0.03 + cfg.batch_size = 320 + cfg.adv_norm = False + cfg.model = dict( + encoder_hidden_size_list=[256, 128, 64, 64], + sigma_type='fixed', + fixed_sigma_value=0.3, + bound_type='tanh', + ) + elif env == 'evogym_carrier': + cfg.action_space = 'continuous' + cfg.n_sample = 2048 + cfg.batch_size = 256 + cfg.epoch_per_collect = 10 + cfg.learning_rate = 3e-3 + elif env == 'mario': + cfg.n_sample = 256 + cfg.batch_size = 64 + cfg.epoch_per_collect = 2 + cfg.learning_rate = 1e-3 + cfg.model = dict( + encoder_hidden_size_list=[64, 64, 128], + critic_head_hidden_size=128, + actor_head_hidden_size=128, + ) + elif env == 'di_sheep': + cfg.n_sample = 3200 + cfg.batch_size = 320 + cfg.epoch_per_collect = 10 + cfg.learning_rate = 3e-4 + cfg.adv_norm = False + cfg.entropy_weight = 0.001 + elif env == 'procgen_bigfish': + cfg.n_sample = 16384 + cfg.batch_size = 16384 + cfg.epoch_per_collect = 10 + cfg.learning_rate = 5e-4 + cfg.model = dict( + encoder_hidden_size_list=[64, 128, 256], + critic_head_hidden_size=256, + actor_head_hidden_size=256, + ) + else: + raise KeyError("not supported env type: {}".format(env)) + elif algorithm == 'TD3': + cfg = TD3Policy.default_config() + if env == 'hopper': + cfg.action_space = 'continuous' + cfg.random_collect_size=25000, + cfg.model=dict( + obs_shape=11, + action_shape=3, + twin_critic=True, + actor_head_hidden_size=256, + critic_head_hidden_size=256, + action_space='regression', + ) + cfg.logger=dict(record_path='./video_hopper_td3', + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ) + cfg.learn=dict( + update_per_collect=1, + batch_size=256, + learning_rate_actor=1e-3, + learning_rate_critic=1e-3, + ignore_done=False, + target_theta=0.005, + discount_factor=0.99, + actor_update_freq=2, + noise=True, + noise_sigma=0.2, + noise_range=dict( + min=-0.5, + max=0.5, + ), + ) + cfg.collect=dict( + n_sample=1, + unroll_len=1, + noise_sigma=0.1, + ) + cfg.other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ) + else: + raise KeyError("not supported env type: {}".format(env)) else: - raise KeyError("not supported env type: {}".format(env)) + raise KeyError("not supported algorithm type: {}".format(algorithm)) + return cfg diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index a5f12e7398..6be0815a04 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -14,6 +14,8 @@ from .model import PPOFModel from .config import get_instance_config, get_instance_env, get_hybrid_shape +class TrainingReturn: + wandb_url:str class PPOF: supported_env_list = [ @@ -43,8 +45,11 @@ def __init__( if isinstance(env, str): assert env in PPOF.supported_env_list, "Please use supported envs: {}".format(PPOF.supported_env_list) self.env = get_instance_env(env) - assert cfg is None, 'It should be default env tuned config' - self.cfg = get_instance_config(env) + if cfg is None: + # 'It should be default env tuned config' + self.cfg = get_instance_config(env) + else: + self.cfg = cfg elif isinstance(env, BaseEnv): self.cfg = cfg raise NotImplementedError @@ -71,6 +76,11 @@ def __init__( ) self.policy = PPOFPolicy(self.cfg, model=model) + def load_policy(self,policy_state_dict, config): + self.policy.load_state_dict(policy_state_dict) + self.policy._cfg = config + + def train( self, step: int = int(1e7), @@ -80,13 +90,14 @@ def train( n_iter_save_ckpt: int = 1000, context: Optional[str] = None, debug: bool = False - ) -> None: + ) -> TrainingReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) logging.debug(self.policy._model) # define env and policy collector_env = self._setup_env_manager(collector_env_num, context, debug) evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug) + wandb_url_return=[] with task.start(ctx=OnlineRLContext()): task.use(interaction_evaluator_ttorch(self.seed, self.policy, evaluator_env)) @@ -94,9 +105,11 @@ def train( task.use(ppof_adv_estimator(self.policy)) task.use(multistep_trainer(self.policy, log_freq=n_iter_log_show)) task.use(CkptSaver(self.policy, save_dir=self.exp_name, train_freq=n_iter_save_ckpt)) - task.use(wandb_online_logger(self.exp_name, metric_list=self.policy.monitor_vars(), anonymous=True)) + task.use(wandb_online_logger(self.exp_name, metric_list=self.policy.monitor_vars(), anonymous=True, project_name=self.exp_name, wandb_url_return=wandb_url_return)) task.use(termination_checker(max_env_step=step)) task.run() + + return TrainingReturn(wandb_url=wandb_url_return[0]) def deploy(self, ckpt_path: str = None, enable_save_replay: bool = False, debug: bool = False) -> None: if debug: @@ -163,7 +176,9 @@ def batch_evaluate( ckpt_path: Optional[str] = None, n_evaluator_episode: int = 4, context: Optional[str] = None, - debug: bool = False + debug: bool = False, + render: bool = False, + replay_video_path: str = None, ) -> None: if debug: logging.getLogger().setLevel(logging.DEBUG) @@ -176,7 +191,16 @@ def batch_evaluate( # main execution task with task.start(ctx=OnlineRLContext()): - task.use(interaction_evaluator_ttorch(self.seed, self.policy, env, n_evaluator_episode)) + task.use(interaction_evaluator_ttorch(self.seed, self.policy, env, n_evaluator_episode, render=render, replay_video_path=replay_video_path)) + # task.use(wandb_online_logger(record_path='./video', + # cfg=EasyDict(dict( + # gradient_logger=False, + # video_logger=True, + # plot_logger=False, + # action_logger=False, + # return_logger=False + # )), + # env=env)) task.run(max_step=1) def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: bool = False) -> BaseEnvManagerV2: diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py new file mode 100644 index 0000000000..f016a14d59 --- /dev/null +++ b/ding/bonus/td3.py @@ -0,0 +1,205 @@ +from typing import Optional, Union +from ditk import logging +from easydict import EasyDict +import os +import gym +import torch +from ding.framework import task, OnlineRLContext +from ding.framework.middleware import interaction_evaluator_ttorch, CkptSaver, multistep_trainer, \ + wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \ + OffPolicyLearner, final_ctx_saver +from ding.envs import BaseEnv, BaseEnvManagerV2, SubprocessEnvManagerV2 +from ding.policy import TD3Policy, single_env_forward_wrapper_ttorch +from ding.utils import set_pkg_seed +from ding.config import save_config_py +from ding.model import QAC +from ding.data import DequeBuffer +from ding.bonus.config import get_instance_config, get_instance_env, get_hybrid_shape + +class TrainingReturn: + wandb_url:str + +class TD3: + supported_env_list = [ + 'hopper', + ] + algorithm='TD3' + + def __init__( + self, + env: Union[str, BaseEnv], + seed: int = 0, + exp_name: str = 'default_experiment', + cfg: Optional[EasyDict] = None + ) -> None: + if isinstance(env, str): + assert env in TD3.supported_env_list, "Please use supported envs: {}".format(TD3.supported_env_list) + self.env = get_instance_env(env) + if cfg is None: + # 'It should be default env tuned config' + self.cfg = get_instance_config(env) + else: + self.cfg = cfg + elif isinstance(env, BaseEnv): + self.cfg = cfg + raise NotImplementedError + else: + raise TypeError("not support env type: {}, only strings and instances of `BaseEnv` now".format(type(env))) + logging.getLogger().setLevel(logging.INFO) + self.seed = seed + set_pkg_seed(self.seed) + self.exp_name = exp_name + if not os.path.exists(self.exp_name): + os.makedirs(self.exp_name) + save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py')) + + action_space = self.env.action_space + if isinstance(action_space, gym.spaces.Discrete): + action_shape = action_space.n + elif isinstance(action_space, gym.spaces.Tuple): + action_shape = get_hybrid_shape(action_space) + else: + action_shape = action_space.shape + model = QAC(**self.cfg.policy.model) + # model = QAC( + # self.env.observation_space.shape, action_shape, action_space=self.cfg.action_space, **self.cfg.model + # ) + self.buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + self.policy = TD3Policy(self.cfg, model=model) + + def load_policy(self,policy_state_dict, config): + self.policy.load_state_dict(policy_state_dict) + self.policy._cfg = config + + + def train( + self, + step: int = int(1e7), + collector_env_num: int = 4, + evaluator_env_num: int = 4, + n_iter_log_show: int = 500, + n_iter_save_ckpt: int = 1000, + context: Optional[str] = None, + debug: bool = False + ) -> dict: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + logging.debug(self.policy._model) + # define env and policy + collector_env = self._setup_env_manager(collector_env_num, context, debug) + evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug) + wandb_url_return=[] + + self.cfg.policy.logger.record_path = './' + self.cfg.exp_name + '/video' + evaluator_env.enable_save_replay(replay_path=self.cfg.policy.logger.record_path) + + with task.start(ctx=OnlineRLContext()): + task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env,render=True)) + task.use( + StepCollector(self.cfg, self.policy.collect_mode, collector_env, random_collect_size=self.cfg.policy.random_collect_size) + ) + task.use(data_pusher(self.cfg, self.buffer_)) + task.use(multistep_trainer(self.policy, log_freq=n_iter_log_show)) + task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_)) + task.use(CkptSaver(policy=self.policy,save_dir=os.path.join(self.cfg["exp_name"],"model"), train_freq=n_iter_save_ckpt)) + task.use(wandb_online_logger(self.exp_name, metric_list=self.policy.monitor_vars(), anonymous=True, project_name=self.exp_name, wandb_url_return=wandb_url_return)) + task.use(termination_checker(max_env_step=step)) + task.use(final_ctx_saver(name=self.cfg["exp_name"])) + task.run() + + return_dict={"wandb_url":wandb_url_return[0]} + return return_dict + + def deploy(self, ckpt_path: str = None, enable_save_replay: bool = False, debug: bool = False) -> None: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + # define env and policy + env = self.env.clone() + env.seed(self.seed, dynamic_seed=False) + if enable_save_replay: + env.enable_save_replay(replay_path=os.path.join(self.exp_name, 'videos')) + if ckpt_path is None: + ckpt_path = os.path.join(self.exp_name, 'ckpt/eval.pth.tar') + state_dict = torch.load(ckpt_path, map_location='cpu') + self.policy.load_state_dict(state_dict) + forward_fn = single_env_forward_wrapper_ttorch(self.policy.eval) + + # main loop + return_ = 0. + step = 0 + obs = env.reset() + while True: + action = forward_fn(obs) + obs, rew, done, info = env.step(action) + return_ += rew + step += 1 + if done: + break + logging.info(f'TD3 deploy is finished, final episode return with {step} steps is: {return_}') + + def collect_data( + self, + env_num: int = 8, + ckpt_path: Optional[str] = None, + save_data_path: Optional[str] = None, + n_sample: Optional[int] = None, + n_episode: Optional[int] = None, + context: Optional[str] = None, + debug: bool = False + ) -> None: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + if n_episode is not None: + raise NotImplementedError + # define env and policy + env = self._setup_env_manager(env_num, context, debug) + if ckpt_path is None: + ckpt_path = os.path.join(self.exp_name, 'ckpt/eval.pth.tar') + if save_data_path is None: + save_data_path = os.path.join(self.exp_name, 'demo_data') + state_dict = torch.load(ckpt_path, map_location='cpu') + self.policy.load_state_dict(state_dict) + + # main execution task + with task.start(ctx=OnlineRLContext()): + task.use( + StepCollector(self.cfg, self.policy.collect_mode, env, random_collect_size=self.cfg.policy.random_collect_size) + ) + task.use(offline_data_saver(save_data_path, data_type='hdf5')) + task.run(max_step=1) + logging.info( + f'TD3 collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`' + ) + + def batch_evaluate( + self, + env_num: int = 4, + ckpt_path: Optional[str] = None, + n_evaluator_episode: int = 4, + context: Optional[str] = None, + debug: bool = False + ) -> None: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + # define env and policy + env = self._setup_env_manager(env_num, context, debug) + if ckpt_path is None: + ckpt_path = os.path.join(self.exp_name, 'ckpt/eval.pth.tar') + state_dict = torch.load(ckpt_path, map_location='cpu') + self.policy.load_state_dict(state_dict) + + # main execution task + with task.start(ctx=OnlineRLContext()): + task.use(interaction_evaluator_ttorch(self.seed, self.policy, env, n_evaluator_episode)) + task.run(max_step=1) + + def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: bool = False) -> BaseEnvManagerV2: + if debug: + env_cls = BaseEnvManagerV2 + manager_cfg = env_cls.default_config() + else: + env_cls = SubprocessEnvManagerV2 + manager_cfg = env_cls.default_config() + if context is not None: + manager_cfg.context = context + return env_cls([self.env.clone for _ in range(env_num)], manager_cfg) diff --git a/ding/config/config.py b/ding/config/config.py index 574f3170e3..690101e61e 100644 --- a/ding/config/config.py +++ b/ding/config/config.py @@ -144,7 +144,7 @@ def save_config_yaml(config_: dict, path: str) -> NoReturn: yaml.safe_dump(json.loads(config_string), f) -def save_config_py(config_: dict, path: str) -> NoReturn: +def save_config_py(config_: dict, path: str): # -> NoReturn: """ Overview: save configuration to python file diff --git a/ding/framework/middleware/functional/evaluator.py b/ding/framework/middleware/functional/evaluator.py index 22083759b2..c515086133 100644 --- a/ding/framework/middleware/functional/evaluator.py +++ b/ding/framework/middleware/functional/evaluator.py @@ -306,7 +306,8 @@ def interaction_evaluator_ttorch( n_evaluator_episode: Optional[int] = None, stop_value: float = np.inf, eval_freq: int = 1000, - render: bool = False + render: bool = False, + replay_video_path: str = None, ) -> Callable: """ Overview: @@ -323,6 +324,9 @@ def interaction_evaluator_ttorch( if n_evaluator_episode is None: n_evaluator_episode = env.env_num + if replay_video_path: + env.enable_save_replay(replay_path=replay_video_path) + def _evaluate(ctx: "OnlineRLContext"): """ Overview: @@ -354,7 +358,7 @@ def _evaluate(ctx: "OnlineRLContext"): inference_output = inference_output.cpu() if render: eval_monitor.update_video(env.ready_imgs) - eval_monitor.update_output(inference_output) + # eval_monitor.update_output(inference_output) action = inference_output.action.numpy() timesteps = env.step(action) for timestep in timesteps: diff --git a/ding/framework/middleware/functional/logger.py b/ding/framework/middleware/functional/logger.py index 05e75b56bd..c791aa14f4 100644 --- a/ding/framework/middleware/functional/logger.py +++ b/ding/framework/middleware/functional/logger.py @@ -126,7 +126,9 @@ def wandb_online_logger( metric_list: Optional[List[str]] = None, env: Optional[BaseEnvManagerV2] = None, model: Optional[torch.nn.Module] = None, - anonymous: bool = False + anonymous: bool = False, + project_name:str = 'default-project', + wandb_url_return: List = [], ) -> Callable: ''' Overview: @@ -151,9 +153,11 @@ def wandb_online_logger( # Initialize wandb with default settings # Settings can be covered by calling wandb.init() at the top of the script if anonymous: - wandb.init(anonymous="must") + wandb.init(project=project_name,reinit=True,anonymous="must") + wandb_url_return.append(wandb.run.get_project_url()) else: - wandb.init() + wandb.init(project=project_name,reinit=True) + wandb_url_return.append(wandb.run.get_project_url()) if cfg == 'default': cfg = EasyDict( dict( @@ -178,21 +182,21 @@ def wandb_online_logger( def _plot(ctx: "OnlineRLContext"): info_for_logging = {} - if not cfg.plot_logger: + if cfg.plot_logger: + for metric in metric_list: + if metric in ctx.train_output[0]: + metric_value_list = [] + for item in ctx.train_output: + if isinstance(item[metric], torch.Tensor): + metric_value_list.append(item[metric].cpu().detach().numpy()) + else: + metric_value_list.append(item[metric]) + metric_value = np.mean(metric_value_list) + info_for_logging.update({metric: metric_value}) + else: one_time_warning( "If you want to use wandb to visualize the result, please set plot_logger = True in the config." ) - return - for metric in metric_list: - if metric in ctx.train_output[0]: - metric_value_list = [] - for item in ctx.train_output: - if isinstance(item[metric], torch.Tensor): - metric_value_list.append(item[metric].cpu().detach().numpy()) - else: - metric_value_list.append(item[metric]) - metric_value = np.mean(metric_value_list) - info_for_logging.update({metric: metric_value}) if ctx.eval_value != -np.inf: info_for_logging.update( diff --git a/ding/policy/ppof.py b/ding/policy/ppof.py index baa402626a..d9f8091720 100644 --- a/ding/policy/ppof.py +++ b/ding/policy/ppof.py @@ -62,7 +62,10 @@ def __init__(self, cfg: "EasyDict", model: torch.nn.Module, enable_mode: List[st self._model = self.default_model() else: self._model = model - if self._cfg.cuda and torch.cuda.is_available(): + if hasattr(self._cfg,"cuda") and self._cfg.cuda and torch.cuda.is_available(): + self._device = 'cuda' + self._model.cuda() + elif not hasattr(self._cfg,"cuda") and torch.cuda.is_available(): self._device = 'cuda' self._model.cuda() else: From 929776be056bbca9e7b14abcf854ed7b6dddfc3e Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 28 Feb 2023 20:42:57 +0800 Subject: [PATCH 012/244] Add td3 env --- ding/bonus/config.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 19c1b699d4..141d911a7c 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -200,6 +200,12 @@ def get_instance_env(env: str) -> BaseEnv: }, seed_api=False, ) + elif env== 'hopper': + from dizoo.mujoco.envs import MujocoEnv + cfg = EasyDict({ + 'env_id': 'Hopper-v3', + }) + return MujocoEnv(cfg) else: raise KeyError("not supported env type: {}".format(env)) From 4fba3b987870af93cd4b9444aa8399bd5c759d81 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 28 Feb 2023 20:48:30 +0800 Subject: [PATCH 013/244] Add td3 env --- ding/bonus/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 141d911a7c..3570d0296c 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -204,6 +204,7 @@ def get_instance_env(env: str) -> BaseEnv: from dizoo.mujoco.envs import MujocoEnv cfg = EasyDict({ 'env_id': 'Hopper-v3', + **MujocoEnv.config }) return MujocoEnv(cfg) else: From 0257ae9367b19a98c5c3feba5e6b3fb6d585f680 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 28 Feb 2023 20:50:28 +0800 Subject: [PATCH 014/244] polish code --- ding/bonus/td3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index f016a14d59..9f9356ab0c 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -37,7 +37,7 @@ def __init__( self.env = get_instance_env(env) if cfg is None: # 'It should be default env tuned config' - self.cfg = get_instance_config(env) + self.cfg = get_instance_config(env, algorithm='td3') else: self.cfg = cfg elif isinstance(env, BaseEnv): From cccd585e7e01df1bcc80e5bfb66da88ebd51b4d9 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 28 Feb 2023 20:51:51 +0800 Subject: [PATCH 015/244] polish code --- ding/bonus/td3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 9f9356ab0c..377528259f 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -37,7 +37,7 @@ def __init__( self.env = get_instance_env(env) if cfg is None: # 'It should be default env tuned config' - self.cfg = get_instance_config(env, algorithm='td3') + self.cfg = get_instance_config(env, algorithm=TD3.algorithm) else: self.cfg = cfg elif isinstance(env, BaseEnv): From d7f272ef6c852b4cdf7b19cb709b1153c7e27cea Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 28 Feb 2023 21:03:19 +0800 Subject: [PATCH 016/244] polish code --- ding/bonus/td3.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 377528259f..0600aadb2c 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -52,18 +52,7 @@ def __init__( if not os.path.exists(self.exp_name): os.makedirs(self.exp_name) save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py')) - - action_space = self.env.action_space - if isinstance(action_space, gym.spaces.Discrete): - action_shape = action_space.n - elif isinstance(action_space, gym.spaces.Tuple): - action_shape = get_hybrid_shape(action_space) - else: - action_shape = action_space.shape model = QAC(**self.cfg.policy.model) - # model = QAC( - # self.env.observation_space.shape, action_shape, action_space=self.cfg.action_space, **self.cfg.model - # ) self.buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) self.policy = TD3Policy(self.cfg, model=model) From 902f9b07bc7d02859248cd83fbf8e7e4398739ae Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 28 Feb 2023 21:08:07 +0800 Subject: [PATCH 017/244] polish code --- ding/bonus/config.py | 87 +++++++++++++++++++++++++------------------- 1 file changed, 50 insertions(+), 37 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 3570d0296c..90bb727892 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -83,45 +83,58 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: elif algorithm == 'TD3': cfg = TD3Policy.default_config() if env == 'hopper': - cfg.action_space = 'continuous' - cfg.random_collect_size=25000, - cfg.model=dict( - obs_shape=11, - action_shape=3, - twin_critic=True, - actor_head_hidden_size=256, - critic_head_hidden_size=256, - action_space='regression', - ) - cfg.logger=dict(record_path='./video_hopper_td3', - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ) - cfg.learn=dict( - update_per_collect=1, - batch_size=256, - learning_rate_actor=1e-3, - learning_rate_critic=1e-3, - ignore_done=False, - target_theta=0.005, - discount_factor=0.99, - actor_update_freq=2, - noise=True, - noise_sigma=0.2, - noise_range=dict( - min=-0.5, - max=0.5, + cfg.exp_name='hopper_td3_wandb_seed0', + cfg.seed=0, + cfg.env=dict( + env_id='Hopper-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=6000, + ), + cfg.policy=dict( + cuda=True, + random_collect_size=25000, + model=dict( + obs_shape=11, + action_shape=3, + twin_critic=True, + actor_head_hidden_size=256, + critic_head_hidden_size=256, + action_space='regression', ), + logger=dict(record_path='./video_hopper_td3', + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + learn=dict( + update_per_collect=1, + batch_size=256, + learning_rate_actor=1e-3, + learning_rate_critic=1e-3, + ignore_done=False, + target_theta=0.005, + discount_factor=0.99, + actor_update_freq=2, + noise=True, + noise_sigma=0.2, + noise_range=dict( + min=-0.5, + max=0.5, + ), + ), + collect=dict( + n_sample=1, + unroll_len=1, + noise_sigma=0.1, + ), + other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), ) - cfg.collect=dict( - n_sample=1, - unroll_len=1, - noise_sigma=0.1, - ) - cfg.other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ) else: raise KeyError("not supported env type: {}".format(env)) else: From 17ba3a66d1af8c42be8042884ba6f2c109d72b70 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 28 Feb 2023 21:44:15 +0800 Subject: [PATCH 018/244] polish code --- ding/bonus/td3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 0600aadb2c..1d621c597e 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -53,7 +53,7 @@ def __init__( os.makedirs(self.exp_name) save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py')) model = QAC(**self.cfg.policy.model) - self.buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + self.buffer_ = DequeBuffer(size=self.cfg.policy.other.replay_buffer.replay_buffer_size) self.policy = TD3Policy(self.cfg, model=model) def load_policy(self,policy_state_dict, config): From 21dcc8b7690c949d3d825b191342ca5ee3961e2f Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 28 Feb 2023 22:04:10 +0800 Subject: [PATCH 019/244] polish code --- ding/bonus/config.py | 18 +++++++++++++----- ding/bonus/ppof.py | 9 --------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 90bb727892..f59cd93588 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -214,12 +214,20 @@ def get_instance_env(env: str) -> BaseEnv: seed_api=False, ) elif env== 'hopper': + from dizoo.mujoco.envs import MujocoEnv - cfg = EasyDict({ - 'env_id': 'Hopper-v3', - **MujocoEnv.config - }) - return MujocoEnv(cfg) + cfg = EasyDict( + env_id='Hopper-v3', + env_wrapper='mujoco_default', + ) + return DingEnvWrapper(cfg=cfg) + + # from dizoo.mujoco.envs import MujocoEnv + # cfg = EasyDict({ + # 'env_id': 'Hopper-v3', + # **MujocoEnv.config + # }) + # return MujocoEnv(cfg) else: raise KeyError("not supported env type: {}".format(env)) diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index 6be0815a04..045f9fcd64 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -192,15 +192,6 @@ def batch_evaluate( # main execution task with task.start(ctx=OnlineRLContext()): task.use(interaction_evaluator_ttorch(self.seed, self.policy, env, n_evaluator_episode, render=render, replay_video_path=replay_video_path)) - # task.use(wandb_online_logger(record_path='./video', - # cfg=EasyDict(dict( - # gradient_logger=False, - # video_logger=True, - # plot_logger=False, - # action_logger=False, - # return_logger=False - # )), - # env=env)) task.run(max_step=1) def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: bool = False) -> BaseEnvManagerV2: From bb0df37e617fecdbaf263eae9c678fbbc0696dd1 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 28 Feb 2023 22:09:48 +0800 Subject: [PATCH 020/244] polish code --- ding/bonus/config.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index f59cd93588..c8286c4145 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -83,7 +83,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: elif algorithm == 'TD3': cfg = TD3Policy.default_config() if env == 'hopper': - cfg.exp_name='hopper_td3_wandb_seed0', + cfg.exp_name='hopper_td3', cfg.seed=0, cfg.env=dict( env_id='Hopper-v3', @@ -105,7 +105,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: critic_head_hidden_size=256, action_space='regression', ), - logger=dict(record_path='./video_hopper_td3', + logger=dict(record_path='./hopper_td3/video', gradient_logger=True, video_logger=True, plot_logger=True, @@ -221,13 +221,6 @@ def get_instance_env(env: str) -> BaseEnv: env_wrapper='mujoco_default', ) return DingEnvWrapper(cfg=cfg) - - # from dizoo.mujoco.envs import MujocoEnv - # cfg = EasyDict({ - # 'env_id': 'Hopper-v3', - # **MujocoEnv.config - # }) - # return MujocoEnv(cfg) else: raise KeyError("not supported env type: {}".format(env)) From d01558d200ecb46e2ae53b4bfe3c75f5221ceda0 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 28 Feb 2023 22:15:35 +0800 Subject: [PATCH 021/244] polish code --- ding/bonus/config.py | 100 ++++++++++++++++++++++--------------------- 1 file changed, 51 insertions(+), 49 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index c8286c4145..4504aadc24 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -83,57 +83,59 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: elif algorithm == 'TD3': cfg = TD3Policy.default_config() if env == 'hopper': - cfg.exp_name='hopper_td3', - cfg.seed=0, - cfg.env=dict( - env_id='Hopper-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=6000, - ), - cfg.policy=dict( - cuda=True, - random_collect_size=25000, - model=dict( - obs_shape=11, - action_shape=3, - twin_critic=True, - actor_head_hidden_size=256, - critic_head_hidden_size=256, - action_space='regression', + cfg=dict( + exp_name='hopper_td3', + seed=0, + env=dict( + env_id='Hopper-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=6000, ), - logger=dict(record_path='./hopper_td3/video', - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - learn=dict( - update_per_collect=1, - batch_size=256, - learning_rate_actor=1e-3, - learning_rate_critic=1e-3, - ignore_done=False, - target_theta=0.005, - discount_factor=0.99, - actor_update_freq=2, - noise=True, - noise_sigma=0.2, - noise_range=dict( - min=-0.5, - max=0.5, + policy=dict( + cuda=True, + random_collect_size=25000, + model=dict( + obs_shape=11, + action_shape=3, + twin_critic=True, + actor_head_hidden_size=256, + critic_head_hidden_size=256, + action_space='regression', ), - ), - collect=dict( - n_sample=1, - unroll_len=1, - noise_sigma=0.1, - ), - other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), + logger=dict(record_path='./hopper_td3/video', + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + learn=dict( + update_per_collect=1, + batch_size=256, + learning_rate_actor=1e-3, + learning_rate_critic=1e-3, + ignore_done=False, + target_theta=0.005, + discount_factor=0.99, + actor_update_freq=2, + noise=True, + noise_sigma=0.2, + noise_range=dict( + min=-0.5, + max=0.5, + ), + ), + collect=dict( + n_sample=1, + unroll_len=1, + noise_sigma=0.1, + ), + other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), + ) ) else: raise KeyError("not supported env type: {}".format(env)) From 60f47b66f797b1d6ecca440b959494cdce56871d Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 28 Feb 2023 22:21:51 +0800 Subject: [PATCH 022/244] polish code --- ding/bonus/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 4504aadc24..65fd8f7b4d 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -83,7 +83,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: elif algorithm == 'TD3': cfg = TD3Policy.default_config() if env == 'hopper': - cfg=dict( + cfg.update(dict( exp_name='hopper_td3', seed=0, env=dict( @@ -136,7 +136,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), ) - ) + )) else: raise KeyError("not supported env type: {}".format(env)) else: From 511d71e96472974604208db38a8e17611027b706 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 28 Feb 2023 22:46:11 +0800 Subject: [PATCH 023/244] polish code --- ding/policy/td3.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/ding/policy/td3.py b/ding/policy/td3.py index 67bb075f88..e883dfe8a9 100644 --- a/ding/policy/td3.py +++ b/ding/policy/td3.py @@ -1,3 +1,4 @@ +from typing import List from ding.utils import POLICY_REGISTRY from .ddpg import DDPGPolicy @@ -153,3 +154,15 @@ class from DDPG class by changing ``_actor_update_freq``, ``_twin_critic`` and n ), ), ) + + def monitor_vars(self) -> List[str]: + variables = [ + "q_value", + "target q_value", + "loss", + "lr", + "entropy", + "target_q_value", + "td_error" + ] + return variables From 6a9fd4569997abf58a7387e7c993e3746855eba0 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 28 Feb 2023 22:56:48 +0800 Subject: [PATCH 024/244] polish code --- ding/bonus/td3.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 1d621c597e..408646bfd5 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -1,3 +1,4 @@ +from dataclasses import dataclass from typing import Optional, Union from ditk import logging from easydict import EasyDict @@ -16,6 +17,7 @@ from ding.data import DequeBuffer from ding.bonus.config import get_instance_config, get_instance_env, get_hybrid_shape +@dataclass class TrainingReturn: wandb_url:str From d5573e9816ff3ab85a8e46d660be5a452016b4ee Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 28 Feb 2023 23:00:03 +0800 Subject: [PATCH 025/244] polish code --- ding/bonus/td3.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 408646bfd5..d4b32c9008 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -98,8 +98,7 @@ def train( task.use(final_ctx_saver(name=self.cfg["exp_name"])) task.run() - return_dict={"wandb_url":wandb_url_return[0]} - return return_dict + return TrainingReturn(wandb_url_return[0]) def deploy(self, ckpt_path: str = None, enable_save_replay: bool = False, debug: bool = False) -> None: if debug: From 39065435dc5693f8fc8237decbdd5c373eccafd3 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 2 Mar 2023 08:28:19 +0000 Subject: [PATCH 026/244] fix data type error for mujoco --- ding/bonus/td3.py | 13 ++++++------- ding/envs/env/ding_env_wrapper.py | 2 ++ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index d4b32c9008..39a11a3cb6 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -12,7 +12,7 @@ from ding.envs import BaseEnv, BaseEnvManagerV2, SubprocessEnvManagerV2 from ding.policy import TD3Policy, single_env_forward_wrapper_ttorch from ding.utils import set_pkg_seed -from ding.config import save_config_py +from ding.config import save_config_py, compile_config from ding.model import QAC from ding.data import DequeBuffer from ding.bonus.config import get_instance_config, get_instance_env, get_hybrid_shape @@ -39,24 +39,23 @@ def __init__( self.env = get_instance_env(env) if cfg is None: # 'It should be default env tuned config' - self.cfg = get_instance_config(env, algorithm=TD3.algorithm) - else: - self.cfg = cfg + cfg = get_instance_config(env, algorithm=TD3.algorithm) + self.cfg = compile_config(cfg, policy=TD3Policy) elif isinstance(env, BaseEnv): - self.cfg = cfg + self.cfg = compile_config(cfg, policy=TD3Policy) raise NotImplementedError else: raise TypeError("not support env type: {}, only strings and instances of `BaseEnv` now".format(type(env))) logging.getLogger().setLevel(logging.INFO) self.seed = seed - set_pkg_seed(self.seed) + set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda) self.exp_name = exp_name if not os.path.exists(self.exp_name): os.makedirs(self.exp_name) save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py')) model = QAC(**self.cfg.policy.model) self.buffer_ = DequeBuffer(size=self.cfg.policy.other.replay_buffer.replay_buffer_size) - self.policy = TD3Policy(self.cfg, model=model) + self.policy = TD3Policy(self.cfg.policy, model=model) def load_policy(self,policy_state_dict, config): self.policy.load_state_dict(policy_state_dict) diff --git a/ding/envs/env/ding_env_wrapper.py b/ding/envs/env/ding_env_wrapper.py index 6cd4392708..233becebe4 100644 --- a/ding/envs/env/ding_env_wrapper.py +++ b/ding/envs/env/ding_env_wrapper.py @@ -187,6 +187,8 @@ def enable_save_replay(self, replay_path: Optional[str] = None) -> None: @property def observation_space(self) -> gym.spaces.Space: + if self._observation_space.dtype==np.float64: + self._observation_space.dtype=np.float32 return self._observation_space @property From e665493898fac9df303180e7967b9697aab1699d Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 2 Mar 2023 08:53:47 +0000 Subject: [PATCH 027/244] polish code --- ding/bonus/td3.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 39a11a3cb6..861b10fa7c 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -31,7 +31,7 @@ def __init__( self, env: Union[str, BaseEnv], seed: int = 0, - exp_name: str = 'default_experiment', + exp_name: str = None, cfg: Optional[EasyDict] = None ) -> None: if isinstance(env, str): @@ -40,6 +40,14 @@ def __init__( if cfg is None: # 'It should be default env tuned config' cfg = get_instance_config(env, algorithm=TD3.algorithm) + if exp_name is not None: + self.exp_name = exp_name + self.cfg.exp_name = exp_name + elif self.cfg.exp_name is not None: + self.exp_name = self.cfg.exp_name + else: + self.exp_name = 'default_experiment' + self.cfg.exp_name = exp_name self.cfg = compile_config(cfg, policy=TD3Policy) elif isinstance(env, BaseEnv): self.cfg = compile_config(cfg, policy=TD3Policy) @@ -49,10 +57,9 @@ def __init__( logging.getLogger().setLevel(logging.INFO) self.seed = seed set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda) - self.exp_name = exp_name if not os.path.exists(self.exp_name): os.makedirs(self.exp_name) - save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py')) + save_config_py(self.cfg.policy, os.path.join(self.exp_name, 'policy_config.py')) model = QAC(**self.cfg.policy.model) self.buffer_ = DequeBuffer(size=self.cfg.policy.other.replay_buffer.replay_buffer_size) self.policy = TD3Policy(self.cfg.policy, model=model) From 88f5181c4b7490ca1e3c6115425e68a98b1cdcd0 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 2 Mar 2023 09:27:41 +0000 Subject: [PATCH 028/244] polish code --- ding/bonus/config.py | 2 +- ding/bonus/td3.py | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 65fd8f7b4d..0e18d9895d 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -106,7 +106,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: critic_head_hidden_size=256, action_space='regression', ), - logger=dict(record_path='./hopper_td3/video', + logger=dict( gradient_logger=True, video_logger=True, plot_logger=True, diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 861b10fa7c..a8760f9e76 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -40,14 +40,16 @@ def __init__( if cfg is None: # 'It should be default env tuned config' cfg = get_instance_config(env, algorithm=TD3.algorithm) + elif not isinstance(cfg,EasyDict): + cfg=EasyDict(cfg) if exp_name is not None: self.exp_name = exp_name - self.cfg.exp_name = exp_name - elif self.cfg.exp_name is not None: - self.exp_name = self.cfg.exp_name + cfg.exp_name = exp_name + elif cfg.exp_name is not None: + self.exp_name = cfg.exp_name else: self.exp_name = 'default_experiment' - self.cfg.exp_name = exp_name + cfg.exp_name = self.exp_name self.cfg = compile_config(cfg, policy=TD3Policy) elif isinstance(env, BaseEnv): self.cfg = compile_config(cfg, policy=TD3Policy) @@ -87,7 +89,7 @@ def train( evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug) wandb_url_return=[] - self.cfg.policy.logger.record_path = './' + self.cfg.exp_name + '/video' + self.cfg.policy.logger.record_path = os.path.join(self.exp_name,'video') evaluator_env.enable_save_replay(replay_path=self.cfg.policy.logger.record_path) with task.start(ctx=OnlineRLContext()): @@ -99,7 +101,7 @@ def train( task.use(multistep_trainer(self.policy, log_freq=n_iter_log_show)) task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_)) task.use(CkptSaver(policy=self.policy,save_dir=os.path.join(self.cfg["exp_name"],"model"), train_freq=n_iter_save_ckpt)) - task.use(wandb_online_logger(self.exp_name, metric_list=self.policy.monitor_vars(), anonymous=True, project_name=self.exp_name, wandb_url_return=wandb_url_return)) + task.use(wandb_online_logger(record_path=self.cfg.policy.logger.record_path, cfg=self.cfg.policy.logger, metric_list=self.policy.monitor_vars(), model=self.policy._model, anonymous=True, project_name=self.exp_name, wandb_url_return=wandb_url_return)) task.use(termination_checker(max_env_step=step)) task.use(final_ctx_saver(name=self.cfg["exp_name"])) task.run() From 693a4cb12cdea5e777c7373cce55b1735008de52 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 2 Mar 2023 15:12:05 +0000 Subject: [PATCH 029/244] Add features --- ding/bonus/__init__.py | 1 + ding/bonus/ppof.py | 5 ----- ding/bonus/td3.py | 19 +++++++++++-------- ding/policy/ddpg.py | 19 +++++++++++++++++++ 4 files changed, 31 insertions(+), 13 deletions(-) diff --git a/ding/bonus/__init__.py b/ding/bonus/__init__.py index 89fed38d99..c689e51fba 100644 --- a/ding/bonus/__init__.py +++ b/ding/bonus/__init__.py @@ -1 +1,2 @@ from .ppof import PPOF +from .td3 import TD3 diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index 045f9fcd64..f912881d5d 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -173,7 +173,6 @@ def collect_data( def batch_evaluate( self, env_num: int = 4, - ckpt_path: Optional[str] = None, n_evaluator_episode: int = 4, context: Optional[str] = None, debug: bool = False, @@ -184,10 +183,6 @@ def batch_evaluate( logging.getLogger().setLevel(logging.DEBUG) # define env and policy env = self._setup_env_manager(env_num, context, debug) - if ckpt_path is None: - ckpt_path = os.path.join(self.exp_name, 'ckpt/eval.pth.tar') - state_dict = torch.load(ckpt_path, map_location='cpu') - self.policy.load_state_dict(state_dict) # main execution task with task.start(ctx=OnlineRLContext()): diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index a8760f9e76..637d48adcb 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -61,7 +61,7 @@ def __init__( set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda) if not os.path.exists(self.exp_name): os.makedirs(self.exp_name) - save_config_py(self.cfg.policy, os.path.join(self.exp_name, 'policy_config.py')) + save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py')) model = QAC(**self.cfg.policy.model) self.buffer_ = DequeBuffer(size=self.cfg.policy.other.replay_buffer.replay_buffer_size) self.policy = TD3Policy(self.cfg.policy, model=model) @@ -172,23 +172,26 @@ def collect_data( def batch_evaluate( self, env_num: int = 4, - ckpt_path: Optional[str] = None, n_evaluator_episode: int = 4, context: Optional[str] = None, - debug: bool = False + debug: bool = False, + render: bool = False, + replay_video_path: str = None, ) -> None: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy env = self._setup_env_manager(env_num, context, debug) - if ckpt_path is None: - ckpt_path = os.path.join(self.exp_name, 'ckpt/eval.pth.tar') - state_dict = torch.load(ckpt_path, map_location='cpu') - self.policy.load_state_dict(state_dict) + + if replay_video_path is not None: + env.enable_save_replay(replay_path=replay_video_path) + + evaluate_cfg=self.cfg + evaluate_cfg.env.n_evaluator_episode=n_evaluator_episode # main execution task with task.start(ctx=OnlineRLContext()): - task.use(interaction_evaluator_ttorch(self.seed, self.policy, env, n_evaluator_episode)) + task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, env, render=render)) task.run(max_step=1) def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: bool = False) -> BaseEnvManagerV2: diff --git a/ding/policy/ddpg.py b/ding/policy/ddpg.py index ebbf885621..9823c02f62 100644 --- a/ding/policy/ddpg.py +++ b/ding/policy/ddpg.py @@ -458,3 +458,22 @@ def _monitor_vars_learn(self) -> List[str]: if self._twin_critic: ret += ['critic_twin_loss'] return ret + + def state_dict(self) -> Dict[str, Any]: + state_dict = { + 'model': self._model.state_dict(), + 'target_model': self._target_model.state_dict(), + } + if 'learn' in self._enable_field: + state_dict['optimizer_actor'] = self._optimizer_actor.state_dict() + state_dict['optimizer_critic'] = self._optimizer_critic.state_dict() + return state_dict + + + def load_state_dict(self, state_dict: Dict[str, Any]) -> None: + self._model.load_state_dict(state_dict['model']) + self._target_model.load_state_dict(state_dict['target_model']) + if 'learn' in self._enable_field: + self._optimizer_actor.load_state_dict(state_dict['optimizer_actor']) + self._optimizer_critic.load_state_dict(state_dict['optimizer_critic']) + From e6bd0c5e92fa27f787d16fd3d14cdb2354bf2f47 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 3 Mar 2023 11:46:30 +0000 Subject: [PATCH 030/244] fix base env manager readyimage --- ding/bonus/td3.py | 2 ++ ding/envs/env_manager/base_env_manager.py | 12 ++++++++++++ 2 files changed, 14 insertions(+) diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 637d48adcb..4659fe0370 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -51,6 +51,8 @@ def __init__( self.exp_name = 'default_experiment' cfg.exp_name = self.exp_name self.cfg = compile_config(cfg, policy=TD3Policy) + if self.cfg.exp_name!=self.exp_name: + self.exp_name=self.cfg.exp_name elif isinstance(env, BaseEnv): self.cfg = compile_config(cfg, policy=TD3Policy) raise NotImplementedError diff --git a/ding/envs/env_manager/base_env_manager.py b/ding/envs/env_manager/base_env_manager.py index 6dec958194..59cc4e2d67 100644 --- a/ding/envs/env_manager/base_env_manager.py +++ b/ding/envs/env_manager/base_env_manager.py @@ -479,6 +479,18 @@ def ready_obs(self) -> tnp.array: obs = [tnp.array(o) for o in obs] return tnp.stack(obs) + @property + def ready_imgs(self, render_mode: Optional[str] = 'rgb_array') -> Dict[int, Any]: + """ + Overview: + Get the next ready renderd frame and corresponding env id. + Return: + - ready_imgs (:obj:`Dict[int, np.ndarray]:`): Dict with env_id keys and rendered frames. + """ + from ding.utils import render + assert render_mode in ['rgb_array', 'depth_array'] + return {i: render(self._envs[i], render_mode) for i in range(self.ready_obs.shape[0])} + def step(self, actions: List[tnp.ndarray]) -> List[tnp.ndarray]: """ Overview: From cdb9928f65e4112c6d0e722b74bedcd05b048331 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 3 Mar 2023 12:48:21 +0000 Subject: [PATCH 031/244] polish code --- ding/bonus/ppof.py | 4 +++- ding/policy/ppof.py | 5 +---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index f912881d5d..77315ae8ab 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -1,3 +1,4 @@ +from dataclasses import dataclass from typing import Optional, Union from ditk import logging from easydict import EasyDict @@ -14,6 +15,7 @@ from .model import PPOFModel from .config import get_instance_config, get_instance_env, get_hybrid_shape +@dataclass class TrainingReturn: wandb_url:str @@ -47,7 +49,7 @@ def __init__( self.env = get_instance_env(env) if cfg is None: # 'It should be default env tuned config' - self.cfg = get_instance_config(env) + self.cfg = get_instance_config(env,algorithm="PPO") else: self.cfg = cfg elif isinstance(env, BaseEnv): diff --git a/ding/policy/ppof.py b/ding/policy/ppof.py index d9f8091720..baa402626a 100644 --- a/ding/policy/ppof.py +++ b/ding/policy/ppof.py @@ -62,10 +62,7 @@ def __init__(self, cfg: "EasyDict", model: torch.nn.Module, enable_mode: List[st self._model = self.default_model() else: self._model = model - if hasattr(self._cfg,"cuda") and self._cfg.cuda and torch.cuda.is_available(): - self._device = 'cuda' - self._model.cuda() - elif not hasattr(self._cfg,"cuda") and torch.cuda.is_available(): + if self._cfg.cuda and torch.cuda.is_available(): self._device = 'cuda' self._model.cuda() else: From 3015a921ac339185fcf8cbbbeb772722c9f48ec2 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 3 Mar 2023 14:12:07 +0000 Subject: [PATCH 032/244] remove NoReturn --- ding/config/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ding/config/config.py b/ding/config/config.py index 690101e61e..8a3a485504 100644 --- a/ding/config/config.py +++ b/ding/config/config.py @@ -131,7 +131,7 @@ def read_config_yaml(path: str) -> EasyDict: return EasyDict(config_) -def save_config_yaml(config_: dict, path: str) -> NoReturn: +def save_config_yaml(config_: dict, path: str) -> None: """ Overview: save configuration to path @@ -144,7 +144,7 @@ def save_config_yaml(config_: dict, path: str) -> NoReturn: yaml.safe_dump(json.loads(config_string), f) -def save_config_py(config_: dict, path: str): # -> NoReturn: +def save_config_py(config_: dict, path: str) -> None: """ Overview: save configuration to python file From 6e7041b1e8f23494d19499cc245588a345c71013 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 3 Mar 2023 14:47:31 +0000 Subject: [PATCH 033/244] remove NoReturn --- ding/config/config.py | 4 ++-- ding/config/utils.py | 4 ++-- ding/worker/replay_buffer/advanced_buffer.py | 4 ++-- dizoo/metadrive/env/drive_utils.py | 2 +- dizoo/metadrive/env/drive_wrapper.py | 3 +-- 5 files changed, 8 insertions(+), 9 deletions(-) diff --git a/ding/config/config.py b/ding/config/config.py index 8a3a485504..c3b447672b 100644 --- a/ding/config/config.py +++ b/ding/config/config.py @@ -9,7 +9,7 @@ import subprocess import datetime from importlib import import_module -from typing import Optional, Tuple, NoReturn +from typing import Optional, Tuple from easydict import EasyDict from copy import deepcopy @@ -218,7 +218,7 @@ def read_config_with_system(path: str) -> Tuple[dict, dict, dict]: raise KeyError("invalid config file suffix: {}".format(suffix)) -def save_config(config_: dict, path: str, type_: str = 'py', save_formatted: bool = False) -> NoReturn: +def save_config(config_: dict, path: str, type_: str = 'py', save_formatted: bool = False) -> None: """ Overview: save configuration to python file or yaml file diff --git a/ding/config/utils.py b/ding/config/utils.py index 8b11028034..5a9a2d6664 100644 --- a/ding/config/utils.py +++ b/ding/config/utils.py @@ -1,4 +1,4 @@ -from typing import Optional, List, NoReturn +from typing import Optional, List import copy from easydict import EasyDict @@ -240,7 +240,7 @@ def parallel_transform_k8s( return cfg -def save_config_formatted(config_: dict, path: str = 'formatted_total_config.py') -> NoReturn: +def save_config_formatted(config_: dict, path: str = 'formatted_total_config.py') -> None: """ Overview: save formatted configuration to python file that can be read by serial_pipeline directly. diff --git a/ding/worker/replay_buffer/advanced_buffer.py b/ding/worker/replay_buffer/advanced_buffer.py index 3a59ef21fe..0d917d07fc 100644 --- a/ding/worker/replay_buffer/advanced_buffer.py +++ b/ding/worker/replay_buffer/advanced_buffer.py @@ -1,7 +1,7 @@ import os import copy import time -from typing import Union, NoReturn, Any, Optional, List, Dict, Tuple +from typing import Union, Any, Optional, List, Dict, Tuple import numpy as np import hickle @@ -721,7 +721,7 @@ def beta(self) -> float: return self._beta @beta.setter - def beta(self, beta: float) -> NoReturn: + def beta(self, beta: float) -> None: self._beta = beta def state_dict(self) -> dict: diff --git a/dizoo/metadrive/env/drive_utils.py b/dizoo/metadrive/env/drive_utils.py index 99415c9b93..2009e5a52d 100644 --- a/dizoo/metadrive/env/drive_utils.py +++ b/dizoo/metadrive/env/drive_utils.py @@ -1,4 +1,4 @@ -from typing import NoReturn, Optional, List +from typing import Optional, List from gym import utils from abc import ABC, abstractmethod from typing import Any, Dict, Optional diff --git a/dizoo/metadrive/env/drive_wrapper.py b/dizoo/metadrive/env/drive_wrapper.py index 5b48826ab8..c4ec7dd0a4 100644 --- a/dizoo/metadrive/env/drive_wrapper.py +++ b/dizoo/metadrive/env/drive_wrapper.py @@ -1,7 +1,6 @@ from typing import Any, Dict, Optional from easydict import EasyDict -from itertools import product -from typing import NoReturn, Optional, List +from typing import Optional import matplotlib.pyplot as plt import gym import copy From fe415b2311efbbfe0765ab324561ef105ff35a21 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 7 Mar 2023 15:57:19 +0800 Subject: [PATCH 034/244] format code --- ding/bonus/config.py | 106 +++++++++--------- ding/bonus/ppof.py | 34 ++++-- ding/bonus/td3.py | 58 +++++++--- ding/envs/env/ding_env_wrapper.py | 4 +- .../framework/middleware/functional/logger.py | 6 +- ding/policy/ddpg.py | 2 - ding/policy/td3.py | 10 +- 7 files changed, 127 insertions(+), 93 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 0e18d9895d..c221707b43 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -83,60 +83,62 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: elif algorithm == 'TD3': cfg = TD3Policy.default_config() if env == 'hopper': - cfg.update(dict( - exp_name='hopper_td3', - seed=0, - env=dict( - env_id='Hopper-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=6000, - ), - policy=dict( - cuda=True, - random_collect_size=25000, - model=dict( - obs_shape=11, - action_shape=3, - twin_critic=True, - actor_head_hidden_size=256, - critic_head_hidden_size=256, - action_space='regression', + cfg.update( + dict( + exp_name='hopper_td3', + seed=0, + env=dict( + env_id='Hopper-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=6000, ), - logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - learn=dict( - update_per_collect=1, - batch_size=256, - learning_rate_actor=1e-3, - learning_rate_critic=1e-3, - ignore_done=False, - target_theta=0.005, - discount_factor=0.99, - actor_update_freq=2, - noise=True, - noise_sigma=0.2, - noise_range=dict( - min=-0.5, - max=0.5, + policy=dict( + cuda=True, + random_collect_size=25000, + model=dict( + obs_shape=11, + action_shape=3, + twin_critic=True, + actor_head_hidden_size=256, + critic_head_hidden_size=256, + action_space='regression', ), - ), - collect=dict( - n_sample=1, - unroll_len=1, - noise_sigma=0.1, - ), - other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), + logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + learn=dict( + update_per_collect=1, + batch_size=256, + learning_rate_actor=1e-3, + learning_rate_critic=1e-3, + ignore_done=False, + target_theta=0.005, + discount_factor=0.99, + actor_update_freq=2, + noise=True, + noise_sigma=0.2, + noise_range=dict( + min=-0.5, + max=0.5, + ), + ), + collect=dict( + n_sample=1, + unroll_len=1, + noise_sigma=0.1, + ), + other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), + ) ) - )) + ) else: raise KeyError("not supported env type: {}".format(env)) else: @@ -215,7 +217,7 @@ def get_instance_env(env: str) -> BaseEnv: }, seed_api=False, ) - elif env== 'hopper': + elif env == 'hopper': from dizoo.mujoco.envs import MujocoEnv cfg = EasyDict( diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index 77315ae8ab..fe8fecb9e7 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -15,9 +15,11 @@ from .model import PPOFModel from .config import get_instance_config, get_instance_env, get_hybrid_shape + @dataclass class TrainingReturn: - wandb_url:str + wandb_url: str + class PPOF: supported_env_list = [ @@ -49,7 +51,7 @@ def __init__( self.env = get_instance_env(env) if cfg is None: # 'It should be default env tuned config' - self.cfg = get_instance_config(env,algorithm="PPO") + self.cfg = get_instance_config(env, algorithm="PPO") else: self.cfg = cfg elif isinstance(env, BaseEnv): @@ -78,11 +80,10 @@ def __init__( ) self.policy = PPOFPolicy(self.cfg, model=model) - def load_policy(self,policy_state_dict, config): + def load_policy(self, policy_state_dict, config): self.policy.load_state_dict(policy_state_dict) self.policy._cfg = config - def train( self, step: int = int(1e7), @@ -99,7 +100,7 @@ def train( # define env and policy collector_env = self._setup_env_manager(collector_env_num, context, debug) evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug) - wandb_url_return=[] + wandb_url_return = [] with task.start(ctx=OnlineRLContext()): task.use(interaction_evaluator_ttorch(self.seed, self.policy, evaluator_env)) @@ -107,10 +108,18 @@ def train( task.use(ppof_adv_estimator(self.policy)) task.use(multistep_trainer(self.policy, log_freq=n_iter_log_show)) task.use(CkptSaver(self.policy, save_dir=self.exp_name, train_freq=n_iter_save_ckpt)) - task.use(wandb_online_logger(self.exp_name, metric_list=self.policy.monitor_vars(), anonymous=True, project_name=self.exp_name, wandb_url_return=wandb_url_return)) + task.use( + wandb_online_logger( + self.exp_name, + metric_list=self.policy.monitor_vars(), + anonymous=True, + project_name=self.exp_name, + wandb_url_return=wandb_url_return + ) + ) task.use(termination_checker(max_env_step=step)) task.run() - + return TrainingReturn(wandb_url=wandb_url_return[0]) def deploy(self, ckpt_path: str = None, enable_save_replay: bool = False, debug: bool = False) -> None: @@ -188,7 +197,16 @@ def batch_evaluate( # main execution task with task.start(ctx=OnlineRLContext()): - task.use(interaction_evaluator_ttorch(self.seed, self.policy, env, n_evaluator_episode, render=render, replay_video_path=replay_video_path)) + task.use( + interaction_evaluator_ttorch( + self.seed, + self.policy, + env, + n_evaluator_episode, + render=render, + replay_video_path=replay_video_path + ) + ) task.run(max_step=1) def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: bool = False) -> BaseEnvManagerV2: diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 4659fe0370..cd5bb06ace 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -17,15 +17,17 @@ from ding.data import DequeBuffer from ding.bonus.config import get_instance_config, get_instance_env, get_hybrid_shape + @dataclass class TrainingReturn: - wandb_url:str + wandb_url: str + class TD3: supported_env_list = [ 'hopper', ] - algorithm='TD3' + algorithm = 'TD3' def __init__( self, @@ -40,8 +42,8 @@ def __init__( if cfg is None: # 'It should be default env tuned config' cfg = get_instance_config(env, algorithm=TD3.algorithm) - elif not isinstance(cfg,EasyDict): - cfg=EasyDict(cfg) + elif not isinstance(cfg, EasyDict): + cfg = EasyDict(cfg) if exp_name is not None: self.exp_name = exp_name cfg.exp_name = exp_name @@ -51,8 +53,8 @@ def __init__( self.exp_name = 'default_experiment' cfg.exp_name = self.exp_name self.cfg = compile_config(cfg, policy=TD3Policy) - if self.cfg.exp_name!=self.exp_name: - self.exp_name=self.cfg.exp_name + if self.cfg.exp_name != self.exp_name: + self.exp_name = self.cfg.exp_name elif isinstance(env, BaseEnv): self.cfg = compile_config(cfg, policy=TD3Policy) raise NotImplementedError @@ -68,11 +70,10 @@ def __init__( self.buffer_ = DequeBuffer(size=self.cfg.policy.other.replay_buffer.replay_buffer_size) self.policy = TD3Policy(self.cfg.policy, model=model) - def load_policy(self,policy_state_dict, config): + def load_policy(self, policy_state_dict, config): self.policy.load_state_dict(policy_state_dict) self.policy._cfg = config - def train( self, step: int = int(1e7), @@ -89,21 +90,42 @@ def train( # define env and policy collector_env = self._setup_env_manager(collector_env_num, context, debug) evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug) - wandb_url_return=[] + wandb_url_return = [] - self.cfg.policy.logger.record_path = os.path.join(self.exp_name,'video') + self.cfg.policy.logger.record_path = os.path.join(self.exp_name, 'video') evaluator_env.enable_save_replay(replay_path=self.cfg.policy.logger.record_path) with task.start(ctx=OnlineRLContext()): - task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env,render=True)) + task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env, render=True)) task.use( - StepCollector(self.cfg, self.policy.collect_mode, collector_env, random_collect_size=self.cfg.policy.random_collect_size) + StepCollector( + self.cfg, + self.policy.collect_mode, + collector_env, + random_collect_size=self.cfg.policy.random_collect_size + ) ) task.use(data_pusher(self.cfg, self.buffer_)) task.use(multistep_trainer(self.policy, log_freq=n_iter_log_show)) task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_)) - task.use(CkptSaver(policy=self.policy,save_dir=os.path.join(self.cfg["exp_name"],"model"), train_freq=n_iter_save_ckpt)) - task.use(wandb_online_logger(record_path=self.cfg.policy.logger.record_path, cfg=self.cfg.policy.logger, metric_list=self.policy.monitor_vars(), model=self.policy._model, anonymous=True, project_name=self.exp_name, wandb_url_return=wandb_url_return)) + task.use( + CkptSaver( + policy=self.policy, + save_dir=os.path.join(self.cfg["exp_name"], "model"), + train_freq=n_iter_save_ckpt + ) + ) + task.use( + wandb_online_logger( + record_path=self.cfg.policy.logger.record_path, + cfg=self.cfg.policy.logger, + metric_list=self.policy.monitor_vars(), + model=self.policy._model, + anonymous=True, + project_name=self.exp_name, + wandb_url_return=wandb_url_return + ) + ) task.use(termination_checker(max_env_step=step)) task.use(final_ctx_saver(name=self.cfg["exp_name"])) task.run() @@ -163,7 +185,9 @@ def collect_data( # main execution task with task.start(ctx=OnlineRLContext()): task.use( - StepCollector(self.cfg, self.policy.collect_mode, env, random_collect_size=self.cfg.policy.random_collect_size) + StepCollector( + self.cfg, self.policy.collect_mode, env, random_collect_size=self.cfg.policy.random_collect_size + ) ) task.use(offline_data_saver(save_data_path, data_type='hdf5')) task.run(max_step=1) @@ -188,8 +212,8 @@ def batch_evaluate( if replay_video_path is not None: env.enable_save_replay(replay_path=replay_video_path) - evaluate_cfg=self.cfg - evaluate_cfg.env.n_evaluator_episode=n_evaluator_episode + evaluate_cfg = self.cfg + evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode # main execution task with task.start(ctx=OnlineRLContext()): diff --git a/ding/envs/env/ding_env_wrapper.py b/ding/envs/env/ding_env_wrapper.py index 233becebe4..479fd8048d 100644 --- a/ding/envs/env/ding_env_wrapper.py +++ b/ding/envs/env/ding_env_wrapper.py @@ -187,8 +187,8 @@ def enable_save_replay(self, replay_path: Optional[str] = None) -> None: @property def observation_space(self) -> gym.spaces.Space: - if self._observation_space.dtype==np.float64: - self._observation_space.dtype=np.float32 + if self._observation_space.dtype == np.float64: + self._observation_space.dtype = np.float32 return self._observation_space @property diff --git a/ding/framework/middleware/functional/logger.py b/ding/framework/middleware/functional/logger.py index c791aa14f4..c4df27bac6 100644 --- a/ding/framework/middleware/functional/logger.py +++ b/ding/framework/middleware/functional/logger.py @@ -127,7 +127,7 @@ def wandb_online_logger( env: Optional[BaseEnvManagerV2] = None, model: Optional[torch.nn.Module] = None, anonymous: bool = False, - project_name:str = 'default-project', + project_name: str = 'default-project', wandb_url_return: List = [], ) -> Callable: ''' @@ -153,10 +153,10 @@ def wandb_online_logger( # Initialize wandb with default settings # Settings can be covered by calling wandb.init() at the top of the script if anonymous: - wandb.init(project=project_name,reinit=True,anonymous="must") + wandb.init(project=project_name, reinit=True, anonymous="must") wandb_url_return.append(wandb.run.get_project_url()) else: - wandb.init(project=project_name,reinit=True) + wandb.init(project=project_name, reinit=True) wandb_url_return.append(wandb.run.get_project_url()) if cfg == 'default': cfg = EasyDict( diff --git a/ding/policy/ddpg.py b/ding/policy/ddpg.py index 9823c02f62..440a349476 100644 --- a/ding/policy/ddpg.py +++ b/ding/policy/ddpg.py @@ -469,11 +469,9 @@ def state_dict(self) -> Dict[str, Any]: state_dict['optimizer_critic'] = self._optimizer_critic.state_dict() return state_dict - def load_state_dict(self, state_dict: Dict[str, Any]) -> None: self._model.load_state_dict(state_dict['model']) self._target_model.load_state_dict(state_dict['target_model']) if 'learn' in self._enable_field: self._optimizer_actor.load_state_dict(state_dict['optimizer_actor']) self._optimizer_critic.load_state_dict(state_dict['optimizer_critic']) - diff --git a/ding/policy/td3.py b/ding/policy/td3.py index e883dfe8a9..528e27a9a6 100644 --- a/ding/policy/td3.py +++ b/ding/policy/td3.py @@ -156,13 +156,5 @@ class from DDPG class by changing ``_actor_update_freq``, ``_twin_critic`` and n ) def monitor_vars(self) -> List[str]: - variables = [ - "q_value", - "target q_value", - "loss", - "lr", - "entropy", - "target_q_value", - "td_error" - ] + variables = ["q_value", "target q_value", "loss", "lr", "entropy", "target_q_value", "td_error"] return variables From 343275407308c4084db7bec57179a8f06978d353 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 7 Mar 2023 16:27:46 +0800 Subject: [PATCH 035/244] format code --- ding/bonus/ppof.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index 6aa1090a4a..4d691b151c 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -105,7 +105,7 @@ def train( # define env and policy collector_env = self._setup_env_manager(collector_env_num, context, debug, 'collector') evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug, 'evaluator') - wandb_url_return=[] + wandb_url_return = [] with task.start(ctx=OnlineRLContext()): task.use(interaction_evaluator_ttorch(self.seed, self.policy, evaluator_env)) task.use(PPOFStepCollector(self.seed, self.policy, collector_env, self.cfg.n_sample)) From 3f6ef3d0f3a5b12d625920cfe5fcc426d56ed22d Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 7 Mar 2023 16:56:26 +0800 Subject: [PATCH 036/244] polish code --- ding/framework/middleware/functional/collector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ding/framework/middleware/functional/collector.py b/ding/framework/middleware/functional/collector.py index dfc94107be..b570ffc200 100644 --- a/ding/framework/middleware/functional/collector.py +++ b/ding/framework/middleware/functional/collector.py @@ -78,7 +78,7 @@ def _inference(ctx: "OnlineRLContext"): obs = {i: obs[i] for i in range(get_shape0(obs))} # TBD inference_output = policy.forward(obs, **ctx.collect_kwargs) - ctx.action = np.array([to_ndarray(v['action']) for v in inference_output.values()]) # TBD + ctx.action = [to_ndarray(v['action']) for v in inference_output.values()] # TBD ctx.inference_output = inference_output return _inference From 535fd777ade1d29a3f9ab47548c7a60c61ded187 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 7 Mar 2023 17:03:11 +0800 Subject: [PATCH 037/244] polish code --- ding/policy/ddpg.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ding/policy/ddpg.py b/ding/policy/ddpg.py index 440a349476..ffe8d298da 100644 --- a/ding/policy/ddpg.py +++ b/ding/policy/ddpg.py @@ -255,9 +255,9 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: if self._twin_critic: # TD3: two critic networks target_q_value = torch.min(target_q_value[0], target_q_value[1]) # find min one as target q value - q_value_dict['q_value'] = q_value[0].mean() - q_value_dict['q_value_twin'] = q_value[1].mean() - target_q_value_dict['target q_value'] = target_q_value.mean() + q_value_dict['q_value'] = q_value[0].mean().data.item() + q_value_dict['q_value_twin'] = q_value[1].mean().data.item() + target_q_value_dict['target q_value'] = target_q_value.mean().data.item() # critic network1 td_data = v_1step_td_data(q_value[0], target_q_value, reward, data['done'], data['weight']) critic_loss, td_error_per_sample1 = v_1step_td_error(td_data, self._gamma) @@ -269,8 +269,8 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: td_error_per_sample = (td_error_per_sample1 + td_error_per_sample2) / 2 else: # DDPG: single critic network - q_value_dict['q_value'] = q_value.mean() - target_q_value_dict['target q_value'] = target_q_value.mean() + q_value_dict['q_value'] = q_value.mean().data.item() + target_q_value_dict['target q_value'] = target_q_value.mean().data.item() td_data = v_1step_td_data(q_value, target_q_value, reward, data['done'], data['weight']) critic_loss, td_error_per_sample = v_1step_td_error(td_data, self._gamma) loss_dict['critic_loss'] = critic_loss From 427161094047ec2e52d3f8316494a82c0ce3525a Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 7 Mar 2023 19:09:18 +0800 Subject: [PATCH 038/244] fix logger --- ding/framework/middleware/tests/test_logger.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ding/framework/middleware/tests/test_logger.py b/ding/framework/middleware/tests/test_logger.py index 7513c1a437..b9514c92ad 100644 --- a/ding/framework/middleware/tests/test_logger.py +++ b/ding/framework/middleware/tests/test_logger.py @@ -209,12 +209,13 @@ def test_wandb_online_logger(): model = TheModelClass() wandb.init(config=cfg, anonymous="must") - def mock_metric_logger(metric_dict): + def mock_metric_logger(metric_dict, step): metric_list = [ "q_value", "target q_value", "loss", "lr", "entropy", "reward", "q value", "video", "q value distribution", - "train iter" + "train iter", "episode return mean", "env step", "action", "actions_of_trajectory_0", + "actions_of_trajectory_1", "actions_of_trajectory_2", "actions_of_trajectory_3", "return distribution", ] - assert set(metric_dict.keys()) < set(metric_list) + assert set(metric_dict.keys()) <= set(metric_list) def mock_gradient_logger(input_model): assert input_model == model From ba0979b4a0f495f1bd07dc7e0e08cf368f09b682 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 7 Mar 2023 19:10:45 +0800 Subject: [PATCH 039/244] format code --- .../framework/middleware/tests/test_logger.py | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/ding/framework/middleware/tests/test_logger.py b/ding/framework/middleware/tests/test_logger.py index b9514c92ad..9a6aa9bc6f 100644 --- a/ding/framework/middleware/tests/test_logger.py +++ b/ding/framework/middleware/tests/test_logger.py @@ -211,9 +211,24 @@ def test_wandb_online_logger(): def mock_metric_logger(metric_dict, step): metric_list = [ - "q_value", "target q_value", "loss", "lr", "entropy", "reward", "q value", "video", "q value distribution", - "train iter", "episode return mean", "env step", "action", "actions_of_trajectory_0", - "actions_of_trajectory_1", "actions_of_trajectory_2", "actions_of_trajectory_3", "return distribution", + "q_value", + "target q_value", + "loss", + "lr", + "entropy", + "reward", + "q value", + "video", + "q value distribution", + "train iter", + "episode return mean", + "env step", + "action", + "actions_of_trajectory_0", + "actions_of_trajectory_1", + "actions_of_trajectory_2", + "actions_of_trajectory_3", + "return distribution", ] assert set(metric_dict.keys()) <= set(metric_list) From 82826e2c1631757977386503caa6d7007dd882ad Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 7 Mar 2023 20:54:54 +0800 Subject: [PATCH 040/244] format code --- ding/bonus/td3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index cd5bb06ace..aaf1181791 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -8,7 +8,7 @@ from ding.framework import task, OnlineRLContext from ding.framework.middleware import interaction_evaluator_ttorch, CkptSaver, multistep_trainer, \ wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \ - OffPolicyLearner, final_ctx_saver + OffPolicyLearner, final_ctx_saver from ding.envs import BaseEnv, BaseEnvManagerV2, SubprocessEnvManagerV2 from ding.policy import TD3Policy, single_env_forward_wrapper_ttorch from ding.utils import set_pkg_seed From 5340658bf5c169bedd51acc705daacbe822e6ddc Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 10 Mar 2023 11:55:11 +0000 Subject: [PATCH 041/244] change api for ckpt; polish code --- ding/bonus/__init__.py | 2 +- ding/bonus/config.py | 19 ------- ding/bonus/td3.py | 51 +++++++++---------- ding/framework/context.py | 8 ++- .../middleware/functional/collector.py | 2 - .../middleware/functional/evaluator.py | 4 +- .../framework/middleware/functional/logger.py | 10 ++-- ding/policy/td3.py | 3 +- 8 files changed, 42 insertions(+), 57 deletions(-) diff --git a/ding/bonus/__init__.py b/ding/bonus/__init__.py index c689e51fba..54841678ea 100644 --- a/ding/bonus/__init__.py +++ b/ding/bonus/__init__.py @@ -1,2 +1,2 @@ from .ppof import PPOF -from .td3 import TD3 +from .td3 import TD3OffPolicyAgent diff --git a/ding/bonus/config.py b/ding/bonus/config.py index d95f45e569..3c06a7b0c5 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -113,7 +113,6 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: model=dict( obs_shape=11, action_shape=3, - twin_critic=True, actor_head_hidden_size=256, critic_head_hidden_size=256, action_space='regression', @@ -125,26 +124,8 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: action_logger=True, return_logger=False ), - learn=dict( - update_per_collect=1, - batch_size=256, - learning_rate_actor=1e-3, - learning_rate_critic=1e-3, - ignore_done=False, - target_theta=0.005, - discount_factor=0.99, - actor_update_freq=2, - noise=True, - noise_sigma=0.2, - noise_range=dict( - min=-0.5, - max=0.5, - ), - ), collect=dict( n_sample=1, - unroll_len=1, - noise_sigma=0.1, ), other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), ) diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index aaf1181791..0af6801fd3 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -12,7 +12,7 @@ from ding.envs import BaseEnv, BaseEnvManagerV2, SubprocessEnvManagerV2 from ding.policy import TD3Policy, single_env_forward_wrapper_ttorch from ding.utils import set_pkg_seed -from ding.config import save_config_py, compile_config +from ding.config import Config, save_config_py, compile_config from ding.model import QAC from ding.data import DequeBuffer from ding.bonus.config import get_instance_config, get_instance_env, get_hybrid_shape @@ -23,7 +23,7 @@ class TrainingReturn: wandb_url: str -class TD3: +class TD3OffPolicyAgent: supported_env_list = [ 'hopper', ] @@ -34,16 +34,22 @@ def __init__( env: Union[str, BaseEnv], seed: int = 0, exp_name: str = None, - cfg: Optional[EasyDict] = None + cfg: Optional[Union[EasyDict, dict, str]] = None, + ckpt_path: str = None, ) -> None: if isinstance(env, str): - assert env in TD3.supported_env_list, "Please use supported envs: {}".format(TD3.supported_env_list) + assert env in TD3OffPolicyAgent.supported_env_list, "Please use supported envs: {}".format(TD3OffPolicyAgent.supported_env_list) self.env = get_instance_env(env) if cfg is None: # 'It should be default env tuned config' - cfg = get_instance_config(env, algorithm=TD3.algorithm) - elif not isinstance(cfg, EasyDict): + cfg = get_instance_config(env, algorithm=TD3OffPolicyAgent.algorithm) + elif isinstance(cfg, EasyDict): + pass + elif isinstance(cfg, dict): cfg = EasyDict(cfg) + elif isinstance(cfg, str): + cfg = EasyDict(Config.file_to_dict(cfg)) + if exp_name is not None: self.exp_name = exp_name cfg.exp_name = exp_name @@ -69,10 +75,13 @@ def __init__( model = QAC(**self.cfg.policy.model) self.buffer_ = DequeBuffer(size=self.cfg.policy.other.replay_buffer.replay_buffer_size) self.policy = TD3Policy(self.cfg.policy, model=model) + if ckpt_path is not None: + self.policy.load_state_dict(torch.load(ckpt_path)) def load_policy(self, policy_state_dict, config): self.policy.load_state_dict(policy_state_dict) self.policy._cfg = config + self.cfg = config def train( self, @@ -82,6 +91,7 @@ def train( n_iter_log_show: int = 500, n_iter_save_ckpt: int = 1000, context: Optional[str] = None, + render: bool = False, debug: bool = False ) -> dict: if debug: @@ -92,11 +102,12 @@ def train( evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug) wandb_url_return = [] - self.cfg.policy.logger.record_path = os.path.join(self.exp_name, 'video') - evaluator_env.enable_save_replay(replay_path=self.cfg.policy.logger.record_path) + if render: + self.cfg.policy.logger.record_path = os.path.join(self.exp_name, 'video') + evaluator_env.enable_save_replay(replay_path=self.cfg.policy.logger.record_path) with task.start(ctx=OnlineRLContext()): - task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env, render=True)) + task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env, render=render)) task.use( StepCollector( self.cfg, @@ -132,7 +143,7 @@ def train( return TrainingReturn(wandb_url_return[0]) - def deploy(self, ckpt_path: str = None, enable_save_replay: bool = False, debug: bool = False) -> None: + def deploy(self, enable_save_replay: bool = False, debug: bool = False) -> None: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -140,10 +151,7 @@ def deploy(self, ckpt_path: str = None, enable_save_replay: bool = False, debug: env.seed(self.seed, dynamic_seed=False) if enable_save_replay: env.enable_save_replay(replay_path=os.path.join(self.exp_name, 'videos')) - if ckpt_path is None: - ckpt_path = os.path.join(self.exp_name, 'ckpt/eval.pth.tar') - state_dict = torch.load(ckpt_path, map_location='cpu') - self.policy.load_state_dict(state_dict) + forward_fn = single_env_forward_wrapper_ttorch(self.policy.eval) # main loop @@ -162,7 +170,6 @@ def deploy(self, ckpt_path: str = None, enable_save_replay: bool = False, debug: def collect_data( self, env_num: int = 8, - ckpt_path: Optional[str] = None, save_data_path: Optional[str] = None, n_sample: Optional[int] = None, n_episode: Optional[int] = None, @@ -175,12 +182,9 @@ def collect_data( raise NotImplementedError # define env and policy env = self._setup_env_manager(env_num, context, debug) - if ckpt_path is None: - ckpt_path = os.path.join(self.exp_name, 'ckpt/eval.pth.tar') + if save_data_path is None: save_data_path = os.path.join(self.exp_name, 'demo_data') - state_dict = torch.load(ckpt_path, map_location='cpu') - self.policy.load_state_dict(state_dict) # main execution task with task.start(ctx=OnlineRLContext()): @@ -200,24 +204,19 @@ def batch_evaluate( env_num: int = 4, n_evaluator_episode: int = 4, context: Optional[str] = None, - debug: bool = False, - render: bool = False, - replay_video_path: str = None, + debug: bool = False ) -> None: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy env = self._setup_env_manager(env_num, context, debug) - if replay_video_path is not None: - env.enable_save_replay(replay_path=replay_video_path) - evaluate_cfg = self.cfg evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode # main execution task with task.start(ctx=OnlineRLContext()): - task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, env, render=render)) + task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, env)) task.run(max_step=1) def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: bool = False) -> BaseEnvManagerV2: diff --git a/ding/framework/context.py b/ding/framework/context.py index c5a23251dc..886d4933f6 100644 --- a/ding/framework/context.py +++ b/ding/framework/context.py @@ -67,12 +67,14 @@ class OnlineRLContext(Context): last_eval_iter: int = -1 last_eval_value: int = -np.inf eval_output: List = dataclasses.field(default_factory=dict) + # wandb + wandb_url: str = "" def __post_init__(self): # This method is called just after __init__ method. Here, concretely speaking, # this method is called just after the object initialize its fields. # We use this method here to keep the fields needed for each iteration. - self.keep('env_step', 'env_episode', 'train_iter', 'last_eval_iter', 'last_eval_value') + self.keep('env_step', 'env_episode', 'train_iter', 'last_eval_iter', 'last_eval_value', 'wandb_url') @dataclasses.dataclass @@ -88,9 +90,11 @@ class OfflineRLContext(Context): eval_value: float = -np.inf last_eval_iter: int = -1 eval_output: List = dataclasses.field(default_factory=dict) + # wandb + wandb_url: str = "" def __post_init__(self): # This method is called just after __init__ method. Here, concretely speaking, # this method is called just after the object initialize its fields. # We use this method here to keep the fields needed for each iteration. - self.keep('train_iter', 'last_eval_iter') + self.keep('train_iter', 'last_eval_iter', 'wandb_url') diff --git a/ding/framework/middleware/functional/collector.py b/ding/framework/middleware/functional/collector.py index dd9c9312aa..92b07a7f73 100644 --- a/ding/framework/middleware/functional/collector.py +++ b/ding/framework/middleware/functional/collector.py @@ -1,7 +1,5 @@ from typing import TYPE_CHECKING, Callable, List, Tuple, Any -from easydict import EasyDict from functools import reduce -import numpy as np import treetensor.torch as ttorch from ding.envs import BaseEnvManager from ding.policy import Policy diff --git a/ding/framework/middleware/functional/evaluator.py b/ding/framework/middleware/functional/evaluator.py index c515086133..162af3dd47 100644 --- a/ding/framework/middleware/functional/evaluator.py +++ b/ding/framework/middleware/functional/evaluator.py @@ -12,7 +12,7 @@ from ding.policy import Policy from ding.data import Dataset, DataLoader from ding.framework import task -from ding.torch_utils import tensor_to_list, to_list, to_ndarray, get_shape0 +from ding.torch_utils import to_ndarray, get_shape0 from ding.utils import lists_to_dicts @@ -257,7 +257,7 @@ def _evaluate(ctx: Union["OnlineRLContext", "OfflineRLContext"]): eval_monitor.update_video(env.ready_imgs) eval_monitor.update_output(inference_output) output = [v for v in inference_output.values()] - action = np.array([to_ndarray(v['action']) for v in output]) # TBD + action = [to_ndarray(v['action']) for v in output] # TBD timesteps = env.step(action) for timestep in timesteps: env_id = timestep.env_id.item() diff --git a/ding/framework/middleware/functional/logger.py b/ding/framework/middleware/functional/logger.py index c4df27bac6..3112b58228 100644 --- a/ding/framework/middleware/functional/logger.py +++ b/ding/framework/middleware/functional/logger.py @@ -128,7 +128,6 @@ def wandb_online_logger( model: Optional[torch.nn.Module] = None, anonymous: bool = False, project_name: str = 'default-project', - wandb_url_return: List = [], ) -> Callable: ''' Overview: @@ -154,10 +153,8 @@ def wandb_online_logger( # Settings can be covered by calling wandb.init() at the top of the script if anonymous: wandb.init(project=project_name, reinit=True, anonymous="must") - wandb_url_return.append(wandb.run.get_project_url()) else: wandb.init(project=project_name, reinit=True) - wandb_url_return.append(wandb.run.get_project_url()) if cfg == 'default': cfg = EasyDict( dict( @@ -178,8 +175,15 @@ def wandb_online_logger( one_time_warning( "If you want to use wandb to visualize the gradient, please set gradient_logger = True in the config." ) + + first_plot=True def _plot(ctx: "OnlineRLContext"): + nonlocal first_plot + if first_plot: + first_plot=False + ctx.wandb_url=wandb.run.get_project_url() + info_for_logging = {} if cfg.plot_logger: diff --git a/ding/policy/td3.py b/ding/policy/td3.py index 528e27a9a6..90f6688e68 100644 --- a/ding/policy/td3.py +++ b/ding/policy/td3.py @@ -156,5 +156,4 @@ class from DDPG class by changing ``_actor_update_freq``, ``_twin_critic`` and n ) def monitor_vars(self) -> List[str]: - variables = ["q_value", "target q_value", "loss", "lr", "entropy", "target_q_value", "td_error"] - return variables + return ["q_value", "target q_value", "loss", "lr", "entropy", "target_q_value", "td_error"] From 2d3f6c81a098d614f35a98a94ae08152c2b5829c Mon Sep 17 00:00:00 2001 From: zjowowen Date: Mon, 13 Mar 2023 06:05:43 +0000 Subject: [PATCH 042/244] polish code --- ding/bonus/config.py | 14 ++--- ding/bonus/td3.py | 57 ++++++++++++------- .../framework/middleware/functional/logger.py | 31 ++++++---- 3 files changed, 63 insertions(+), 39 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 3c06a7b0c5..787f354b3d 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -117,18 +117,18 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: critic_head_hidden_size=256, action_space='regression', ), - logger=dict( + collect=dict( + n_sample=1, + ), + other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), + ), + wandb_logger=dict( gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), - collect=dict( - n_sample=1, - ), - other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), - ) + ), ) ) else: diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 0af6801fd3..8837389c4a 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -5,21 +5,26 @@ import os import gym import torch +import treetensor.torch as ttorch from ding.framework import task, OnlineRLContext -from ding.framework.middleware import interaction_evaluator_ttorch, CkptSaver, multistep_trainer, \ +from ding.framework.middleware import CkptSaver, multistep_trainer, \ wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \ OffPolicyLearner, final_ctx_saver from ding.envs import BaseEnv, BaseEnvManagerV2, SubprocessEnvManagerV2 -from ding.policy import TD3Policy, single_env_forward_wrapper_ttorch +from ding.policy import TD3Policy from ding.utils import set_pkg_seed from ding.config import Config, save_config_py, compile_config from ding.model import QAC from ding.data import DequeBuffer -from ding.bonus.config import get_instance_config, get_instance_env, get_hybrid_shape +from ding.bonus.config import get_instance_config, get_instance_env @dataclass class TrainingReturn: + ''' + Attributions + wandb_url: The weight & biases (wandb) project url of the trainning experiment. + ''' wandb_url: str @@ -35,7 +40,7 @@ def __init__( seed: int = 0, exp_name: str = None, cfg: Optional[Union[EasyDict, dict, str]] = None, - ckpt_path: str = None, + policy_state_dict: str = None, ) -> None: if isinstance(env, str): assert env in TD3OffPolicyAgent.supported_env_list, "Please use supported envs: {}".format(TD3OffPolicyAgent.supported_env_list) @@ -75,8 +80,8 @@ def __init__( model = QAC(**self.cfg.policy.model) self.buffer_ = DequeBuffer(size=self.cfg.policy.other.replay_buffer.replay_buffer_size) self.policy = TD3Policy(self.cfg.policy, model=model) - if ckpt_path is not None: - self.policy.load_state_dict(torch.load(ckpt_path)) + if policy_state_dict is not None: + self.policy.load_state_dict(policy_state_dict) def load_policy(self, policy_state_dict, config): self.policy.load_state_dict(policy_state_dict) @@ -91,7 +96,6 @@ def train( n_iter_log_show: int = 500, n_iter_save_ckpt: int = 1000, context: Optional[str] = None, - render: bool = False, debug: bool = False ) -> dict: if debug: @@ -100,14 +104,9 @@ def train( # define env and policy collector_env = self._setup_env_manager(collector_env_num, context, debug) evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug) - wandb_url_return = [] - - if render: - self.cfg.policy.logger.record_path = os.path.join(self.exp_name, 'video') - evaluator_env.enable_save_replay(replay_path=self.cfg.policy.logger.record_path) with task.start(ctx=OnlineRLContext()): - task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env, render=render)) + task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) task.use( StepCollector( self.cfg, @@ -128,31 +127,45 @@ def train( ) task.use( wandb_online_logger( - record_path=self.cfg.policy.logger.record_path, - cfg=self.cfg.policy.logger, metric_list=self.policy.monitor_vars(), model=self.policy._model, anonymous=True, - project_name=self.exp_name, - wandb_url_return=wandb_url_return + project_name=self.exp_name ) ) task.use(termination_checker(max_env_step=step)) task.use(final_ctx_saver(name=self.cfg["exp_name"])) task.run() - return TrainingReturn(wandb_url_return[0]) + return TrainingReturn(wandb_url=task.ctx.wandb_url) - def deploy(self, enable_save_replay: bool = False, debug: bool = False) -> None: + def deploy(self, enable_save_replay: bool = False, replay_save_path:str=None, debug: bool = False) -> None: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy env = self.env.clone() env.seed(self.seed, dynamic_seed=False) if enable_save_replay: - env.enable_save_replay(replay_path=os.path.join(self.exp_name, 'videos')) - - forward_fn = single_env_forward_wrapper_ttorch(self.policy.eval) + if replay_save_path is None: + env.enable_save_replay(replay_path=os.path.join(self.exp_name, 'videos')) + else: + env.enable_save_replay(replay_path=replay_save_path) + + def single_env_forward_wrapper(forward_fn, cuda=True): + + def _forward(obs): + # unsqueeze means add batch dim, i.e. (O, ) -> (1, O) + obs = ttorch.as_tensor(obs).unsqueeze(0) + if cuda and torch.cuda.is_available(): + obs = obs.cuda() + action = forward_fn(obs,mode='compute_actor')["action"] + # squeeze means delete batch dim, i.e. (1, A) -> (A, ) + action = action.squeeze(0).detach().cpu().numpy() + return action + + return _forward + + forward_fn = single_env_forward_wrapper(self.policy._model) # main loop return_ = 0. diff --git a/ding/framework/middleware/functional/logger.py b/ding/framework/middleware/functional/logger.py index 3112b58228..c327fc3aa6 100644 --- a/ding/framework/middleware/functional/logger.py +++ b/ding/framework/middleware/functional/logger.py @@ -121,8 +121,8 @@ def _logger(ctx: "OfflineRLContext"): def wandb_online_logger( - record_path: str, - cfg: Union[str, EasyDict] = 'default', + record_path: str = None, + cfg: Union[dict, EasyDict] = None, metric_list: Optional[List[str]] = None, env: Optional[BaseEnvManagerV2] = None, model: Optional[torch.nn.Module] = None, @@ -134,15 +134,18 @@ def wandb_online_logger( Wandb visualizer to track the experiment. Arguments: - record_path (:obj:`str`): The path to save the replay of simulation. - - cfg (:obj:`Union[str, EasyDict]`): Config, a dict of following settings: + - cfg (:obj:`Union[dict, EasyDict]`): Config, a dict of following settings: - gradient_logger: boolean. Whether to track the gradient. - plot_logger: boolean. Whether to track the metrics like reward and loss. - - action_logger: `q_value` or `action probability`. + - video_logger: boolean. Whether to upload the rendering video replay. + - action_logger: boolean. `q_value` or `action probability`. + - return_logger: boolean. Whether to track the return value. - metric_list (:obj:`Optional[List[str]]`): Logged metric list, specialized by different policies. - env (:obj:`BaseEnvManagerV2`): Evaluator environment. - model (:obj:`nn.Module`): Policy neural network model. - anonymous (:obj:`bool`): Open the anonymous mode of wandb or not. The anonymous mode allows visualization of data without wandb count. + - project_name (:obj:`str`): The name of wandb project. ''' if task.router.is_active and not task.has_role(task.role.LEARNER): return task.void() @@ -155,7 +158,7 @@ def wandb_online_logger( wandb.init(project=project_name, reinit=True, anonymous="must") else: wandb.init(project=project_name, reinit=True) - if cfg == 'default': + if cfg is None: cfg = EasyDict( dict( gradient_logger=False, @@ -165,9 +168,15 @@ def wandb_online_logger( return_logger=False, ) ) + else: + if not isinstance(cfg, EasyDict): + cfg = EasyDict(cfg) + assert tuple(cfg.keys()) == ("gradient_logger","plot_logger","video_logger","action_logger","return_logger") + assert all(value in [True, False] for value in cfg.values()) + # The visualizer is called to save the replay of the simulation # which will be uploaded to wandb later - if env is not None: + if env is not None and cfg.video_logger is True and record_path is not None: env.enable_save_replay(replay_path=record_path) if cfg.gradient_logger: wandb.watch(model) @@ -225,10 +234,10 @@ def _plot(ctx: "OnlineRLContext"): file_list.sort(key=lambda fn: os.path.getmtime(os.path.join(record_path, fn))) video_path = os.path.join(record_path, file_list[-2]) info_for_logging.update({"video": wandb.Video(video_path, format="mp4")}) - - action_path = os.path.join(record_path, (str(ctx.env_step) + "_action.gif")) - return_path = os.path.join(record_path, (str(ctx.env_step) + "_return.gif")) + + if cfg.action_logger: + action_path = os.path.join(record_path, (str(ctx.env_step) + "_action.gif")) if all(['logit' in v for v in eval_output]) or hasattr(eval_output, "logit"): if isinstance(eval_output, tnp.ndarray): action_prob = softmax(eval_output.logit) @@ -257,6 +266,7 @@ def _plot(ctx: "OnlineRLContext"): info_for_logging.update({"actions_of_trajectory_{}".format(i): fig}) if cfg.return_logger: + return_path = os.path.join(record_path, (str(ctx.env_step) + "_return.gif")) fig, ax = plt.subplots() ax = plt.gca() ax.set_ylim([0, 1]) @@ -267,7 +277,8 @@ def _plot(ctx: "OnlineRLContext"): ani.save(return_path, writer='pillow') info_for_logging.update({"return distribution": wandb.Video(return_path, format="gif")}) - wandb.log(data=info_for_logging, step=ctx.env_step) + if bool(info_for_logging): + wandb.log(data=info_for_logging, step=ctx.env_step) plt.clf() return _plot From 2f883d729d9ba4bab9c1d60d6384db9306cf2b98 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Mon, 13 Mar 2023 06:19:12 +0000 Subject: [PATCH 043/244] format code --- ding/bonus/config.py | 14 ++++++-------- ding/bonus/td3.py | 12 +++++++----- ding/framework/middleware/functional/logger.py | 13 ++++++------- 3 files changed, 19 insertions(+), 20 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 3c77f7756c..bd85c6cdb1 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -139,17 +139,15 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: critic_head_hidden_size=256, action_space='regression', ), - collect=dict( - n_sample=1, - ), + collect=dict(n_sample=1, ), other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), ), wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False ), ) ) diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 8837389c4a..123e3951fc 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -43,7 +43,9 @@ def __init__( policy_state_dict: str = None, ) -> None: if isinstance(env, str): - assert env in TD3OffPolicyAgent.supported_env_list, "Please use supported envs: {}".format(TD3OffPolicyAgent.supported_env_list) + assert env in TD3OffPolicyAgent.supported_env_list, "Please use supported envs: {}".format( + TD3OffPolicyAgent.supported_env_list + ) self.env = get_instance_env(env) if cfg is None: # 'It should be default env tuned config' @@ -54,7 +56,7 @@ def __init__( cfg = EasyDict(cfg) elif isinstance(cfg, str): cfg = EasyDict(Config.file_to_dict(cfg)) - + if exp_name is not None: self.exp_name = exp_name cfg.exp_name = exp_name @@ -139,7 +141,7 @@ def train( return TrainingReturn(wandb_url=task.ctx.wandb_url) - def deploy(self, enable_save_replay: bool = False, replay_save_path:str=None, debug: bool = False) -> None: + def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> None: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -158,13 +160,13 @@ def _forward(obs): obs = ttorch.as_tensor(obs).unsqueeze(0) if cuda and torch.cuda.is_available(): obs = obs.cuda() - action = forward_fn(obs,mode='compute_actor')["action"] + action = forward_fn(obs, mode='compute_actor')["action"] # squeeze means delete batch dim, i.e. (1, A) -> (A, ) action = action.squeeze(0).detach().cpu().numpy() return action return _forward - + forward_fn = single_env_forward_wrapper(self.policy._model) # main loop diff --git a/ding/framework/middleware/functional/logger.py b/ding/framework/middleware/functional/logger.py index c327fc3aa6..129c2bea76 100644 --- a/ding/framework/middleware/functional/logger.py +++ b/ding/framework/middleware/functional/logger.py @@ -171,7 +171,7 @@ def wandb_online_logger( else: if not isinstance(cfg, EasyDict): cfg = EasyDict(cfg) - assert tuple(cfg.keys()) == ("gradient_logger","plot_logger","video_logger","action_logger","return_logger") + assert tuple(cfg.keys()) == ("gradient_logger", "plot_logger", "video_logger", "action_logger", "return_logger") assert all(value in [True, False] for value in cfg.values()) # The visualizer is called to save the replay of the simulation @@ -184,14 +184,14 @@ def wandb_online_logger( one_time_warning( "If you want to use wandb to visualize the gradient, please set gradient_logger = True in the config." ) - - first_plot=True + + first_plot = True def _plot(ctx: "OnlineRLContext"): nonlocal first_plot if first_plot: - first_plot=False - ctx.wandb_url=wandb.run.get_project_url() + first_plot = False + ctx.wandb_url = wandb.run.get_project_url() info_for_logging = {} @@ -234,8 +234,7 @@ def _plot(ctx: "OnlineRLContext"): file_list.sort(key=lambda fn: os.path.getmtime(os.path.join(record_path, fn))) video_path = os.path.join(record_path, file_list[-2]) info_for_logging.update({"video": wandb.Video(video_path, format="mp4")}) - - + if cfg.action_logger: action_path = os.path.join(record_path, (str(ctx.env_step) + "_action.gif")) if all(['logit' in v for v in eval_output]) or hasattr(eval_output, "logit"): From 3c15c84f1d4969487b7b4ae003bf918661afb7b0 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Mon, 13 Mar 2023 06:22:45 +0000 Subject: [PATCH 044/244] polish code --- ding/bonus/ppof.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index 9bc7a59764..a21c0acd0d 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -20,6 +20,10 @@ @dataclass class TrainingReturn: + ''' + Attributions + wandb_url: The weight & biases (wandb) project url of the trainning experiment. + ''' wandb_url: str From 6ce1421338baaa4e03f64c7ed6a78b38ec2e3117 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Mon, 13 Mar 2023 08:58:34 +0000 Subject: [PATCH 045/244] fix load bug --- ding/policy/ddpg.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ding/policy/ddpg.py b/ding/policy/ddpg.py index ffe8d298da..15e12e9388 100644 --- a/ding/policy/ddpg.py +++ b/ding/policy/ddpg.py @@ -470,8 +470,8 @@ def state_dict(self) -> Dict[str, Any]: return state_dict def load_state_dict(self, state_dict: Dict[str, Any]) -> None: - self._model.load_state_dict(state_dict['model']) - self._target_model.load_state_dict(state_dict['target_model']) + self._model.load_state_dict(state_dict['model'], map_location=torch.device(self._device)) + self._target_model.load_state_dict(state_dict['target_model'], map_location=torch.device(self._device)) if 'learn' in self._enable_field: - self._optimizer_actor.load_state_dict(state_dict['optimizer_actor']) - self._optimizer_critic.load_state_dict(state_dict['optimizer_critic']) + self._optimizer_actor.load_state_dict(state_dict['optimizer_actor'], map_location=torch.device(self._device)) + self._optimizer_critic.load_state_dict(state_dict['optimizer_critic'], map_location=torch.device(self._device)) From eac9434e95049c0fec04a183722e46db80b4af1a Mon Sep 17 00:00:00 2001 From: zjowowen Date: Mon, 13 Mar 2023 10:10:06 +0000 Subject: [PATCH 046/244] fix bug --- ding/policy/ddpg.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ding/policy/ddpg.py b/ding/policy/ddpg.py index 15e12e9388..ffe8d298da 100644 --- a/ding/policy/ddpg.py +++ b/ding/policy/ddpg.py @@ -470,8 +470,8 @@ def state_dict(self) -> Dict[str, Any]: return state_dict def load_state_dict(self, state_dict: Dict[str, Any]) -> None: - self._model.load_state_dict(state_dict['model'], map_location=torch.device(self._device)) - self._target_model.load_state_dict(state_dict['target_model'], map_location=torch.device(self._device)) + self._model.load_state_dict(state_dict['model']) + self._target_model.load_state_dict(state_dict['target_model']) if 'learn' in self._enable_field: - self._optimizer_actor.load_state_dict(state_dict['optimizer_actor'], map_location=torch.device(self._device)) - self._optimizer_critic.load_state_dict(state_dict['optimizer_critic'], map_location=torch.device(self._device)) + self._optimizer_actor.load_state_dict(state_dict['optimizer_actor']) + self._optimizer_critic.load_state_dict(state_dict['optimizer_critic']) From 6fda31b45216dfb998ad8639b2af646a6ca36b32 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 14 Mar 2023 04:41:41 +0000 Subject: [PATCH 047/244] fix dtype error --- ding/envs/env/ding_env_wrapper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ding/envs/env/ding_env_wrapper.py b/ding/envs/env/ding_env_wrapper.py index e1d102ec97..03869ee5e5 100644 --- a/ding/envs/env/ding_env_wrapper.py +++ b/ding/envs/env/ding_env_wrapper.py @@ -98,7 +98,7 @@ def reset(self) -> None: obs = self._env.reset() else: raise RuntimeError("not support env type: {}".format(type(self._env))) - obs = to_ndarray(obs) + obs = to_ndarray(obs, dtype=np.float32) return obs # override @@ -121,7 +121,7 @@ def step(self, action: Union[np.int64, np.ndarray]) -> BaseEnvTimestep: if self._cfg.act_scale: action = affine_transform(action, min_val=self._env.action_space.low, max_val=self._env.action_space.high) obs, rew, done, info = self._env.step(action) - obs = to_ndarray(obs) + obs = to_ndarray(obs, dtype=np.float32) rew = to_ndarray([rew], np.float32) return BaseEnvTimestep(obs, rew, done, info) From 6b9def4e3ed4f03d362e55e179fc8aff7fa67128 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 15 Mar 2023 08:56:04 +0000 Subject: [PATCH 048/244] polish code --- ding/bonus/config.py | 2 - ding/bonus/ppof.py | 55 ++++++++----------- ding/bonus/td3.py | 43 ++++++--------- ding/envs/env/ding_env_wrapper.py | 10 +++- .../middleware/functional/evaluator.py | 5 -- .../framework/middleware/functional/logger.py | 19 ++++--- .../framework/middleware/tests/test_logger.py | 19 ++----- ding/model/template/qac.py | 1 - ding/policy/ddpg.py | 7 --- dizoo/metadrive/env/drive_wrapper.py | 1 - 10 files changed, 63 insertions(+), 99 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index bd85c6cdb1..041a37e653 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -122,8 +122,6 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: seed=0, env=dict( env_id='Hopper-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), collector_env_num=8, evaluator_env_num=8, n_evaluator_episode=8, diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index a21c0acd0d..d573f0fa8e 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -58,7 +58,8 @@ def __init__( seed: int = 0, exp_name: str = 'default_experiment', model: Optional[torch.nn.Module] = None, - cfg: Optional[EasyDict] = None + cfg: Optional[EasyDict] = None, + policy_state_dict: str = None, ) -> None: if isinstance(env, str): assert env in PPOF.supported_env_list, "Please use supported envs: {}".format(PPOF.supported_env_list) @@ -93,10 +94,8 @@ def __init__( self.env.observation_space.shape, action_shape, action_space=self.cfg.action_space, **self.cfg.model ) self.policy = PPOFPolicy(self.cfg, model=model) - - def load_policy(self, policy_state_dict, config): - self.policy.load_state_dict(policy_state_dict) - self.policy._cfg = config + if policy_state_dict is not None: + self.policy.load_state_dict(policy_state_dict) def train( self, @@ -115,7 +114,7 @@ def train( # define env and policy collector_env = self._setup_env_manager(collector_env_num, context, debug, 'collector') evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug, 'evaluator') - wandb_url_return = [] + if reward_model is not None: # self.reward_model = create_reward_model(reward_model, self.cfg.reward_model) pass @@ -128,30 +127,31 @@ def train( task.use(CkptSaver(self.policy, save_dir=self.exp_name, train_freq=n_iter_save_ckpt)) task.use( wandb_online_logger( - self.exp_name, metric_list=self.policy.monitor_vars(), + model=self.policy._model, anonymous=True, - project_name=self.exp_name, - wandb_url_return=wandb_url_return + project_name=self.exp_name ) ) task.use(termination_checker(max_env_step=step)) task.run() - return TrainingReturn(wandb_url=wandb_url_return[0]) + return TrainingReturn(wandb_url=task.ctx.wandb_url) - def deploy(self, ckpt_path: str = None, enable_save_replay: bool = False, debug: bool = False) -> None: + def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> None: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy env = self.env.clone() env.seed(self.seed, dynamic_seed=False) - if enable_save_replay: + + if enable_save_replay and replay_save_path: + env.enable_save_replay(replay_path=replay_save_path) + elif enable_save_replay: env.enable_save_replay(replay_path=os.path.join(self.exp_name, 'videos')) - if ckpt_path is None: - ckpt_path = os.path.join(self.exp_name, 'ckpt/eval.pth.tar') - state_dict = torch.load(ckpt_path, map_location='cpu') - self.policy.load_state_dict(state_dict) + else: + logging.warning(f'No video would be generated during the deploy.') + forward_fn = single_env_forward_wrapper_ttorch(self.policy.eval) # main loop @@ -170,7 +170,6 @@ def deploy(self, ckpt_path: str = None, enable_save_replay: bool = False, debug: def collect_data( self, env_num: int = 8, - ckpt_path: Optional[str] = None, save_data_path: Optional[str] = None, n_sample: Optional[int] = None, n_episode: Optional[int] = None, @@ -183,12 +182,8 @@ def collect_data( raise NotImplementedError # define env and policy env = self._setup_env_manager(env_num, context, debug, 'collector') - if ckpt_path is None: - ckpt_path = os.path.join(self.exp_name, 'ckpt/eval.pth.tar') if save_data_path is None: save_data_path = os.path.join(self.exp_name, 'demo_data') - state_dict = torch.load(ckpt_path, map_location='cpu') - self.policy.load_state_dict(state_dict) # main execution task with task.start(ctx=OnlineRLContext()): @@ -205,8 +200,6 @@ def batch_evaluate( n_evaluator_episode: int = 4, context: Optional[str] = None, debug: bool = False, - render: bool = False, - replay_video_path: str = None, ) -> None: if debug: logging.getLogger().setLevel(logging.DEBUG) @@ -215,16 +208,12 @@ def batch_evaluate( # main execution task with task.start(ctx=OnlineRLContext()): - task.use( - interaction_evaluator_ttorch( - self.seed, - self.policy, - env, - n_evaluator_episode, - render=render, - replay_video_path=replay_video_path - ) - ) + task.use(interaction_evaluator_ttorch( + self.seed, + self.policy, + env, + n_evaluator_episode, + )) task.run(max_step=1) def _setup_env_manager( diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 123e3951fc..82e703b1bd 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -39,6 +39,7 @@ def __init__( env: Union[str, BaseEnv], seed: int = 0, exp_name: str = None, + model: Optional[torch.nn.Module] = None, cfg: Optional[Union[EasyDict, dict, str]] = None, policy_state_dict: str = None, ) -> None: @@ -50,24 +51,14 @@ def __init__( if cfg is None: # 'It should be default env tuned config' cfg = get_instance_config(env, algorithm=TD3OffPolicyAgent.algorithm) - elif isinstance(cfg, EasyDict): - pass - elif isinstance(cfg, dict): - cfg = EasyDict(cfg) - elif isinstance(cfg, str): - cfg = EasyDict(Config.file_to_dict(cfg)) + else: + assert isinstance(cfg, EasyDict), "Please use EasyDict as config data type." if exp_name is not None: - self.exp_name = exp_name cfg.exp_name = exp_name - elif cfg.exp_name is not None: - self.exp_name = cfg.exp_name - else: - self.exp_name = 'default_experiment' - cfg.exp_name = self.exp_name self.cfg = compile_config(cfg, policy=TD3Policy) - if self.cfg.exp_name != self.exp_name: - self.exp_name = self.cfg.exp_name + self.exp_name = self.cfg.exp_name + elif isinstance(env, BaseEnv): self.cfg = compile_config(cfg, policy=TD3Policy) raise NotImplementedError @@ -79,16 +70,12 @@ def __init__( if not os.path.exists(self.exp_name): os.makedirs(self.exp_name) save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py')) - model = QAC(**self.cfg.policy.model) + if model is None: + model = QAC(**self.cfg.policy.model) self.buffer_ = DequeBuffer(size=self.cfg.policy.other.replay_buffer.replay_buffer_size) self.policy = TD3Policy(self.cfg.policy, model=model) if policy_state_dict is not None: - self.policy.load_state_dict(policy_state_dict) - - def load_policy(self, policy_state_dict, config): - self.policy.load_state_dict(policy_state_dict) - self.policy._cfg = config - self.cfg = config + self.policy.learn_mode.load_state_dict(policy_state_dict) def train( self, @@ -99,7 +86,7 @@ def train( n_iter_save_ckpt: int = 1000, context: Optional[str] = None, debug: bool = False - ) -> dict: + ) -> TrainingReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) logging.debug(self.policy._model) @@ -147,11 +134,13 @@ def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, # define env and policy env = self.env.clone() env.seed(self.seed, dynamic_seed=False) - if enable_save_replay: - if replay_save_path is None: - env.enable_save_replay(replay_path=os.path.join(self.exp_name, 'videos')) - else: - env.enable_save_replay(replay_path=replay_save_path) + + if enable_save_replay and replay_save_path: + env.enable_save_replay(replay_path=replay_save_path) + elif enable_save_replay: + env.enable_save_replay(replay_path=os.path.join(self.exp_name, 'videos')) + else: + logging.warning(f'No video would be generated during the deploy.') def single_env_forward_wrapper(forward_fn, cuda=True): diff --git a/ding/envs/env/ding_env_wrapper.py b/ding/envs/env/ding_env_wrapper.py index 03869ee5e5..fb806cbbbc 100644 --- a/ding/envs/env/ding_env_wrapper.py +++ b/ding/envs/env/ding_env_wrapper.py @@ -98,7 +98,10 @@ def reset(self) -> None: obs = self._env.reset() else: raise RuntimeError("not support env type: {}".format(type(self._env))) - obs = to_ndarray(obs, dtype=np.float32) + if self.observation_space.dtype == np.float32: + obs = to_ndarray(obs, dtype=np.float32) + else: + obs = to_ndarray(obs) return obs # override @@ -121,7 +124,10 @@ def step(self, action: Union[np.int64, np.ndarray]) -> BaseEnvTimestep: if self._cfg.act_scale: action = affine_transform(action, min_val=self._env.action_space.low, max_val=self._env.action_space.high) obs, rew, done, info = self._env.step(action) - obs = to_ndarray(obs, dtype=np.float32) + if self.observation_space.dtype == np.float32: + obs = to_ndarray(obs, dtype=np.float32) + else: + obs = to_ndarray(obs) rew = to_ndarray([rew], np.float32) return BaseEnvTimestep(obs, rew, done, info) diff --git a/ding/framework/middleware/functional/evaluator.py b/ding/framework/middleware/functional/evaluator.py index 162af3dd47..102cba70e0 100644 --- a/ding/framework/middleware/functional/evaluator.py +++ b/ding/framework/middleware/functional/evaluator.py @@ -307,7 +307,6 @@ def interaction_evaluator_ttorch( stop_value: float = np.inf, eval_freq: int = 1000, render: bool = False, - replay_video_path: str = None, ) -> Callable: """ Overview: @@ -315,7 +314,6 @@ def interaction_evaluator_ttorch( Arguments: - policy (:obj:`Policy`): The policy to be evaluated. - env (:obj:`BaseEnvManager`): The env for the evaluation. - - render (:obj:`bool`): Whether to render env images and policy logits. """ if task.router.is_active and not task.has_role(task.role.EVALUATOR): return task.void() @@ -324,9 +322,6 @@ def interaction_evaluator_ttorch( if n_evaluator_episode is None: n_evaluator_episode = env.env_num - if replay_video_path: - env.enable_save_replay(replay_path=replay_video_path) - def _evaluate(ctx: "OnlineRLContext"): """ Overview: diff --git a/ding/framework/middleware/functional/logger.py b/ding/framework/middleware/functional/logger.py index 129c2bea76..d5487d3ca2 100644 --- a/ding/framework/middleware/functional/logger.py +++ b/ding/framework/middleware/functional/logger.py @@ -284,28 +284,33 @@ def _plot(ctx: "OnlineRLContext"): def wandb_offline_logger( - record_path: str, - datasetpath: str, - cfg: Union[str, EasyDict] = 'default', + dataset_path: str, + record_path: str = None, + cfg: Union[dict, EasyDict] = None, metric_list: Optional[List[str]] = None, env: Optional[BaseEnvManagerV2] = None, model: Optional[torch.nn.Module] = None, - anonymous: bool = False + anonymous: bool = False, + project_name: str = 'default-project', ) -> Callable: ''' Overview: Wandb visualizer to track the experiment. Arguments: + - datasetpath (:obj:`str`): The path to save the replay of simulation. - record_path (:obj:`str`): The path to save the replay of simulation. - - cfg (:obj:`Union[str, EasyDict]`): Config, a dict of following settings: + - cfg (:obj:`Union[dict, EasyDict]`): Config, a dict of following settings: - gradient_logger: boolean. Whether to track the gradient. - plot_logger: boolean. Whether to track the metrics like reward and loss. - - action_logger: `q_value` or `action probability`. + - video_logger: boolean. Whether to upload the rendering video replay. + - action_logger: boolean. `q_value` or `action probability`. + - return_logger: boolean. Whether to track the return value. - metric_list (:obj:`Optional[List[str]]`): Logged metric list, specialized by different policies. - env (:obj:`BaseEnvManagerV2`): Evaluator environment. - model (:obj:`nn.Module`): Policy neural network model. - anonymous (:obj:`bool`): Open the anonymous mode of wandb or not. The anonymous mode allows visualization of data without wandb count. + - project_name (:obj:`str`): The name of wandb project. ''' if task.router.is_active and not task.has_role(task.role.LEARNER): return task.void() @@ -385,7 +390,7 @@ def _vis_dataset(datasetpath: str): wandb.log({"dataset": wandb.Image("dataset.png")}) if cfg.vis_dataset is True: - _vis_dataset(datasetpath) + _vis_dataset(dataset_path) def _plot(ctx: "OnlineRLContext"): info_for_logging = {} diff --git a/ding/framework/middleware/tests/test_logger.py b/ding/framework/middleware/tests/test_logger.py index 9a6aa9bc6f..38c4c45cb6 100644 --- a/ding/framework/middleware/tests/test_logger.py +++ b/ding/framework/middleware/tests/test_logger.py @@ -192,10 +192,9 @@ def __getitem__(self, index): @pytest.mark.unittest def test_wandb_online_logger(): - + record_path = './video_qbert_dqn' cfg = EasyDict( dict( - record_path='./video_qbert_dqn', gradient_logger=True, plot_logger=True, action_logger='action probability', @@ -237,11 +236,11 @@ def mock_gradient_logger(input_model): def test_wandb_online_logger_metric(): with patch.object(wandb, 'log', new=mock_metric_logger): - wandb_online_logger(cfg.record_path, cfg, env=env, model=model, anonymous=True)(ctx) + wandb_online_logger(record_path, cfg, env=env, model=model, anonymous=True)(ctx) def test_wandb_online_logger_gradient(): with patch.object(wandb, 'watch', new=mock_gradient_logger): - wandb_online_logger(cfg.record_path, cfg, env=env, model=model, anonymous=True)(ctx) + wandb_online_logger(record_path, cfg, env=env, model=model, anonymous=True)(ctx) test_wandb_online_logger_metric() test_wandb_online_logger_gradient() @@ -251,16 +250,8 @@ def test_wandb_online_logger_gradient(): # TODO(nyz): fix CI bug when py=3.8.15 @pytest.mark.tmp def test_wandb_offline_logger(mocker): - - cfg = EasyDict( - dict( - record_path='./video_pendulum_cql', - gradient_logger=True, - plot_logger=True, - action_logger='action probability', - vis_dataset=True - ) - ) + record_path = './video_pendulum_cql' + cfg = EasyDict(dict(gradient_logger=True, plot_logger=True, action_logger='action probability', vis_dataset=True)) env = TheEnvClass() ctx = OnlineRLContext() ctx.train_output = [{'reward': 1, 'q_value': [1.0]}] diff --git a/ding/model/template/qac.py b/ding/model/template/qac.py index 276ed45565..aa0cc42b0e 100644 --- a/ding/model/template/qac.py +++ b/ding/model/template/qac.py @@ -3,7 +3,6 @@ import numpy as np import torch import torch.nn as nn -import copy from ding.utils import SequenceType, squeeze, MODEL_REGISTRY from ..common import RegressionHead, ReparameterizationHead, DiscreteHead, MultiHead, \ diff --git a/ding/policy/ddpg.py b/ding/policy/ddpg.py index ffe8d298da..4eb71f1b19 100644 --- a/ding/policy/ddpg.py +++ b/ding/policy/ddpg.py @@ -468,10 +468,3 @@ def state_dict(self) -> Dict[str, Any]: state_dict['optimizer_actor'] = self._optimizer_actor.state_dict() state_dict['optimizer_critic'] = self._optimizer_critic.state_dict() return state_dict - - def load_state_dict(self, state_dict: Dict[str, Any]) -> None: - self._model.load_state_dict(state_dict['model']) - self._target_model.load_state_dict(state_dict['target_model']) - if 'learn' in self._enable_field: - self._optimizer_actor.load_state_dict(state_dict['optimizer_actor']) - self._optimizer_critic.load_state_dict(state_dict['optimizer_critic']) diff --git a/dizoo/metadrive/env/drive_wrapper.py b/dizoo/metadrive/env/drive_wrapper.py index eb635f91b5..9b1a1373fd 100644 --- a/dizoo/metadrive/env/drive_wrapper.py +++ b/dizoo/metadrive/env/drive_wrapper.py @@ -1,6 +1,5 @@ from typing import Any, Dict, Optional from easydict import EasyDict -from typing import Optional import matplotlib.pyplot as plt import gym import copy From 6f49d0a8b575f0a6dc3556ad2fdae51112018b1f Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 15 Mar 2023 09:22:21 +0000 Subject: [PATCH 049/244] polish code --- ding/bonus/ppof.py | 2 +- ding/bonus/td3.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index d573f0fa8e..6bcbd1a2c3 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -191,7 +191,7 @@ def collect_data( task.use(offline_data_saver(save_data_path, data_type='hdf5')) task.run(max_step=1) logging.info( - f'PPOF collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`' + 'PPOF collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`' ) def batch_evaluate( diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 82e703b1bd..8eb6397b2c 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -140,7 +140,7 @@ def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, elif enable_save_replay: env.enable_save_replay(replay_path=os.path.join(self.exp_name, 'videos')) else: - logging.warning(f'No video would be generated during the deploy.') + logging.warning('No video would be generated during the deploy.') def single_env_forward_wrapper(forward_fn, cuda=True): From cdafb5514f19f502b1eeb2d68265ce45ee05928f Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 16 Mar 2023 03:34:48 +0000 Subject: [PATCH 050/244] Add dqn agent --- ding/bonus/__init__.py | 1 + ding/bonus/config.py | 60 ++++++++++- ding/bonus/dqn.py | 234 +++++++++++++++++++++++++++++++++++++++++ ding/policy/dqn.py | 3 + 4 files changed, 297 insertions(+), 1 deletion(-) create mode 100644 ding/bonus/dqn.py diff --git a/ding/bonus/__init__.py b/ding/bonus/__init__.py index 54841678ea..89ec84d897 100644 --- a/ding/bonus/__init__.py +++ b/ding/bonus/__init__.py @@ -1,2 +1,3 @@ from .ppof import PPOF from .td3 import TD3OffPolicyAgent +from .dqn import DQNOffpolicyAgent diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 041a37e653..b373212875 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -4,7 +4,7 @@ from ding.envs import BaseEnv, DingEnvWrapper from ding.envs.env_wrappers import MaxAndSkipWrapper, WarpFrameWrapper, ScaledFloatFrameWrapper, FrameStackWrapper, \ EvalEpisodeReturnEnv, TransposeWrapper, TimeLimitWrapper, FlatObsWrapper, GymToGymnasiumWrapper -from ding.policy import PPOFPolicy, TD3Policy +from ding.policy import PPOFPolicy, TD3Policy, DQNPolicy def get_instance_config(env: str, algorithm: str) -> EasyDict: @@ -151,6 +151,64 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ) else: raise KeyError("not supported env type: {}".format(env)) + elif algorithm == 'DQN': + cfg = DQNPolicy.default_config() + if env == 'lunarlander_discrete': + cfg.update( + dict( + exp_name='lunarlander_dqn', + seed=0, + env=dict( + env_id='LunarLander-v2', + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=200, + ), + policy=dict( + cuda=True, + random_collect_size=25000, + discount_factor=0.99, + nstep=3, + learn=dict( + update_per_collect=10, + batch_size=64, + learning_rate=0.001, + # Frequency of target network update. + target_update_freq=100, + ), + model=dict( + obs_shape=8, + action_shape=4, + encoder_hidden_size_list=[512, 64], + # Whether to use dueling head. + dueling=True, + ), + collect=dict( + n_sample=64, + unroll_len=1, + ), + other=dict( + eps=dict( + type='exp', + start=0.95, + end=0.1, + decay=50000, + ), + replay_buffer=dict(replay_buffer_size=100000, ) + ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) + else: + raise KeyError("not supported env type: {}".format(env)) else: raise KeyError("not supported algorithm type: {}".format(algorithm)) diff --git a/ding/bonus/dqn.py b/ding/bonus/dqn.py new file mode 100644 index 0000000000..2a5367c3d4 --- /dev/null +++ b/ding/bonus/dqn.py @@ -0,0 +1,234 @@ +from dataclasses import dataclass +from typing import Optional, Union +from ditk import logging +from easydict import EasyDict +import os +import torch +import treetensor.torch as ttorch +from ding.framework import task, OnlineRLContext +from ding.framework.middleware import CkptSaver, multistep_trainer, \ + wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \ + OffPolicyLearner, final_ctx_saver +from ding.envs import BaseEnv, BaseEnvManagerV2, SubprocessEnvManagerV2 +from ding.policy import DQNPolicy +from ding.utils import set_pkg_seed +from ding.config import save_config_py, compile_config +from ding.model import DQN +from ding.data import DequeBuffer +from ding.bonus.config import get_instance_config, get_instance_env + + +@dataclass +class TrainingReturn: + ''' + Attributions + wandb_url: The weight & biases (wandb) project url of the trainning experiment. + ''' + wandb_url: str + + +class DQNOffpolicyAgent: + supported_env_list = [ + 'lunarlander_discrete', + ] + algorithm = 'DQN' + + def __init__( + self, + env: Union[str, BaseEnv], + seed: int = 0, + exp_name: str = None, + model: Optional[torch.nn.Module] = None, + cfg: Optional[Union[EasyDict, dict, str]] = None, + policy_state_dict: str = None, + ) -> None: + if isinstance(env, str): + assert env in DQNOffpolicyAgent.supported_env_list, "Please use supported envs: {}".format( + DQNOffpolicyAgent.supported_env_list + ) + self.env = get_instance_env(env) + if cfg is None: + # 'It should be default env tuned config' + cfg = get_instance_config(env, algorithm=DQNOffpolicyAgent.algorithm) + else: + assert isinstance(cfg, EasyDict), "Please use EasyDict as config data type." + + if exp_name is not None: + cfg.exp_name = exp_name + self.cfg = compile_config(cfg, policy=DQNPolicy) + self.exp_name = self.cfg.exp_name + + elif isinstance(env, BaseEnv): + self.cfg = compile_config(cfg, policy=DQNPolicy) + raise NotImplementedError + else: + raise TypeError("not support env type: {}, only strings and instances of `BaseEnv` now".format(type(env))) + logging.getLogger().setLevel(logging.INFO) + self.seed = seed + set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda) + if not os.path.exists(self.exp_name): + os.makedirs(self.exp_name) + save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py')) + if model is None: + model = DQN(**self.cfg.policy.model) + self.buffer_ = DequeBuffer(size=self.cfg.policy.other.replay_buffer.replay_buffer_size) + self.policy = DQNPolicy(self.cfg.policy, model=model) + if policy_state_dict is not None: + self.policy.learn_mode.load_state_dict(policy_state_dict) + + def train( + self, + step: int = int(1e7), + collector_env_num: int = 4, + evaluator_env_num: int = 4, + n_iter_log_show: int = 500, + n_iter_save_ckpt: int = 1000, + context: Optional[str] = None, + debug: bool = False + ) -> TrainingReturn: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + logging.debug(self.policy._model) + # define env and policy + collector_env = self._setup_env_manager(collector_env_num, context, debug) + evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug) + + with task.start(ctx=OnlineRLContext()): + task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) + task.use( + StepCollector( + self.cfg, + self.policy.collect_mode, + collector_env, + random_collect_size=self.cfg.policy.random_collect_size + ) + ) + task.use(data_pusher(self.cfg, self.buffer_)) + task.use(multistep_trainer(self.policy, log_freq=n_iter_log_show)) + task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_)) + task.use( + CkptSaver( + policy=self.policy, + save_dir=os.path.join(self.cfg["exp_name"], "model"), + train_freq=n_iter_save_ckpt + ) + ) + task.use( + wandb_online_logger( + metric_list=self.policy.monitor_vars(), + model=self.policy._model, + anonymous=True, + project_name=self.exp_name + ) + ) + task.use(termination_checker(max_env_step=step)) + task.use(final_ctx_saver(name=self.cfg["exp_name"])) + task.run() + + return TrainingReturn(wandb_url=task.ctx.wandb_url) + + def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> None: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + # define env and policy + env = self.env.clone() + env.seed(self.seed, dynamic_seed=False) + + if enable_save_replay and replay_save_path: + env.enable_save_replay(replay_path=replay_save_path) + elif enable_save_replay: + env.enable_save_replay(replay_path=os.path.join(self.exp_name, 'videos')) + else: + logging.warning('No video would be generated during the deploy.') + + def single_env_forward_wrapper(forward_fn, cuda=True): + + def _forward(obs): + # unsqueeze means add batch dim, i.e. (O, ) -> (1, O) + obs = ttorch.as_tensor(obs).unsqueeze(0) + if cuda and torch.cuda.is_available(): + obs = obs.cuda() + action = forward_fn(obs, mode='compute_actor')["action"] + # squeeze means delete batch dim, i.e. (1, A) -> (A, ) + action = action.squeeze(0).detach().cpu().numpy() + return action + + return _forward + + forward_fn = single_env_forward_wrapper(self.policy._model) + + # main loop + return_ = 0. + step = 0 + obs = env.reset() + while True: + action = forward_fn(obs) + obs, rew, done, info = env.step(action) + return_ += rew + step += 1 + if done: + break + logging.info(f'TD3 deploy is finished, final episode return with {step} steps is: {return_}') + + def collect_data( + self, + env_num: int = 8, + save_data_path: Optional[str] = None, + n_sample: Optional[int] = None, + n_episode: Optional[int] = None, + context: Optional[str] = None, + debug: bool = False + ) -> None: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + if n_episode is not None: + raise NotImplementedError + # define env and policy + env = self._setup_env_manager(env_num, context, debug) + + if save_data_path is None: + save_data_path = os.path.join(self.exp_name, 'demo_data') + + # main execution task + with task.start(ctx=OnlineRLContext()): + task.use( + StepCollector( + self.cfg, self.policy.collect_mode, env, random_collect_size=self.cfg.policy.random_collect_size + ) + ) + task.use(offline_data_saver(save_data_path, data_type='hdf5')) + task.run(max_step=1) + logging.info( + f'TD3 collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`' + ) + + def batch_evaluate( + self, + env_num: int = 4, + n_evaluator_episode: int = 4, + context: Optional[str] = None, + debug: bool = False + ) -> None: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + # define env and policy + env = self._setup_env_manager(env_num, context, debug) + + evaluate_cfg = self.cfg + evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode + + # main execution task + with task.start(ctx=OnlineRLContext()): + task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, env)) + task.run(max_step=1) + + def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: bool = False) -> BaseEnvManagerV2: + if debug: + env_cls = BaseEnvManagerV2 + manager_cfg = env_cls.default_config() + else: + env_cls = SubprocessEnvManagerV2 + manager_cfg = env_cls.default_config() + if context is not None: + manager_cfg.context = context + return env_cls([self.env.clone for _ in range(env_num)], manager_cfg) diff --git a/ding/policy/dqn.py b/ding/policy/dqn.py index 9efc68350f..17ae51920b 100644 --- a/ding/policy/dqn.py +++ b/ding/policy/dqn.py @@ -395,6 +395,9 @@ def _forward_eval(self, data: Dict[int, Any]) -> Dict[int, Any]: return {i: d for i, d in zip(data_id, output)} + def monitor_vars(self) -> List[str]: + return ['cur_lr', 'total_loss', 'q_value'] + @POLICY_REGISTRY.register('dqn_stdim') class DQNSTDIMPolicy(DQNPolicy): """ From b18159769f42c758497e9e8b075d13220662a3db Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 17 Mar 2023 05:36:40 +0000 Subject: [PATCH 051/244] add config --- ding/bonus/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index b373212875..7b0abc55eb 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -114,7 +114,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: else: raise KeyError("not supported env type: {}".format(env)) elif algorithm == 'TD3': - cfg = TD3Policy.default_config() + cfg = EasyDict({"policy":TD3Policy.default_config()}) if env == 'hopper': cfg.update( dict( @@ -152,7 +152,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: else: raise KeyError("not supported env type: {}".format(env)) elif algorithm == 'DQN': - cfg = DQNPolicy.default_config() + cfg = EasyDict({"policy":DQNPolicy.default_config()}) if env == 'lunarlander_discrete': cfg.update( dict( From 1d91b6d508fa314d57e63618e59ae414bc97f94d Mon Sep 17 00:00:00 2001 From: zhangpaipai <1124321458@qq.com> Date: Mon, 20 Mar 2023 19:31:26 +0800 Subject: [PATCH 052/244] add bonus/c51.py --- ding/bonus/__init__.py | 1 + ding/bonus/c51.py | 254 +++++++++++++++++++++++++++++++++++++++++ ding/bonus/config.py | 60 +++++++++- ding/policy/c51.py | 3 + 4 files changed, 317 insertions(+), 1 deletion(-) create mode 100644 ding/bonus/c51.py diff --git a/ding/bonus/__init__.py b/ding/bonus/__init__.py index 54841678ea..6a9e233a30 100644 --- a/ding/bonus/__init__.py +++ b/ding/bonus/__init__.py @@ -1,2 +1,3 @@ from .ppof import PPOF from .td3 import TD3OffPolicyAgent +from .c51 import C51Agent diff --git a/ding/bonus/c51.py b/ding/bonus/c51.py new file mode 100644 index 0000000000..1eb97fdd3f --- /dev/null +++ b/ding/bonus/c51.py @@ -0,0 +1,254 @@ +from dataclasses import dataclass +from typing import Optional, Union +from ditk import logging +from easydict import EasyDict +import os +import gym +import torch +import treetensor.torch as ttorch +from ding.framework import task, OnlineRLContext +from ding.framework.middleware import CkptSaver, multistep_trainer, \ + wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \ + OffPolicyLearner, final_ctx_saver, eps_greedy_handler, nstep_reward_enhancer +from ding.envs import BaseEnv, BaseEnvManagerV2, SubprocessEnvManagerV2 +from ding.policy import C51Policy +from ding.utils import set_pkg_seed +from ding.config import Config, save_config_py, compile_config +from ding.model import C51DQN +from ding.data import DequeBuffer +from ding.bonus.config import get_instance_config, get_instance_env + + +@dataclass +class TrainingReturn: + ''' + Attributions + wandb_url: The weight & biases (wandb) project url of the trainning experiment. + ''' + wandb_url: str + + +class C51Agent: + supported_env_list = [ + 'lunarlander_discrete', + ] + algorithm = 'C51' + + def __init__( + self, + env: Union[str, BaseEnv], + seed: int = 0, + exp_name: str = None, + model: Optional[torch.nn.Module] = None, + cfg: Optional[Union[EasyDict, dict, str]] = None, + policy_state_dict: str = None, + ) -> None: + if isinstance(env, str): + assert env in C51Agent.supported_env_list, "Please use supported envs: {}".format( + C51Agent.supported_env_list + ) + self.env = get_instance_env(env) + if cfg is None: + # 'It should be default env tuned config' + cfg = get_instance_config(env, algorithm=C51Agent.algorithm) + else: + assert isinstance(cfg, EasyDict), "Please use EasyDict as config data type." + + if exp_name is not None: + cfg.exp_name = exp_name + self.cfg = compile_config(cfg, policy=C51Policy) + self.exp_name = self.cfg.exp_name + + elif isinstance(env, BaseEnv): + self.cfg = compile_config(cfg, policy=C51Policy) + raise NotImplementedError + else: + raise TypeError("not support env type: {}, only strings and instances of `BaseEnv` now".format(type(env))) + logging.getLogger().setLevel(logging.INFO) + self.seed = seed + set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda) + if not os.path.exists(self.exp_name): + os.makedirs(self.exp_name) + save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py')) + if model is None: + model = C51DQN(**self.cfg.policy.model) + self.buffer_ = DequeBuffer(size=self.cfg.policy.other.replay_buffer.replay_buffer_size) + self.policy = C51Policy(self.cfg.policy, model=model) + if policy_state_dict is not None: + self.policy.learn_mode.load_state_dict(policy_state_dict) + + def train( + self, + step: int = int(1e7), + collector_env_num: int = 4, + evaluator_env_num: int = 4, + n_iter_log_show: int = 500, + n_iter_save_ckpt: int = 1000, + context: Optional[str] = None, + debug: bool = False + ) -> TrainingReturn: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + logging.debug(self.policy._model) + # define env and policy + collector_env = self._setup_env_manager(collector_env_num, context, debug) + evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug) + + with task.start(ctx=OnlineRLContext()): + task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(self.cfg)) + task.use( + StepCollector( + self.cfg, + self.policy.collect_mode, + collector_env, + random_collect_size=self.cfg.policy.random_collect_size + ) + ) + task.use(nstep_reward_enhancer(self.cfg)) + task.use(data_pusher(self.cfg, self.buffer_)) + task.use(multistep_trainer(self.policy, log_freq=n_iter_log_show)) + task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_)) + task.use( + CkptSaver( + policy=self.policy, + save_dir=os.path.join(self.cfg["exp_name"], "model"), + train_freq=n_iter_save_ckpt + ) + ) + task.use( + wandb_online_logger( + metric_list=self.policy.monitor_vars(), + model=self.policy._model, + anonymous=True, + project_name=self.exp_name + ) + ) + task.use(termination_checker(max_env_step=step)) + task.use(final_ctx_saver(name=self.cfg["exp_name"])) + task.run() + + return TrainingReturn(wandb_url=task.ctx.wandb_url) + + def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> None: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + # define env and policy + env = self.env.clone() + env.seed(self.seed, dynamic_seed=False) + + if enable_save_replay and replay_save_path: + env.enable_save_replay(replay_path=replay_save_path) + elif enable_save_replay: + env.enable_save_replay(replay_path=os.path.join(self.exp_name, 'videos')) + else: + logging.warning('No video would be generated during the deploy.') + + def single_env_forward_wrapper(forward_fn, cuda=True): + + def _forward(obs): + # unsqueeze means add batch dim, i.e. (O, ) -> (1, O) + obs = ttorch.as_tensor(obs).unsqueeze(0) + if cuda and torch.cuda.is_available(): + obs = obs.cuda() + output = forward_fn(obs) + assert isinstance(output, dict), "model output must be dict, but find {}".format(type(output)) + logit = output['logit'] + assert isinstance(logit, torch.Tensor) or isinstance(logit, list) + if isinstance(logit, torch.Tensor): + logit = [logit] + if 'action_mask' in output: + mask = output['action_mask'] + if isinstance(mask, torch.Tensor): + mask = [mask] + logit = [l.sub_(1e8 * (1 - m)) for l, m in zip(logit, mask)] + action = [l.argmax(dim=-1) for l in logit] + if len(action) == 1: + action, logit = action[0], logit[0] + #forward_fn.eval() + #action = forward_fn(obs)["action"] + + # squeeze means delete batch dim, i.e. (1, A) -> (A, ) + action = action.squeeze(0).detach().cpu().numpy() + return action + + return _forward + + forward_fn = single_env_forward_wrapper(self.policy._model) + #forward_fn = single_env_forward_wrapper(self.policy._eval_model) + + # main loop + return_ = 0. + step = 0 + obs = env.reset() + while True: + action = forward_fn(obs) + obs, rew, done, info = env.step(action) + return_ += rew + step += 1 + if done: + break + logging.info(f'C51 deploy is finished, final episode return with {step} steps is: {return_}') + + def collect_data( + self, + env_num: int = 8, + save_data_path: Optional[str] = None, + n_sample: Optional[int] = None, + n_episode: Optional[int] = None, + context: Optional[str] = None, + debug: bool = False + ) -> None: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + if n_episode is not None: + raise NotImplementedError + # define env and policy + env = self._setup_env_manager(env_num, context, debug) + + if save_data_path is None: + save_data_path = os.path.join(self.exp_name, 'demo_data') + + # main execution task + with task.start(ctx=OnlineRLContext()): + task.use( + StepCollector( + self.cfg, self.policy.collect_mode, env, random_collect_size=self.cfg.policy.random_collect_size + ) + ) + task.use(offline_data_saver(save_data_path, data_type='hdf5')) + task.run(max_step=1) + logging.info( + f'C51 collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`' + ) + + def batch_evaluate( + self, + env_num: int = 4, + n_evaluator_episode: int = 4, + context: Optional[str] = None, + debug: bool = False + ) -> None: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + # define env and policy + env = self._setup_env_manager(env_num, context, debug) + + evaluate_cfg = self.cfg + evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode + + # main execution task + with task.start(ctx=OnlineRLContext()): + task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, env)) + task.run(max_step=1) + + def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: bool = False) -> BaseEnvManagerV2: + if debug: + env_cls = BaseEnvManagerV2 + manager_cfg = env_cls.default_config() + else: + env_cls = SubprocessEnvManagerV2 + manager_cfg = env_cls.default_config() + if context is not None: + manager_cfg.context = context + return env_cls([self.env.clone for _ in range(env_num)], manager_cfg) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 041a37e653..1e1da4f01a 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -4,7 +4,7 @@ from ding.envs import BaseEnv, DingEnvWrapper from ding.envs.env_wrappers import MaxAndSkipWrapper, WarpFrameWrapper, ScaledFloatFrameWrapper, FrameStackWrapper, \ EvalEpisodeReturnEnv, TransposeWrapper, TimeLimitWrapper, FlatObsWrapper, GymToGymnasiumWrapper -from ding.policy import PPOFPolicy, TD3Policy +from ding.policy import PPOFPolicy, TD3Policy, C51Policy def get_instance_config(env: str, algorithm: str) -> EasyDict: @@ -151,6 +151,64 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ) else: raise KeyError("not supported env type: {}".format(env)) + elif algorithm == 'C51': + cfg = EasyDict({"policy":C51Policy.default_config()}) + if env == 'lunarlander_discrete': + cfg.update( + dict( + exp_name='lunarlander_c51', + seed=0, + env=dict( + collector_env_num=8, + evaluator_env_num=8, + env_id='LunarLander-v2', + n_evaluator_episode=8, + stop_value=200, + ), + policy=dict( + cuda=False, + priority=True, + model=dict( + obs_shape=8, + action_shape=4, + encoder_hidden_size_list=[128, 128, 64], + v_min=-10, + v_max=10, + n_atom=51, + ), + discount_factor=0.97, + nstep=3, + learn=dict( + update_per_collect=3, + batch_size=64, + learning_rate=0.001, + target_update_freq=100, + ), + collect=dict( + n_sample=80, + unroll_len=1, + ), + other=dict( + eps=dict( + type='exp', + start=0.95, + end=0.1, + decay=10000, + ), replay_buffer=dict(replay_buffer_size=20000, ) + ), + random_collect_size=0, + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) + else: + raise KeyError("not supported env type: {}".format(env)) else: raise KeyError("not supported algorithm type: {}".format(algorithm)) diff --git a/ding/policy/c51.py b/ding/policy/c51.py index 441c1ee3de..bfafe3e59a 100644 --- a/ding/policy/c51.py +++ b/ding/policy/c51.py @@ -257,3 +257,6 @@ def _get_train_sample(self, data: list) -> Union[None, List[Any]]: """ data = get_nstep_return_data(data, self._nstep, gamma=self._gamma) return get_train_sample(data, self._unroll_len) + + def monitor_vars(self) -> List[str]: + return ['cur_lr', 'total_loss', 'priority'] From ef5f1d536050c0d9b9f63a71ce4de80f27831733 Mon Sep 17 00:00:00 2001 From: zhangpaipai <1124321458@qq.com> Date: Mon, 27 Mar 2023 16:39:26 +0800 Subject: [PATCH 053/244] add c51 logit monitor --- ding/policy/c51.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ding/policy/c51.py b/ding/policy/c51.py index bfafe3e59a..ce630c9213 100644 --- a/ding/policy/c51.py +++ b/ding/policy/c51.py @@ -159,6 +159,7 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: self._target_model.train() # Current q value (main model) q_value = self._learn_model.forward(data['obs'])['distribution'] + logit = self._learn_model.forward(data['obs'])['logit'] # Target q value with torch.no_grad(): target_q_value = self._target_model.forward(data['next_obs'])['distribution'] @@ -187,6 +188,7 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: # ============= self._target_model.update(self._learn_model.state_dict()) return { + 'logit': logit.mean().item(), 'cur_lr': self._optimizer.defaults['lr'], 'total_loss': loss.item(), 'priority': td_error_per_sample.abs().tolist(), @@ -259,4 +261,4 @@ def _get_train_sample(self, data: list) -> Union[None, List[Any]]: return get_train_sample(data, self._unroll_len) def monitor_vars(self) -> List[str]: - return ['cur_lr', 'total_loss', 'priority'] + return ['logit', 'cur_lr', 'total_loss', 'priority'] From 87822ba7cb6b0b28b6e5f800c133c84a87193333 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 28 Mar 2023 05:33:30 +0000 Subject: [PATCH 054/244] add sac dqn agent --- ding/bonus/__init__.py | 2 + ding/bonus/a2c.py | 0 ding/bonus/config.py | 235 ++++++++++++++++- ding/bonus/dqn.py | 43 ++- ding/bonus/impala.py | 237 +++++++++++++++++ ding/bonus/ppof.py | 4 +- ding/bonus/sac.py | 246 ++++++++++++++++++ ding/bonus/td3.py | 4 +- .../middleware/functional/evaluator.py | 2 + ding/policy/ddpg.py | 10 - ding/policy/impala.py | 3 + ding/policy/sac.py | 4 + .../config/lunarlander_cont_td3_config.py | 8 +- 13 files changed, 762 insertions(+), 36 deletions(-) create mode 100644 ding/bonus/a2c.py create mode 100644 ding/bonus/impala.py create mode 100644 ding/bonus/sac.py diff --git a/ding/bonus/__init__.py b/ding/bonus/__init__.py index 89ec84d897..af3d089c6e 100644 --- a/ding/bonus/__init__.py +++ b/ding/bonus/__init__.py @@ -1,3 +1,5 @@ from .ppof import PPOF from .td3 import TD3OffPolicyAgent from .dqn import DQNOffpolicyAgent +from .sac import SACOffPolicyAgent +from .impala import IMPALAOffPolicyAgent diff --git a/ding/bonus/a2c.py b/ding/bonus/a2c.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 7b0abc55eb..8acb64b24f 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -4,7 +4,7 @@ from ding.envs import BaseEnv, DingEnvWrapper from ding.envs.env_wrappers import MaxAndSkipWrapper, WarpFrameWrapper, ScaledFloatFrameWrapper, FrameStackWrapper, \ EvalEpisodeReturnEnv, TransposeWrapper, TimeLimitWrapper, FlatObsWrapper, GymToGymnasiumWrapper -from ding.policy import PPOFPolicy, TD3Policy, DQNPolicy +from ding.policy import PPOFPolicy, TD3Policy, SACPolicy, DQNPolicy, IMPALAPolicy def get_instance_config(env: str, algorithm: str) -> EasyDict: @@ -111,6 +111,12 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: critic_head_hidden_size=128, critic_head_layer_num=2, ) + elif env in ['hopper']: + cfg.action_space = "continuous" + cfg.n_sample = 3200 + cfg.batch_size = 320 + cfg.epoch_per_collect = 10 + cfg.learning_rate = 3e-4 else: raise KeyError("not supported env type: {}".format(env)) elif algorithm == 'TD3': @@ -149,6 +155,139 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), ) ) + elif env == 'lunarlander_continuous': + cfg.update( + dict( + exp_name='LunarLanderContinuous-V2-TD3', + seed=0, + env=dict( + env_id='Hopper-v3', + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + act_scale=True, + stop_value=240, + ), + policy=dict( + cuda=True, + random_collect_size=25000, + model=dict( + obs_shape=8, + action_shape=2, + action_space='regression', + ), + learn=dict( + update_per_collect=256, + batch_size=256, + learning_rate_actor=3e-4, + learning_rate_critic=3e-4, + noise_sigma=0.1, + ), + collect=dict(n_sample=256, + noise_sigma=0.1, + ), + other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) + else: + raise KeyError("not supported env type: {}".format(env)) + elif algorithm == 'SAC': + cfg = EasyDict({"policy":SACPolicy.default_config()}) + if env == 'hopper': + cfg.update( + dict( + exp_name='Hopper-v3-SAC', + seed=0, + env=dict( + env_id='Hopper-v3', + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=6000, + ), + policy=dict( + cuda=True, + random_collect_size=10000, + model=dict( + obs_shape=11, + action_shape=3, + action_space='reparameterization', + actor_head_hidden_size=256, + critic_head_hidden_size=256, + ), + learn=dict( + update_per_collect=1, + batch_size=256, + learning_rate_q=1e-3, + learning_rate_policy=1e-3, + reparameterization=True, + auto_alpha=False, + ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) + elif env == 'lunarlander_continuous': + cfg.update( + dict( + exp_name='LunarLander-v2-SAC', + seed=0, + env=dict( + env_id='LunarLanderContinuous-v2', + collector_env_num=8, + evaluator_env_num=8, + act_scale=True, + n_evaluator_episode=8, + stop_value=240, + ), + policy=dict( + cuda=True, + random_collect_size=25000, + model=dict( + obs_shape=8, + action_shape=2, + action_space='reparameterization', + actor_head_hidden_size=256, + critic_head_hidden_size=256, + ), + learn=dict( + update_per_collect=256, + batch_size=256, + learning_rate_actor=3e-4, + learning_rate_critic=3e-4, + reparameterization=True, + auto_alpha=False, + ), + collect=dict( + n_sample=256, + ), + other=dict(replay_buffer=dict(replay_buffer_size=int(1e6), ), ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) + + pass else: raise KeyError("not supported env type: {}".format(env)) elif algorithm == 'DQN': @@ -156,14 +295,14 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: if env == 'lunarlander_discrete': cfg.update( dict( - exp_name='lunarlander_dqn', + exp_name='LunarLander-v2-DQN', seed=0, env=dict( env_id='LunarLander-v2', collector_env_num=8, evaluator_env_num=8, n_evaluator_episode=8, - stop_value=200, + stop_value=240, ), policy=dict( cuda=True, @@ -209,6 +348,79 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ) else: raise KeyError("not supported env type: {}".format(env)) + elif algorithm == 'IMPALA': + cfg = EasyDict({"policy":IMPALAPolicy.default_config()}) + if env == 'SpaceInvaders': + cfg.update( + dict( + exp_name='SpaceInvaders-v4-IMPALA', + seed=0, + env=dict( + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=10000000000, + env_id='SpaceInvaders-v4', + frame_stack=4, + manager=dict(shared_memory=False, ) + ), + policy=dict( + cuda=True, + #unroll_len=32, + random_collect_size=500, + model=dict( + obs_shape=[4, 84, 84], + action_shape=6, + encoder_hidden_size_list=[128, 128, 256, 512], + critic_head_hidden_size=512, + critic_head_layer_num=3, + actor_head_hidden_size=512, + actor_head_layer_num=3, + ), + learn=dict( + # (int) collect n_sample data, train model update_per_collect times + # here we follow impala serial pipeline + update_per_collect=3, # update_per_collect show be in [1, 10] + # (int) the number of data for a train iteration + batch_size=128, + grad_clip_type='clip_norm', + clip_value=5, + learning_rate=0.0003, + # (float) loss weight of the value network, the weight of policy network is set to 1 + value_weight=0.5, + # (float) loss weight of the entropy regularization, the weight of policy network is set to 1 + entropy_weight=0.01, + # (float) discount factor for future reward, defaults int [0, 1] + discount_factor=0.99, + # (float) additional discounting parameter + lambda_=0.95, + # (float) clip ratio of importance weights + rho_clip_ratio=1.0, + # (float) clip ratio of importance weights + c_clip_ratio=1.0, + # (float) clip ratio of importance sampling + rho_pg_clip_ratio=1.0, + ), + collect=dict( + unroll_len=32, + # (int) collect n_sample data, train model n_iteration times + n_sample=16, + collector=dict(collect_print_freq=1000, ), + ), + eval=dict(evaluator=dict(eval_freq=5000, )), + other=dict(replay_buffer=dict(replay_buffer_size=10000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) + else: + raise KeyError("not supported env type: {}".format(env)) else: raise KeyError("not supported algorithm type: {}".format(algorithm)) @@ -288,25 +500,32 @@ def get_instance_env(env: str) -> BaseEnv: seed_api=False, ) elif env == 'hopper': - from dizoo.mujoco.envs import MujocoEnv cfg = EasyDict( env_id='Hopper-v3', env_wrapper='mujoco_default', ) - return DingEnvWrapper(cfg=cfg) - elif env in ['atari_qbert', 'atari_kangaroo', 'atari_bowling']: + return DingEnvWrapper(gym.make('Hopper-v3'), cfg=cfg) + elif env == "SpaceInvaders": + cfg = EasyDict({ + 'env_id': "SpaceInvaders-v4", + 'env_wrapper': 'atari_default', + }) + return DingEnvWrapper(gym.make("SpaceInvaders-v4"), cfg=cfg) + elif env in ['atari_qbert', 'atari_kangaroo', 'atari_bowling', 'atari_breakout','atari_spaceinvader', 'atari_gopher']: from dizoo.atari.envs.atari_env import AtariEnv atari_env_list = { 'atari_qbert': 'QbertNoFrameskip-v4', 'atari_kangaroo': 'KangarooNoFrameskip-v4', - 'atari_bowling': 'BowlingNoFrameskip-v4' + 'atari_bowling': 'BowlingNoFrameskip-v4', + 'atari_breakout': 'BreakoutNoFrameskip-v4', + 'atari_spaceinvader': 'SpaceInvadersNoFrameskip-v4', + 'atari_gopher': 'GopherNoFrameskip-v4' } cfg = EasyDict({ 'env_id': atari_env_list[env], 'env_wrapper': 'atari_default', }) ding_env_atari = DingEnvWrapper(gym.make(atari_env_list[env]), cfg=cfg) - ding_env_atari.enable_save_replay(env + '_log/') return ding_env_atari elif env == 'minigrid_fourroom': import gymnasium diff --git a/ding/bonus/dqn.py b/ding/bonus/dqn.py index 2a5367c3d4..fe3b5918d6 100644 --- a/ding/bonus/dqn.py +++ b/ding/bonus/dqn.py @@ -5,15 +5,17 @@ import os import torch import treetensor.torch as ttorch +import numpy as np from ding.framework import task, OnlineRLContext from ding.framework.middleware import CkptSaver, multistep_trainer, \ wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \ - OffPolicyLearner, final_ctx_saver + OffPolicyLearner, final_ctx_saver, nstep_reward_enhancer, eps_greedy_handler from ding.envs import BaseEnv, BaseEnvManagerV2, SubprocessEnvManagerV2 from ding.policy import DQNPolicy from ding.utils import set_pkg_seed from ding.config import save_config_py, compile_config from ding.model import DQN +from ding.model import model_wrap from ding.data import DequeBuffer from ding.bonus.config import get_instance_config, get_instance_env @@ -26,6 +28,14 @@ class TrainingReturn: ''' wandb_url: str +@dataclass +class EvalReturn: + ''' + Attributions + wandb_url: The weight & biases (wandb) project url of the trainning experiment. + ''' + eval_value: np.float32 + eval_value_std: np.float32 class DQNOffpolicyAgent: supported_env_list = [ @@ -81,7 +91,6 @@ def train( step: int = int(1e7), collector_env_num: int = 4, evaluator_env_num: int = 4, - n_iter_log_show: int = 500, n_iter_save_ckpt: int = 1000, context: Optional[str] = None, debug: bool = False @@ -95,16 +104,24 @@ def train( with task.start(ctx=OnlineRLContext()): task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(self.cfg)) + # task.use( + # StepCollector( + # self.cfg, + # self.policy.collect_mode, + # collector_env, + # random_collect_size=self.cfg.policy.random_collect_size + # ) + # ) task.use( StepCollector( self.cfg, self.policy.collect_mode, - collector_env, - random_collect_size=self.cfg.policy.random_collect_size + collector_env ) ) + task.use(nstep_reward_enhancer(self.cfg)) task.use(data_pusher(self.cfg, self.buffer_)) - task.use(multistep_trainer(self.policy, log_freq=n_iter_log_show)) task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_)) task.use( CkptSaver( @@ -143,19 +160,21 @@ def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, def single_env_forward_wrapper(forward_fn, cuda=True): + forward_fn=model_wrap(forward_fn, wrapper_name='argmax_sample').forward + def _forward(obs): # unsqueeze means add batch dim, i.e. (O, ) -> (1, O) obs = ttorch.as_tensor(obs).unsqueeze(0) if cuda and torch.cuda.is_available(): obs = obs.cuda() - action = forward_fn(obs, mode='compute_actor')["action"] + action = forward_fn(obs)["action"] # squeeze means delete batch dim, i.e. (1, A) -> (A, ) action = action.squeeze(0).detach().cpu().numpy() return action return _forward - - forward_fn = single_env_forward_wrapper(self.policy._model) + + forward_fn = single_env_forward_wrapper(self.policy._model, self.cfg.policy.cuda) # main loop return_ = 0. @@ -168,7 +187,7 @@ def _forward(obs): step += 1 if done: break - logging.info(f'TD3 deploy is finished, final episode return with {step} steps is: {return_}') + logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}') def collect_data( self, @@ -199,7 +218,7 @@ def collect_data( task.use(offline_data_saver(save_data_path, data_type='hdf5')) task.run(max_step=1) logging.info( - f'TD3 collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`' + f'DQN collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`' ) def batch_evaluate( @@ -208,7 +227,7 @@ def batch_evaluate( n_evaluator_episode: int = 4, context: Optional[str] = None, debug: bool = False - ) -> None: + ) -> EvalReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -222,6 +241,8 @@ def batch_evaluate( task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, env)) task.run(max_step=1) + return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) + def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: bool = False) -> BaseEnvManagerV2: if debug: env_cls = BaseEnvManagerV2 diff --git a/ding/bonus/impala.py b/ding/bonus/impala.py new file mode 100644 index 0000000000..b7900f9ce1 --- /dev/null +++ b/ding/bonus/impala.py @@ -0,0 +1,237 @@ +from dataclasses import dataclass +from typing import Optional, Union +from ditk import logging +from easydict import EasyDict +import os +import gym +import torch +import treetensor.torch as ttorch +from ding.framework import task, OnlineRLContext +from ding.framework.middleware import CkptSaver, multistep_trainer, \ + wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \ + OffPolicyLearner, final_ctx_saver, eps_greedy_handler, nstep_reward_enhancer, epoch_timer +from ding.envs import BaseEnv, BaseEnvManagerV2, SubprocessEnvManagerV2 +from ding.policy import IMPALAPolicy +from ding.utils import set_pkg_seed +from ding.config import Config, save_config_py, compile_config +from ding.model import VAC +from ding.model import model_wrap +from ding.data import DequeBuffer +from ding.bonus.config import get_instance_config, get_instance_env + + +@dataclass +class TrainingReturn: + ''' + Attributions + wandb_url: The weight & biases (wandb) project url of the trainning experiment. + ''' + wandb_url: str + + +class IMPALAOffPolicyAgent: + supported_env_list = [ + 'SpaceInvaders', + ] + algorithm = 'IMPALA' + + def __init__( + self, + env: Union[str, BaseEnv], + seed: int = 0, + exp_name: str = None, + model: Optional[torch.nn.Module] = None, + cfg: Optional[Union[EasyDict, dict, str]] = None, + policy_state_dict: str = None, + ) -> None: + if isinstance(env, str): + assert env in IMPALAOffPolicyAgent.supported_env_list, "Please use supported envs: {}".format( + IMPALAOffPolicyAgent.supported_env_list + ) + self.env = get_instance_env(env) + if cfg is None: + # 'It should be default env tuned config' + cfg = get_instance_config(env, algorithm=IMPALAOffPolicyAgent.algorithm) + else: + assert isinstance(cfg, EasyDict), "Please use EasyDict as config data type." + + if exp_name is not None: + cfg.exp_name = exp_name + self.cfg = compile_config(cfg, policy=IMPALAPolicy) + self.exp_name = self.cfg.exp_name + + elif isinstance(env, BaseEnv): + self.cfg = compile_config(cfg, policy=IMPALAPolicy) + raise NotImplementedError + else: + raise TypeError("not support env type: {}, only strings and instances of `BaseEnv` now".format(type(env))) + logging.getLogger().setLevel(logging.INFO) + self.seed = seed + set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda) + if not os.path.exists(self.exp_name): + os.makedirs(self.exp_name) + save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py')) + if model is None: + model = VAC(**self.cfg.policy.model) + self.buffer_ = DequeBuffer(size=self.cfg.policy.other.replay_buffer.replay_buffer_size) + self.policy = IMPALAPolicy(self.cfg.policy, model=model) + if policy_state_dict is not None: + self.policy.learn_mode.load_state_dict(policy_state_dict) + + def train( + self, + step: int = int(1e7), + collector_env_num: int = 4, + evaluator_env_num: int = 4, + n_iter_log_show: int = 500, + n_iter_save_ckpt: int = 1000, + context: Optional[str] = None, + debug: bool = False + ) -> TrainingReturn: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + logging.debug(self.policy._model) + # define env and policy + collector_env = self._setup_env_manager(collector_env_num, context, debug) + evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug) + + with task.start(ctx=OnlineRLContext()): + task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) + task.use( + StepCollector( + self.cfg, + self.policy.collect_mode, + collector_env, + random_collect_size=self.cfg.policy.random_collect_size + ) + ) + task.use(data_pusher(self.cfg, self.buffer_, group_by_env=True)) + task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_)) + task.use( + CkptSaver( + policy=self.policy, + save_dir=os.path.join(self.cfg["exp_name"], "model"), + train_freq=n_iter_save_ckpt + ) + ) + task.use(epoch_timer()) + task.use( + wandb_online_logger( + metric_list=self.policy.monitor_vars(), + model=self.policy._model, + anonymous=True, + project_name=self.exp_name + ) + ) + task.use(termination_checker(max_env_step=step)) + task.use(final_ctx_saver(name=self.cfg["exp_name"])) + task.run() + + return TrainingReturn(wandb_url=task.ctx.wandb_url) + + def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> None: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + # define env and policy + env = self.env.clone() + env.seed(self.seed, dynamic_seed=False) + + if enable_save_replay and replay_save_path: + env.enable_save_replay(replay_path=replay_save_path) + elif enable_save_replay: + env.enable_save_replay(replay_path=os.path.join(self.exp_name, 'videos')) + else: + logging.warning('No video would be generated during the deploy.') + + def single_env_forward_wrapper(forward_fn, cuda=True): + + forward_fn=model_wrap(forward_fn, wrapper_name='base').forward + + def _forward(obs): + # unsqueeze means add batch dim, i.e. (O, ) -> (1, O) + obs = ttorch.as_tensor(obs).unsqueeze(0) + if cuda and torch.cuda.is_available(): + obs = obs.cuda() + (mu, sigma) = forward_fn(obs, mode='compute_actor')['logit'] + action = torch.tanh(mu).detach().cpu().numpy()[0] # deterministic_eval + return action + + return _forward + + forward_fn = single_env_forward_wrapper(self.policy._model, self.cfg.policy.cuda) + + # main loop + return_ = 0. + step = 0 + obs = env.reset() + while True: + action = forward_fn(obs) + obs, rew, done, info = env.step(action) + return_ += rew + step += 1 + if done: + break + logging.info(f'IMPALA deploy is finished, final episode return with {step} steps is: {return_}') + + def collect_data( + self, + env_num: int = 8, + save_data_path: Optional[str] = None, + n_sample: Optional[int] = None, + n_episode: Optional[int] = None, + context: Optional[str] = None, + debug: bool = False + ) -> None: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + if n_episode is not None: + raise NotImplementedError + # define env and policy + env = self._setup_env_manager(env_num, context, debug) + + if save_data_path is None: + save_data_path = os.path.join(self.exp_name, 'demo_data') + + # main execution task + with task.start(ctx=OnlineRLContext()): + task.use( + StepCollector( + self.cfg, self.policy.collect_mode, env, random_collect_size=self.cfg.policy.random_collect_size + ) + ) + task.use(offline_data_saver(save_data_path, data_type='hdf5')) + task.run(max_step=1) + logging.info( + f'IMPALA collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`' + ) + + def batch_evaluate( + self, + env_num: int = 4, + n_evaluator_episode: int = 4, + context: Optional[str] = None, + debug: bool = False + ) -> None: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + # define env and policy + env = self._setup_env_manager(env_num, context, debug) + + evaluate_cfg = self.cfg + evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode + + # main execution task + with task.start(ctx=OnlineRLContext()): + task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, env)) + task.run(max_step=1) + + def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: bool = False) -> BaseEnvManagerV2: + if debug: + env_cls = BaseEnvManagerV2 + manager_cfg = env_cls.default_config() + else: + env_cls = SubprocessEnvManagerV2 + manager_cfg = env_cls.default_config() + if context is not None: + manager_cfg.context = context + return env_cls([self.env.clone for _ in range(env_num)], manager_cfg) diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index 67c8e059b9..f86afbbb8f 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -50,6 +50,8 @@ class PPOF: 'atari_qbert', 'atari_kangaroo', 'atari_bowling', + # mujoco + 'hopper', ] def __init__( @@ -152,7 +154,7 @@ def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, else: logging.warning('No video would be generated during the deploy.') - forward_fn = single_env_forward_wrapper_ttorch(self.policy.eval) + forward_fn = single_env_forward_wrapper_ttorch(self.policy.eval, self.cfg.policy.cuda) # main loop return_ = 0. diff --git a/ding/bonus/sac.py b/ding/bonus/sac.py new file mode 100644 index 0000000000..7344c8f57e --- /dev/null +++ b/ding/bonus/sac.py @@ -0,0 +1,246 @@ +from dataclasses import dataclass +from typing import Optional, Union +from ditk import logging +from easydict import EasyDict +import os +import gym +import torch +import treetensor.torch as ttorch +import numpy as np +from ding.framework import task, OnlineRLContext +from ding.framework.middleware import CkptSaver, multistep_trainer, \ + wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \ + OffPolicyLearner, final_ctx_saver +from ding.envs import BaseEnv, BaseEnvManagerV2, SubprocessEnvManagerV2 +from ding.policy import SACPolicy +from ding.utils import set_pkg_seed +from ding.config import Config, save_config_py, compile_config +from ding.model import QAC +from ding.model import model_wrap +from ding.data import DequeBuffer +from ding.bonus.config import get_instance_config, get_instance_env + + +@dataclass +class TrainingReturn: + ''' + Attributions + wandb_url: The weight & biases (wandb) project url of the trainning experiment. + ''' + wandb_url: str + +@dataclass +class EvalReturn: + ''' + Attributions + wandb_url: The weight & biases (wandb) project url of the trainning experiment. + ''' + eval_value: np.float32 + eval_value_std: np.float32 + +class SACOffPolicyAgent: + supported_env_list = [ + 'hopper', + 'lunarlander_continuous', + ] + algorithm = 'SAC' + + def __init__( + self, + env: Union[str, BaseEnv], + seed: int = 0, + exp_name: str = None, + model: Optional[torch.nn.Module] = None, + cfg: Optional[Union[EasyDict, dict, str]] = None, + policy_state_dict: str = None, + ) -> None: + if isinstance(env, str): + assert env in SACOffPolicyAgent.supported_env_list, "Please use supported envs: {}".format( + SACOffPolicyAgent.supported_env_list + ) + self.env = get_instance_env(env) + if cfg is None: + # 'It should be default env tuned config' + cfg = get_instance_config(env, algorithm=SACOffPolicyAgent.algorithm) + else: + assert isinstance(cfg, EasyDict), "Please use EasyDict as config data type." + + if exp_name is not None: + cfg.exp_name = exp_name + self.cfg = compile_config(cfg, policy=SACPolicy) + self.exp_name = self.cfg.exp_name + + elif isinstance(env, BaseEnv): + self.cfg = compile_config(cfg, policy=SACPolicy) + raise NotImplementedError + else: + raise TypeError("not support env type: {}, only strings and instances of `BaseEnv` now".format(type(env))) + logging.getLogger().setLevel(logging.INFO) + self.seed = seed + set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda) + if not os.path.exists(self.exp_name): + os.makedirs(self.exp_name) + save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py')) + if model is None: + model = QAC(**self.cfg.policy.model) + self.buffer_ = DequeBuffer(size=self.cfg.policy.other.replay_buffer.replay_buffer_size) + self.policy = SACPolicy(self.cfg.policy, model=model) + if policy_state_dict is not None: + self.policy.learn_mode.load_state_dict(policy_state_dict) + + def train( + self, + step: int = int(1e7), + collector_env_num: int = 4, + evaluator_env_num: int = 4, + n_iter_log_show: int = 500, + n_iter_save_ckpt: int = 1000, + context: Optional[str] = None, + debug: bool = False + ) -> TrainingReturn: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + logging.debug(self.policy._model) + # define env and policy + collector_env = self._setup_env_manager(collector_env_num, context, debug) + evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug) + + with task.start(ctx=OnlineRLContext()): + task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) + task.use( + StepCollector( + self.cfg, + self.policy.collect_mode, + collector_env, + random_collect_size=self.cfg.policy.random_collect_size + ) + ) + task.use(data_pusher(self.cfg, self.buffer_)) + task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_)) + task.use( + CkptSaver( + policy=self.policy, + save_dir=os.path.join(self.cfg["exp_name"], "model"), + train_freq=n_iter_save_ckpt + ) + ) + task.use( + wandb_online_logger( + metric_list=self.policy.monitor_vars(), + model=self.policy._model, + anonymous=True, + project_name=self.exp_name + ) + ) + task.use(termination_checker(max_env_step=step)) + task.use(final_ctx_saver(name=self.cfg["exp_name"])) + task.run() + + return TrainingReturn(wandb_url=task.ctx.wandb_url) + + def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> None: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + # define env and policy + env = self.env.clone() + env.seed(self.seed, dynamic_seed=False) + + if enable_save_replay and replay_save_path: + env.enable_save_replay(replay_path=replay_save_path) + elif enable_save_replay: + env.enable_save_replay(replay_path=os.path.join(self.exp_name, 'videos')) + else: + logging.warning('No video would be generated during the deploy.') + + def single_env_forward_wrapper(forward_fn, cuda=True): + + forward_fn=model_wrap(forward_fn, wrapper_name='base').forward + + def _forward(obs): + # unsqueeze means add batch dim, i.e. (O, ) -> (1, O) + obs = ttorch.as_tensor(obs).unsqueeze(0) + if cuda and torch.cuda.is_available(): + obs = obs.cuda() + (mu, sigma) = forward_fn(obs, mode='compute_actor')['logit'] + action = torch.tanh(mu).detach().cpu().numpy()[0] # deterministic_eval + return action + + return _forward + + forward_fn = single_env_forward_wrapper(self.policy._model, self.cfg.policy.cuda) + + # main loop + return_ = 0. + step = 0 + obs = env.reset() + while True: + action = forward_fn(obs) + obs, rew, done, info = env.step(action) + return_ += rew + step += 1 + if done: + break + logging.info(f'SAC deploy is finished, final episode return with {step} steps is: {return_}') + + def collect_data( + self, + env_num: int = 8, + save_data_path: Optional[str] = None, + n_sample: Optional[int] = None, + n_episode: Optional[int] = None, + context: Optional[str] = None, + debug: bool = False + ) -> None: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + if n_episode is not None: + raise NotImplementedError + # define env and policy + env = self._setup_env_manager(env_num, context, debug) + + if save_data_path is None: + save_data_path = os.path.join(self.exp_name, 'demo_data') + + # main execution task + with task.start(ctx=OnlineRLContext()): + task.use( + StepCollector( + self.cfg, self.policy.collect_mode, env, random_collect_size=self.cfg.policy.random_collect_size + ) + ) + task.use(offline_data_saver(save_data_path, data_type='hdf5')) + task.run(max_step=1) + logging.info( + f'SAC collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`' + ) + + def batch_evaluate( + self, + env_num: int = 4, + n_evaluator_episode: int = 4, + context: Optional[str] = None, + debug: bool = False + ) -> None: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + # define env and policy + env = self._setup_env_manager(env_num, context, debug) + + evaluate_cfg = self.cfg + evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode + + # main execution task + with task.start(ctx=OnlineRLContext()): + task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, env)) + task.run(max_step=1) + + def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: bool = False) -> BaseEnvManagerV2: + if debug: + env_cls = BaseEnvManagerV2 + manager_cfg = env_cls.default_config() + else: + env_cls = SubprocessEnvManagerV2 + manager_cfg = env_cls.default_config() + if context is not None: + manager_cfg.context = context + return env_cls([self.env.clone for _ in range(env_num)], manager_cfg) diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 8eb6397b2c..40f0145cfa 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -31,6 +31,7 @@ class TrainingReturn: class TD3OffPolicyAgent: supported_env_list = [ 'hopper', + 'lunarlander_continuous', ] algorithm = 'TD3' @@ -105,7 +106,6 @@ def train( ) ) task.use(data_pusher(self.cfg, self.buffer_)) - task.use(multistep_trainer(self.policy, log_freq=n_iter_log_show)) task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_)) task.use( CkptSaver( @@ -156,7 +156,7 @@ def _forward(obs): return _forward - forward_fn = single_env_forward_wrapper(self.policy._model) + forward_fn = single_env_forward_wrapper(self.policy._model, self.cfg.policy.cuda) # main loop return_ = 0. diff --git a/ding/framework/middleware/functional/evaluator.py b/ding/framework/middleware/functional/evaluator.py index 102cba70e0..bdc6d01f3a 100644 --- a/ding/framework/middleware/functional/evaluator.py +++ b/ding/framework/middleware/functional/evaluator.py @@ -268,6 +268,7 @@ def _evaluate(ctx: Union["OnlineRLContext", "OfflineRLContext"]): if 'episode_info' in timestep.info: eval_monitor.update_info(env_id, timestep.info.episode_info) episode_return = eval_monitor.get_episode_return() + episode_return_std = np.std(episode_return) episode_return = np.mean(episode_return) stop_flag = episode_return >= cfg.env.stop_value and ctx.train_iter > 0 if isinstance(ctx, OnlineRLContext): @@ -282,6 +283,7 @@ def _evaluate(ctx: Union["OnlineRLContext", "OfflineRLContext"]): raise TypeError("not supported ctx type: {}".format(type(ctx))) ctx.last_eval_iter = ctx.train_iter ctx.eval_value = episode_return + ctx.eval_value_std = episode_return_std ctx.last_eval_value = ctx.eval_value ctx.eval_output = {'episode_return': episode_return} episode_info = eval_monitor.get_episode_info() diff --git a/ding/policy/ddpg.py b/ding/policy/ddpg.py index 4eb71f1b19..967c963950 100644 --- a/ding/policy/ddpg.py +++ b/ding/policy/ddpg.py @@ -458,13 +458,3 @@ def _monitor_vars_learn(self) -> List[str]: if self._twin_critic: ret += ['critic_twin_loss'] return ret - - def state_dict(self) -> Dict[str, Any]: - state_dict = { - 'model': self._model.state_dict(), - 'target_model': self._target_model.state_dict(), - } - if 'learn' in self._enable_field: - state_dict['optimizer_actor'] = self._optimizer_actor.state_dict() - state_dict['optimizer_critic'] = self._optimizer_critic.state_dict() - return state_dict diff --git a/ding/policy/impala.py b/ding/policy/impala.py index 5262257a90..6b35e0c959 100644 --- a/ding/policy/impala.py +++ b/ding/policy/impala.py @@ -445,3 +445,6 @@ def _monitor_vars_learn(self) -> List[str]: by import_names path. For IMPALA, ``ding.model.interface.IMPALA`` """ return super()._monitor_vars_learn() + ['policy_loss', 'value_loss', 'entropy_loss'] + + def monitor_vars(self): + return self._monitor_vars_learn() diff --git a/ding/policy/sac.py b/ding/policy/sac.py index 33717235bc..dc82eeadfd 100644 --- a/ding/policy/sac.py +++ b/ding/policy/sac.py @@ -827,6 +827,10 @@ def _monitor_vars_learn(self) -> List[str]: ] + twin_critic + alpha_loss + def monitor_vars(self) -> List[str]: + return self._monitor_vars_learn() + + @POLICY_REGISTRY.register('sqil_sac') class SQILSACPolicy(SACPolicy): diff --git a/dizoo/box2d/lunarlander/config/lunarlander_cont_td3_config.py b/dizoo/box2d/lunarlander/config/lunarlander_cont_td3_config.py index e57c296555..e5a4f634d5 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_cont_td3_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_cont_td3_config.py @@ -12,9 +12,9 @@ stop_value=200, ), policy=dict( - cuda=False, + cuda=True, priority=False, - random_collect_size=0, + random_collect_size=25000, model=dict( obs_shape=8, action_shape=2, @@ -23,7 +23,7 @@ ), learn=dict( update_per_collect=256, - batch_size=128, + batch_size=256, learning_rate_actor=3e-4, learning_rate_critic=3e-4, ignore_done=False, @@ -41,7 +41,7 @@ collector=dict(collect_print_freq=1000, ), ), eval=dict(evaluator=dict(eval_freq=100, ), ), - other=dict(replay_buffer=dict(replay_buffer_size=int(1e5), ), ), + other=dict(replay_buffer=dict(replay_buffer_size=int(1e6), ), ), ), ) lunarlander_td3_config = EasyDict(lunarlander_td3_config) From c86d89789491050901db2086f1e7804d4bf75c7f Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 28 Mar 2023 07:08:52 +0000 Subject: [PATCH 055/244] add sac dqn agent demo in dizoo --- ding/bonus/dqn.py | 3 ++- ding/bonus/impala.py | 14 ++++++++++++- ding/bonus/ppof.py | 14 ++++++++++++- ding/bonus/sac.py | 7 +++++-- ding/bonus/td3.py | 14 ++++++++++++- .../middleware/functional/evaluator.py | 2 ++ dizoo/common/dqn/lunarlander_dqn.py | 18 +++++++++++++++++ dizoo/common/dqn/lunarlander_dqn_download.py | 11 ++++++++++ dizoo/common/ppo/lunarlander_ppo.py | 18 +++++++++++++++++ dizoo/common/ppo/lunarlander_ppo_download.py | 11 ++++++++++ dizoo/common/sac/lunarlander_sac.py | 18 +++++++++++++++++ dizoo/common/sac/lunarlander_sac_download.py | 11 ++++++++++ dizoo/common/td3/lunarlander_td3.py | 20 +++++++++++++++++++ dizoo/common/td3/lunarlander_td3_download.py | 11 ++++++++++ 14 files changed, 166 insertions(+), 6 deletions(-) create mode 100644 dizoo/common/dqn/lunarlander_dqn.py create mode 100644 dizoo/common/dqn/lunarlander_dqn_download.py create mode 100644 dizoo/common/ppo/lunarlander_ppo.py create mode 100644 dizoo/common/ppo/lunarlander_ppo_download.py create mode 100644 dizoo/common/sac/lunarlander_sac.py create mode 100644 dizoo/common/sac/lunarlander_sac_download.py create mode 100644 dizoo/common/td3/lunarlander_td3.py create mode 100644 dizoo/common/td3/lunarlander_td3_download.py diff --git a/ding/bonus/dqn.py b/ding/bonus/dqn.py index fe3b5918d6..977b050de1 100644 --- a/ding/bonus/dqn.py +++ b/ding/bonus/dqn.py @@ -32,7 +32,8 @@ class TrainingReturn: class EvalReturn: ''' Attributions - wandb_url: The weight & biases (wandb) project url of the trainning experiment. + eval_value: The mean of evaluation return. + eval_value_std: The standard deviation of evaluation return. ''' eval_value: np.float32 eval_value_std: np.float32 diff --git a/ding/bonus/impala.py b/ding/bonus/impala.py index b7900f9ce1..ccd6428909 100644 --- a/ding/bonus/impala.py +++ b/ding/bonus/impala.py @@ -6,6 +6,7 @@ import gym import torch import treetensor.torch as ttorch +import numpy as np from ding.framework import task, OnlineRLContext from ding.framework.middleware import CkptSaver, multistep_trainer, \ wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \ @@ -28,6 +29,15 @@ class TrainingReturn: ''' wandb_url: str +@dataclass +class EvalReturn: + ''' + Attributions + eval_value: The mean of evaluation return. + eval_value_std: The standard deviation of evaluation return. + ''' + eval_value: np.float32 + eval_value_std: np.float32 class IMPALAOffPolicyAgent: supported_env_list = [ @@ -211,7 +221,7 @@ def batch_evaluate( n_evaluator_episode: int = 4, context: Optional[str] = None, debug: bool = False - ) -> None: + ) -> EvalReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -225,6 +235,8 @@ def batch_evaluate( task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, env)) task.run(max_step=1) + return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) + def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: bool = False) -> BaseEnvManagerV2: if debug: env_cls = BaseEnvManagerV2 diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index f86afbbb8f..bfa677f0b4 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -7,6 +7,7 @@ import gym import gymnasium import torch +import numpy as np from ding.framework import task, OnlineRLContext from ding.framework.middleware import interaction_evaluator_ttorch, PPOFStepCollector, multistep_trainer, CkptSaver, \ wandb_online_logger, offline_data_saver, termination_checker, ppof_adv_estimator @@ -26,6 +27,15 @@ class TrainingReturn: ''' wandb_url: str +@dataclass +class EvalReturn: + ''' + Attributions + eval_value: The mean of evaluation return. + eval_value_std: The standard deviation of evaluation return. + ''' + eval_value: np.float32 + eval_value_std: np.float32 class PPOF: supported_env_list = [ @@ -202,7 +212,7 @@ def batch_evaluate( n_evaluator_episode: int = 4, context: Optional[str] = None, debug: bool = False, - ) -> None: + ) -> EvalReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -218,6 +228,8 @@ def batch_evaluate( )) task.run(max_step=1) + return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) + def _setup_env_manager( self, env_num: int, diff --git a/ding/bonus/sac.py b/ding/bonus/sac.py index 7344c8f57e..267d40a759 100644 --- a/ding/bonus/sac.py +++ b/ding/bonus/sac.py @@ -33,7 +33,8 @@ class TrainingReturn: class EvalReturn: ''' Attributions - wandb_url: The weight & biases (wandb) project url of the trainning experiment. + eval_value: The mean of evaluation return. + eval_value_std: The standard deviation of evaluation return. ''' eval_value: np.float32 eval_value_std: np.float32 @@ -220,7 +221,7 @@ def batch_evaluate( n_evaluator_episode: int = 4, context: Optional[str] = None, debug: bool = False - ) -> None: + ) -> EvalReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -234,6 +235,8 @@ def batch_evaluate( task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, env)) task.run(max_step=1) + return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) + def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: bool = False) -> BaseEnvManagerV2: if debug: env_cls = BaseEnvManagerV2 diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 40f0145cfa..20629952c4 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -6,6 +6,7 @@ import gym import torch import treetensor.torch as ttorch +import numpy as np from ding.framework import task, OnlineRLContext from ding.framework.middleware import CkptSaver, multistep_trainer, \ wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \ @@ -27,6 +28,15 @@ class TrainingReturn: ''' wandb_url: str +@dataclass +class EvalReturn: + ''' + Attributions + eval_value: The mean of evaluation return. + eval_value_std: The standard deviation of evaluation return. + ''' + eval_value: np.float32 + eval_value_std: np.float32 class TD3OffPolicyAgent: supported_env_list = [ @@ -209,7 +219,7 @@ def batch_evaluate( n_evaluator_episode: int = 4, context: Optional[str] = None, debug: bool = False - ) -> None: + ) -> EvalReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -223,6 +233,8 @@ def batch_evaluate( task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, env)) task.run(max_step=1) + return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) + def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: bool = False) -> BaseEnvManagerV2: if debug: env_cls = BaseEnvManagerV2 diff --git a/ding/framework/middleware/functional/evaluator.py b/ding/framework/middleware/functional/evaluator.py index bdc6d01f3a..03e6c9720a 100644 --- a/ding/framework/middleware/functional/evaluator.py +++ b/ding/framework/middleware/functional/evaluator.py @@ -367,6 +367,7 @@ def _evaluate(ctx: "OnlineRLContext"): if 'episode_info' in timestep.info: eval_monitor.update_info(env_id, timestep.info.episode_info) episode_return = eval_monitor.get_episode_return() + episode_return_std = np.std(episode_return) episode_return_mean = np.mean(episode_return) stop_flag = episode_return_mean >= stop_value and ctx.train_iter > 0 logging.info( @@ -376,6 +377,7 @@ def _evaluate(ctx: "OnlineRLContext"): ) ctx.last_eval_iter = ctx.train_iter ctx.eval_value = episode_return_mean + ctx.eval_value_std = episode_return_std ctx.last_eval_value = ctx.eval_value ctx.eval_output = {'episode_return': episode_return} episode_info = eval_monitor.get_episode_info() diff --git a/dizoo/common/dqn/lunarlander_dqn.py b/dizoo/common/dqn/lunarlander_dqn.py new file mode 100644 index 0000000000..a41e845029 --- /dev/null +++ b/dizoo/common/dqn/lunarlander_dqn.py @@ -0,0 +1,18 @@ +from ding.bonus import DQNOffpolicyAgent +from huggingface_ding import push_model_to_hub + +# Instantiate the agent +agent = DQNOffpolicyAgent(env="lunarlander_discrete", exp_name="Lunarlander-v2-DQN") +# Train the agent +return_=agent.train(step=int(4000000), collector_env_num=8, evaluator_env_num=8, debug=False) +# Push model to huggingface hub +push_model_to_hub(agent=agent, + env_name="OpenAI/Gym/Box2d", + task_name="LunarLander-v2", + algo_name="DQN", + wandb_url=return_.wandb_url, + github_repo_url="https://github.com/opendilab/DI-engine", + github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/dqn.html", + github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", + usage_file_path="./dizoo/common/dqn/lunarlander_dqn_download.py", + repo_id="OpenDILabCommunity/Lunarlander-v2-DQN") diff --git a/dizoo/common/dqn/lunarlander_dqn_download.py b/dizoo/common/dqn/lunarlander_dqn_download.py new file mode 100644 index 0000000000..1320196c10 --- /dev/null +++ b/dizoo/common/dqn/lunarlander_dqn_download.py @@ -0,0 +1,11 @@ +from ding.bonus import DQNOffpolicyAgent +from huggingface_ding import pull_model_from_hub + +# Pull model from Hugggingface hub +policy_state_dict, cfg=pull_model_from_hub(repo_id="OpenDILabCommunity/Lunarlander-v2-DQN") +# Instantiate the agent +agent = DQNOffpolicyAgent(env="lunarlander_discrete",exp_name="Lunarlander-v2-DQN-test", cfg=cfg.exp_config, policy_state_dict=policy_state_dict) +# Continue training +agent.train(step=5000) +# Render the new agent performance +agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/ppo/lunarlander_ppo.py b/dizoo/common/ppo/lunarlander_ppo.py new file mode 100644 index 0000000000..88cb6088fd --- /dev/null +++ b/dizoo/common/ppo/lunarlander_ppo.py @@ -0,0 +1,18 @@ +from ding.bonus import PPOF +from huggingface_ding import push_model_to_hub + +# Instantiate the agent +agent = PPOF("lunarlander_discrete", exp_name="LunarLander-v2-PPO") +# Train the agent +return_=agent.train(step=int(200000), collector_env_num=4, evaluator_env_num=4) +# Push model to huggingface hub +push_model_to_hub(agent=agent, + env_name="OpenAI/Gym/Box2d", + task_name="LunarLander-v2", + algo_name="PPO", + wandb_url=return_.wandb_url, + github_repo_url="https://github.com/opendilab/DI-engine", + github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/ppo.html", + github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", + usage_file_path="./dizoo/common/ppo/lunarlander_ppo_download.py", + repo_id="OpenDILabCommunity/LunarLander-v2-PPO") diff --git a/dizoo/common/ppo/lunarlander_ppo_download.py b/dizoo/common/ppo/lunarlander_ppo_download.py new file mode 100644 index 0000000000..a55d164f7d --- /dev/null +++ b/dizoo/common/ppo/lunarlander_ppo_download.py @@ -0,0 +1,11 @@ +from ding.bonus import PPOF +from huggingface_ding import pull_model_from_hub + +# Pull model from Hugggingface hub +policy_state_dict, cfg=pull_model_from_hub(repo_id="OpenDILabCommunity/LunarLander-v2-PPO") +# Instantiate the agent +agent = PPOF(env="lunarlander_discrete",exp_name="lunarlander-ppo", cfg=cfg.exp_config, policy_state_dict=policy_state_dict) +# Continue training +agent.train(step=5000) +# Render the new agent performance +agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/sac/lunarlander_sac.py b/dizoo/common/sac/lunarlander_sac.py new file mode 100644 index 0000000000..defc41f4b1 --- /dev/null +++ b/dizoo/common/sac/lunarlander_sac.py @@ -0,0 +1,18 @@ +from ding.bonus import SACOffPolicyAgent +from huggingface_ding import push_model_to_hub + +# Instantiate the agent +agent = SACOffPolicyAgent("lunarlander_continuous", exp_name="LunarLander-v2-SAC") +# Train the agent +return_=agent.train(step=int(2000000), collector_env_num=4, evaluator_env_num=4) +# Push model to huggingface hub +push_model_to_hub(agent=agent, + env_name="OpenAI/Gym/Box2d", + task_name="LunarLander-v2", + algo_name="SAC", + wandb_url=return_.wandb_url, + github_repo_url="https://github.com/opendilab/DI-engine", + github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/sac.html", + github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", + usage_file_path="./dizoo/common/sac/lunarlander_sac_download.py", + repo_id="OpenDILabCommunity/LunarLander-v2-SAC") diff --git a/dizoo/common/sac/lunarlander_sac_download.py b/dizoo/common/sac/lunarlander_sac_download.py new file mode 100644 index 0000000000..20eb75d3ef --- /dev/null +++ b/dizoo/common/sac/lunarlander_sac_download.py @@ -0,0 +1,11 @@ +from ding.bonus import SACOffPolicyAgent +from huggingface_ding import pull_model_from_hub + +# Pull model from Hugggingface hub +policy_state_dict, cfg=pull_model_from_hub(repo_id="OpenDILabCommunity/LunarLander-v2-SAC") +# Instantiate the agent +agent = SACOffPolicyAgent(env="lunarlander_continuous",exp_name="lunarlander-sac", cfg=cfg.exp_config, policy_state_dict=policy_state_dict) +# Continue training +agent.train(step=5000) +# Render the new agent performance +agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/td3/lunarlander_td3.py b/dizoo/common/td3/lunarlander_td3.py new file mode 100644 index 0000000000..3384d2a4c7 --- /dev/null +++ b/dizoo/common/td3/lunarlander_td3.py @@ -0,0 +1,20 @@ +from ding.bonus import TD3OffPolicyAgent +from huggingface_ding import push_model_to_hub + +# Instantiate the agent +agent = TD3OffPolicyAgent("lunarlander_continuous", exp_name="LunarLander-v2-TD3") +# Train the agent +return_ = agent.train(step=int(200000), collector_env_num=4, evaluator_env_num=4) +# Push model to huggingface hub +push_model_to_hub( + agent=agent, + env_name="OpenAI/Gym/Box2d", + task_name="LunarLander-v2", + algo_name="TD3", + wandb_url=return_.wandb_url, + github_repo_url="https://github.com/opendilab/DI-engine", + github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/td3.html", + github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", + usage_file_path="./dizoo/common/td3/lunarlander_td3_download.py", + repo_id="OpenDILabCommunity/LunarLander-v2-TD3" +) diff --git a/dizoo/common/td3/lunarlander_td3_download.py b/dizoo/common/td3/lunarlander_td3_download.py new file mode 100644 index 0000000000..ef7133733b --- /dev/null +++ b/dizoo/common/td3/lunarlander_td3_download.py @@ -0,0 +1,11 @@ +from ding.bonus import TD3OffPolicyAgent +from huggingface_ding import pull_model_from_hub + +# Pull model from Hugggingface hub +policy_state_dict, cfg=pull_model_from_hub(repo_id="OpenDILabCommunity/LunarLander-v2-TD3") +# Instantiate the agent +agent = TD3OffPolicyAgent(env="lunarlander_continuous",exp_name="LunarLander-v2-TD3", cfg=cfg.exp_config, policy_state_dict=policy_state_dict) +# Continue training +agent.train(step=5000) +# Render the new agent performance +agent.deploy(enable_save_replay=True) From 1973d01c940fec01980a051081afbdfcfafa8829 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 28 Mar 2023 08:03:42 +0000 Subject: [PATCH 056/244] polish format --- ding/bonus/config.py | 24 ++--- ding/bonus/dqn.py | 14 +-- ding/bonus/impala.py | 6 +- ding/bonus/ppof.py | 2 + ding/bonus/sac.py | 6 +- ding/bonus/td3.py | 2 + ding/policy/dqn.py | 2 +- ding/policy/sac.py | 1 - dizoo/atari/entry/spaceinvaders_dqn_eval.py | 5 +- dizoo/atari/example/atari_dqn_dist_ddp.py | 1 - .../carracing/config/carracing_dqn_config.py | 9 +- dizoo/box2d/carracing/envs/carracing_env.py | 1 - .../carracing/envs/test_carracing_env.py | 10 +- .../cartpole/config/cartpole_bc_config.py | 2 +- .../config/mtcar_rainbow_config.py | 95 ++++++++++--------- .../mountain_car/envs/__init__.py | 2 +- .../pendulum/config/pendulum_ibc_config.py | 13 ++- .../pendulum/config/pendulum_td3_bc_config.py | 2 +- .../pendulum/entry/pendulum_dqn_eval.py | 5 +- dizoo/common/dqn/lunarlander_dqn.py | 25 ++--- dizoo/common/dqn/lunarlander_dqn_download.py | 9 +- dizoo/common/ppo/lunarlander_ppo.py | 25 ++--- dizoo/common/ppo/lunarlander_ppo_download.py | 6 +- dizoo/common/sac/lunarlander_sac.py | 25 ++--- dizoo/common/sac/lunarlander_sac_download.py | 6 +- dizoo/common/td3/lunarlander_td3.py | 1 + dizoo/common/td3/lunarlander_td3_download.py | 9 +- .../config/halfcheetah_expert_td3bc_config.py | 2 +- .../halfcheetah_medium_expert_td3bc_config.py | 2 +- .../halfcheetah_medium_replay_td3bc_config.py | 2 +- .../config/halfcheetah_medium_td3bc_config.py | 2 +- .../config/halfcheetah_random_td3bc_config.py | 2 +- .../d4rl/config/hopper_expert_td3bc_config.py | 2 +- .../config/hopper_medium_expert_bc_config.py | 6 +- .../hopper_medium_expert_ibc_ar_config.py | 14 +-- .../config/hopper_medium_expert_ibc_config.py | 14 +-- .../hopper_medium_expert_ibc_mcmc_config.py | 14 +-- .../hopper_medium_expert_td3bc_config.py | 2 +- .../hopper_medium_replay_td3bc_config.py | 2 +- .../d4rl/config/hopper_medium_td3bc_config.py | 2 +- .../d4rl/config/hopper_random_td3bc_config.py | 2 +- .../d4rl/config/kitchen_complete_bc_config.py | 8 +- .../config/kitchen_complete_ibc_ar_config.py | 14 +-- .../config/kitchen_complete_ibc_config.py | 14 +-- .../kitchen_complete_ibc_mcmc_config.py | 14 +-- dizoo/d4rl/config/pen_human_bc_config.py | 6 +- dizoo/d4rl/config/pen_human_ibc_ar_config.py | 14 +-- dizoo/d4rl/config/pen_human_ibc_config.py | 14 +-- .../d4rl/config/pen_human_ibc_mcmc_config.py | 14 +-- .../config/walker2d_expert_td3bc_config.py | 2 +- .../walker2d_medium_expert_td3bc_config.py | 2 +- .../walker2d_medium_replay_td3bc_config.py | 2 +- .../config/walker2d_medium_td3bc_config.py | 2 +- .../config/walker2d_random_td3bc_config.py | 2 +- dizoo/d4rl/entry/d4rl_cql_main.py | 2 +- dizoo/d4rl/entry/d4rl_td3_bc_main.py | 2 +- dizoo/dmc2gym/config/dmc2gym_ppo_config.py | 1 - dizoo/dmc2gym/entry/dmc2gym_sac_pixel_main.py | 33 ++++--- dizoo/dmc2gym/entry/dmc2gym_sac_state_main.py | 33 ++++--- dizoo/dmc2gym/envs/dmc2gym_env.py | 2 + dizoo/dmc2gym/envs/test_dmc2gym_env.py | 1 - .../evogym/envs/test/visualize_simple_env.py | 1 - .../config/stocks_dqn_config.py | 6 +- .../worker/trading_serial_evaluator.py | 26 ++--- .../envs/gym-hybrid/gym_hybrid/__init__.py | 3 +- dizoo/gym_hybrid/envs/gym-hybrid/setup.py | 9 +- .../envs/gym-hybrid/tests/moving.py | 1 - dizoo/gym_hybrid/envs/test_gym_hybrid_env.py | 12 ++- .../entry/imagenet_res18_config.py | 4 +- dizoo/league_demo/league_demo_collector.py | 14 +-- dizoo/maze/entry/maze_bc_main.py | 14 +-- dizoo/minigrid/utils/eval.py | 10 +- dizoo/mujoco/config/halfcheetah_bdq_config.py | 7 +- dizoo/mujoco/config/hopper_bdq_config.py | 6 +- dizoo/mujoco/envs/mujoco_wrappers.py | 8 +- .../config/ant_mappo_config.py | 1 - .../config/ant_masac_config.py | 4 +- .../config/ptz_simple_spread_madqn_config.py | 8 +- dizoo/rocket/entry/rocket_hover_ppo_main.py | 6 +- dizoo/rocket/entry/rocket_landing_ppo_main.py | 8 +- dizoo/rocket/envs/test_rocket_env.py | 6 +- dizoo/smac/config/smac_3s5z_madqn_config.py | 12 +-- .../config/smac_3s5zvs3s6z_madqn_config.py | 12 +-- dizoo/smac/config/smac_5m6m_madqn_config.py | 11 +-- dizoo/smac/config/smac_8m9m_madqn_config.py | 11 +-- dizoo/smac/config/smac_MMM2_madqn_config.py | 12 +-- dizoo/smac/config/smac_MMM_madqn_config.py | 12 +-- dizoo/smac/utils/eval.py | 10 +- 88 files changed, 375 insertions(+), 417 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 8acb64b24f..3a42e482ac 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -120,7 +120,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: else: raise KeyError("not supported env type: {}".format(env)) elif algorithm == 'TD3': - cfg = EasyDict({"policy":TD3Policy.default_config()}) + cfg = EasyDict({"policy": TD3Policy.default_config()}) if env == 'hopper': cfg.update( dict( @@ -183,8 +183,9 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: learning_rate_critic=3e-4, noise_sigma=0.1, ), - collect=dict(n_sample=256, - noise_sigma=0.1, + collect=dict( + n_sample=256, + noise_sigma=0.1, ), other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), ), @@ -199,8 +200,8 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ) else: raise KeyError("not supported env type: {}".format(env)) - elif algorithm == 'SAC': - cfg = EasyDict({"policy":SACPolicy.default_config()}) + elif algorithm == 'SAC': + cfg = EasyDict({"policy": SACPolicy.default_config()}) if env == 'hopper': cfg.update( dict( @@ -272,9 +273,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: reparameterization=True, auto_alpha=False, ), - collect=dict( - n_sample=256, - ), + collect=dict(n_sample=256, ), other=dict(replay_buffer=dict(replay_buffer_size=int(1e6), ), ), ), wandb_logger=dict( @@ -286,12 +285,12 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), ) ) - + pass else: raise KeyError("not supported env type: {}".format(env)) elif algorithm == 'DQN': - cfg = EasyDict({"policy":DQNPolicy.default_config()}) + cfg = EasyDict({"policy": DQNPolicy.default_config()}) if env == 'lunarlander_discrete': cfg.update( dict( @@ -349,7 +348,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: else: raise KeyError("not supported env type: {}".format(env)) elif algorithm == 'IMPALA': - cfg = EasyDict({"policy":IMPALAPolicy.default_config()}) + cfg = EasyDict({"policy": IMPALAPolicy.default_config()}) if env == 'SpaceInvaders': cfg.update( dict( @@ -511,7 +510,8 @@ def get_instance_env(env: str) -> BaseEnv: 'env_wrapper': 'atari_default', }) return DingEnvWrapper(gym.make("SpaceInvaders-v4"), cfg=cfg) - elif env in ['atari_qbert', 'atari_kangaroo', 'atari_bowling', 'atari_breakout','atari_spaceinvader', 'atari_gopher']: + elif env in ['atari_qbert', 'atari_kangaroo', 'atari_bowling', 'atari_breakout', 'atari_spaceinvader', + 'atari_gopher']: from dizoo.atari.envs.atari_env import AtariEnv atari_env_list = { 'atari_qbert': 'QbertNoFrameskip-v4', diff --git a/ding/bonus/dqn.py b/ding/bonus/dqn.py index 977b050de1..8faaeaf5b4 100644 --- a/ding/bonus/dqn.py +++ b/ding/bonus/dqn.py @@ -28,6 +28,7 @@ class TrainingReturn: ''' wandb_url: str + @dataclass class EvalReturn: ''' @@ -38,6 +39,7 @@ class EvalReturn: eval_value: np.float32 eval_value_std: np.float32 + class DQNOffpolicyAgent: supported_env_list = [ 'lunarlander_discrete', @@ -114,13 +116,7 @@ def train( # random_collect_size=self.cfg.policy.random_collect_size # ) # ) - task.use( - StepCollector( - self.cfg, - self.policy.collect_mode, - collector_env - ) - ) + task.use(StepCollector(self.cfg, self.policy.collect_mode, collector_env)) task.use(nstep_reward_enhancer(self.cfg)) task.use(data_pusher(self.cfg, self.buffer_)) task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_)) @@ -161,7 +157,7 @@ def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, def single_env_forward_wrapper(forward_fn, cuda=True): - forward_fn=model_wrap(forward_fn, wrapper_name='argmax_sample').forward + forward_fn = model_wrap(forward_fn, wrapper_name='argmax_sample').forward def _forward(obs): # unsqueeze means add batch dim, i.e. (O, ) -> (1, O) @@ -174,7 +170,7 @@ def _forward(obs): return action return _forward - + forward_fn = single_env_forward_wrapper(self.policy._model, self.cfg.policy.cuda) # main loop diff --git a/ding/bonus/impala.py b/ding/bonus/impala.py index ccd6428909..875f4f4e0c 100644 --- a/ding/bonus/impala.py +++ b/ding/bonus/impala.py @@ -29,6 +29,7 @@ class TrainingReturn: ''' wandb_url: str + @dataclass class EvalReturn: ''' @@ -39,6 +40,7 @@ class EvalReturn: eval_value: np.float32 eval_value_std: np.float32 + class IMPALAOffPolicyAgent: supported_env_list = [ 'SpaceInvaders', @@ -155,7 +157,7 @@ def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, def single_env_forward_wrapper(forward_fn, cuda=True): - forward_fn=model_wrap(forward_fn, wrapper_name='base').forward + forward_fn = model_wrap(forward_fn, wrapper_name='base').forward def _forward(obs): # unsqueeze means add batch dim, i.e. (O, ) -> (1, O) @@ -163,7 +165,7 @@ def _forward(obs): if cuda and torch.cuda.is_available(): obs = obs.cuda() (mu, sigma) = forward_fn(obs, mode='compute_actor')['logit'] - action = torch.tanh(mu).detach().cpu().numpy()[0] # deterministic_eval + action = torch.tanh(mu).detach().cpu().numpy()[0] # deterministic_eval return action return _forward diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index bfa677f0b4..9bafd63a42 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -27,6 +27,7 @@ class TrainingReturn: ''' wandb_url: str + @dataclass class EvalReturn: ''' @@ -37,6 +38,7 @@ class EvalReturn: eval_value: np.float32 eval_value_std: np.float32 + class PPOF: supported_env_list = [ # common diff --git a/ding/bonus/sac.py b/ding/bonus/sac.py index 267d40a759..e1cdac9e0c 100644 --- a/ding/bonus/sac.py +++ b/ding/bonus/sac.py @@ -29,6 +29,7 @@ class TrainingReturn: ''' wandb_url: str + @dataclass class EvalReturn: ''' @@ -39,6 +40,7 @@ class EvalReturn: eval_value: np.float32 eval_value_std: np.float32 + class SACOffPolicyAgent: supported_env_list = [ 'hopper', @@ -155,7 +157,7 @@ def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, def single_env_forward_wrapper(forward_fn, cuda=True): - forward_fn=model_wrap(forward_fn, wrapper_name='base').forward + forward_fn = model_wrap(forward_fn, wrapper_name='base').forward def _forward(obs): # unsqueeze means add batch dim, i.e. (O, ) -> (1, O) @@ -163,7 +165,7 @@ def _forward(obs): if cuda and torch.cuda.is_available(): obs = obs.cuda() (mu, sigma) = forward_fn(obs, mode='compute_actor')['logit'] - action = torch.tanh(mu).detach().cpu().numpy()[0] # deterministic_eval + action = torch.tanh(mu).detach().cpu().numpy()[0] # deterministic_eval return action return _forward diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 20629952c4..8880121e28 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -28,6 +28,7 @@ class TrainingReturn: ''' wandb_url: str + @dataclass class EvalReturn: ''' @@ -38,6 +39,7 @@ class EvalReturn: eval_value: np.float32 eval_value_std: np.float32 + class TD3OffPolicyAgent: supported_env_list = [ 'hopper', diff --git a/ding/policy/dqn.py b/ding/policy/dqn.py index d1ee73fdd7..91db6a1840 100644 --- a/ding/policy/dqn.py +++ b/ding/policy/dqn.py @@ -418,10 +418,10 @@ def _forward_eval(self, data: Dict[int, Any]) -> Dict[int, Any]: output = default_decollate(output) return {i: d for i, d in zip(data_id, output)} - def monitor_vars(self) -> List[str]: return ['cur_lr', 'total_loss', 'q_value'] + @POLICY_REGISTRY.register('dqn_stdim') class DQNSTDIMPolicy(DQNPolicy): """ diff --git a/ding/policy/sac.py b/ding/policy/sac.py index a12d8bdc7d..a36f6a10c5 100644 --- a/ding/policy/sac.py +++ b/ding/policy/sac.py @@ -822,7 +822,6 @@ def _monitor_vars_learn(self) -> List[str]: 'td_error', ] + twin_critic + alpha_loss - def monitor_vars(self) -> List[str]: return self._monitor_vars_learn() diff --git a/dizoo/atari/entry/spaceinvaders_dqn_eval.py b/dizoo/atari/entry/spaceinvaders_dqn_eval.py index d8bfde290d..35e15a578c 100644 --- a/dizoo/atari/entry/spaceinvaders_dqn_eval.py +++ b/dizoo/atari/entry/spaceinvaders_dqn_eval.py @@ -15,8 +15,9 @@ from ding.rl_utils import get_epsilon_greedy_fn from dizoo.atari.config.serial.spaceinvaders.spaceinvaders_dqn_config import main_config, create_config + def main(rl_cfg, seed=0): - main_cfg, create_cfg =rl_cfg + main_cfg, create_cfg = rl_cfg cfg = compile_config( main_cfg, BaseEnvManager, @@ -56,4 +57,4 @@ def main(rl_cfg, seed=0): if __name__ == "__main__": - main(rl_cfg=(main_config, create_config),seed=0) + main(rl_cfg=(main_config, create_config), seed=0) diff --git a/dizoo/atari/example/atari_dqn_dist_ddp.py b/dizoo/atari/example/atari_dqn_dist_ddp.py index f194c326bc..5dbfc4e65c 100644 --- a/dizoo/atari/example/atari_dqn_dist_ddp.py +++ b/dizoo/atari/example/atari_dqn_dist_ddp.py @@ -14,7 +14,6 @@ from dizoo.atari.envs.atari_env import AtariEnv from dizoo.atari.config.serial.pong.pong_dqn_config import main_config, create_config - logging.getLogger().setLevel(logging.INFO) main_config.exp_name = 'pong_dqn_seed0_ditask_dist_ddp' diff --git a/dizoo/box2d/carracing/config/carracing_dqn_config.py b/dizoo/box2d/carracing/config/carracing_dqn_config.py index 31dd42fca8..1792056a83 100644 --- a/dizoo/box2d/carracing/config/carracing_dqn_config.py +++ b/dizoo/box2d/carracing/config/carracing_dqn_config.py @@ -29,17 +29,14 @@ learning_rate=0.0001, target_update_freq=100, ), - collect=dict( - n_sample=64, - ), + collect=dict(n_sample=64, ), other=dict( eps=dict( type='exp', start=0.95, end=0.1, decay=50000, - ), - replay_buffer=dict(replay_buffer_size=100000, ) + ), replay_buffer=dict(replay_buffer_size=100000, ) ), ), ) @@ -60,4 +57,4 @@ if __name__ == "__main__": # or you can enter `ding -m serial -c carracing_dqn_config.py -s 0` from ding.entry import serial_pipeline - serial_pipeline([main_config, create_config], seed=0) \ No newline at end of file + serial_pipeline([main_config, create_config], seed=0) diff --git a/dizoo/box2d/carracing/envs/carracing_env.py b/dizoo/box2d/carracing/envs/carracing_env.py index 39b82a2502..60ebaa97d1 100644 --- a/dizoo/box2d/carracing/envs/carracing_env.py +++ b/dizoo/box2d/carracing/envs/carracing_env.py @@ -2,7 +2,6 @@ import copy import os - import gym import numpy as np from easydict import EasyDict diff --git a/dizoo/box2d/carracing/envs/test_carracing_env.py b/dizoo/box2d/carracing/envs/test_carracing_env.py index 7eb4a75039..47a5fa4638 100644 --- a/dizoo/box2d/carracing/envs/test_carracing_env.py +++ b/dizoo/box2d/carracing/envs/test_carracing_env.py @@ -5,15 +5,7 @@ @pytest.mark.envtest -@pytest.mark.parametrize( - 'cfg', [ - EasyDict({ - 'env_id': 'CarRacing-v2', - 'continuous': False, - 'act_scale': False - }) - ] -) +@pytest.mark.parametrize('cfg', [EasyDict({'env_id': 'CarRacing-v2', 'continuous': False, 'act_scale': False})]) class TestCarRacing: def test_naive(self, cfg): diff --git a/dizoo/classic_control/cartpole/config/cartpole_bc_config.py b/dizoo/classic_control/cartpole/config/cartpole_bc_config.py index 8315e934fe..b1975718f3 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_bc_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_bc_config.py @@ -20,7 +20,7 @@ batch_size=64, learning_rate=0.01, learner=dict(hook=dict(save_ckpt_after_iter=1000)), - train_epoch = 20, + train_epoch=20, ), eval=dict(evaluator=dict(eval_freq=40, )) ), diff --git a/dizoo/classic_control/mountain_car/config/mtcar_rainbow_config.py b/dizoo/classic_control/mountain_car/config/mtcar_rainbow_config.py index c6c4fb4db0..b293d44494 100644 --- a/dizoo/classic_control/mountain_car/config/mtcar_rainbow_config.py +++ b/dizoo/classic_control/mountain_car/config/mtcar_rainbow_config.py @@ -1,58 +1,63 @@ from easydict import EasyDict # DI-Engine uses EasyDict for configuration, by convention -mtcar_rainbow_config = EasyDict(dict( - exp_name='mtcar_rainbow_seed0', - env=dict( - collector_env_num=8, - evaluator_env_num=5, - n_evaluator_episode=5, - stop_value=195, - ), - policy=dict( - cuda=False, - priority=True, - discount_factor=0.97, - nstep=3, - model=dict( - obs_shape=2, - action_shape=3, - encoder_hidden_size_list=[128, 128, 64], +mtcar_rainbow_config = EasyDict( + dict( + exp_name='mtcar_rainbow_seed0', + env=dict( + collector_env_num=8, + evaluator_env_num=5, + n_evaluator_episode=5, + stop_value=195, ), - learn=dict( - update_per_collect=3, - batch_size=64, - learning_rate=0.001, - target_update_freq=100, + policy=dict( + cuda=False, + priority=True, + discount_factor=0.97, + nstep=3, + model=dict( + obs_shape=2, + action_shape=3, + encoder_hidden_size_list=[128, 128, 64], + ), + learn=dict( + update_per_collect=3, + batch_size=64, + learning_rate=0.001, + target_update_freq=100, + ), + collect=dict( + n_sample=80, + unroll_len=1, + ), + other=dict( + eps=dict( + type='exp', + start=0.95, + end=0.1, + decay=10000, + ), + replay_buffer=dict(replay_buffer_size=20000, ) + ), ), - collect=dict( - n_sample=80, - unroll_len=1, - ), - other=dict( - eps=dict( - type='exp', - start=0.95, - end=0.1, - decay=10000, - ), replay_buffer=dict(replay_buffer_size=20000, ) - ), - ), -)) + ) +) main_config = mtcar_rainbow_config -mtcar_rainbow_create_config = EasyDict(dict( - env=dict( - type='mountain_car', - import_names=['dizoo.classic_control.mountain_car.envs.mtcar_env'], - ), - env_manager=dict(type='base'), - policy=dict(type='rainbow'), -)) +mtcar_rainbow_create_config = EasyDict( + dict( + env=dict( + type='mountain_car', + import_names=['dizoo.classic_control.mountain_car.envs.mtcar_env'], + ), + env_manager=dict(type='base'), + policy=dict(type='rainbow'), + ) +) create_config = mtcar_rainbow_create_config if __name__ == "__main__": from ding.entry import serial_pipeline - serial_pipeline((main_config, create_config), seed=0) \ No newline at end of file + serial_pipeline((main_config, create_config), seed=0) diff --git a/dizoo/classic_control/mountain_car/envs/__init__.py b/dizoo/classic_control/mountain_car/envs/__init__.py index 19f7eaf1cc..9e8ca86d5f 100644 --- a/dizoo/classic_control/mountain_car/envs/__init__.py +++ b/dizoo/classic_control/mountain_car/envs/__init__.py @@ -1 +1 @@ -from .mtcar_env import MountainCarEnv \ No newline at end of file +from .mtcar_env import MountainCarEnv diff --git a/dizoo/classic_control/pendulum/config/pendulum_ibc_config.py b/dizoo/classic_control/pendulum/config/pendulum_ibc_config.py index 247fdad045..7c56f283fe 100644 --- a/dizoo/classic_control/pendulum/config/pendulum_ibc_config.py +++ b/dizoo/classic_control/pendulum/config/pendulum_ibc_config.py @@ -13,16 +13,15 @@ ), policy=dict( cuda=cuda, - model=dict( - obs_shape=3, - action_shape=1, - stochastic_optim=dict(type='mcmc', cuda=cuda,) - ), + model=dict(obs_shape=3, action_shape=1, stochastic_optim=dict( + type='mcmc', + cuda=cuda, + )), learn=dict( multi_gpu=multi_gpu, train_epoch=15, batch_size=256, - optim=dict(learning_rate=1e-5,), + optim=dict(learning_rate=1e-5, ), learner=dict(hook=dict(log_show_after_iter=1000)), ), collect=dict( @@ -30,7 +29,7 @@ data_path='./pendulum_sac_data_generation/expert_demos.hdf5', collector_logit=False, ), - eval=dict(evaluator=dict(eval_freq=-1,)), + eval=dict(evaluator=dict(eval_freq=-1, )), ), ) pendulum_ibc_config = EasyDict(main_config) diff --git a/dizoo/classic_control/pendulum/config/pendulum_td3_bc_config.py b/dizoo/classic_control/pendulum/config/pendulum_td3_bc_config.py index 82a44f034e..8583fc6ada 100644 --- a/dizoo/classic_control/pendulum/config/pendulum_td3_bc_config.py +++ b/dizoo/classic_control/pendulum/config/pendulum_td3_bc_config.py @@ -6,7 +6,7 @@ collector_env_num=8, evaluator_env_num=5, norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), # (bool) Scale output action into legal range. diff --git a/dizoo/classic_control/pendulum/entry/pendulum_dqn_eval.py b/dizoo/classic_control/pendulum/entry/pendulum_dqn_eval.py index a5a7b9ab32..fb80ad42ad 100644 --- a/dizoo/classic_control/pendulum/entry/pendulum_dqn_eval.py +++ b/dizoo/classic_control/pendulum/entry/pendulum_dqn_eval.py @@ -15,8 +15,9 @@ from ding.rl_utils import get_epsilon_greedy_fn from dizoo.classic_control.pendulum.config.pendulum_dqn_config import main_config, create_config + def main(rl_cfg, seed=0): - main_cfg, create_cfg =rl_cfg + main_cfg, create_cfg = rl_cfg cfg = compile_config( main_cfg, BaseEnvManager, @@ -56,4 +57,4 @@ def main(rl_cfg, seed=0): if __name__ == "__main__": - main(rl_cfg=(main_config, create_config),seed=0) + main(rl_cfg=(main_config, create_config), seed=0) diff --git a/dizoo/common/dqn/lunarlander_dqn.py b/dizoo/common/dqn/lunarlander_dqn.py index a41e845029..aabea36a56 100644 --- a/dizoo/common/dqn/lunarlander_dqn.py +++ b/dizoo/common/dqn/lunarlander_dqn.py @@ -4,15 +4,18 @@ # Instantiate the agent agent = DQNOffpolicyAgent(env="lunarlander_discrete", exp_name="Lunarlander-v2-DQN") # Train the agent -return_=agent.train(step=int(4000000), collector_env_num=8, evaluator_env_num=8, debug=False) +return_ = agent.train(step=int(4000000), collector_env_num=8, evaluator_env_num=8, debug=False) # Push model to huggingface hub -push_model_to_hub(agent=agent, - env_name="OpenAI/Gym/Box2d", - task_name="LunarLander-v2", - algo_name="DQN", - wandb_url=return_.wandb_url, - github_repo_url="https://github.com/opendilab/DI-engine", - github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/dqn.html", - github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", - usage_file_path="./dizoo/common/dqn/lunarlander_dqn_download.py", - repo_id="OpenDILabCommunity/Lunarlander-v2-DQN") +push_model_to_hub( + agent=agent, + env_name="OpenAI/Gym/Box2d", + task_name="LunarLander-v2", + algo_name="DQN", + wandb_url=return_.wandb_url, + github_repo_url="https://github.com/opendilab/DI-engine", + github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/dqn.html", + github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", + installation_guide="pip3 install DI-engine[common_env,fast]", + usage_file_path="./dizoo/common/dqn/lunarlander_dqn_download.py", + repo_id="OpenDILabCommunity/Lunarlander-v2-DQN" +) diff --git a/dizoo/common/dqn/lunarlander_dqn_download.py b/dizoo/common/dqn/lunarlander_dqn_download.py index 1320196c10..2ad25b5084 100644 --- a/dizoo/common/dqn/lunarlander_dqn_download.py +++ b/dizoo/common/dqn/lunarlander_dqn_download.py @@ -2,9 +2,14 @@ from huggingface_ding import pull_model_from_hub # Pull model from Hugggingface hub -policy_state_dict, cfg=pull_model_from_hub(repo_id="OpenDILabCommunity/Lunarlander-v2-DQN") +policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/Lunarlander-v2-DQN") # Instantiate the agent -agent = DQNOffpolicyAgent(env="lunarlander_discrete",exp_name="Lunarlander-v2-DQN-test", cfg=cfg.exp_config, policy_state_dict=policy_state_dict) +agent = DQNOffpolicyAgent( + env="lunarlander_discrete", + exp_name="Lunarlander-v2-DQN-test", + cfg=cfg.exp_config, + policy_state_dict=policy_state_dict +) # Continue training agent.train(step=5000) # Render the new agent performance diff --git a/dizoo/common/ppo/lunarlander_ppo.py b/dizoo/common/ppo/lunarlander_ppo.py index 88cb6088fd..648266b15f 100644 --- a/dizoo/common/ppo/lunarlander_ppo.py +++ b/dizoo/common/ppo/lunarlander_ppo.py @@ -4,15 +4,18 @@ # Instantiate the agent agent = PPOF("lunarlander_discrete", exp_name="LunarLander-v2-PPO") # Train the agent -return_=agent.train(step=int(200000), collector_env_num=4, evaluator_env_num=4) +return_ = agent.train(step=int(200000), collector_env_num=4, evaluator_env_num=4) # Push model to huggingface hub -push_model_to_hub(agent=agent, - env_name="OpenAI/Gym/Box2d", - task_name="LunarLander-v2", - algo_name="PPO", - wandb_url=return_.wandb_url, - github_repo_url="https://github.com/opendilab/DI-engine", - github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/ppo.html", - github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", - usage_file_path="./dizoo/common/ppo/lunarlander_ppo_download.py", - repo_id="OpenDILabCommunity/LunarLander-v2-PPO") +push_model_to_hub( + agent=agent, + env_name="OpenAI/Gym/Box2d", + task_name="LunarLander-v2", + algo_name="PPO", + wandb_url=return_.wandb_url, + github_repo_url="https://github.com/opendilab/DI-engine", + github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/ppo.html", + github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", + installation_guide="pip3 install DI-engine[common_env,fast]", + usage_file_path="./dizoo/common/ppo/lunarlander_ppo_download.py", + repo_id="OpenDILabCommunity/LunarLander-v2-PPO" +) diff --git a/dizoo/common/ppo/lunarlander_ppo_download.py b/dizoo/common/ppo/lunarlander_ppo_download.py index a55d164f7d..dd2ac3b4c6 100644 --- a/dizoo/common/ppo/lunarlander_ppo_download.py +++ b/dizoo/common/ppo/lunarlander_ppo_download.py @@ -2,9 +2,11 @@ from huggingface_ding import pull_model_from_hub # Pull model from Hugggingface hub -policy_state_dict, cfg=pull_model_from_hub(repo_id="OpenDILabCommunity/LunarLander-v2-PPO") +policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/LunarLander-v2-PPO") # Instantiate the agent -agent = PPOF(env="lunarlander_discrete",exp_name="lunarlander-ppo", cfg=cfg.exp_config, policy_state_dict=policy_state_dict) +agent = PPOF( + env="lunarlander_discrete", exp_name="lunarlander-ppo", cfg=cfg.exp_config, policy_state_dict=policy_state_dict +) # Continue training agent.train(step=5000) # Render the new agent performance diff --git a/dizoo/common/sac/lunarlander_sac.py b/dizoo/common/sac/lunarlander_sac.py index defc41f4b1..1008d23fd9 100644 --- a/dizoo/common/sac/lunarlander_sac.py +++ b/dizoo/common/sac/lunarlander_sac.py @@ -4,15 +4,18 @@ # Instantiate the agent agent = SACOffPolicyAgent("lunarlander_continuous", exp_name="LunarLander-v2-SAC") # Train the agent -return_=agent.train(step=int(2000000), collector_env_num=4, evaluator_env_num=4) +return_ = agent.train(step=int(4000000), collector_env_num=8, evaluator_env_num=8) # Push model to huggingface hub -push_model_to_hub(agent=agent, - env_name="OpenAI/Gym/Box2d", - task_name="LunarLander-v2", - algo_name="SAC", - wandb_url=return_.wandb_url, - github_repo_url="https://github.com/opendilab/DI-engine", - github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/sac.html", - github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", - usage_file_path="./dizoo/common/sac/lunarlander_sac_download.py", - repo_id="OpenDILabCommunity/LunarLander-v2-SAC") +push_model_to_hub( + agent=agent, + env_name="OpenAI/Gym/Box2d", + task_name="LunarLander-v2", + algo_name="SAC", + wandb_url=return_.wandb_url, + github_repo_url="https://github.com/opendilab/DI-engine", + github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/sac.html", + github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", + installation_guide="pip3 install DI-engine[common_env,fast]", + usage_file_path="./dizoo/common/sac/lunarlander_sac_download.py", + repo_id="OpenDILabCommunity/LunarLander-v2-SAC" +) diff --git a/dizoo/common/sac/lunarlander_sac_download.py b/dizoo/common/sac/lunarlander_sac_download.py index 20eb75d3ef..36863eb2fb 100644 --- a/dizoo/common/sac/lunarlander_sac_download.py +++ b/dizoo/common/sac/lunarlander_sac_download.py @@ -2,9 +2,11 @@ from huggingface_ding import pull_model_from_hub # Pull model from Hugggingface hub -policy_state_dict, cfg=pull_model_from_hub(repo_id="OpenDILabCommunity/LunarLander-v2-SAC") +policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/LunarLander-v2-SAC") # Instantiate the agent -agent = SACOffPolicyAgent(env="lunarlander_continuous",exp_name="lunarlander-sac", cfg=cfg.exp_config, policy_state_dict=policy_state_dict) +agent = SACOffPolicyAgent( + env="lunarlander_continuous", exp_name="lunarlander-sac", cfg=cfg.exp_config, policy_state_dict=policy_state_dict +) # Continue training agent.train(step=5000) # Render the new agent performance diff --git a/dizoo/common/td3/lunarlander_td3.py b/dizoo/common/td3/lunarlander_td3.py index 3384d2a4c7..c97a61aa78 100644 --- a/dizoo/common/td3/lunarlander_td3.py +++ b/dizoo/common/td3/lunarlander_td3.py @@ -15,6 +15,7 @@ github_repo_url="https://github.com/opendilab/DI-engine", github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/td3.html", github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", + installation_guide="pip3 install DI-engine[common_env,fast]", usage_file_path="./dizoo/common/td3/lunarlander_td3_download.py", repo_id="OpenDILabCommunity/LunarLander-v2-TD3" ) diff --git a/dizoo/common/td3/lunarlander_td3_download.py b/dizoo/common/td3/lunarlander_td3_download.py index ef7133733b..2e0982f225 100644 --- a/dizoo/common/td3/lunarlander_td3_download.py +++ b/dizoo/common/td3/lunarlander_td3_download.py @@ -2,9 +2,14 @@ from huggingface_ding import pull_model_from_hub # Pull model from Hugggingface hub -policy_state_dict, cfg=pull_model_from_hub(repo_id="OpenDILabCommunity/LunarLander-v2-TD3") +policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/LunarLander-v2-TD3") # Instantiate the agent -agent = TD3OffPolicyAgent(env="lunarlander_continuous",exp_name="LunarLander-v2-TD3", cfg=cfg.exp_config, policy_state_dict=policy_state_dict) +agent = TD3OffPolicyAgent( + env="lunarlander_continuous", + exp_name="LunarLander-v2-TD3", + cfg=cfg.exp_config, + policy_state_dict=policy_state_dict +) # Continue training agent.train(step=5000) # Render the new agent performance diff --git a/dizoo/d4rl/config/halfcheetah_expert_td3bc_config.py b/dizoo/d4rl/config/halfcheetah_expert_td3bc_config.py index e798cf66e3..77b24abcf1 100644 --- a/dizoo/d4rl/config/halfcheetah_expert_td3bc_config.py +++ b/dizoo/d4rl/config/halfcheetah_expert_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='walker2d-expert-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/halfcheetah_medium_expert_td3bc_config.py b/dizoo/d4rl/config/halfcheetah_medium_expert_td3bc_config.py index 8d25289131..d93ba6f445 100644 --- a/dizoo/d4rl/config/halfcheetah_medium_expert_td3bc_config.py +++ b/dizoo/d4rl/config/halfcheetah_medium_expert_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='halfcheetah-medium-expert-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/halfcheetah_medium_replay_td3bc_config.py b/dizoo/d4rl/config/halfcheetah_medium_replay_td3bc_config.py index 3561f320fb..cbc35370f4 100644 --- a/dizoo/d4rl/config/halfcheetah_medium_replay_td3bc_config.py +++ b/dizoo/d4rl/config/halfcheetah_medium_replay_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='halfcheetah-medium-replay-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/halfcheetah_medium_td3bc_config.py b/dizoo/d4rl/config/halfcheetah_medium_td3bc_config.py index ef6e2d3f40..38f78689ea 100644 --- a/dizoo/d4rl/config/halfcheetah_medium_td3bc_config.py +++ b/dizoo/d4rl/config/halfcheetah_medium_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='halfcheetah-medium-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/halfcheetah_random_td3bc_config.py b/dizoo/d4rl/config/halfcheetah_random_td3bc_config.py index dbe94d1a24..dec3f2edc1 100644 --- a/dizoo/d4rl/config/halfcheetah_random_td3bc_config.py +++ b/dizoo/d4rl/config/halfcheetah_random_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='halfcheetah-random-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/hopper_expert_td3bc_config.py b/dizoo/d4rl/config/hopper_expert_td3bc_config.py index b0874a0018..776366ba0c 100644 --- a/dizoo/d4rl/config/hopper_expert_td3bc_config.py +++ b/dizoo/d4rl/config/hopper_expert_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='hopper-expert-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/hopper_medium_expert_bc_config.py b/dizoo/d4rl/config/hopper_medium_expert_bc_config.py index e04bd28069..348361dd2d 100644 --- a/dizoo/d4rl/config/hopper_medium_expert_bc_config.py +++ b/dizoo/d4rl/config/hopper_medium_expert_bc_config.py @@ -8,7 +8,7 @@ env=dict( env_id='hopper-medium-expert-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), evaluator_env_num=8, @@ -38,7 +38,7 @@ data_type='d4rl', data_path=None, ), - eval=dict(evaluator=dict(eval_freq=-1,)), + eval=dict(evaluator=dict(eval_freq=-1, )), ), ) main_config = EasyDict(main_config) @@ -48,7 +48,7 @@ type='d4rl', import_names=['dizoo.d4rl.envs.d4rl_env'], ), - env_manager=dict(type='base',), + env_manager=dict(type='base', ), policy=dict( type='bc', import_names=['ding.policy.bc'], diff --git a/dizoo/d4rl/config/hopper_medium_expert_ibc_ar_config.py b/dizoo/d4rl/config/hopper_medium_expert_ibc_ar_config.py index 061b8b53a6..5d1090dc77 100644 --- a/dizoo/d4rl/config/hopper_medium_expert_ibc_ar_config.py +++ b/dizoo/d4rl/config/hopper_medium_expert_ibc_ar_config.py @@ -8,7 +8,7 @@ env=dict( env_id='hopper-medium-expert-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), evaluator_env_num=8, @@ -18,23 +18,19 @@ ), policy=dict( cuda=cuda, - model=dict( - obs_shape=11, - action_shape=3, - stochastic_optim=dict(type='ardfo',) - ), + model=dict(obs_shape=11, action_shape=3, stochastic_optim=dict(type='ardfo', )), learn=dict( multi_gpu=multi_gpu, train_epoch=15, batch_size=256, - optim=dict(learning_rate=1e-5,), + optim=dict(learning_rate=1e-5, ), learner=dict(hook=dict(log_show_after_iter=1000)), ), collect=dict( data_type='d4rl', data_path=None, ), - eval=dict(evaluator=dict(eval_freq=-1,)), + eval=dict(evaluator=dict(eval_freq=-1, )), ), ) main_config = EasyDict(main_config) @@ -44,7 +40,7 @@ type='d4rl', import_names=['dizoo.d4rl.envs.d4rl_env'], ), - env_manager=dict(type='base',), + env_manager=dict(type='base', ), policy=dict( type='ibc', import_names=['ding.policy.ibc'], diff --git a/dizoo/d4rl/config/hopper_medium_expert_ibc_config.py b/dizoo/d4rl/config/hopper_medium_expert_ibc_config.py index e7a72984b6..0f040970e6 100644 --- a/dizoo/d4rl/config/hopper_medium_expert_ibc_config.py +++ b/dizoo/d4rl/config/hopper_medium_expert_ibc_config.py @@ -8,7 +8,7 @@ env=dict( env_id='hopper-medium-expert-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), evaluator_env_num=8, @@ -18,23 +18,19 @@ ), policy=dict( cuda=cuda, - model=dict( - obs_shape=11, - action_shape=3, - stochastic_optim=dict(type='dfo',) - ), + model=dict(obs_shape=11, action_shape=3, stochastic_optim=dict(type='dfo', )), learn=dict( multi_gpu=multi_gpu, train_epoch=15, batch_size=256, - optim=dict(learning_rate=1e-5,), + optim=dict(learning_rate=1e-5, ), learner=dict(hook=dict(log_show_after_iter=1000)), ), collect=dict( data_type='d4rl', data_path=None, ), - eval=dict(evaluator=dict(eval_freq=-1,)), + eval=dict(evaluator=dict(eval_freq=-1, )), ), ) main_config = EasyDict(main_config) @@ -44,7 +40,7 @@ type='d4rl', import_names=['dizoo.d4rl.envs.d4rl_env'], ), - env_manager=dict(type='base',), + env_manager=dict(type='base', ), policy=dict( type='ibc', import_names=['ding.policy.ibc'], diff --git a/dizoo/d4rl/config/hopper_medium_expert_ibc_mcmc_config.py b/dizoo/d4rl/config/hopper_medium_expert_ibc_mcmc_config.py index e5f6f3dbb1..478e0c5d44 100644 --- a/dizoo/d4rl/config/hopper_medium_expert_ibc_mcmc_config.py +++ b/dizoo/d4rl/config/hopper_medium_expert_ibc_mcmc_config.py @@ -8,7 +8,7 @@ env=dict( env_id='hopper-medium-expert-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), evaluator_env_num=8, @@ -18,23 +18,19 @@ ), policy=dict( cuda=cuda, - model=dict( - obs_shape=11, - action_shape=3, - stochastic_optim=dict(type='mcmc',) - ), + model=dict(obs_shape=11, action_shape=3, stochastic_optim=dict(type='mcmc', )), learn=dict( multi_gpu=multi_gpu, train_epoch=15, batch_size=256, - optim=dict(learning_rate=1e-5,), + optim=dict(learning_rate=1e-5, ), learner=dict(hook=dict(log_show_after_iter=1000)), ), collect=dict( data_type='d4rl', data_path=None, ), - eval=dict(evaluator=dict(eval_freq=-1,)), + eval=dict(evaluator=dict(eval_freq=-1, )), ), ) main_config = EasyDict(main_config) @@ -44,7 +40,7 @@ type='d4rl', import_names=['dizoo.d4rl.envs.d4rl_env'], ), - env_manager=dict(type='base',), + env_manager=dict(type='base', ), policy=dict( type='ibc', import_names=['ding.policy.ibc'], diff --git a/dizoo/d4rl/config/hopper_medium_expert_td3bc_config.py b/dizoo/d4rl/config/hopper_medium_expert_td3bc_config.py index 19531debad..16212d4518 100644 --- a/dizoo/d4rl/config/hopper_medium_expert_td3bc_config.py +++ b/dizoo/d4rl/config/hopper_medium_expert_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='hopper-medium-expert-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/hopper_medium_replay_td3bc_config.py b/dizoo/d4rl/config/hopper_medium_replay_td3bc_config.py index 8f754781db..87bc42721f 100644 --- a/dizoo/d4rl/config/hopper_medium_replay_td3bc_config.py +++ b/dizoo/d4rl/config/hopper_medium_replay_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='hopper-medium-replay-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/hopper_medium_td3bc_config.py b/dizoo/d4rl/config/hopper_medium_td3bc_config.py index cbf5fcce19..15ed2b9073 100644 --- a/dizoo/d4rl/config/hopper_medium_td3bc_config.py +++ b/dizoo/d4rl/config/hopper_medium_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='hopper-medium-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/hopper_random_td3bc_config.py b/dizoo/d4rl/config/hopper_random_td3bc_config.py index 8cf796b5fb..0f1127f16a 100644 --- a/dizoo/d4rl/config/hopper_random_td3bc_config.py +++ b/dizoo/d4rl/config/hopper_random_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='hopper-random-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/kitchen_complete_bc_config.py b/dizoo/d4rl/config/kitchen_complete_bc_config.py index 7160885da3..413696993d 100644 --- a/dizoo/d4rl/config/kitchen_complete_bc_config.py +++ b/dizoo/d4rl/config/kitchen_complete_bc_config.py @@ -8,7 +8,7 @@ env=dict( env_id='kitchen-complete-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), evaluator_env_num=8, @@ -19,7 +19,7 @@ policy=dict( cuda=cuda, continuous=True, - loss_type='mse_loss', + loss_type='mse_loss', model=dict( obs_shape=60, action_shape=9, @@ -38,7 +38,7 @@ data_type='d4rl', data_path=None, ), - eval=dict(evaluator=dict(eval_freq=1000,)), + eval=dict(evaluator=dict(eval_freq=1000, )), ), ) main_config = EasyDict(main_config) @@ -48,7 +48,7 @@ type='d4rl', import_names=['dizoo.d4rl.envs.d4rl_env'], ), - env_manager=dict(type='base',), + env_manager=dict(type='base', ), policy=dict( type='bc', import_names=['ding.policy.bc'], diff --git a/dizoo/d4rl/config/kitchen_complete_ibc_ar_config.py b/dizoo/d4rl/config/kitchen_complete_ibc_ar_config.py index 403dc52eff..bbb7198af0 100644 --- a/dizoo/d4rl/config/kitchen_complete_ibc_ar_config.py +++ b/dizoo/d4rl/config/kitchen_complete_ibc_ar_config.py @@ -8,7 +8,7 @@ env=dict( env_id='kitchen-complete-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), evaluator_env_num=8, @@ -18,23 +18,19 @@ ), policy=dict( cuda=cuda, - model=dict( - obs_shape=60, - action_shape=9, - stochastic_optim=dict(type='ardfo',) - ), + model=dict(obs_shape=60, action_shape=9, stochastic_optim=dict(type='ardfo', )), learn=dict( multi_gpu=multi_gpu, train_epoch=1000, batch_size=256, - optim=dict(learning_rate=1e-5,), + optim=dict(learning_rate=1e-5, ), learner=dict(hook=dict(log_show_after_iter=100)), ), collect=dict( data_type='d4rl', data_path=None, ), - eval=dict(evaluator=dict(eval_freq=1000,)), + eval=dict(evaluator=dict(eval_freq=1000, )), ), ) main_config = EasyDict(main_config) @@ -44,7 +40,7 @@ type='d4rl', import_names=['dizoo.d4rl.envs.d4rl_env'], ), - env_manager=dict(type='base',), + env_manager=dict(type='base', ), policy=dict( type='ibc', import_names=['ding.policy.ibc'], diff --git a/dizoo/d4rl/config/kitchen_complete_ibc_config.py b/dizoo/d4rl/config/kitchen_complete_ibc_config.py index 5c02f04a81..1606cb7792 100644 --- a/dizoo/d4rl/config/kitchen_complete_ibc_config.py +++ b/dizoo/d4rl/config/kitchen_complete_ibc_config.py @@ -8,7 +8,7 @@ env=dict( env_id='kitchen-complete-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), evaluator_env_num=8, @@ -18,23 +18,19 @@ ), policy=dict( cuda=cuda, - model=dict( - obs_shape=60, - action_shape=9, - stochastic_optim=dict(type='dfo',) - ), + model=dict(obs_shape=60, action_shape=9, stochastic_optim=dict(type='dfo', )), learn=dict( multi_gpu=multi_gpu, train_epoch=1000, batch_size=256, - optim=dict(learning_rate=1e-5,), + optim=dict(learning_rate=1e-5, ), learner=dict(hook=dict(log_show_after_iter=100)), ), collect=dict( data_type='d4rl', data_path=None, ), - eval=dict(evaluator=dict(eval_freq=1000,)), + eval=dict(evaluator=dict(eval_freq=1000, )), ), ) main_config = EasyDict(main_config) @@ -44,7 +40,7 @@ type='d4rl', import_names=['dizoo.d4rl.envs.d4rl_env'], ), - env_manager=dict(type='base',), + env_manager=dict(type='base', ), policy=dict( type='ibc', import_names=['ding.policy.ibc'], diff --git a/dizoo/d4rl/config/kitchen_complete_ibc_mcmc_config.py b/dizoo/d4rl/config/kitchen_complete_ibc_mcmc_config.py index d93c5eb737..14924d5257 100644 --- a/dizoo/d4rl/config/kitchen_complete_ibc_mcmc_config.py +++ b/dizoo/d4rl/config/kitchen_complete_ibc_mcmc_config.py @@ -8,7 +8,7 @@ env=dict( env_id='kitchen-complete-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), evaluator_env_num=8, @@ -18,23 +18,19 @@ ), policy=dict( cuda=cuda, - model=dict( - obs_shape=60, - action_shape=9, - stochastic_optim=dict(type='mcmc',) - ), + model=dict(obs_shape=60, action_shape=9, stochastic_optim=dict(type='mcmc', )), learn=dict( multi_gpu=multi_gpu, train_epoch=1000, batch_size=256, - optim=dict(learning_rate=1e-5,), + optim=dict(learning_rate=1e-5, ), learner=dict(hook=dict(log_show_after_iter=100)), ), collect=dict( data_type='d4rl', data_path=None, ), - eval=dict(evaluator=dict(eval_freq=1000,)), + eval=dict(evaluator=dict(eval_freq=1000, )), ), ) main_config = EasyDict(main_config) @@ -44,7 +40,7 @@ type='d4rl', import_names=['dizoo.d4rl.envs.d4rl_env'], ), - env_manager=dict(type='base',), + env_manager=dict(type='base', ), policy=dict( type='ibc', import_names=['ding.policy.ibc'], diff --git a/dizoo/d4rl/config/pen_human_bc_config.py b/dizoo/d4rl/config/pen_human_bc_config.py index 6779ffd934..215b706ffc 100644 --- a/dizoo/d4rl/config/pen_human_bc_config.py +++ b/dizoo/d4rl/config/pen_human_bc_config.py @@ -8,7 +8,7 @@ env=dict( env_id='pen-human-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), evaluator_env_num=8, @@ -38,7 +38,7 @@ data_type='d4rl', data_path=None, ), - eval=dict(evaluator=dict(eval_freq=1000,)), + eval=dict(evaluator=dict(eval_freq=1000, )), ), ) main_config = EasyDict(main_config) @@ -48,7 +48,7 @@ type='d4rl', import_names=['dizoo.d4rl.envs.d4rl_env'], ), - env_manager=dict(type='base',), + env_manager=dict(type='base', ), policy=dict( type='bc', import_names=['ding.policy.bc'], diff --git a/dizoo/d4rl/config/pen_human_ibc_ar_config.py b/dizoo/d4rl/config/pen_human_ibc_ar_config.py index b75e3b9f11..4f59733fd5 100644 --- a/dizoo/d4rl/config/pen_human_ibc_ar_config.py +++ b/dizoo/d4rl/config/pen_human_ibc_ar_config.py @@ -8,7 +8,7 @@ env=dict( env_id='pen-human-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), evaluator_env_num=8, @@ -19,24 +19,20 @@ policy=dict( cuda=cuda, model=dict( - obs_shape=45, - action_shape=24, - hidden_size=128, - hidden_layer_num=4, - stochastic_optim=dict(type='ardfo',) + obs_shape=45, action_shape=24, hidden_size=128, hidden_layer_num=4, stochastic_optim=dict(type='ardfo', ) ), learn=dict( multi_gpu=multi_gpu, train_epoch=1000, batch_size=256, - optim=dict(learning_rate=1e-5,), + optim=dict(learning_rate=1e-5, ), learner=dict(hook=dict(log_show_after_iter=100)), ), collect=dict( data_type='d4rl', data_path=None, ), - eval=dict(evaluator=dict(eval_freq=1000,)), + eval=dict(evaluator=dict(eval_freq=1000, )), ), ) main_config = EasyDict(main_config) @@ -46,7 +42,7 @@ type='d4rl', import_names=['dizoo.d4rl.envs.d4rl_env'], ), - env_manager=dict(type='base',), + env_manager=dict(type='base', ), policy=dict( type='ibc', import_names=['ding.policy.ibc'], diff --git a/dizoo/d4rl/config/pen_human_ibc_config.py b/dizoo/d4rl/config/pen_human_ibc_config.py index 207487d921..9ed4f6d17b 100644 --- a/dizoo/d4rl/config/pen_human_ibc_config.py +++ b/dizoo/d4rl/config/pen_human_ibc_config.py @@ -8,7 +8,7 @@ env=dict( env_id='pen-human-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), evaluator_env_num=8, @@ -18,23 +18,19 @@ ), policy=dict( cuda=cuda, - model=dict( - obs_shape=45, - action_shape=24, - stochastic_optim=dict(type='dfo',) - ), + model=dict(obs_shape=45, action_shape=24, stochastic_optim=dict(type='dfo', )), learn=dict( multi_gpu=multi_gpu, train_epoch=1000, batch_size=256, - optim=dict(learning_rate=1e-5,), + optim=dict(learning_rate=1e-5, ), learner=dict(hook=dict(log_show_after_iter=100)), ), collect=dict( data_type='d4rl', data_path=None, ), - eval=dict(evaluator=dict(eval_freq=1000,)), + eval=dict(evaluator=dict(eval_freq=1000, )), ), ) main_config = EasyDict(main_config) @@ -44,7 +40,7 @@ type='d4rl', import_names=['dizoo.d4rl.envs.d4rl_env'], ), - env_manager=dict(type='base',), + env_manager=dict(type='base', ), policy=dict( type='ibc', import_names=['ding.policy.ibc'], diff --git a/dizoo/d4rl/config/pen_human_ibc_mcmc_config.py b/dizoo/d4rl/config/pen_human_ibc_mcmc_config.py index cee0f631fd..4dd6b37f90 100644 --- a/dizoo/d4rl/config/pen_human_ibc_mcmc_config.py +++ b/dizoo/d4rl/config/pen_human_ibc_mcmc_config.py @@ -8,7 +8,7 @@ env=dict( env_id='pen-human-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), evaluator_env_num=8, @@ -18,23 +18,19 @@ ), policy=dict( cuda=cuda, - model=dict( - obs_shape=45, - action_shape=24, - stochastic_optim=dict(type='mcmc',) - ), + model=dict(obs_shape=45, action_shape=24, stochastic_optim=dict(type='mcmc', )), learn=dict( multi_gpu=multi_gpu, train_epoch=1000, batch_size=256, - optim=dict(learning_rate=1e-5,), + optim=dict(learning_rate=1e-5, ), learner=dict(hook=dict(log_show_after_iter=100)), ), collect=dict( data_type='d4rl', data_path=None, ), - eval=dict(evaluator=dict(eval_freq=1000,)), + eval=dict(evaluator=dict(eval_freq=1000, )), ), ) main_config = EasyDict(main_config) @@ -44,7 +40,7 @@ type='d4rl', import_names=['dizoo.d4rl.envs.d4rl_env'], ), - env_manager=dict(type='base',), + env_manager=dict(type='base', ), policy=dict( type='ibc', import_names=['ding.policy.ibc'], diff --git a/dizoo/d4rl/config/walker2d_expert_td3bc_config.py b/dizoo/d4rl/config/walker2d_expert_td3bc_config.py index c12d58b230..f5530b7bfd 100644 --- a/dizoo/d4rl/config/walker2d_expert_td3bc_config.py +++ b/dizoo/d4rl/config/walker2d_expert_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='halfcheetah-expert-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/walker2d_medium_expert_td3bc_config.py b/dizoo/d4rl/config/walker2d_medium_expert_td3bc_config.py index 2aed878dd8..d85ddc134e 100644 --- a/dizoo/d4rl/config/walker2d_medium_expert_td3bc_config.py +++ b/dizoo/d4rl/config/walker2d_medium_expert_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='walker2d-medium-expert-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/walker2d_medium_replay_td3bc_config.py b/dizoo/d4rl/config/walker2d_medium_replay_td3bc_config.py index 67cc95a1c2..e997e7d8aa 100644 --- a/dizoo/d4rl/config/walker2d_medium_replay_td3bc_config.py +++ b/dizoo/d4rl/config/walker2d_medium_replay_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='walker2d-medium-replay-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/walker2d_medium_td3bc_config.py b/dizoo/d4rl/config/walker2d_medium_td3bc_config.py index dc76b5c012..619dc62a8d 100644 --- a/dizoo/d4rl/config/walker2d_medium_td3bc_config.py +++ b/dizoo/d4rl/config/walker2d_medium_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='walker2d-medium-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/walker2d_random_td3bc_config.py b/dizoo/d4rl/config/walker2d_random_td3bc_config.py index f252c14dbd..fe915b65bc 100644 --- a/dizoo/d4rl/config/walker2d_random_td3bc_config.py +++ b/dizoo/d4rl/config/walker2d_random_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='walker2d-random-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/entry/d4rl_cql_main.py b/dizoo/d4rl/entry/d4rl_cql_main.py index 9315a3644d..7a8934a90a 100644 --- a/dizoo/d4rl/entry/d4rl_cql_main.py +++ b/dizoo/d4rl/entry/d4rl_cql_main.py @@ -5,7 +5,7 @@ def train(args): # launch from anywhere - config = Path(__file__).absolute().parent.parent / 'config' / args.config + config = Path(__file__).absolute().parent.parent / 'config' / args.config config = read_config(str(config)) config[0].exp_name = config[0].exp_name.replace('0', str(args.seed)) serial_pipeline_offline(config, seed=args.seed) diff --git a/dizoo/d4rl/entry/d4rl_td3_bc_main.py b/dizoo/d4rl/entry/d4rl_td3_bc_main.py index bdf945978f..b25bf904a5 100644 --- a/dizoo/d4rl/entry/d4rl_td3_bc_main.py +++ b/dizoo/d4rl/entry/d4rl_td3_bc_main.py @@ -5,7 +5,7 @@ def train(args): # launch from anywhere - config = Path(__file__).absolute().parent.parent / 'config' / args.config + config = Path(__file__).absolute().parent.parent / 'config' / args.config config = read_config(str(config)) config[0].exp_name = config[0].exp_name.replace('0', str(args.seed)) serial_pipeline_offline(config, seed=args.seed) diff --git a/dizoo/dmc2gym/config/dmc2gym_ppo_config.py b/dizoo/dmc2gym/config/dmc2gym_ppo_config.py index 4f48633c5f..207b398e63 100644 --- a/dizoo/dmc2gym/config/dmc2gym_ppo_config.py +++ b/dizoo/dmc2gym/config/dmc2gym_ppo_config.py @@ -1,6 +1,5 @@ from easydict import EasyDict - cartpole_balance_ppo_config = dict( exp_name='dmc2gym_cartpole_balance_ppo', env=dict( diff --git a/dizoo/dmc2gym/entry/dmc2gym_sac_pixel_main.py b/dizoo/dmc2gym/entry/dmc2gym_sac_pixel_main.py index 60a83921ef..1f6eb2abb5 100644 --- a/dizoo/dmc2gym/entry/dmc2gym_sac_pixel_main.py +++ b/dizoo/dmc2gym/entry/dmc2gym_sac_pixel_main.py @@ -15,6 +15,7 @@ from dizoo.dmc2gym.envs.dmc2gym_env import DMC2GymEnv from dizoo.dmc2gym.config.dmc2gym_sac_pixel_config import main_config, create_config + def main(): logging.getLogger().setLevel(logging.INFO) main_config.exp_name = 'dmc2gym_sac_pixel_seed0' @@ -23,8 +24,8 @@ def main(): num_seed = 1 for seed_i in range(num_seed): - tb_logger = SummaryWriter(os.path.join('./{}/log/'.format(cfg.exp_name), 'seed'+str(seed_i))) - + tb_logger = SummaryWriter(os.path.join('./{}/log/'.format(cfg.exp_name), 'seed' + str(seed_i))) + with task.start(async_mode=False, ctx=OnlineRLContext()): collector_env = BaseEnvManagerV2( env_fn=[lambda: DMC2GymEnv(cfg.env) for _ in range(cfg.env.collector_env_num)], cfg=cfg.env.manager @@ -42,16 +43,20 @@ def main(): def _add_scalar(ctx): if ctx.eval_value != -np.inf: - tb_logger.add_scalar('evaluator_step/reward', ctx.eval_value, global_step= ctx.env_step) + tb_logger.add_scalar('evaluator_step/reward', ctx.eval_value, global_step=ctx.env_step) collector_rewards = [ctx.trajectories[i]['reward'] for i in range(len(ctx.trajectories))] collector_mean_reward = sum(collector_rewards) / len(ctx.trajectories) # collector_max_reward = max(collector_rewards) # collector_min_reward = min(collector_rewards) - tb_logger.add_scalar('collecter_step/mean_reward', collector_mean_reward, global_step= ctx.env_step) + tb_logger.add_scalar('collecter_step/mean_reward', collector_mean_reward, global_step=ctx.env_step) # tb_logger.add_scalar('collecter_step/max_reward', collector_max_reward, global_step= ctx.env_step) # tb_logger.add_scalar('collecter_step/min_reward', collector_min_reward, global_step= ctx.env_step) - tb_logger.add_scalar('collecter_step/avg_env_step_per_episode', ctx.env_step/ctx.env_episode, global_step= ctx.env_step) - + tb_logger.add_scalar( + 'collecter_step/avg_env_step_per_episode', + ctx.env_step / ctx.env_episode, + global_step=ctx.env_step + ) + def _add_train_scalar(ctx): len_train = len(ctx.train_output) cur_lr_q_avg = sum([ctx.train_output[i]['cur_lr_q'] for i in range(len_train)]) / len_train @@ -59,15 +64,17 @@ def _add_train_scalar(ctx): critic_loss_avg = sum([ctx.train_output[i]['critic_loss'] for i in range(len_train)]) / len_train policy_loss_avg = sum([ctx.train_output[i]['policy_loss'] for i in range(len_train)]) / len_train total_loss_avg = sum([ctx.train_output[i]['total_loss'] for i in range(len_train)]) / len_train - tb_logger.add_scalar('learner_step/cur_lr_q_avg', cur_lr_q_avg, global_step= ctx.env_step) - tb_logger.add_scalar('learner_step/cur_lr_p_avg', cur_lr_p_avg, global_step= ctx.env_step) - tb_logger.add_scalar('learner_step/critic_loss_avg', critic_loss_avg, global_step= ctx.env_step) - tb_logger.add_scalar('learner_step/policy_loss_avg', policy_loss_avg, global_step= ctx.env_step) - tb_logger.add_scalar('learner_step/total_loss_avg', total_loss_avg, global_step= ctx.env_step) - + tb_logger.add_scalar('learner_step/cur_lr_q_avg', cur_lr_q_avg, global_step=ctx.env_step) + tb_logger.add_scalar('learner_step/cur_lr_p_avg', cur_lr_p_avg, global_step=ctx.env_step) + tb_logger.add_scalar('learner_step/critic_loss_avg', critic_loss_avg, global_step=ctx.env_step) + tb_logger.add_scalar('learner_step/policy_loss_avg', policy_loss_avg, global_step=ctx.env_step) + tb_logger.add_scalar('learner_step/total_loss_avg', total_loss_avg, global_step=ctx.env_step) + task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) task.use( - StepCollector(cfg, policy.collect_mode, collector_env, random_collect_size=cfg.policy.random_collect_size) + StepCollector( + cfg, policy.collect_mode, collector_env, random_collect_size=cfg.policy.random_collect_size + ) ) task.use(_add_scalar) task.use(data_pusher(cfg, buffer_)) diff --git a/dizoo/dmc2gym/entry/dmc2gym_sac_state_main.py b/dizoo/dmc2gym/entry/dmc2gym_sac_state_main.py index 6bc7036352..7e6cf920f5 100644 --- a/dizoo/dmc2gym/entry/dmc2gym_sac_state_main.py +++ b/dizoo/dmc2gym/entry/dmc2gym_sac_state_main.py @@ -15,6 +15,7 @@ from tensorboardX import SummaryWriter import os + def main(): logging.getLogger().setLevel(logging.INFO) main_config.exp_name = 'dmc2gym_sac_state_nseed_5M' @@ -23,8 +24,8 @@ def main(): num_seed = 4 for seed_i in range(num_seed): - tb_logger = SummaryWriter(os.path.join('./{}/log/'.format(cfg.exp_name), 'seed'+str(seed_i))) - + tb_logger = SummaryWriter(os.path.join('./{}/log/'.format(cfg.exp_name), 'seed' + str(seed_i))) + with task.start(async_mode=False, ctx=OnlineRLContext()): collector_env = BaseEnvManagerV2( env_fn=[lambda: DMC2GymEnv(cfg.env) for _ in range(cfg.env.collector_env_num)], cfg=cfg.env.manager @@ -41,16 +42,20 @@ def main(): def _add_scalar(ctx): if ctx.eval_value != -np.inf: - tb_logger.add_scalar('evaluator_step/reward', ctx.eval_value, global_step= ctx.env_step) + tb_logger.add_scalar('evaluator_step/reward', ctx.eval_value, global_step=ctx.env_step) collector_rewards = [ctx.trajectories[i]['reward'] for i in range(len(ctx.trajectories))] collector_mean_reward = sum(collector_rewards) / len(ctx.trajectories) # collector_max_reward = max(collector_rewards) # collector_min_reward = min(collector_rewards) - tb_logger.add_scalar('collecter_step/mean_reward', collector_mean_reward, global_step= ctx.env_step) + tb_logger.add_scalar('collecter_step/mean_reward', collector_mean_reward, global_step=ctx.env_step) # tb_logger.add_scalar('collecter_step/max_reward', collector_max_reward, global_step= ctx.env_step) # tb_logger.add_scalar('collecter_step/min_reward', collector_min_reward, global_step= ctx.env_step) - tb_logger.add_scalar('collecter_step/avg_env_step_per_episode', ctx.env_step/ctx.env_episode, global_step= ctx.env_step) - + tb_logger.add_scalar( + 'collecter_step/avg_env_step_per_episode', + ctx.env_step / ctx.env_episode, + global_step=ctx.env_step + ) + def _add_train_scalar(ctx): len_train = len(ctx.train_output) cur_lr_q_avg = sum([ctx.train_output[i]['cur_lr_q'] for i in range(len_train)]) / len_train @@ -58,15 +63,17 @@ def _add_train_scalar(ctx): critic_loss_avg = sum([ctx.train_output[i]['critic_loss'] for i in range(len_train)]) / len_train policy_loss_avg = sum([ctx.train_output[i]['policy_loss'] for i in range(len_train)]) / len_train total_loss_avg = sum([ctx.train_output[i]['total_loss'] for i in range(len_train)]) / len_train - tb_logger.add_scalar('learner_step/cur_lr_q_avg', cur_lr_q_avg, global_step= ctx.env_step) - tb_logger.add_scalar('learner_step/cur_lr_p_avg', cur_lr_p_avg, global_step= ctx.env_step) - tb_logger.add_scalar('learner_step/critic_loss_avg', critic_loss_avg, global_step= ctx.env_step) - tb_logger.add_scalar('learner_step/policy_loss_avg', policy_loss_avg, global_step= ctx.env_step) - tb_logger.add_scalar('learner_step/total_loss_avg', total_loss_avg, global_step= ctx.env_step) - + tb_logger.add_scalar('learner_step/cur_lr_q_avg', cur_lr_q_avg, global_step=ctx.env_step) + tb_logger.add_scalar('learner_step/cur_lr_p_avg', cur_lr_p_avg, global_step=ctx.env_step) + tb_logger.add_scalar('learner_step/critic_loss_avg', critic_loss_avg, global_step=ctx.env_step) + tb_logger.add_scalar('learner_step/policy_loss_avg', policy_loss_avg, global_step=ctx.env_step) + tb_logger.add_scalar('learner_step/total_loss_avg', total_loss_avg, global_step=ctx.env_step) + task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) task.use( - StepCollector(cfg, policy.collect_mode, collector_env, random_collect_size=cfg.policy.random_collect_size) + StepCollector( + cfg, policy.collect_mode, collector_env, random_collect_size=cfg.policy.random_collect_size + ) ) task.use(_add_scalar) task.use(data_pusher(cfg, buffer_)) diff --git a/dizoo/dmc2gym/envs/dmc2gym_env.py b/dizoo/dmc2gym/envs/dmc2gym_env.py index 9e97629897..14c70b6f44 100644 --- a/dizoo/dmc2gym/envs/dmc2gym_env.py +++ b/dizoo/dmc2gym/envs/dmc2gym_env.py @@ -10,6 +10,7 @@ def dmc2gym_observation_space(dim, minimum=-np.inf, maximum=np.inf, dtype=np.float32) -> Callable: + def observation_space(from_pixels=True, height=84, width=84, channels_first=True) -> Box: if from_pixels: shape = [3, height, width] if channels_first else [height, width, 3] @@ -29,6 +30,7 @@ def dmc2gym_action_space(dim, minimum=-1, maximum=1, dtype=np.float32) -> Box: def dmc2gym_reward_space(minimum=0, maximum=1, dtype=np.float32) -> Callable: + def reward_space(frame_skip=1) -> Box: return Box( np.repeat(minimum * frame_skip, 1).astype(dtype), diff --git a/dizoo/dmc2gym/envs/test_dmc2gym_env.py b/dizoo/dmc2gym/envs/test_dmc2gym_env.py index 94e6d9e9a7..5245a7a86a 100644 --- a/dizoo/dmc2gym/envs/test_dmc2gym_env.py +++ b/dizoo/dmc2gym/envs/test_dmc2gym_env.py @@ -47,4 +47,3 @@ def test_naive(self): assert timestep.reward <= env.reward_space.high print(env.observation_space, env.action_space, env.reward_space) env.close() - diff --git a/dizoo/evogym/envs/test/visualize_simple_env.py b/dizoo/evogym/envs/test/visualize_simple_env.py index cde80b725c..2203209fbe 100644 --- a/dizoo/evogym/envs/test/visualize_simple_env.py +++ b/dizoo/evogym/envs/test/visualize_simple_env.py @@ -7,7 +7,6 @@ from dizoo.evogym.envs.viewer import DingEvoViewer from evogym.sim import EvoSim - if __name__ == '__main__': gym.logger.set_level(gym.logger.DEBUG) # create a random robot diff --git a/dizoo/gym_anytrading/config/stocks_dqn_config.py b/dizoo/gym_anytrading/config/stocks_dqn_config.py index c16ab0a5a5..c05a1f5974 100644 --- a/dizoo/gym_anytrading/config/stocks_dqn_config.py +++ b/dizoo/gym_anytrading/config/stocks_dqn_config.py @@ -78,13 +78,11 @@ import_names=['dizoo.gym_anytrading.envs.stocks_env'], ), env_manager=dict(type='base'), - policy=dict( - type='dqn', - ), + policy=dict(type='dqn', ), evaluator=dict( type='trading_interaction', import_names=['dizoo.gym_anytrading.worker'], - ), + ), ) stocks_dqn_create_config = EasyDict(stocks_dqn_create_config) create_config = stocks_dqn_create_config diff --git a/dizoo/gym_anytrading/worker/trading_serial_evaluator.py b/dizoo/gym_anytrading/worker/trading_serial_evaluator.py index 9c7749f722..d2fa4d22d1 100644 --- a/dizoo/gym_anytrading/worker/trading_serial_evaluator.py +++ b/dizoo/gym_anytrading/worker/trading_serial_evaluator.py @@ -32,13 +32,13 @@ class TradingSerialEvaluator(InteractionSerialEvaluator): ) def __init__( - self, - cfg: dict, - env: BaseEnvManager = None, - policy: namedtuple = None, - tb_logger: 'SummaryWriter' = None, # noqa - exp_name: Optional[str] = 'default_experiment', - instance_name: Optional[str] = 'evaluator', + self, + cfg: dict, + env: BaseEnvManager = None, + policy: namedtuple = None, + tb_logger: 'SummaryWriter' = None, # noqa + exp_name: Optional[str] = 'default_experiment', + instance_name: Optional[str] = 'evaluator', ) -> None: """ Overview: @@ -49,12 +49,12 @@ def __init__( super().__init__(cfg, env, policy, tb_logger, exp_name, instance_name) def eval( - self, - save_ckpt_fn: Callable = None, - train_iter: int = -1, - envstep: int = -1, - n_episode: Optional[int] = None, - force_render: bool = False, + self, + save_ckpt_fn: Callable = None, + train_iter: int = -1, + envstep: int = -1, + n_episode: Optional[int] = None, + force_render: bool = False, ) -> Tuple[bool, dict]: ''' Overview: diff --git a/dizoo/gym_hybrid/envs/gym-hybrid/gym_hybrid/__init__.py b/dizoo/gym_hybrid/envs/gym-hybrid/gym_hybrid/__init__.py index aa9f5bdf37..89cb5d7764 100644 --- a/dizoo/gym_hybrid/envs/gym-hybrid/gym_hybrid/__init__.py +++ b/dizoo/gym_hybrid/envs/gym-hybrid/gym_hybrid/__init__.py @@ -3,7 +3,6 @@ from gym_hybrid.environments import SlidingEnv from gym_hybrid.environments import HardMoveEnv - register( id='Moving-v0', entry_point='gym_hybrid:MovingEnv', @@ -15,4 +14,4 @@ register( id='HardMove-v0', entry_point='gym_hybrid:HardMoveEnv', -) \ No newline at end of file +) diff --git a/dizoo/gym_hybrid/envs/gym-hybrid/setup.py b/dizoo/gym_hybrid/envs/gym-hybrid/setup.py index af82deb670..248ccb4535 100644 --- a/dizoo/gym_hybrid/envs/gym-hybrid/setup.py +++ b/dizoo/gym_hybrid/envs/gym-hybrid/setup.py @@ -1,7 +1,8 @@ from setuptools import setup -setup(name='gym_hybrid', - version='0.0.2', # original gym_hybrid version='0.0.1' - packages=['gym_hybrid'], - install_requires=['gym', 'numpy'], +setup( + name='gym_hybrid', + version='0.0.2', # original gym_hybrid version='0.0.1' + packages=['gym_hybrid'], + install_requires=['gym', 'numpy'], ) diff --git a/dizoo/gym_hybrid/envs/gym-hybrid/tests/moving.py b/dizoo/gym_hybrid/envs/gym-hybrid/tests/moving.py index dbc230c0d7..52315decd9 100644 --- a/dizoo/gym_hybrid/envs/gym-hybrid/tests/moving.py +++ b/dizoo/gym_hybrid/envs/gym-hybrid/tests/moving.py @@ -2,7 +2,6 @@ import gym import gym_hybrid - if __name__ == '__main__': env = gym.make('Moving-v0') env.reset() diff --git a/dizoo/gym_hybrid/envs/test_gym_hybrid_env.py b/dizoo/gym_hybrid/envs/test_gym_hybrid_env.py index 7a7bc10006..896987f33f 100644 --- a/dizoo/gym_hybrid/envs/test_gym_hybrid_env.py +++ b/dizoo/gym_hybrid/envs/test_gym_hybrid_env.py @@ -8,7 +8,17 @@ class TestGymHybridEnv: def test_naive(self): - env = GymHybridEnv(EasyDict({'env_id': 'Moving-v0', 'act_scale': False, 'save_replay_gif': False, 'replay_path_gif': None, 'replay_path': None})) + env = GymHybridEnv( + EasyDict( + { + 'env_id': 'Moving-v0', + 'act_scale': False, + 'save_replay_gif': False, + 'replay_path_gif': None, + 'replay_path': None + } + ) + ) env.enable_save_replay('./video') env.seed(314, dynamic_seed=False) assert env._seed == 314 diff --git a/dizoo/image_classification/entry/imagenet_res18_config.py b/dizoo/image_classification/entry/imagenet_res18_config.py index 970ea4f2fd..bd4f473dd6 100644 --- a/dizoo/image_classification/entry/imagenet_res18_config.py +++ b/dizoo/image_classification/entry/imagenet_res18_config.py @@ -27,9 +27,7 @@ learn_data_path='/mnt/lustre/share/images/train', eval_data_path='/mnt/lustre/share/images/val', ), - eval=dict( - batch_size=32, evaluator=dict(eval_freq=1, stop_value=dict(loss=0.5, acc1=75.0, acc5=95.0)) - ), + eval=dict(batch_size=32, evaluator=dict(eval_freq=1, stop_value=dict(loss=0.5, acc1=75.0, acc5=95.0))), ), env=dict(), ) diff --git a/dizoo/league_demo/league_demo_collector.py b/dizoo/league_demo/league_demo_collector.py index 211e15b5e8..ce7985a6dc 100644 --- a/dizoo/league_demo/league_demo_collector.py +++ b/dizoo/league_demo/league_demo_collector.py @@ -25,13 +25,13 @@ class LeagueDemoCollector(ISerialCollector): config = dict(deepcopy_obs=False, transform_obs=False, collect_print_freq=100, get_train_sample=False) def __init__( - self, - cfg: EasyDict, - env: BaseEnvManager = None, - policy: List[namedtuple] = None, - tb_logger: 'SummaryWriter' = None, # noqa - exp_name: Optional[str] = 'default_experiment', - instance_name: Optional[str] = 'collector' + self, + cfg: EasyDict, + env: BaseEnvManager = None, + policy: List[namedtuple] = None, + tb_logger: 'SummaryWriter' = None, # noqa + exp_name: Optional[str] = 'default_experiment', + instance_name: Optional[str] = 'collector' ) -> None: """ Overview: diff --git a/dizoo/maze/entry/maze_bc_main.py b/dizoo/maze/entry/maze_bc_main.py index efd9b6d2a8..3a42d4e921 100644 --- a/dizoo/maze/entry/maze_bc_main.py +++ b/dizoo/maze/entry/maze_bc_main.py @@ -61,9 +61,7 @@ def get_vi_sequence(env, observation): cur_x, cur_y = start_x, start_y while cur_x != target_location[0] or cur_y != target_location[1]: act = vi_sequence[-1][cur_x, cur_y] - track_back.append(( - torch.FloatTensor(env.process_states([cur_x, cur_y], env.get_maze_map())), - act)) + track_back.append((torch.FloatTensor(env.process_states([cur_x, cur_y], env.get_maze_map())), act)) if act == 0: cur_x += 1 elif act == 1: @@ -89,6 +87,7 @@ def __len__(self): def load_bc_dataset(train_seeds=1, test_seeds=1, batch_size=32): + def load_env(seed): ccc = easydict.EasyDict({'size': 16}) e = Maze(ccc) @@ -111,13 +110,8 @@ def load_env(seed): data += track_back - - train_data = BCDataset( - data_train - ) - test_data = BCDataset( - data_test - ) + train_data = BCDataset(data_train) + test_data = BCDataset(data_test) train_dataset = DataLoader(train_data, batch_size=batch_size, shuffle=True) test_dataset = DataLoader(test_data, batch_size=batch_size, shuffle=True) diff --git a/dizoo/minigrid/utils/eval.py b/dizoo/minigrid/utils/eval.py index e8e4f728fa..e3c6acb9fb 100644 --- a/dizoo/minigrid/utils/eval.py +++ b/dizoo/minigrid/utils/eval.py @@ -8,11 +8,11 @@ def eval( - input_cfg: Union[str, Tuple[dict, dict]], - seed: int = 0, - model: Optional[torch.nn.Module] = None, - state_dict: Optional[dict] = None, - replay_path: Optional[str] = './video', + input_cfg: Union[str, Tuple[dict, dict]], + seed: int = 0, + model: Optional[torch.nn.Module] = None, + state_dict: Optional[dict] = None, + replay_path: Optional[str] = './video', ) -> float: r""" Overview: diff --git a/dizoo/mujoco/config/halfcheetah_bdq_config.py b/dizoo/mujoco/config/halfcheetah_bdq_config.py index 145bf8062e..25fb65ba35 100644 --- a/dizoo/mujoco/config/halfcheetah_bdq_config.py +++ b/dizoo/mujoco/config/halfcheetah_bdq_config.py @@ -22,7 +22,6 @@ action_bins_per_branch=2, # mean the action shape is 6, 2 discrete actions for each action dimension encoder_hidden_size_list=[256, 256, 128], ), - learn=dict( batch_size=512, learning_rate=3e-4, @@ -65,4 +64,8 @@ if __name__ == "__main__": # or you can enter `ding -m serial_onpolicy -c halfcheetah_onbdq_config.py -s 0` from ding.entry import serial_pipeline - serial_pipeline((main_config, create_config), seed=0, max_env_step=10000000,) \ No newline at end of file + serial_pipeline( + (main_config, create_config), + seed=0, + max_env_step=10000000, + ) diff --git a/dizoo/mujoco/config/hopper_bdq_config.py b/dizoo/mujoco/config/hopper_bdq_config.py index de08da2a7a..34dbe21664 100644 --- a/dizoo/mujoco/config/hopper_bdq_config.py +++ b/dizoo/mujoco/config/hopper_bdq_config.py @@ -68,4 +68,8 @@ if __name__ == "__main__": # or you can enter `ding -m serial_onpolicy -c hopper_bdq_config.py -s 0` from ding.entry import serial_pipeline - serial_pipeline([main_config, create_config], seed=0, max_env_step=10000000,) + serial_pipeline( + [main_config, create_config], + seed=0, + max_env_step=10000000, + ) diff --git a/dizoo/mujoco/envs/mujoco_wrappers.py b/dizoo/mujoco/envs/mujoco_wrappers.py index 8fc19cd503..d99819783c 100644 --- a/dizoo/mujoco/envs/mujoco_wrappers.py +++ b/dizoo/mujoco/envs/mujoco_wrappers.py @@ -6,10 +6,10 @@ def wrap_mujoco( - env_id, - norm_obs: Dict = dict(use_norm=False, ), - norm_reward: Dict = dict(use_norm=False, ), - delay_reward_step: int = 1 + env_id, + norm_obs: Dict = dict(use_norm=False, ), + norm_reward: Dict = dict(use_norm=False, ), + delay_reward_step: int = 1 ) -> gym.Env: r""" Overview: diff --git a/dizoo/multiagent_mujoco/config/ant_mappo_config.py b/dizoo/multiagent_mujoco/config/ant_mappo_config.py index f221fa7c0f..d11c31be8d 100644 --- a/dizoo/multiagent_mujoco/config/ant_mappo_config.py +++ b/dizoo/multiagent_mujoco/config/ant_mappo_config.py @@ -75,7 +75,6 @@ ) create_config = EasyDict(create_config) - if __name__ == '__main__': from ding.entry import serial_pipeline_onpolicy serial_pipeline_onpolicy((main_config, create_config), seed=0, max_env_step=int(1e7)) diff --git a/dizoo/multiagent_mujoco/config/ant_masac_config.py b/dizoo/multiagent_mujoco/config/ant_masac_config.py index 1f04efe8b7..9316b095c0 100644 --- a/dizoo/multiagent_mujoco/config/ant_masac_config.py +++ b/dizoo/multiagent_mujoco/config/ant_masac_config.py @@ -34,9 +34,7 @@ target_theta=0.005, discount_factor=0.99, ), - collect=dict( - n_sample=400, - ), + collect=dict(n_sample=400, ), eval=dict(evaluator=dict(eval_freq=500, )), other=dict(replay_buffer=dict(replay_buffer_size=100000, ), ), ), diff --git a/dizoo/petting_zoo/config/ptz_simple_spread_madqn_config.py b/dizoo/petting_zoo/config/ptz_simple_spread_madqn_config.py index b7db69abbe..8ddb636abf 100644 --- a/dizoo/petting_zoo/config/ptz_simple_spread_madqn_config.py +++ b/dizoo/petting_zoo/config/ptz_simple_spread_madqn_config.py @@ -41,9 +41,7 @@ discount_factor=0.95, ), collect=dict( - collector=dict( - get_train_sample=True, - ), + collector=dict(get_train_sample=True, ), n_episode=32, unroll_len=10, env_num=collector_env_num, @@ -60,9 +58,7 @@ end=0.05, decay=10000, ), - replay_buffer=dict( - replay_buffer_size=15000, - ), + replay_buffer=dict(replay_buffer_size=15000, ), ), ), ) diff --git a/dizoo/rocket/entry/rocket_hover_ppo_main.py b/dizoo/rocket/entry/rocket_hover_ppo_main.py index 2539ff12d3..13f5714483 100644 --- a/dizoo/rocket/entry/rocket_hover_ppo_main.py +++ b/dizoo/rocket/entry/rocket_hover_ppo_main.py @@ -30,12 +30,10 @@ def main(): tb_logger = SummaryWriter(os.path.join('./{}/log/'.format(cfg.exp_name), 'seed' + str(seed_i))) with task.start(async_mode=False, ctx=OnlineRLContext()): collector_env = BaseEnvManagerV2( - env_fn=[lambda: RocketEnv(cfg.env) for _ in range(cfg.env.collector_env_num)], - cfg=cfg.env.manager + env_fn=[lambda: RocketEnv(cfg.env) for _ in range(cfg.env.collector_env_num)], cfg=cfg.env.manager ) evaluator_env = BaseEnvManagerV2( - env_fn=[lambda: RocketEnv(cfg.env) for _ in range(cfg.env.evaluator_env_num)], - cfg=cfg.env.manager + env_fn=[lambda: RocketEnv(cfg.env) for _ in range(cfg.env.evaluator_env_num)], cfg=cfg.env.manager ) # evaluator_env.enable_save_replay() diff --git a/dizoo/rocket/entry/rocket_landing_ppo_main.py b/dizoo/rocket/entry/rocket_landing_ppo_main.py index cc83242ce5..bf8ebb5162 100644 --- a/dizoo/rocket/entry/rocket_landing_ppo_main.py +++ b/dizoo/rocket/entry/rocket_landing_ppo_main.py @@ -27,15 +27,13 @@ def main(): cfg = compile_config(main_config, create_cfg=create_config, auto=True) num_seed = 4 for seed_i in range(num_seed): - tb_logger = SummaryWriter(os.path.join('./{}/log/'.format(cfg.exp_name), 'seed'+str(seed_i))) + tb_logger = SummaryWriter(os.path.join('./{}/log/'.format(cfg.exp_name), 'seed' + str(seed_i))) with task.start(async_mode=False, ctx=OnlineRLContext()): collector_env = BaseEnvManagerV2( - env_fn=[lambda: RocketEnv(cfg.env) for _ in range(cfg.env.collector_env_num)], - cfg=cfg.env.manager + env_fn=[lambda: RocketEnv(cfg.env) for _ in range(cfg.env.collector_env_num)], cfg=cfg.env.manager ) evaluator_env = BaseEnvManagerV2( - env_fn=[lambda: RocketEnv(cfg.env) for _ in range(cfg.env.evaluator_env_num)], - cfg=cfg.env.manager + env_fn=[lambda: RocketEnv(cfg.env) for _ in range(cfg.env.evaluator_env_num)], cfg=cfg.env.manager ) # evaluator_env.enable_save_replay() diff --git a/dizoo/rocket/envs/test_rocket_env.py b/dizoo/rocket/envs/test_rocket_env.py index e19d2879c1..a8bf030fe7 100644 --- a/dizoo/rocket/envs/test_rocket_env.py +++ b/dizoo/rocket/envs/test_rocket_env.py @@ -12,7 +12,7 @@ def test_hover(self): env.seed(314, dynamic_seed=False) assert env._seed == 314 obs = env.reset() - assert obs.shape == (8,) + assert obs.shape == (8, ) for _ in range(5): env.reset() np.random.seed(314) @@ -28,8 +28,8 @@ def test_hover(self): print('timestep', timestep, '\n') assert isinstance(timestep.obs, np.ndarray) assert isinstance(timestep.done, bool) - assert timestep.obs.shape == (8,) - assert timestep.reward.shape == (1,) + assert timestep.obs.shape == (8, ) + assert timestep.reward.shape == (1, ) assert timestep.reward >= env.reward_space.low assert timestep.reward <= env.reward_space.high print(env.observation_space, env.action_space, env.reward_space) diff --git a/dizoo/smac/config/smac_3s5z_madqn_config.py b/dizoo/smac/config/smac_3s5z_madqn_config.py index c15dfcd655..5e771baf09 100644 --- a/dizoo/smac/config/smac_3s5z_madqn_config.py +++ b/dizoo/smac/config/smac_3s5z_madqn_config.py @@ -18,9 +18,7 @@ stop_value=0.999, n_evaluator_episode=32, special_global_state=True, - manager=dict( - shared_memory=False, - ), + manager=dict(shared_memory=False, ), ), policy=dict( nstep=1, @@ -41,9 +39,7 @@ discount_factor=0.95, ), collect=dict( - collector=dict( - get_train_sample=True, - ), + collector=dict(get_train_sample=True, ), n_episode=32, unroll_len=10, env_num=collector_env_num, @@ -56,9 +52,7 @@ end=0.05, decay=10000, ), - replay_buffer=dict( - replay_buffer_size=15000, - ), + replay_buffer=dict(replay_buffer_size=15000, ), ), ), ) diff --git a/dizoo/smac/config/smac_3s5zvs3s6z_madqn_config.py b/dizoo/smac/config/smac_3s5zvs3s6z_madqn_config.py index 23c215b63c..438025241f 100644 --- a/dizoo/smac/config/smac_3s5zvs3s6z_madqn_config.py +++ b/dizoo/smac/config/smac_3s5zvs3s6z_madqn_config.py @@ -18,9 +18,7 @@ stop_value=0.999, n_evaluator_episode=32, special_global_state=True, - manager=dict( - shared_memory=False, - ), + manager=dict(shared_memory=False, ), ), policy=dict( nstep=3, @@ -41,9 +39,7 @@ discount_factor=0.95, ), collect=dict( - collector=dict( - get_train_sample=True, - ), + collector=dict(get_train_sample=True, ), n_episode=32, unroll_len=10, env_num=collector_env_num, @@ -56,9 +52,7 @@ end=0.05, decay=100000, ), - replay_buffer=dict( - replay_buffer_size=30000, - ), + replay_buffer=dict(replay_buffer_size=30000, ), ), ), ) diff --git a/dizoo/smac/config/smac_5m6m_madqn_config.py b/dizoo/smac/config/smac_5m6m_madqn_config.py index 0aa0497712..d05bb23dcb 100644 --- a/dizoo/smac/config/smac_5m6m_madqn_config.py +++ b/dizoo/smac/config/smac_5m6m_madqn_config.py @@ -27,7 +27,7 @@ obs_shape=72, global_obs_shape=152, action_shape=12, - hidden_size_list=[256,256], + hidden_size_list=[256, 256], ), learn=dict( update_per_collect=40, @@ -38,9 +38,7 @@ discount_factor=0.95, ), collect=dict( - collector=dict( - get_train_sample=True, - ), + collector=dict(get_train_sample=True, ), n_episode=32, unroll_len=10, env_num=collector_env_num, @@ -53,9 +51,7 @@ end=0.05, decay=50000, ), - replay_buffer=dict( - replay_buffer_size=50000, - ), + replay_buffer=dict(replay_buffer_size=50000, ), ), ), ) @@ -87,7 +83,6 @@ def train(args): train(args) - def train(args): config = [main_config, create_config] serial_pipeline(config, seed=args.seed, max_env_step=1e7) diff --git a/dizoo/smac/config/smac_8m9m_madqn_config.py b/dizoo/smac/config/smac_8m9m_madqn_config.py index ccf9153a14..672330df24 100644 --- a/dizoo/smac/config/smac_8m9m_madqn_config.py +++ b/dizoo/smac/config/smac_8m9m_madqn_config.py @@ -27,7 +27,7 @@ obs_shape=108, global_obs_shape=263, action_shape=15, - hidden_size_list=[256,256], + hidden_size_list=[256, 256], ), learn=dict( update_per_collect=40, @@ -38,9 +38,7 @@ discount_factor=0.95, ), collect=dict( - collector=dict( - get_train_sample=True, - ), + collector=dict(get_train_sample=True, ), n_episode=32, unroll_len=20, env_num=collector_env_num, @@ -53,9 +51,7 @@ end=0.05, decay=50000, ), - replay_buffer=dict( - replay_buffer_size=20000, - ), + replay_buffer=dict(replay_buffer_size=20000, ), ), ), ) @@ -87,7 +83,6 @@ def train(args): train(args) - def train(args): config = [main_config, create_config] serial_pipeline(config, seed=args.seed, max_env_step=1e7) diff --git a/dizoo/smac/config/smac_MMM2_madqn_config.py b/dizoo/smac/config/smac_MMM2_madqn_config.py index 60e3123dc4..fe8e96501c 100644 --- a/dizoo/smac/config/smac_MMM2_madqn_config.py +++ b/dizoo/smac/config/smac_MMM2_madqn_config.py @@ -18,9 +18,7 @@ stop_value=0.999, n_evaluator_episode=32, special_global_state=True, - manager=dict( - shared_memory=False, - ), + manager=dict(shared_memory=False, ), ), policy=dict( nstep=1, @@ -41,9 +39,7 @@ discount_factor=0.95, ), collect=dict( - collector=dict( - get_train_sample=True, - ), + collector=dict(get_train_sample=True, ), n_episode=32, unroll_len=20, env_num=collector_env_num, @@ -56,9 +52,7 @@ end=0.05, decay=100000, ), - replay_buffer=dict( - replay_buffer_size=30000, - ), + replay_buffer=dict(replay_buffer_size=30000, ), ), ), ) diff --git a/dizoo/smac/config/smac_MMM_madqn_config.py b/dizoo/smac/config/smac_MMM_madqn_config.py index 1d9a6abeaf..892f1f5217 100644 --- a/dizoo/smac/config/smac_MMM_madqn_config.py +++ b/dizoo/smac/config/smac_MMM_madqn_config.py @@ -18,9 +18,7 @@ stop_value=0.999, n_evaluator_episode=32, special_global_state=True, - manager=dict( - shared_memory=False, - ), + manager=dict(shared_memory=False, ), ), policy=dict( nstep=1, @@ -41,9 +39,7 @@ discount_factor=0.95, ), collect=dict( - collector=dict( - get_train_sample=True, - ), + collector=dict(get_train_sample=True, ), n_episode=32, unroll_len=10, env_num=collector_env_num, @@ -56,9 +52,7 @@ end=0.05, decay=10000, ), - replay_buffer=dict( - replay_buffer_size=15000, - ), + replay_buffer=dict(replay_buffer_size=15000, ), ), ), ) diff --git a/dizoo/smac/utils/eval.py b/dizoo/smac/utils/eval.py index 6d683a8ace..1e112e84a7 100644 --- a/dizoo/smac/utils/eval.py +++ b/dizoo/smac/utils/eval.py @@ -10,11 +10,11 @@ def eval( - input_cfg: Union[str, Tuple[dict, dict]], - seed: int = 0, - env_setting: Optional[List[Any]] = None, - model: Optional[torch.nn.Module] = None, - state_dict: Optional[dict] = None, + input_cfg: Union[str, Tuple[dict, dict]], + seed: int = 0, + env_setting: Optional[List[Any]] = None, + model: Optional[torch.nn.Module] = None, + state_dict: Optional[dict] = None, ) -> float: r""" Overview: From 1c111c2c020a7387f2667445e419568163bf5598 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 28 Mar 2023 09:35:09 +0000 Subject: [PATCH 057/244] polish code --- dizoo/common/dqn/lunarlander_dqn.py | 1 + dizoo/common/ppo/lunarlander_ppo.py | 1 + dizoo/common/sac/lunarlander_sac.py | 1 + dizoo/common/td3/lunarlander_td3.py | 1 + 4 files changed, 4 insertions(+) diff --git a/dizoo/common/dqn/lunarlander_dqn.py b/dizoo/common/dqn/lunarlander_dqn.py index aabea36a56..ac411179d7 100644 --- a/dizoo/common/dqn/lunarlander_dqn.py +++ b/dizoo/common/dqn/lunarlander_dqn.py @@ -17,5 +17,6 @@ github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", installation_guide="pip3 install DI-engine[common_env,fast]", usage_file_path="./dizoo/common/dqn/lunarlander_dqn_download.py", + train_file_path="./dizoo/common/dqn/lunarlander_dqn.py", repo_id="OpenDILabCommunity/Lunarlander-v2-DQN" ) diff --git a/dizoo/common/ppo/lunarlander_ppo.py b/dizoo/common/ppo/lunarlander_ppo.py index 648266b15f..0456f89226 100644 --- a/dizoo/common/ppo/lunarlander_ppo.py +++ b/dizoo/common/ppo/lunarlander_ppo.py @@ -17,5 +17,6 @@ github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", installation_guide="pip3 install DI-engine[common_env,fast]", usage_file_path="./dizoo/common/ppo/lunarlander_ppo_download.py", + train_file_path="./dizoo/common/ppo/lunarlander_ppo.py", repo_id="OpenDILabCommunity/LunarLander-v2-PPO" ) diff --git a/dizoo/common/sac/lunarlander_sac.py b/dizoo/common/sac/lunarlander_sac.py index 1008d23fd9..16b7028e46 100644 --- a/dizoo/common/sac/lunarlander_sac.py +++ b/dizoo/common/sac/lunarlander_sac.py @@ -17,5 +17,6 @@ github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", installation_guide="pip3 install DI-engine[common_env,fast]", usage_file_path="./dizoo/common/sac/lunarlander_sac_download.py", + train_file_path="./dizoo/common/sac/lunarlander_sac.py", repo_id="OpenDILabCommunity/LunarLander-v2-SAC" ) diff --git a/dizoo/common/td3/lunarlander_td3.py b/dizoo/common/td3/lunarlander_td3.py index c97a61aa78..0d6f6bfb65 100644 --- a/dizoo/common/td3/lunarlander_td3.py +++ b/dizoo/common/td3/lunarlander_td3.py @@ -17,5 +17,6 @@ github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", installation_guide="pip3 install DI-engine[common_env,fast]", usage_file_path="./dizoo/common/td3/lunarlander_td3_download.py", + train_file_path="./dizoo/common/td3/lunarlander_td3.py", repo_id="OpenDILabCommunity/LunarLander-v2-TD3" ) From 3a0543776b9e89ed6479e752d59c090c02a615f8 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 28 Mar 2023 10:02:41 +0000 Subject: [PATCH 058/244] polish code --- ding/bonus/__init__.py | 1 + ding/bonus/config.py | 69 ++++- ding/bonus/ddpg.py | 248 ++++++++++++++++++ dizoo/common/ddpg/lunarlander_ddpg.py | 22 ++ .../common/ddpg/lunarlander_ddpg_download.py | 16 ++ dizoo/common/ppo/lunarlander_ppo.py | 2 +- dizoo/common/td3/lunarlander_td3.py | 2 +- 7 files changed, 356 insertions(+), 4 deletions(-) create mode 100644 ding/bonus/ddpg.py create mode 100644 dizoo/common/ddpg/lunarlander_ddpg.py create mode 100644 dizoo/common/ddpg/lunarlander_ddpg_download.py diff --git a/ding/bonus/__init__.py b/ding/bonus/__init__.py index af3d089c6e..cfe27b509e 100644 --- a/ding/bonus/__init__.py +++ b/ding/bonus/__init__.py @@ -1,5 +1,6 @@ from .ppof import PPOF from .td3 import TD3OffPolicyAgent +from .ddpg import DDPGOffPolicyAgent from .dqn import DQNOffpolicyAgent from .sac import SACOffPolicyAgent from .impala import IMPALAOffPolicyAgent diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 3a42e482ac..f1a6f609b2 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -4,7 +4,7 @@ from ding.envs import BaseEnv, DingEnvWrapper from ding.envs.env_wrappers import MaxAndSkipWrapper, WarpFrameWrapper, ScaledFloatFrameWrapper, FrameStackWrapper, \ EvalEpisodeReturnEnv, TransposeWrapper, TimeLimitWrapper, FlatObsWrapper, GymToGymnasiumWrapper -from ding.policy import PPOFPolicy, TD3Policy, SACPolicy, DQNPolicy, IMPALAPolicy +from ding.policy import PPOFPolicy, TD3Policy, DDPGPolicy, SACPolicy, DQNPolicy, IMPALAPolicy def get_instance_config(env: str, algorithm: str) -> EasyDict: @@ -161,7 +161,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: exp_name='LunarLanderContinuous-V2-TD3', seed=0, env=dict( - env_id='Hopper-v3', + env_id='LunarLanderContinuous-v2', collector_env_num=8, evaluator_env_num=8, n_evaluator_episode=8, @@ -200,6 +200,71 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ) else: raise KeyError("not supported env type: {}".format(env)) + elif algorithm == 'DDPG': + cfg = EasyDict({"policy": DDPGPolicy.default_config()}) + if env == 'hopper': + pass + elif env == 'lunarlander_continuous': + cfg.update( + dict( + exp_name='LunarLanderContinuous-V2-DDPG', + seed=0, + env=dict( + env_id='LunarLanderContinuous-v2', + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + act_scale=True, + stop_value=240, + ), + policy=dict( + cuda=True, + random_collect_size=0, + model=dict( + obs_shape=8, + action_shape=2, + twin_critic=True, + action_space='regression', + ), + learn=dict( + update_per_collect=2, + batch_size=128, + learning_rate_actor=0.001, + learning_rate_critic=0.001, + ignore_done=False, # TODO(pu) + # (int) When critic network updates once, how many times will actor network update. + # Delayed Policy Updates in original TD3 paper(https://arxiv.org/pdf/1802.09477.pdf). + # Default 1 for DDPG, 2 for TD3. + actor_update_freq=1, + # (bool) Whether to add noise on target network's action. + # Target Policy Smoothing Regularization in original TD3 paper(https://arxiv.org/pdf/1802.09477.pdf). + # Default True for TD3, False for DDPG. + noise=False, + noise_sigma=0.1, + noise_range=dict( + min=-0.5, + max=0.5, + ), + ), + collect=dict( + n_sample=48, + noise_sigma=0.1, + collector=dict(collect_print_freq=1000, ), + ), + eval=dict(evaluator=dict(eval_freq=100, ), ), + other=dict(replay_buffer=dict(replay_buffer_size=20000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) + else: + raise KeyError("not supported env type: {}".format(env)) elif algorithm == 'SAC': cfg = EasyDict({"policy": SACPolicy.default_config()}) if env == 'hopper': diff --git a/ding/bonus/ddpg.py b/ding/bonus/ddpg.py new file mode 100644 index 0000000000..5a5516b4ff --- /dev/null +++ b/ding/bonus/ddpg.py @@ -0,0 +1,248 @@ +from dataclasses import dataclass +from typing import Optional, Union +from ditk import logging +from easydict import EasyDict +import os +import gym +import torch +import treetensor.torch as ttorch +import numpy as np +from ding.framework import task, OnlineRLContext +from ding.framework.middleware import CkptSaver, multistep_trainer, \ + wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \ + OffPolicyLearner, final_ctx_saver +from ding.envs import BaseEnv, BaseEnvManagerV2, SubprocessEnvManagerV2 +from ding.policy import DDPGPolicy +from ding.utils import set_pkg_seed +from ding.config import Config, save_config_py, compile_config +from ding.model import QAC +from ding.data import DequeBuffer +from ding.bonus.config import get_instance_config, get_instance_env + + +@dataclass +class TrainingReturn: + ''' + Attributions + wandb_url: The weight & biases (wandb) project url of the trainning experiment. + ''' + wandb_url: str + + +@dataclass +class EvalReturn: + ''' + Attributions + eval_value: The mean of evaluation return. + eval_value_std: The standard deviation of evaluation return. + ''' + eval_value: np.float32 + eval_value_std: np.float32 + + +class DDPGOffPolicyAgent: + supported_env_list = [ + 'lunarlander_continuous', + ] + algorithm = 'DDPG' + + def __init__( + self, + env: Union[str, BaseEnv], + seed: int = 0, + exp_name: str = None, + model: Optional[torch.nn.Module] = None, + cfg: Optional[Union[EasyDict, dict, str]] = None, + policy_state_dict: str = None, + ) -> None: + if isinstance(env, str): + assert env in DDPGOffPolicyAgent.supported_env_list, "Please use supported envs: {}".format( + DDPGOffPolicyAgent.supported_env_list + ) + self.env = get_instance_env(env) + if cfg is None: + # 'It should be default env tuned config' + cfg = get_instance_config(env, algorithm=DDPGOffPolicyAgent.algorithm) + else: + assert isinstance(cfg, EasyDict), "Please use EasyDict as config data type." + + if exp_name is not None: + cfg.exp_name = exp_name + self.cfg = compile_config(cfg, policy=DDPGPolicy) + self.exp_name = self.cfg.exp_name + + elif isinstance(env, BaseEnv): + self.cfg = compile_config(cfg, policy=DDPGPolicy) + raise NotImplementedError + else: + raise TypeError("not support env type: {}, only strings and instances of `BaseEnv` now".format(type(env))) + logging.getLogger().setLevel(logging.INFO) + self.seed = seed + set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda) + if not os.path.exists(self.exp_name): + os.makedirs(self.exp_name) + save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py')) + if model is None: + model = QAC(**self.cfg.policy.model) + self.buffer_ = DequeBuffer(size=self.cfg.policy.other.replay_buffer.replay_buffer_size) + self.policy = DDPGPolicy(self.cfg.policy, model=model) + if policy_state_dict is not None: + self.policy.learn_mode.load_state_dict(policy_state_dict) + + def train( + self, + step: int = int(1e7), + collector_env_num: int = 4, + evaluator_env_num: int = 4, + n_iter_log_show: int = 500, + n_iter_save_ckpt: int = 1000, + context: Optional[str] = None, + debug: bool = False + ) -> TrainingReturn: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + logging.debug(self.policy._model) + # define env and policy + collector_env = self._setup_env_manager(collector_env_num, context, debug) + evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug) + + with task.start(ctx=OnlineRLContext()): + task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) + task.use( + StepCollector( + self.cfg, + self.policy.collect_mode, + collector_env, + random_collect_size=self.cfg.policy.random_collect_size + ) + ) + task.use(data_pusher(self.cfg, self.buffer_)) + task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_)) + task.use( + CkptSaver( + policy=self.policy, + save_dir=os.path.join(self.cfg["exp_name"], "model"), + train_freq=n_iter_save_ckpt + ) + ) + task.use( + wandb_online_logger( + metric_list=self.policy.monitor_vars(), + model=self.policy._model, + anonymous=True, + project_name=self.exp_name + ) + ) + task.use(termination_checker(max_env_step=step)) + task.use(final_ctx_saver(name=self.cfg["exp_name"])) + task.run() + + return TrainingReturn(wandb_url=task.ctx.wandb_url) + + def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> None: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + # define env and policy + env = self.env.clone() + env.seed(self.seed, dynamic_seed=False) + + if enable_save_replay and replay_save_path: + env.enable_save_replay(replay_path=replay_save_path) + elif enable_save_replay: + env.enable_save_replay(replay_path=os.path.join(self.exp_name, 'videos')) + else: + logging.warning('No video would be generated during the deploy.') + + def single_env_forward_wrapper(forward_fn, cuda=True): + + def _forward(obs): + # unsqueeze means add batch dim, i.e. (O, ) -> (1, O) + obs = ttorch.as_tensor(obs).unsqueeze(0) + if cuda and torch.cuda.is_available(): + obs = obs.cuda() + action = forward_fn(obs, mode='compute_actor')["action"] + # squeeze means delete batch dim, i.e. (1, A) -> (A, ) + action = action.squeeze(0).detach().cpu().numpy() + return action + + return _forward + + forward_fn = single_env_forward_wrapper(self.policy._model, self.cfg.policy.cuda) + + # main loop + return_ = 0. + step = 0 + obs = env.reset() + while True: + action = forward_fn(obs) + obs, rew, done, info = env.step(action) + return_ += rew + step += 1 + if done: + break + logging.info(f'DDPG deploy is finished, final episode return with {step} steps is: {return_}') + + def collect_data( + self, + env_num: int = 8, + save_data_path: Optional[str] = None, + n_sample: Optional[int] = None, + n_episode: Optional[int] = None, + context: Optional[str] = None, + debug: bool = False + ) -> None: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + if n_episode is not None: + raise NotImplementedError + # define env and policy + env = self._setup_env_manager(env_num, context, debug) + + if save_data_path is None: + save_data_path = os.path.join(self.exp_name, 'demo_data') + + # main execution task + with task.start(ctx=OnlineRLContext()): + task.use( + StepCollector( + self.cfg, self.policy.collect_mode, env, random_collect_size=self.cfg.policy.random_collect_size + ) + ) + task.use(offline_data_saver(save_data_path, data_type='hdf5')) + task.run(max_step=1) + logging.info( + f'DDPG collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`' + ) + + def batch_evaluate( + self, + env_num: int = 4, + n_evaluator_episode: int = 4, + context: Optional[str] = None, + debug: bool = False + ) -> EvalReturn: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + # define env and policy + env = self._setup_env_manager(env_num, context, debug) + + evaluate_cfg = self.cfg + evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode + + # main execution task + with task.start(ctx=OnlineRLContext()): + task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, env)) + task.run(max_step=1) + + return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) + + def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: bool = False) -> BaseEnvManagerV2: + if debug: + env_cls = BaseEnvManagerV2 + manager_cfg = env_cls.default_config() + else: + env_cls = SubprocessEnvManagerV2 + manager_cfg = env_cls.default_config() + if context is not None: + manager_cfg.context = context + return env_cls([self.env.clone for _ in range(env_num)], manager_cfg) diff --git a/dizoo/common/ddpg/lunarlander_ddpg.py b/dizoo/common/ddpg/lunarlander_ddpg.py new file mode 100644 index 0000000000..c5ec50348b --- /dev/null +++ b/dizoo/common/ddpg/lunarlander_ddpg.py @@ -0,0 +1,22 @@ +from ding.bonus import DDPGOffPolicyAgent +from huggingface_ding import push_model_to_hub + +# Instantiate the agent +agent = DDPGOffPolicyAgent("lunarlander_continuous", exp_name="LunarLander-v2-DDPG") +# Train the agent +return_ = agent.train(step=int(4000000), collector_env_num=4, evaluator_env_num=4) +# Push model to huggingface hub +push_model_to_hub( + agent=agent, + env_name="OpenAI/Gym/Box2d", + task_name="LunarLander-v2", + algo_name="DDPG", + wandb_url=return_.wandb_url, + github_repo_url="https://github.com/opendilab/DI-engine", + github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/ddpg.html", + github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", + installation_guide="pip3 install DI-engine[common_env,fast]", + usage_file_path="./dizoo/common/ddpg/lunarlander_ddpg_download.py", + train_file_path="./dizoo/common/ddpg/lunarlander_ddpg.py", + repo_id="OpenDILabCommunity/LunarLander-v2-DDPG" +) diff --git a/dizoo/common/ddpg/lunarlander_ddpg_download.py b/dizoo/common/ddpg/lunarlander_ddpg_download.py new file mode 100644 index 0000000000..5458d5610f --- /dev/null +++ b/dizoo/common/ddpg/lunarlander_ddpg_download.py @@ -0,0 +1,16 @@ +from ding.bonus import DDPGOffPolicyAgent +from huggingface_ding import pull_model_from_hub + +# Pull model from Hugggingface hub +policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/LunarLander-v2-DDPG") +# Instantiate the agent +agent = DDPGOffPolicyAgent( + env="lunarlander_continuous", + exp_name="LunarLander-v2-DDPG", + cfg=cfg.exp_config, + policy_state_dict=policy_state_dict +) +# Continue training +agent.train(step=5000) +# Render the new agent performance +agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/ppo/lunarlander_ppo.py b/dizoo/common/ppo/lunarlander_ppo.py index 0456f89226..ae29699a65 100644 --- a/dizoo/common/ppo/lunarlander_ppo.py +++ b/dizoo/common/ppo/lunarlander_ppo.py @@ -4,7 +4,7 @@ # Instantiate the agent agent = PPOF("lunarlander_discrete", exp_name="LunarLander-v2-PPO") # Train the agent -return_ = agent.train(step=int(200000), collector_env_num=4, evaluator_env_num=4) +return_ = agent.train(step=int(4000000), collector_env_num=4, evaluator_env_num=4) # Push model to huggingface hub push_model_to_hub( agent=agent, diff --git a/dizoo/common/td3/lunarlander_td3.py b/dizoo/common/td3/lunarlander_td3.py index 0d6f6bfb65..2d01d7717a 100644 --- a/dizoo/common/td3/lunarlander_td3.py +++ b/dizoo/common/td3/lunarlander_td3.py @@ -4,7 +4,7 @@ # Instantiate the agent agent = TD3OffPolicyAgent("lunarlander_continuous", exp_name="LunarLander-v2-TD3") # Train the agent -return_ = agent.train(step=int(200000), collector_env_num=4, evaluator_env_num=4) +return_ = agent.train(step=int(4000000), collector_env_num=4, evaluator_env_num=4) # Push model to huggingface hub push_model_to_hub( agent=agent, From 54b1a096825798818b63d86b8d9bdbcdb973fb9a Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 28 Mar 2023 10:24:01 +0000 Subject: [PATCH 059/244] fix ddpg bug --- ding/framework/middleware/functional/logger.py | 2 +- ding/policy/ddpg.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ding/framework/middleware/functional/logger.py b/ding/framework/middleware/functional/logger.py index 844e3e8cfb..50ebbbbd8e 100644 --- a/ding/framework/middleware/functional/logger.py +++ b/ding/framework/middleware/functional/logger.py @@ -198,7 +198,7 @@ def _plot(ctx: "OnlineRLContext"): if cfg.plot_logger: for metric in metric_list: - if metric in ctx.train_output[0]: + if len(ctx.train_output)>0 and metric in ctx.train_output[0]: metric_value_list = [] for item in ctx.train_output: if isinstance(item[metric], torch.Tensor): diff --git a/ding/policy/ddpg.py b/ding/policy/ddpg.py index 2887b7480d..02d7c6f6d1 100644 --- a/ding/policy/ddpg.py +++ b/ding/policy/ddpg.py @@ -450,3 +450,6 @@ def _monitor_vars_learn(self) -> List[str]: if self._twin_critic: ret += ['critic_twin_loss'] return ret + + def monitor_vars(self) -> List[str]: + return self._monitor_vars_learn() From 1caefff3e95bcd053a7652c6042967fde759e064 Mon Sep 17 00:00:00 2001 From: zhangpaipai <1124321458@qq.com> Date: Tue, 28 Mar 2023 22:58:34 +0800 Subject: [PATCH 060/244] merge nyz c51/dqn config and policy --- ding/bonus/c51.py | 40 +++++++++---------- ding/bonus/config.py | 23 +++++------ ding/policy/c51.py | 20 +++++++--- ding/policy/dqn.py | 2 +- .../config/lunarlander_c51_config.py | 19 +++++---- 5 files changed, 52 insertions(+), 52 deletions(-) diff --git a/ding/bonus/c51.py b/ding/bonus/c51.py index 1eb97fdd3f..ae2e25884e 100644 --- a/ding/bonus/c51.py +++ b/ding/bonus/c51.py @@ -3,9 +3,9 @@ from ditk import logging from easydict import EasyDict import os -import gym import torch import treetensor.torch as ttorch +import numpy as np from ding.framework import task, OnlineRLContext from ding.framework.middleware import CkptSaver, multistep_trainer, \ wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \ @@ -15,6 +15,7 @@ from ding.utils import set_pkg_seed from ding.config import Config, save_config_py, compile_config from ding.model import C51DQN +from ding.model import model_wrap from ding.data import DequeBuffer from ding.bonus.config import get_instance_config, get_instance_env @@ -28,6 +29,17 @@ class TrainingReturn: wandb_url: str +@dataclass +class EvalReturn: + ''' + Attributions + eval_value: The mean of evaluation return. + eval_value_std: The standard deviation of evaluation return. + ''' + eval_value: np.float32 + eval_value_std: np.float32 + + class C51Agent: supported_env_list = [ 'lunarlander_discrete', @@ -146,36 +158,20 @@ def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, def single_env_forward_wrapper(forward_fn, cuda=True): + forward_fn = model_wrap(forward_fn, wrapper_name='argmax_sample').forward def _forward(obs): # unsqueeze means add batch dim, i.e. (O, ) -> (1, O) obs = ttorch.as_tensor(obs).unsqueeze(0) if cuda and torch.cuda.is_available(): obs = obs.cuda() - output = forward_fn(obs) - assert isinstance(output, dict), "model output must be dict, but find {}".format(type(output)) - logit = output['logit'] - assert isinstance(logit, torch.Tensor) or isinstance(logit, list) - if isinstance(logit, torch.Tensor): - logit = [logit] - if 'action_mask' in output: - mask = output['action_mask'] - if isinstance(mask, torch.Tensor): - mask = [mask] - logit = [l.sub_(1e8 * (1 - m)) for l, m in zip(logit, mask)] - action = [l.argmax(dim=-1) for l in logit] - if len(action) == 1: - action, logit = action[0], logit[0] - #forward_fn.eval() - #action = forward_fn(obs)["action"] - + action = forward_fn(obs)["action"] # squeeze means delete batch dim, i.e. (1, A) -> (A, ) action = action.squeeze(0).detach().cpu().numpy() return action return _forward - forward_fn = single_env_forward_wrapper(self.policy._model) - #forward_fn = single_env_forward_wrapper(self.policy._eval_model) + forward_fn = single_env_forward_wrapper(self.policy._model, self.cfg.policy.cuda) # main loop return_ = 0. @@ -228,7 +224,7 @@ def batch_evaluate( n_evaluator_episode: int = 4, context: Optional[str] = None, debug: bool = False - ) -> None: + ) -> EvalReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -242,6 +238,8 @@ def batch_evaluate( task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, env)) task.run(max_step=1) + return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) + def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: bool = False) -> BaseEnvManagerV2: if debug: env_cls = BaseEnvManagerV2 diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 5931171c60..8c8bafc2b6 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -4,11 +4,7 @@ from ding.envs import BaseEnv, DingEnvWrapper from ding.envs.env_wrappers import MaxAndSkipWrapper, WarpFrameWrapper, ScaledFloatFrameWrapper, FrameStackWrapper, \ EvalEpisodeReturnEnv, TransposeWrapper, TimeLimitWrapper, FlatObsWrapper, GymToGymnasiumWrapper -<<<<<<< HEAD -from ding.policy import PPOFPolicy, TD3Policy, C51Policy -======= -from ding.policy import PPOFPolicy, TD3Policy, SACPolicy, DQNPolicy, IMPALAPolicy ->>>>>>> 1973d01c940fec01980a051081afbdfcfafa8829 +from ding.policy import PPOFPolicy, TD3Policy, SACPolicy, DQNPolicy, IMPALAPolicy, C51Policy def get_instance_config(env: str, algorithm: str) -> EasyDict: @@ -440,25 +436,24 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), policy=dict( cuda=False, - priority=True, model=dict( obs_shape=8, action_shape=4, - encoder_hidden_size_list=[128, 128, 64], - v_min=-10, - v_max=10, + encoder_hidden_size_list=[512, 64], + v_min=-30, + v_max=30, n_atom=51, ), - discount_factor=0.97, + discount_factor=0.99, nstep=3, learn=dict( - update_per_collect=3, + update_per_collect=10, batch_size=64, learning_rate=0.001, target_update_freq=100, ), collect=dict( - n_sample=80, + n_sample=64, unroll_len=1, ), other=dict( @@ -466,8 +461,8 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: type='exp', start=0.95, end=0.1, - decay=10000, - ), replay_buffer=dict(replay_buffer_size=20000, ) + decay=50000, + ), replay_buffer=dict(replay_buffer_size=100000, ) ), random_collect_size=0, ), diff --git a/ding/policy/c51.py b/ding/policy/c51.py index 8ca8493c02..e5cd42bf52 100644 --- a/ding/policy/c51.py +++ b/ding/policy/c51.py @@ -157,16 +157,20 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: self._learn_model.train() self._target_model.train() # Current q value (main model) - q_value = self._learn_model.forward(data['obs'])['distribution'] - logit = self._learn_model.forward(data['obs'])['logit'] + output = self._learn_model.forward(data['obs']) + q_value = output['logit'] + q_value_dist = output['distribution'] # Target q value with torch.no_grad(): - target_q_value = self._target_model.forward(data['next_obs'])['distribution'] + target_output = self._target_model.forward(data['next_obs']) + target_q_value_dist = target_output['distribution'] + target_q_value = target_output['logit'] # Max q value action (main model) target_q_action = self._learn_model.forward(data['next_obs'])['action'] data_n = dist_nstep_td_data( - q_value, target_q_value, data['action'], target_q_action, data['reward'], data['done'], data['weight'] + q_value_dist, target_q_value_dist, data['action'], target_q_action, data['reward'], data['done'], + data['weight'] ) value_gamma = data.get('value_gamma') loss, td_error_per_sample = dist_nstep_td_error( @@ -187,13 +191,17 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: # ============= self._target_model.update(self._learn_model.state_dict()) return { - 'logit': logit.mean().item(), 'cur_lr': self._optimizer.defaults['lr'], 'total_loss': loss.item(), + 'q_value': q_value.mean().item(), + 'target_q_value': target_q_value.mean().item(), 'priority': td_error_per_sample.abs().tolist(), # Only discrete action satisfying len(data['action'])==1 can return this and draw histogram on tensorboard. # '[histogram]action_distribution': data['action'], } + + def _monitor_vars_learn(self) -> List[str]: + return ['cur_lr', 'total_loss', 'q_value', 'target_q_value'] def _state_dict_learn(self) -> Dict[str, Any]: return { @@ -260,4 +268,4 @@ def _get_train_sample(self, data: list) -> Union[None, List[Any]]: return get_train_sample(data, self._unroll_len) def monitor_vars(self) -> List[str]: - return ['logit', 'cur_lr', 'total_loss', 'priority'] + return ['cur_lr', 'total_loss', 'q_value', 'target_q_value'] diff --git a/ding/policy/dqn.py b/ding/policy/dqn.py index 91db6a1840..dd8d2ea1f3 100644 --- a/ding/policy/dqn.py +++ b/ding/policy/dqn.py @@ -270,7 +270,7 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: } def _monitor_vars_learn(self) -> List[str]: - return ['cur_lr', 'total_loss', 'q_value'] + return ['cur_lr', 'total_loss', 'q_value', 'target_q_value'] def _state_dict_learn(self) -> Dict[str, Any]: """ diff --git a/dizoo/box2d/lunarlander/config/lunarlander_c51_config.py b/dizoo/box2d/lunarlander/config/lunarlander_c51_config.py index 8ca965dfb8..8a843f838c 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_c51_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_c51_config.py @@ -10,26 +10,25 @@ stop_value=200, ), policy=dict( - cuda=False, - priority=True, + cuda=True, model=dict( obs_shape=8, action_shape=4, - encoder_hidden_size_list=[128, 128, 64], - v_min=-10, - v_max=10, + encoder_hidden_size_list=[512, 64], + v_min=-30, + v_max=30, n_atom=51, ), - discount_factor=0.97, + discount_factor=0.99, nstep=3, learn=dict( - update_per_collect=3, + update_per_collect=10, batch_size=64, learning_rate=0.001, target_update_freq=100, ), collect=dict( - n_sample=80, + n_sample=64, unroll_len=1, ), other=dict( @@ -37,8 +36,8 @@ type='exp', start=0.95, end=0.1, - decay=10000, - ), replay_buffer=dict(replay_buffer_size=20000, ) + decay=50000, + ), replay_buffer=dict(replay_buffer_size=100000, ) ), ), ) From 6a8d53597d8b4b4ae942561c0fb795571b099c56 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 29 Mar 2023 03:50:42 +0000 Subject: [PATCH 061/244] fix config --- ding/bonus/config.py | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index f1a6f609b2..b6d7ac45c8 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -162,7 +162,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: seed=0, env=dict( env_id='LunarLanderContinuous-v2', - collector_env_num=8, + collector_env_num=4, evaluator_env_num=8, n_evaluator_episode=8, act_scale=True, @@ -170,7 +170,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), policy=dict( cuda=True, - random_collect_size=25000, + random_collect_size=10000, model=dict( obs_shape=8, action_shape=2, @@ -180,14 +180,20 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: update_per_collect=256, batch_size=256, learning_rate_actor=3e-4, - learning_rate_critic=3e-4, + learning_rate_critic=1e-3, + noise=True, noise_sigma=0.1, + noise_range=dict( + min=-0.5, + max=0.5, + ), ), collect=dict( n_sample=256, noise_sigma=0.1, ), - other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), + eval=dict(evaluator=dict(eval_freq=1000, ), ), + other=dict(replay_buffer=dict(replay_buffer_size=100000, ), ), ), wandb_logger=dict( gradient_logger=True, @@ -314,7 +320,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: seed=0, env=dict( env_id='LunarLanderContinuous-v2', - collector_env_num=8, + collector_env_num=4, evaluator_env_num=8, act_scale=True, n_evaluator_episode=8, @@ -322,24 +328,24 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), policy=dict( cuda=True, - random_collect_size=25000, + random_collect_size=10000, model=dict( obs_shape=8, action_shape=2, action_space='reparameterization', - actor_head_hidden_size=256, - critic_head_hidden_size=256, + twin_critic=True, ), learn=dict( update_per_collect=256, - batch_size=256, - learning_rate_actor=3e-4, - learning_rate_critic=3e-4, - reparameterization=True, - auto_alpha=False, + batch_size=128, + learning_rate_q=1e-3, + learning_rate_policy=3e-4, + learning_rate_alpha=3e-4, + auto_alpha=True, ), collect=dict(n_sample=256, ), - other=dict(replay_buffer=dict(replay_buffer_size=int(1e6), ), ), + eval=dict(evaluator=dict(eval_freq=1000, ), ), + other=dict(replay_buffer=dict(replay_buffer_size=int(1e5), ), ), ), wandb_logger=dict( gradient_logger=True, From 6fb3534699e6f45cb9197fdedee9f01634f47782 Mon Sep 17 00:00:00 2001 From: zhangpaipai <1124321458@qq.com> Date: Wed, 29 Mar 2023 13:32:09 +0800 Subject: [PATCH 062/244] remove mutistep_trainer --- ding/bonus/c51.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ding/bonus/c51.py b/ding/bonus/c51.py index ae2e25884e..0e7e696020 100644 --- a/ding/bonus/c51.py +++ b/ding/bonus/c51.py @@ -119,7 +119,6 @@ def train( ) task.use(nstep_reward_enhancer(self.cfg)) task.use(data_pusher(self.cfg, self.buffer_)) - task.use(multistep_trainer(self.policy, log_freq=n_iter_log_show)) task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_)) task.use( CkptSaver( From c54f220981e76e0aaf3159f69add2d6fe7699090 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 29 Mar 2023 08:04:17 +0000 Subject: [PATCH 063/244] fix bug --- ding/bonus/ppof.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index 9bafd63a42..edd8087ef7 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -166,7 +166,7 @@ def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, else: logging.warning('No video would be generated during the deploy.') - forward_fn = single_env_forward_wrapper_ttorch(self.policy.eval, self.cfg.policy.cuda) + forward_fn = single_env_forward_wrapper_ttorch(self.policy.eval, self.cfg.cuda) # main loop return_ = 0. From 557102eff3c46940d0546401b923f2e6a45ffaea Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 29 Mar 2023 12:39:31 +0000 Subject: [PATCH 064/244] polish code --- .../config/lunarlander_cont_sac_config.py | 4 +--- dizoo/common/ddpg/lunarlander_ddpg.py | 5 +++-- dizoo/common/ddpg/lunarlander_ddpg_deploy.py | 19 +++++++++++++++++++ dizoo/common/dqn/lunarlander_dqn.py | 5 +++-- dizoo/common/dqn/lunarlander_dqn_deploy.py | 19 +++++++++++++++++++ dizoo/common/dqn/lunarlander_dqn_download.py | 5 +---- dizoo/common/ppo/lunarlander_ppo.py | 5 +++-- dizoo/common/ppo/lunarlander_ppo_deploy.py | 16 ++++++++++++++++ dizoo/common/ppo/lunarlander_ppo_download.py | 2 +- dizoo/common/sac/lunarlander_sac.py | 5 +++-- dizoo/common/sac/lunarlander_sac_deploy.py | 16 ++++++++++++++++ dizoo/common/sac/lunarlander_sac_download.py | 5 ++++- dizoo/common/td3/lunarlander_td3.py | 5 +++-- dizoo/common/td3/lunarlander_td3_deploy.py | 19 +++++++++++++++++++ 14 files changed, 111 insertions(+), 19 deletions(-) create mode 100644 dizoo/common/ddpg/lunarlander_ddpg_deploy.py create mode 100644 dizoo/common/dqn/lunarlander_dqn_deploy.py create mode 100644 dizoo/common/ppo/lunarlander_ppo_deploy.py create mode 100644 dizoo/common/sac/lunarlander_sac_deploy.py create mode 100644 dizoo/common/td3/lunarlander_td3_deploy.py diff --git a/dizoo/box2d/lunarlander/config/lunarlander_cont_sac_config.py b/dizoo/box2d/lunarlander/config/lunarlander_cont_sac_config.py index 0e60fce608..f8a8ab47e7 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_cont_sac_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_cont_sac_config.py @@ -28,9 +28,7 @@ learning_rate_alpha=3e-4, auto_alpha=True, ), - collect=dict( - n_sample=256, - ), + collect=dict(n_sample=256, ), eval=dict(evaluator=dict(eval_freq=1000, ), ), other=dict(replay_buffer=dict(replay_buffer_size=int(1e5), ), ), ), diff --git a/dizoo/common/ddpg/lunarlander_ddpg.py b/dizoo/common/ddpg/lunarlander_ddpg.py index c5ec50348b..2a49bf9112 100644 --- a/dizoo/common/ddpg/lunarlander_ddpg.py +++ b/dizoo/common/ddpg/lunarlander_ddpg.py @@ -16,7 +16,8 @@ github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/ddpg.html", github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", installation_guide="pip3 install DI-engine[common_env,fast]", - usage_file_path="./dizoo/common/ddpg/lunarlander_ddpg_download.py", - train_file_path="./dizoo/common/ddpg/lunarlander_ddpg.py", + usage_file_by_git_clone="./dizoo/common/ddpg/lunarlander_ddpg_deploy.py", + usage_file_by_huggingface_ding="./dizoo/common/ddpg/lunarlander_ddpg_download.py", + train_file="./dizoo/common/ddpg/lunarlander_ddpg.py", repo_id="OpenDILabCommunity/LunarLander-v2-DDPG" ) diff --git a/dizoo/common/ddpg/lunarlander_ddpg_deploy.py b/dizoo/common/ddpg/lunarlander_ddpg_deploy.py new file mode 100644 index 0000000000..4b6260662d --- /dev/null +++ b/dizoo/common/ddpg/lunarlander_ddpg_deploy.py @@ -0,0 +1,19 @@ +from ding.bonus import DDPGOffPolicyAgent +from ding.config import Config +from easydict import EasyDict +import torch + +# Pull model from files which are git cloned from huggingface +policy_state_dict = torch.load("pytorch_model.bin", map_location=torch.device("cpu")) +cfg = EasyDict(Config.file_to_dict("policy_config.py")) +# Instantiate the agent +agent = DDPGOffPolicyAgent( + env="lunarlander_continuous", + exp_name="LunarLander-v2-DDPG", + cfg=cfg.exp_config, + policy_state_dict=policy_state_dict +) +# Continue training +agent.train(step=5000) +# Render the new agent performance +agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/dqn/lunarlander_dqn.py b/dizoo/common/dqn/lunarlander_dqn.py index ac411179d7..91d9f72db9 100644 --- a/dizoo/common/dqn/lunarlander_dqn.py +++ b/dizoo/common/dqn/lunarlander_dqn.py @@ -16,7 +16,8 @@ github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/dqn.html", github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", installation_guide="pip3 install DI-engine[common_env,fast]", - usage_file_path="./dizoo/common/dqn/lunarlander_dqn_download.py", - train_file_path="./dizoo/common/dqn/lunarlander_dqn.py", + usage_file_by_git_clone="./dizoo/common/dqn/lunarlander_dqn_deploy.py", + usage_file_by_huggingface_ding="./dizoo/common/dqn/lunarlander_dqn_download.py", + train_file="./dizoo/common/dqn/lunarlander_dqn.py", repo_id="OpenDILabCommunity/Lunarlander-v2-DQN" ) diff --git a/dizoo/common/dqn/lunarlander_dqn_deploy.py b/dizoo/common/dqn/lunarlander_dqn_deploy.py new file mode 100644 index 0000000000..f60e66f9c1 --- /dev/null +++ b/dizoo/common/dqn/lunarlander_dqn_deploy.py @@ -0,0 +1,19 @@ +from ding.bonus import DQNOffpolicyAgent +from ding.config import Config +from easydict import EasyDict +import torch + +# Pull model from files which are git cloned from huggingface +policy_state_dict = torch.load("pytorch_model.bin", map_location=torch.device("cpu")) +cfg = EasyDict(Config.file_to_dict("policy_config.py")) +# Instantiate the agent +agent = DQNOffpolicyAgent( + env="lunarlander_discrete", + exp_name="Lunarlander-v2-DQN-test", + cfg=cfg.exp_config, + policy_state_dict=policy_state_dict +) +# Continue training +agent.train(step=5000) +# Render the new agent performance +agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/dqn/lunarlander_dqn_download.py b/dizoo/common/dqn/lunarlander_dqn_download.py index 2ad25b5084..ff848f9ca3 100644 --- a/dizoo/common/dqn/lunarlander_dqn_download.py +++ b/dizoo/common/dqn/lunarlander_dqn_download.py @@ -5,10 +5,7 @@ policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/Lunarlander-v2-DQN") # Instantiate the agent agent = DQNOffpolicyAgent( - env="lunarlander_discrete", - exp_name="Lunarlander-v2-DQN-test", - cfg=cfg.exp_config, - policy_state_dict=policy_state_dict + env="lunarlander_discrete", exp_name="Lunarlander-v2-DQN", cfg=cfg.exp_config, policy_state_dict=policy_state_dict ) # Continue training agent.train(step=5000) diff --git a/dizoo/common/ppo/lunarlander_ppo.py b/dizoo/common/ppo/lunarlander_ppo.py index ae29699a65..99a13c0f8d 100644 --- a/dizoo/common/ppo/lunarlander_ppo.py +++ b/dizoo/common/ppo/lunarlander_ppo.py @@ -16,7 +16,8 @@ github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/ppo.html", github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", installation_guide="pip3 install DI-engine[common_env,fast]", - usage_file_path="./dizoo/common/ppo/lunarlander_ppo_download.py", - train_file_path="./dizoo/common/ppo/lunarlander_ppo.py", + usage_file_by_git_clone="./dizoo/common/ppo/lunarlander_ppo_deploy.py", + usage_file_by_huggingface_ding="./dizoo/common/ppo/lunarlander_ppo_download.py", + train_file="./dizoo/common/ppo/lunarlander_ppo.py", repo_id="OpenDILabCommunity/LunarLander-v2-PPO" ) diff --git a/dizoo/common/ppo/lunarlander_ppo_deploy.py b/dizoo/common/ppo/lunarlander_ppo_deploy.py new file mode 100644 index 0000000000..87b266995d --- /dev/null +++ b/dizoo/common/ppo/lunarlander_ppo_deploy.py @@ -0,0 +1,16 @@ +from ding.bonus import PPOF +from ding.config import Config +from easydict import EasyDict +import torch + +# Pull model from files which are git cloned from huggingface +policy_state_dict = torch.load("pytorch_model.bin", map_location=torch.device("cpu")) +cfg = EasyDict(Config.file_to_dict("policy_config.py")) +# Instantiate the agent +agent = PPOF( + env="lunarlander_discrete", exp_name="lunarlander-ppo", cfg=cfg.exp_config, policy_state_dict=policy_state_dict +) +# Continue training +agent.train(step=5000) +# Render the new agent performance +agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/ppo/lunarlander_ppo_download.py b/dizoo/common/ppo/lunarlander_ppo_download.py index dd2ac3b4c6..fcf0ec3a03 100644 --- a/dizoo/common/ppo/lunarlander_ppo_download.py +++ b/dizoo/common/ppo/lunarlander_ppo_download.py @@ -5,7 +5,7 @@ policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/LunarLander-v2-PPO") # Instantiate the agent agent = PPOF( - env="lunarlander_discrete", exp_name="lunarlander-ppo", cfg=cfg.exp_config, policy_state_dict=policy_state_dict + env="lunarlander_discrete", exp_name="LunarLander-v2-PPO", cfg=cfg.exp_config, policy_state_dict=policy_state_dict ) # Continue training agent.train(step=5000) diff --git a/dizoo/common/sac/lunarlander_sac.py b/dizoo/common/sac/lunarlander_sac.py index 16b7028e46..ca14e6104c 100644 --- a/dizoo/common/sac/lunarlander_sac.py +++ b/dizoo/common/sac/lunarlander_sac.py @@ -16,7 +16,8 @@ github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/sac.html", github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", installation_guide="pip3 install DI-engine[common_env,fast]", - usage_file_path="./dizoo/common/sac/lunarlander_sac_download.py", - train_file_path="./dizoo/common/sac/lunarlander_sac.py", + usage_file_by_git_clone="./dizoo/common/sac/lunarlander_sac_deploy.py", + usage_file_by_huggingface_ding="./dizoo/common/sac/lunarlander_sac_download.py", + train_file="./dizoo/common/sac/lunarlander_sac.py", repo_id="OpenDILabCommunity/LunarLander-v2-SAC" ) diff --git a/dizoo/common/sac/lunarlander_sac_deploy.py b/dizoo/common/sac/lunarlander_sac_deploy.py new file mode 100644 index 0000000000..63c585ea5e --- /dev/null +++ b/dizoo/common/sac/lunarlander_sac_deploy.py @@ -0,0 +1,16 @@ +from ding.bonus import SACOffPolicyAgent +from ding.config import Config +from easydict import EasyDict +import torch + +# Pull model from files which are git cloned from huggingface +policy_state_dict = torch.load("pytorch_model.bin", map_location=torch.device("cpu")) +cfg = EasyDict(Config.file_to_dict("policy_config.py")) +# Instantiate the agent +agent = SACOffPolicyAgent( + env="lunarlander_continuous", exp_name="lunarlander-sac", cfg=cfg.exp_config, policy_state_dict=policy_state_dict +) +# Continue training +agent.train(step=5000) +# Render the new agent performance +agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/sac/lunarlander_sac_download.py b/dizoo/common/sac/lunarlander_sac_download.py index 36863eb2fb..72fdf15dce 100644 --- a/dizoo/common/sac/lunarlander_sac_download.py +++ b/dizoo/common/sac/lunarlander_sac_download.py @@ -5,7 +5,10 @@ policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/LunarLander-v2-SAC") # Instantiate the agent agent = SACOffPolicyAgent( - env="lunarlander_continuous", exp_name="lunarlander-sac", cfg=cfg.exp_config, policy_state_dict=policy_state_dict + env="lunarlander_continuous", + exp_name="LunarLander-v2-SAC", + cfg=cfg.exp_config, + policy_state_dict=policy_state_dict ) # Continue training agent.train(step=5000) diff --git a/dizoo/common/td3/lunarlander_td3.py b/dizoo/common/td3/lunarlander_td3.py index 2d01d7717a..1f4758a5f4 100644 --- a/dizoo/common/td3/lunarlander_td3.py +++ b/dizoo/common/td3/lunarlander_td3.py @@ -16,7 +16,8 @@ github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/td3.html", github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", installation_guide="pip3 install DI-engine[common_env,fast]", - usage_file_path="./dizoo/common/td3/lunarlander_td3_download.py", - train_file_path="./dizoo/common/td3/lunarlander_td3.py", + usage_file_by_git_clone="./dizoo/common/td3/lunarlander_td3_deploy.py", + usage_file_by_huggingface_ding="./dizoo/common/td3/lunarlander_td3_download.py", + train_file="./dizoo/common/td3/lunarlander_td3.py", repo_id="OpenDILabCommunity/LunarLander-v2-TD3" ) diff --git a/dizoo/common/td3/lunarlander_td3_deploy.py b/dizoo/common/td3/lunarlander_td3_deploy.py new file mode 100644 index 0000000000..5c62441137 --- /dev/null +++ b/dizoo/common/td3/lunarlander_td3_deploy.py @@ -0,0 +1,19 @@ +from ding.bonus import TD3OffPolicyAgent +from ding.config import Config +from easydict import EasyDict +import torch + +# Pull model from files which are git cloned from huggingface +policy_state_dict = torch.load("pytorch_model.bin", map_location=torch.device("cpu")) +cfg = EasyDict(Config.file_to_dict("policy_config.py")) +# Instantiate the agent +agent = TD3OffPolicyAgent( + env="lunarlander_continuous", + exp_name="LunarLander-v2-TD3", + cfg=cfg.exp_config, + policy_state_dict=policy_state_dict +) +# Continue training +agent.train(step=5000) +# Render the new agent performance +agent.deploy(enable_save_replay=True) From 95f995c8d29d0f37e1ce6211d02f5c3fa7f1fb3d Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 29 Mar 2023 14:06:27 +0000 Subject: [PATCH 065/244] polish code --- dizoo/common/dqn/lunarlander_dqn_deploy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dizoo/common/dqn/lunarlander_dqn_deploy.py b/dizoo/common/dqn/lunarlander_dqn_deploy.py index f60e66f9c1..f623227688 100644 --- a/dizoo/common/dqn/lunarlander_dqn_deploy.py +++ b/dizoo/common/dqn/lunarlander_dqn_deploy.py @@ -9,7 +9,7 @@ # Instantiate the agent agent = DQNOffpolicyAgent( env="lunarlander_discrete", - exp_name="Lunarlander-v2-DQN-test", + exp_name="Lunarlander-v2-DQN", cfg=cfg.exp_config, policy_state_dict=policy_state_dict ) From c5e9a526d9e84b8e9d28e18053a445d7059c9575 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 30 Mar 2023 06:15:09 +0000 Subject: [PATCH 066/244] polish code --- ding/bonus/__init__.py | 10 +++++----- ding/bonus/ddpg.py | 8 ++++---- ding/bonus/dqn.py | 8 ++++---- ding/bonus/impala.py | 8 ++++---- ding/bonus/sac.py | 8 ++++---- ding/bonus/td3.py | 8 ++++---- dizoo/common/ddpg/lunarlander_ddpg.py | 6 +++--- dizoo/common/ddpg/lunarlander_ddpg_deploy.py | 4 ++-- dizoo/common/ddpg/lunarlander_ddpg_download.py | 4 ++-- dizoo/common/dqn/lunarlander_dqn.py | 6 +++--- dizoo/common/dqn/lunarlander_dqn_deploy.py | 4 ++-- dizoo/common/dqn/lunarlander_dqn_download.py | 4 ++-- dizoo/common/ppo/lunarlander_ppo.py | 2 +- dizoo/common/sac/lunarlander_sac.py | 6 +++--- dizoo/common/sac/lunarlander_sac_deploy.py | 4 ++-- dizoo/common/sac/lunarlander_sac_download.py | 4 ++-- dizoo/common/td3/lunarlander_td3.py | 6 +++--- dizoo/common/td3/lunarlander_td3_deploy.py | 4 ++-- dizoo/common/td3/lunarlander_td3_download.py | 4 ++-- 19 files changed, 54 insertions(+), 54 deletions(-) diff --git a/ding/bonus/__init__.py b/ding/bonus/__init__.py index cfe27b509e..0f5d4b1f48 100644 --- a/ding/bonus/__init__.py +++ b/ding/bonus/__init__.py @@ -1,6 +1,6 @@ from .ppof import PPOF -from .td3 import TD3OffPolicyAgent -from .ddpg import DDPGOffPolicyAgent -from .dqn import DQNOffpolicyAgent -from .sac import SACOffPolicyAgent -from .impala import IMPALAOffPolicyAgent +from .td3 import TD3Agent +from .ddpg import DDPGAgent +from .dqn import DQNAgent +from .sac import SACAgent +from .impala import IMPALAAgent diff --git a/ding/bonus/ddpg.py b/ding/bonus/ddpg.py index 5a5516b4ff..72757c41a0 100644 --- a/ding/bonus/ddpg.py +++ b/ding/bonus/ddpg.py @@ -40,7 +40,7 @@ class EvalReturn: eval_value_std: np.float32 -class DDPGOffPolicyAgent: +class DDPGAgent: supported_env_list = [ 'lunarlander_continuous', ] @@ -56,13 +56,13 @@ def __init__( policy_state_dict: str = None, ) -> None: if isinstance(env, str): - assert env in DDPGOffPolicyAgent.supported_env_list, "Please use supported envs: {}".format( - DDPGOffPolicyAgent.supported_env_list + assert env in DDPGAgent.supported_env_list, "Please use supported envs: {}".format( + DDPGAgent.supported_env_list ) self.env = get_instance_env(env) if cfg is None: # 'It should be default env tuned config' - cfg = get_instance_config(env, algorithm=DDPGOffPolicyAgent.algorithm) + cfg = get_instance_config(env, algorithm=DDPGAgent.algorithm) else: assert isinstance(cfg, EasyDict), "Please use EasyDict as config data type." diff --git a/ding/bonus/dqn.py b/ding/bonus/dqn.py index 8faaeaf5b4..cbab3e5bc9 100644 --- a/ding/bonus/dqn.py +++ b/ding/bonus/dqn.py @@ -40,7 +40,7 @@ class EvalReturn: eval_value_std: np.float32 -class DQNOffpolicyAgent: +class DQNAgent: supported_env_list = [ 'lunarlander_discrete', ] @@ -56,13 +56,13 @@ def __init__( policy_state_dict: str = None, ) -> None: if isinstance(env, str): - assert env in DQNOffpolicyAgent.supported_env_list, "Please use supported envs: {}".format( - DQNOffpolicyAgent.supported_env_list + assert env in DQNAgent.supported_env_list, "Please use supported envs: {}".format( + DQNAgent.supported_env_list ) self.env = get_instance_env(env) if cfg is None: # 'It should be default env tuned config' - cfg = get_instance_config(env, algorithm=DQNOffpolicyAgent.algorithm) + cfg = get_instance_config(env, algorithm=DQNAgent.algorithm) else: assert isinstance(cfg, EasyDict), "Please use EasyDict as config data type." diff --git a/ding/bonus/impala.py b/ding/bonus/impala.py index 875f4f4e0c..ea9e866c78 100644 --- a/ding/bonus/impala.py +++ b/ding/bonus/impala.py @@ -41,7 +41,7 @@ class EvalReturn: eval_value_std: np.float32 -class IMPALAOffPolicyAgent: +class IMPALAAgent: supported_env_list = [ 'SpaceInvaders', ] @@ -57,13 +57,13 @@ def __init__( policy_state_dict: str = None, ) -> None: if isinstance(env, str): - assert env in IMPALAOffPolicyAgent.supported_env_list, "Please use supported envs: {}".format( - IMPALAOffPolicyAgent.supported_env_list + assert env in IMPALAAgent.supported_env_list, "Please use supported envs: {}".format( + IMPALAAgent.supported_env_list ) self.env = get_instance_env(env) if cfg is None: # 'It should be default env tuned config' - cfg = get_instance_config(env, algorithm=IMPALAOffPolicyAgent.algorithm) + cfg = get_instance_config(env, algorithm=IMPALAAgent.algorithm) else: assert isinstance(cfg, EasyDict), "Please use EasyDict as config data type." diff --git a/ding/bonus/sac.py b/ding/bonus/sac.py index e1cdac9e0c..7aff4f3fd7 100644 --- a/ding/bonus/sac.py +++ b/ding/bonus/sac.py @@ -41,7 +41,7 @@ class EvalReturn: eval_value_std: np.float32 -class SACOffPolicyAgent: +class SACAgent: supported_env_list = [ 'hopper', 'lunarlander_continuous', @@ -58,13 +58,13 @@ def __init__( policy_state_dict: str = None, ) -> None: if isinstance(env, str): - assert env in SACOffPolicyAgent.supported_env_list, "Please use supported envs: {}".format( - SACOffPolicyAgent.supported_env_list + assert env in SACAgent.supported_env_list, "Please use supported envs: {}".format( + SACAgent.supported_env_list ) self.env = get_instance_env(env) if cfg is None: # 'It should be default env tuned config' - cfg = get_instance_config(env, algorithm=SACOffPolicyAgent.algorithm) + cfg = get_instance_config(env, algorithm=SACAgent.algorithm) else: assert isinstance(cfg, EasyDict), "Please use EasyDict as config data type." diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 8880121e28..79934b0268 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -40,7 +40,7 @@ class EvalReturn: eval_value_std: np.float32 -class TD3OffPolicyAgent: +class TD3Agent: supported_env_list = [ 'hopper', 'lunarlander_continuous', @@ -57,13 +57,13 @@ def __init__( policy_state_dict: str = None, ) -> None: if isinstance(env, str): - assert env in TD3OffPolicyAgent.supported_env_list, "Please use supported envs: {}".format( - TD3OffPolicyAgent.supported_env_list + assert env in TD3Agent.supported_env_list, "Please use supported envs: {}".format( + TD3Agent.supported_env_list ) self.env = get_instance_env(env) if cfg is None: # 'It should be default env tuned config' - cfg = get_instance_config(env, algorithm=TD3OffPolicyAgent.algorithm) + cfg = get_instance_config(env, algorithm=TD3Agent.algorithm) else: assert isinstance(cfg, EasyDict), "Please use EasyDict as config data type." diff --git a/dizoo/common/ddpg/lunarlander_ddpg.py b/dizoo/common/ddpg/lunarlander_ddpg.py index 2a49bf9112..e550c5b721 100644 --- a/dizoo/common/ddpg/lunarlander_ddpg.py +++ b/dizoo/common/ddpg/lunarlander_ddpg.py @@ -1,8 +1,8 @@ -from ding.bonus import DDPGOffPolicyAgent +from ding.bonus import DDPGAgent from huggingface_ding import push_model_to_hub # Instantiate the agent -agent = DDPGOffPolicyAgent("lunarlander_continuous", exp_name="LunarLander-v2-DDPG") +agent = DDPGAgent("lunarlander_continuous", exp_name="LunarLander-v2-DDPG") # Train the agent return_ = agent.train(step=int(4000000), collector_env_num=4, evaluator_env_num=4) # Push model to huggingface hub @@ -15,7 +15,7 @@ github_repo_url="https://github.com/opendilab/DI-engine", github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/ddpg.html", github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", - installation_guide="pip3 install DI-engine[common_env,fast]", + installation_guide="pip3 install DI-engine[common_env]", usage_file_by_git_clone="./dizoo/common/ddpg/lunarlander_ddpg_deploy.py", usage_file_by_huggingface_ding="./dizoo/common/ddpg/lunarlander_ddpg_download.py", train_file="./dizoo/common/ddpg/lunarlander_ddpg.py", diff --git a/dizoo/common/ddpg/lunarlander_ddpg_deploy.py b/dizoo/common/ddpg/lunarlander_ddpg_deploy.py index 4b6260662d..ceeedaaee4 100644 --- a/dizoo/common/ddpg/lunarlander_ddpg_deploy.py +++ b/dizoo/common/ddpg/lunarlander_ddpg_deploy.py @@ -1,4 +1,4 @@ -from ding.bonus import DDPGOffPolicyAgent +from ding.bonus import DDPGAgent from ding.config import Config from easydict import EasyDict import torch @@ -7,7 +7,7 @@ policy_state_dict = torch.load("pytorch_model.bin", map_location=torch.device("cpu")) cfg = EasyDict(Config.file_to_dict("policy_config.py")) # Instantiate the agent -agent = DDPGOffPolicyAgent( +agent = DDPGAgent( env="lunarlander_continuous", exp_name="LunarLander-v2-DDPG", cfg=cfg.exp_config, diff --git a/dizoo/common/ddpg/lunarlander_ddpg_download.py b/dizoo/common/ddpg/lunarlander_ddpg_download.py index 5458d5610f..907d8e9f1f 100644 --- a/dizoo/common/ddpg/lunarlander_ddpg_download.py +++ b/dizoo/common/ddpg/lunarlander_ddpg_download.py @@ -1,10 +1,10 @@ -from ding.bonus import DDPGOffPolicyAgent +from ding.bonus import DDPGAgent from huggingface_ding import pull_model_from_hub # Pull model from Hugggingface hub policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/LunarLander-v2-DDPG") # Instantiate the agent -agent = DDPGOffPolicyAgent( +agent = DDPGAgent( env="lunarlander_continuous", exp_name="LunarLander-v2-DDPG", cfg=cfg.exp_config, diff --git a/dizoo/common/dqn/lunarlander_dqn.py b/dizoo/common/dqn/lunarlander_dqn.py index 91d9f72db9..59428a482d 100644 --- a/dizoo/common/dqn/lunarlander_dqn.py +++ b/dizoo/common/dqn/lunarlander_dqn.py @@ -1,8 +1,8 @@ -from ding.bonus import DQNOffpolicyAgent +from ding.bonus import DQNAgent from huggingface_ding import push_model_to_hub # Instantiate the agent -agent = DQNOffpolicyAgent(env="lunarlander_discrete", exp_name="Lunarlander-v2-DQN") +agent = DQNAgent(env="lunarlander_discrete", exp_name="Lunarlander-v2-DQN") # Train the agent return_ = agent.train(step=int(4000000), collector_env_num=8, evaluator_env_num=8, debug=False) # Push model to huggingface hub @@ -15,7 +15,7 @@ github_repo_url="https://github.com/opendilab/DI-engine", github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/dqn.html", github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", - installation_guide="pip3 install DI-engine[common_env,fast]", + installation_guide="pip3 install DI-engine[common_env]", usage_file_by_git_clone="./dizoo/common/dqn/lunarlander_dqn_deploy.py", usage_file_by_huggingface_ding="./dizoo/common/dqn/lunarlander_dqn_download.py", train_file="./dizoo/common/dqn/lunarlander_dqn.py", diff --git a/dizoo/common/dqn/lunarlander_dqn_deploy.py b/dizoo/common/dqn/lunarlander_dqn_deploy.py index f623227688..485bb26181 100644 --- a/dizoo/common/dqn/lunarlander_dqn_deploy.py +++ b/dizoo/common/dqn/lunarlander_dqn_deploy.py @@ -1,4 +1,4 @@ -from ding.bonus import DQNOffpolicyAgent +from ding.bonus import DQNAgent from ding.config import Config from easydict import EasyDict import torch @@ -7,7 +7,7 @@ policy_state_dict = torch.load("pytorch_model.bin", map_location=torch.device("cpu")) cfg = EasyDict(Config.file_to_dict("policy_config.py")) # Instantiate the agent -agent = DQNOffpolicyAgent( +agent = DQNAgent( env="lunarlander_discrete", exp_name="Lunarlander-v2-DQN", cfg=cfg.exp_config, diff --git a/dizoo/common/dqn/lunarlander_dqn_download.py b/dizoo/common/dqn/lunarlander_dqn_download.py index ff848f9ca3..380b1e22d4 100644 --- a/dizoo/common/dqn/lunarlander_dqn_download.py +++ b/dizoo/common/dqn/lunarlander_dqn_download.py @@ -1,10 +1,10 @@ -from ding.bonus import DQNOffpolicyAgent +from ding.bonus import DQNAgent from huggingface_ding import pull_model_from_hub # Pull model from Hugggingface hub policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/Lunarlander-v2-DQN") # Instantiate the agent -agent = DQNOffpolicyAgent( +agent = DQNAgent( env="lunarlander_discrete", exp_name="Lunarlander-v2-DQN", cfg=cfg.exp_config, policy_state_dict=policy_state_dict ) # Continue training diff --git a/dizoo/common/ppo/lunarlander_ppo.py b/dizoo/common/ppo/lunarlander_ppo.py index 99a13c0f8d..9c11750a24 100644 --- a/dizoo/common/ppo/lunarlander_ppo.py +++ b/dizoo/common/ppo/lunarlander_ppo.py @@ -15,7 +15,7 @@ github_repo_url="https://github.com/opendilab/DI-engine", github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/ppo.html", github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", - installation_guide="pip3 install DI-engine[common_env,fast]", + installation_guide="pip3 install DI-engine[common_env]", usage_file_by_git_clone="./dizoo/common/ppo/lunarlander_ppo_deploy.py", usage_file_by_huggingface_ding="./dizoo/common/ppo/lunarlander_ppo_download.py", train_file="./dizoo/common/ppo/lunarlander_ppo.py", diff --git a/dizoo/common/sac/lunarlander_sac.py b/dizoo/common/sac/lunarlander_sac.py index ca14e6104c..72a88026d2 100644 --- a/dizoo/common/sac/lunarlander_sac.py +++ b/dizoo/common/sac/lunarlander_sac.py @@ -1,8 +1,8 @@ -from ding.bonus import SACOffPolicyAgent +from ding.bonus import SACAgent from huggingface_ding import push_model_to_hub # Instantiate the agent -agent = SACOffPolicyAgent("lunarlander_continuous", exp_name="LunarLander-v2-SAC") +agent = SACAgent("lunarlander_continuous", exp_name="LunarLander-v2-SAC") # Train the agent return_ = agent.train(step=int(4000000), collector_env_num=8, evaluator_env_num=8) # Push model to huggingface hub @@ -15,7 +15,7 @@ github_repo_url="https://github.com/opendilab/DI-engine", github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/sac.html", github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", - installation_guide="pip3 install DI-engine[common_env,fast]", + installation_guide="pip3 install DI-engine[common_env]", usage_file_by_git_clone="./dizoo/common/sac/lunarlander_sac_deploy.py", usage_file_by_huggingface_ding="./dizoo/common/sac/lunarlander_sac_download.py", train_file="./dizoo/common/sac/lunarlander_sac.py", diff --git a/dizoo/common/sac/lunarlander_sac_deploy.py b/dizoo/common/sac/lunarlander_sac_deploy.py index 63c585ea5e..c9ec71331f 100644 --- a/dizoo/common/sac/lunarlander_sac_deploy.py +++ b/dizoo/common/sac/lunarlander_sac_deploy.py @@ -1,4 +1,4 @@ -from ding.bonus import SACOffPolicyAgent +from ding.bonus import SACAgent from ding.config import Config from easydict import EasyDict import torch @@ -7,7 +7,7 @@ policy_state_dict = torch.load("pytorch_model.bin", map_location=torch.device("cpu")) cfg = EasyDict(Config.file_to_dict("policy_config.py")) # Instantiate the agent -agent = SACOffPolicyAgent( +agent = SACAgent( env="lunarlander_continuous", exp_name="lunarlander-sac", cfg=cfg.exp_config, policy_state_dict=policy_state_dict ) # Continue training diff --git a/dizoo/common/sac/lunarlander_sac_download.py b/dizoo/common/sac/lunarlander_sac_download.py index 72fdf15dce..a9d3cbd000 100644 --- a/dizoo/common/sac/lunarlander_sac_download.py +++ b/dizoo/common/sac/lunarlander_sac_download.py @@ -1,10 +1,10 @@ -from ding.bonus import SACOffPolicyAgent +from ding.bonus import SACAgent from huggingface_ding import pull_model_from_hub # Pull model from Hugggingface hub policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/LunarLander-v2-SAC") # Instantiate the agent -agent = SACOffPolicyAgent( +agent = SACAgent( env="lunarlander_continuous", exp_name="LunarLander-v2-SAC", cfg=cfg.exp_config, diff --git a/dizoo/common/td3/lunarlander_td3.py b/dizoo/common/td3/lunarlander_td3.py index 1f4758a5f4..41de88b1a9 100644 --- a/dizoo/common/td3/lunarlander_td3.py +++ b/dizoo/common/td3/lunarlander_td3.py @@ -1,8 +1,8 @@ -from ding.bonus import TD3OffPolicyAgent +from ding.bonus import TD3Agent from huggingface_ding import push_model_to_hub # Instantiate the agent -agent = TD3OffPolicyAgent("lunarlander_continuous", exp_name="LunarLander-v2-TD3") +agent = TD3Agent("lunarlander_continuous", exp_name="LunarLander-v2-TD3") # Train the agent return_ = agent.train(step=int(4000000), collector_env_num=4, evaluator_env_num=4) # Push model to huggingface hub @@ -15,7 +15,7 @@ github_repo_url="https://github.com/opendilab/DI-engine", github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/td3.html", github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", - installation_guide="pip3 install DI-engine[common_env,fast]", + installation_guide="pip3 install DI-engine[common_env]", usage_file_by_git_clone="./dizoo/common/td3/lunarlander_td3_deploy.py", usage_file_by_huggingface_ding="./dizoo/common/td3/lunarlander_td3_download.py", train_file="./dizoo/common/td3/lunarlander_td3.py", diff --git a/dizoo/common/td3/lunarlander_td3_deploy.py b/dizoo/common/td3/lunarlander_td3_deploy.py index 5c62441137..043f4b660b 100644 --- a/dizoo/common/td3/lunarlander_td3_deploy.py +++ b/dizoo/common/td3/lunarlander_td3_deploy.py @@ -1,4 +1,4 @@ -from ding.bonus import TD3OffPolicyAgent +from ding.bonus import TD3Agent from ding.config import Config from easydict import EasyDict import torch @@ -7,7 +7,7 @@ policy_state_dict = torch.load("pytorch_model.bin", map_location=torch.device("cpu")) cfg = EasyDict(Config.file_to_dict("policy_config.py")) # Instantiate the agent -agent = TD3OffPolicyAgent( +agent = TD3Agent( env="lunarlander_continuous", exp_name="LunarLander-v2-TD3", cfg=cfg.exp_config, diff --git a/dizoo/common/td3/lunarlander_td3_download.py b/dizoo/common/td3/lunarlander_td3_download.py index 2e0982f225..9809d4ddd4 100644 --- a/dizoo/common/td3/lunarlander_td3_download.py +++ b/dizoo/common/td3/lunarlander_td3_download.py @@ -1,10 +1,10 @@ -from ding.bonus import TD3OffPolicyAgent +from ding.bonus import TD3Agent from huggingface_ding import pull_model_from_hub # Pull model from Hugggingface hub policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/LunarLander-v2-TD3") # Instantiate the agent -agent = TD3OffPolicyAgent( +agent = TD3Agent( env="lunarlander_continuous", exp_name="LunarLander-v2-TD3", cfg=cfg.exp_config, From 01b82c7038e759a0869104c895791eaeb23f53fd Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 31 Mar 2023 06:04:20 +0000 Subject: [PATCH 067/244] add Hopper demo --- ding/bonus/config.py | 47 ++++++++++++++++++++++- dizoo/common/ddpg/hopper_ddpg.py | 46 ++++++++++++++++++++++ dizoo/common/ddpg/hopper_ddpg_deploy.py | 16 ++++++++ dizoo/common/ddpg/hopper_ddpg_download.py | 16 ++++++++ dizoo/common/ppo/hopper_ppo.py | 46 ++++++++++++++++++++++ dizoo/common/ppo/hopper_ppo_deploy.py | 16 ++++++++ dizoo/common/ppo/hopper_ppo_download.py | 16 ++++++++ dizoo/common/sac/hopper_sac.py | 47 +++++++++++++++++++++++ dizoo/common/sac/hopper_sac_deploy.py | 16 ++++++++ dizoo/common/sac/hopper_sac_download.py | 16 ++++++++ dizoo/common/td3/hopper_td3.py | 47 +++++++++++++++++++++++ dizoo/common/td3/hopper_td3_deploy.py | 16 ++++++++ dizoo/common/td3/hopper_td3_download.py | 16 ++++++++ 13 files changed, 359 insertions(+), 2 deletions(-) create mode 100644 dizoo/common/ddpg/hopper_ddpg.py create mode 100644 dizoo/common/ddpg/hopper_ddpg_deploy.py create mode 100644 dizoo/common/ddpg/hopper_ddpg_download.py create mode 100644 dizoo/common/ppo/hopper_ppo.py create mode 100644 dizoo/common/ppo/hopper_ppo_deploy.py create mode 100644 dizoo/common/ppo/hopper_ppo_download.py create mode 100644 dizoo/common/sac/hopper_sac.py create mode 100644 dizoo/common/sac/hopper_sac_deploy.py create mode 100644 dizoo/common/sac/hopper_sac_download.py create mode 100644 dizoo/common/td3/hopper_td3.py create mode 100644 dizoo/common/td3/hopper_td3_deploy.py create mode 100644 dizoo/common/td3/hopper_td3_download.py diff --git a/ding/bonus/config.py b/ding/bonus/config.py index b6d7ac45c8..1b6e4adc02 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -124,7 +124,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: if env == 'hopper': cfg.update( dict( - exp_name='hopper_td3', + exp_name='Hopper-v3-TD3', seed=0, env=dict( env_id='Hopper-v3', @@ -209,7 +209,50 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: elif algorithm == 'DDPG': cfg = EasyDict({"policy": DDPGPolicy.default_config()}) if env == 'hopper': - pass + cfg.update( + dict( + exp_name='Hopper-v3-DDPG', + seed=0, + env=dict( + env_id='Hopper-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=1, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=6000, + ), + policy=dict( + cuda=True, + random_collect_size=25000, + model=dict( + obs_shape=11, + action_shape=3, + twin_critic=False, + actor_head_hidden_size=256, + critic_head_hidden_size=256, + action_space='regression', + ), + learn=dict( + update_per_collect=1, + batch_size=256, + learning_rate_actor=1e-3, + learning_rate_critic=1e-3, + ignore_done=False, + target_theta=0.005, + discount_factor=0.99, + actor_update_freq=1, + noise=False, + ), + collect=dict( + n_sample=1, + unroll_len=1, + noise_sigma=0.1, + ), + other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), + ) + ) + ) elif env == 'lunarlander_continuous': cfg.update( dict( diff --git a/dizoo/common/ddpg/hopper_ddpg.py b/dizoo/common/ddpg/hopper_ddpg.py new file mode 100644 index 0000000000..9c32b2a1f7 --- /dev/null +++ b/dizoo/common/ddpg/hopper_ddpg.py @@ -0,0 +1,46 @@ +from ding.bonus import DDPGAgent +from huggingface_ding import push_model_to_hub + +# Instantiate the agent +agent = DDPGAgent(env="hopper", exp_name="Hopper-v3-DDPG") +# Train the agent +return_ = agent.train(step=int(10000000), collector_env_num=4, evaluator_env_num=4, debug=False) +# Push model to huggingface hub +push_model_to_hub( + agent=agent, + env_name="OpenAI/Gym/MuJoCo", + task_name="Hopper-v3", + algo_name="DDPG", + wandb_url=return_.wandb_url, + github_repo_url="https://github.com/opendilab/DI-engine", + github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/ddpg.html", + github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco.html", + installation_guide= +''' +sudo apt update -y \ + && sudo apt install -y \ + build-essential \ + libgl1-mesa-dev \ + libgl1-mesa-glx \ + libglew-dev \ + libosmesa6-dev \ + libglfw3 \ + libglfw3-dev \ + libsdl2-dev \ + libsdl2-image-dev \ + libglm-dev \ + libfreetype6-dev \ + patchelf + +mkdir -p ~/.mujoco +wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz +tar -xf mujoco.tar.gz -C ~/.mujoco +echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin" >> ~/.bashrc +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin +pip3 install DI-engine[common_env] +''', + usage_file_by_git_clone="./dizoo/common/ddpg/hopper_ddpg_deploy.py", + usage_file_by_huggingface_ding="./dizoo/common/ddpg/hopper_ddpg_download.py", + train_file="./dizoo/common/ddpg/hopper_ddpg.py", + repo_id="OpenDILabCommunity/Hopper-v3-DDPG" +) diff --git a/dizoo/common/ddpg/hopper_ddpg_deploy.py b/dizoo/common/ddpg/hopper_ddpg_deploy.py new file mode 100644 index 0000000000..6044f6a5b2 --- /dev/null +++ b/dizoo/common/ddpg/hopper_ddpg_deploy.py @@ -0,0 +1,16 @@ +from ding.bonus import DDPGAgent +from ding.config import Config +from easydict import EasyDict +import torch + +# Pull model from files which are git cloned from huggingface +policy_state_dict = torch.load("pytorch_model.bin", map_location=torch.device("cpu")) +cfg = EasyDict(Config.file_to_dict("policy_config.py")) +# Instantiate the agent +agent = DDPGAgent( + env="hopper", exp_name="Hopper-v3-DDPG", cfg=cfg.exp_config, policy_state_dict=policy_state_dict +) +# Continue training +agent.train(step=5000) +# Render the new agent performance +agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/ddpg/hopper_ddpg_download.py b/dizoo/common/ddpg/hopper_ddpg_download.py new file mode 100644 index 0000000000..42208aec1a --- /dev/null +++ b/dizoo/common/ddpg/hopper_ddpg_download.py @@ -0,0 +1,16 @@ +from ding.bonus import DDPGAgent +from huggingface_ding import pull_model_from_hub + +# Pull model from Hugggingface hub +policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/Hopper-v3-DDPG") +# Instantiate the agent +agent = DDPGAgent( + env="hopper", + exp_name="Hopper-v3-DDPG", + cfg=cfg.exp_config, + policy_state_dict=policy_state_dict +) +# Continue training +agent.train(step=5000) +# Render the new agent performance +agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/ppo/hopper_ppo.py b/dizoo/common/ppo/hopper_ppo.py new file mode 100644 index 0000000000..c8b57383ef --- /dev/null +++ b/dizoo/common/ppo/hopper_ppo.py @@ -0,0 +1,46 @@ +from ding.bonus import PPOF +from huggingface_ding import push_model_to_hub + +# Instantiate the agent +agent = PPOF(env="hopper", exp_name="Hopper-v3-PPO") +# Train the agent +return_ = agent.train(step=int(10000000), collector_env_num=4, evaluator_env_num=4, debug=False) +# Push model to huggingface hub +push_model_to_hub( + agent=agent, + env_name="OpenAI/Gym/MuJoCo", + task_name="Hopper-v3", + algo_name="PPO", + wandb_url=return_.wandb_url, + github_repo_url="https://github.com/opendilab/DI-engine", + github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/ppo.html", + github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco.html", + installation_guide= +''' +sudo apt update -y \ + && sudo apt install -y \ + build-essential \ + libgl1-mesa-dev \ + libgl1-mesa-glx \ + libglew-dev \ + libosmesa6-dev \ + libglfw3 \ + libglfw3-dev \ + libsdl2-dev \ + libsdl2-image-dev \ + libglm-dev \ + libfreetype6-dev \ + patchelf + +mkdir -p ~/.mujoco +wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz +tar -xf mujoco.tar.gz -C ~/.mujoco +echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin" >> ~/.bashrc +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin +pip3 install DI-engine[common_env] +''', + usage_file_by_git_clone="./dizoo/common/ppo/hopper_ppo_deploy.py", + usage_file_by_huggingface_ding="./dizoo/common/ppo/hopper_ppo_download.py", + train_file="./dizoo/common/ppo/hopper_ppo.py", + repo_id="OpenDILabCommunity/Hopper-v3-PPO" +) diff --git a/dizoo/common/ppo/hopper_ppo_deploy.py b/dizoo/common/ppo/hopper_ppo_deploy.py new file mode 100644 index 0000000000..afc094ec33 --- /dev/null +++ b/dizoo/common/ppo/hopper_ppo_deploy.py @@ -0,0 +1,16 @@ +from ding.bonus import PPOF +from ding.config import Config +from easydict import EasyDict +import torch + +# Pull model from files which are git cloned from huggingface +policy_state_dict = torch.load("pytorch_model.bin", map_location=torch.device("cpu")) +cfg = EasyDict(Config.file_to_dict("policy_config.py")) +# Instantiate the agent +agent = PPOF( + env="hopper", exp_name="Hopper-v3-PPO", cfg=cfg.exp_config, policy_state_dict=policy_state_dict +) +# Continue training +agent.train(step=5000) +# Render the new agent performance +agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/ppo/hopper_ppo_download.py b/dizoo/common/ppo/hopper_ppo_download.py new file mode 100644 index 0000000000..bb95f353e5 --- /dev/null +++ b/dizoo/common/ppo/hopper_ppo_download.py @@ -0,0 +1,16 @@ +from ding.bonus import PPOF +from huggingface_ding import pull_model_from_hub + +# Pull model from Hugggingface hub +policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/Hopper-v3-PPO") +# Instantiate the agent +agent = PPOF( + env="hopper", + exp_name="Hopper-v3-PPO", + cfg=cfg.exp_config, + policy_state_dict=policy_state_dict +) +# Continue training +agent.train(step=5000) +# Render the new agent performance +agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/sac/hopper_sac.py b/dizoo/common/sac/hopper_sac.py new file mode 100644 index 0000000000..9995b8e42c --- /dev/null +++ b/dizoo/common/sac/hopper_sac.py @@ -0,0 +1,47 @@ +from ding.bonus.sac import SACAgent +from huggingface_ding import push_model_to_hub + +# Instantiate the agent +agent = SACAgent(env="hopper", exp_name="Hopper-v3-SAC") +# Train the agent +return_ = agent.train(step=int(10000000), collector_env_num=4, evaluator_env_num=4, debug=False) +# Push model to huggingface hub +push_model_to_hub( + agent=agent, + env_name="OpenAI/Gym/MuJoCo", + task_name="Hopper-v3", + algo_name="SAC", + wandb_url=return_.wandb_url, + github_repo_url="https://github.com/opendilab/DI-engine", + github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/sac.html", + github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco.html", + installation_guide= +''' +sudo apt update -y \ + && sudo apt install -y \ + build-essential \ + libgl1-mesa-dev \ + libgl1-mesa-glx \ + libglew-dev \ + libosmesa6-dev \ + libglfw3 \ + libglfw3-dev \ + libsdl2-dev \ + libsdl2-image-dev \ + libglm-dev \ + libfreetype6-dev \ + patchelf + +mkdir -p ~/.mujoco +wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz +tar -xf mujoco.tar.gz -C ~/.mujoco +echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin" >> ~/.bashrc +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin +pip3 install DI-engine[common_env] +''', + usage_file_by_git_clone="./dizoo/common/sac/hopper_sac_deploy.py", + usage_file_by_huggingface_ding="./dizoo/common/sac/hopper_sac_download.py", + train_file="./dizoo/common/sac/hopper_sac.py", + repo_id="OpenDILabCommunity/Hopper-v3-SAC" +) + diff --git a/dizoo/common/sac/hopper_sac_deploy.py b/dizoo/common/sac/hopper_sac_deploy.py new file mode 100644 index 0000000000..46ef580274 --- /dev/null +++ b/dizoo/common/sac/hopper_sac_deploy.py @@ -0,0 +1,16 @@ +from ding.bonus import SACAgent +from ding.config import Config +from easydict import EasyDict +import torch + +# Pull model from files which are git cloned from huggingface +policy_state_dict = torch.load("pytorch_model.bin", map_location=torch.device("cpu")) +cfg = EasyDict(Config.file_to_dict("policy_config.py")) +# Instantiate the agent +agent = SACAgent( + env="hopper", exp_name="Hopper-v3-SAC", cfg=cfg.exp_config, policy_state_dict=policy_state_dict +) +# Continue training +agent.train(step=5000) +# Render the new agent performance +agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/sac/hopper_sac_download.py b/dizoo/common/sac/hopper_sac_download.py new file mode 100644 index 0000000000..59344c9d34 --- /dev/null +++ b/dizoo/common/sac/hopper_sac_download.py @@ -0,0 +1,16 @@ +from ding.bonus import SACAgent +from huggingface_ding import pull_model_from_hub + +# Pull model from Hugggingface hub +policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/Hopper-v3-SAC") +# Instantiate the agent +agent = SACAgent( + env="hopper", + exp_name="Hopper-v3-SAC", + cfg=cfg.exp_config, + policy_state_dict=policy_state_dict +) +# Continue training +agent.train(step=5000) +# Render the new agent performance +agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/td3/hopper_td3.py b/dizoo/common/td3/hopper_td3.py new file mode 100644 index 0000000000..2cda317fdf --- /dev/null +++ b/dizoo/common/td3/hopper_td3.py @@ -0,0 +1,47 @@ +from ding.bonus import TD3Agent +from huggingface_ding import push_model_to_hub + +# Instantiate the agent +agent = TD3Agent(env="hopper", exp_name="Hopper-v3-TD3") +# Train the agent +return_ = agent.train(step=int(10000000), collector_env_num=4, evaluator_env_num=4, debug=False) +# Push model to huggingface hub +push_model_to_hub( + agent=agent, + env_name="OpenAI/Gym/MuJoCo", + task_name="Hopper-v3", + algo_name="TD3", + wandb_url=return_.wandb_url, + github_repo_url="https://github.com/opendilab/DI-engine", + github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/td3.html", + github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco.html", + installation_guide= +''' +sudo apt update -y \ + && sudo apt install -y \ + build-essential \ + libgl1-mesa-dev \ + libgl1-mesa-glx \ + libglew-dev \ + libosmesa6-dev \ + libglfw3 \ + libglfw3-dev \ + libsdl2-dev \ + libsdl2-image-dev \ + libglm-dev \ + libfreetype6-dev \ + patchelf + +mkdir -p ~/.mujoco +wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz +tar -xf mujoco.tar.gz -C ~/.mujoco +echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin" >> ~/.bashrc +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin +pip3 install DI-engine[common_env] +''', + usage_file_by_git_clone="./dizoo/common/td3/hopper_td3_deploy.py", + usage_file_by_huggingface_ding="./dizoo/common/td3/hopper_td3_download.py", + train_file="./dizoo/common/td3/hopper_td3.py", + repo_id="OpenDILabCommunity/Hopper-v3-TD3" +) + diff --git a/dizoo/common/td3/hopper_td3_deploy.py b/dizoo/common/td3/hopper_td3_deploy.py new file mode 100644 index 0000000000..7618135d9c --- /dev/null +++ b/dizoo/common/td3/hopper_td3_deploy.py @@ -0,0 +1,16 @@ +from ding.bonus import SACAgent +from ding.config import Config +from easydict import EasyDict +import torch + +# Pull model from files which are git cloned from huggingface +policy_state_dict = torch.load("pytorch_model.bin", map_location=torch.device("cpu")) +cfg = EasyDict(Config.file_to_dict("policy_config.py")) +# Instantiate the agent +agent = SACAgent( + env="hopper", exp_name="Hopper-v3-TD3", cfg=cfg.exp_config, policy_state_dict=policy_state_dict +) +# Continue training +agent.train(step=5000) +# Render the new agent performance +agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/td3/hopper_td3_download.py b/dizoo/common/td3/hopper_td3_download.py new file mode 100644 index 0000000000..280e325eb6 --- /dev/null +++ b/dizoo/common/td3/hopper_td3_download.py @@ -0,0 +1,16 @@ +from ding.bonus import TD3Agent +from huggingface_ding import pull_model_from_hub + +# Pull model from Hugggingface hub +policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/Hopper-v3-TD3") +# Instantiate the agent +agent = TD3Agent( + env="hopper", + exp_name="Hopper-v3-TD3", + cfg=cfg.exp_config, + policy_state_dict=policy_state_dict +) +# Continue training +agent.train(step=5000) +# Render the new agent performance +agent.deploy(enable_save_replay=True) From 0d600709a7f4a52e728a3da3b48c44c7c4830d6c Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 31 Mar 2023 06:22:47 +0000 Subject: [PATCH 068/244] polish code --- ding/bonus/ddpg.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ding/bonus/ddpg.py b/ding/bonus/ddpg.py index 72757c41a0..eeac990e70 100644 --- a/ding/bonus/ddpg.py +++ b/ding/bonus/ddpg.py @@ -42,6 +42,7 @@ class EvalReturn: class DDPGAgent: supported_env_list = [ + 'hopper', 'lunarlander_continuous', ] algorithm = 'DDPG' From 3f3fb68180c8de472b99b10c18a059cda1989a17 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Mon, 3 Apr 2023 07:28:00 +0000 Subject: [PATCH 069/244] add property best --- ding/bonus/ddpg.py | 11 ++++++++++- ding/bonus/dqn.py | 11 ++++++++++- ding/bonus/impala.py | 12 +++++++++++- ding/bonus/ppof.py | 11 ++++++++++- ding/bonus/sac.py | 11 ++++++++++- ding/bonus/td3.py | 11 ++++++++++- dizoo/common/ddpg/hopper_ddpg.py | 2 +- dizoo/common/ddpg/lunarlander_ddpg.py | 2 +- dizoo/common/dqn/lunarlander_dqn.py | 2 +- dizoo/common/ppo/hopper_ppo.py | 2 +- dizoo/common/ppo/lunarlander_ppo.py | 2 +- dizoo/common/sac/hopper_sac.py | 2 +- dizoo/common/sac/lunarlander_sac.py | 2 +- dizoo/common/td3/hopper_td3.py | 2 +- dizoo/common/td3/lunarlander_td3.py | 2 +- 15 files changed, 70 insertions(+), 15 deletions(-) diff --git a/ding/bonus/ddpg.py b/ding/bonus/ddpg.py index eeac990e70..8ef0908b99 100644 --- a/ding/bonus/ddpg.py +++ b/ding/bonus/ddpg.py @@ -89,6 +89,7 @@ def __init__( self.policy = DDPGPolicy(self.cfg.policy, model=model) if policy_state_dict is not None: self.policy.learn_mode.load_state_dict(policy_state_dict) + self.model_save_dir=os.path.join(self.cfg["exp_name"], "model") def train( self, @@ -122,7 +123,7 @@ def train( task.use( CkptSaver( policy=self.policy, - save_dir=os.path.join(self.cfg["exp_name"], "model"), + save_dir=self.model_save_dir, train_freq=n_iter_save_ckpt ) ) @@ -247,3 +248,11 @@ def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: if context is not None: manager_cfg.context = context return env_cls([self.env.clone for _ in range(env_num)], manager_cfg) + + @property + def best(self): + best_model_file_path=os.path.join(self.model_save_dir, "eval.pth.tar") + if os.path.exists(best_model_file_path): + policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) + self.policy.learn_mode.load_state_dict(policy_state_dict) + return self diff --git a/ding/bonus/dqn.py b/ding/bonus/dqn.py index cbab3e5bc9..4ae8be7329 100644 --- a/ding/bonus/dqn.py +++ b/ding/bonus/dqn.py @@ -88,6 +88,7 @@ def __init__( self.policy = DQNPolicy(self.cfg.policy, model=model) if policy_state_dict is not None: self.policy.learn_mode.load_state_dict(policy_state_dict) + self.model_save_dir=os.path.join(self.cfg["exp_name"], "model") def train( self, @@ -123,7 +124,7 @@ def train( task.use( CkptSaver( policy=self.policy, - save_dir=os.path.join(self.cfg["exp_name"], "model"), + save_dir=self.model_save_dir, train_freq=n_iter_save_ckpt ) ) @@ -250,3 +251,11 @@ def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: if context is not None: manager_cfg.context = context return env_cls([self.env.clone for _ in range(env_num)], manager_cfg) + + @property + def best(self): + best_model_file_path=os.path.join(self.model_save_dir, "eval.pth.tar") + if os.path.exists(best_model_file_path): + policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) + self.policy.learn_mode.load_state_dict(policy_state_dict) + return self diff --git a/ding/bonus/impala.py b/ding/bonus/impala.py index ea9e866c78..7fb73abc89 100644 --- a/ding/bonus/impala.py +++ b/ding/bonus/impala.py @@ -89,6 +89,7 @@ def __init__( self.policy = IMPALAPolicy(self.cfg.policy, model=model) if policy_state_dict is not None: self.policy.learn_mode.load_state_dict(policy_state_dict) + self.model_save_dir=os.path.join(self.cfg["exp_name"], "model") def train( self, @@ -122,7 +123,7 @@ def train( task.use( CkptSaver( policy=self.policy, - save_dir=os.path.join(self.cfg["exp_name"], "model"), + save_dir=self.model_save_dir, train_freq=n_iter_save_ckpt ) ) @@ -249,3 +250,12 @@ def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: if context is not None: manager_cfg.context = context return env_cls([self.env.clone for _ in range(env_num)], manager_cfg) + + @property + def best(self): + best_model_file_path=os.path.join(self.model_save_dir, "eval.pth.tar") + if os.path.exists(best_model_file_path): + policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) + self.policy.learn_mode.load_state_dict(policy_state_dict) + return self + diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index edd8087ef7..6b61ba3ac9 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -110,6 +110,7 @@ def __init__( self.policy = PPOFPolicy(self.cfg, model=model) if policy_state_dict is not None: self.policy.load_state_dict(policy_state_dict) + self.model_save_dir=os.path.join(self.cfg["exp_name"], "model") def train( self, @@ -138,7 +139,7 @@ def train( task.use(PPOFStepCollector(self.seed, self.policy, collector_env, self.cfg.n_sample)) task.use(ppof_adv_estimator(self.policy)) task.use(multistep_trainer(self.policy, log_freq=n_iter_log_show)) - task.use(CkptSaver(self.policy, save_dir=self.exp_name, train_freq=n_iter_save_ckpt)) + task.use(CkptSaver(self.policy, save_dir=self.model_save_dir, train_freq=n_iter_save_ckpt)) task.use( wandb_online_logger( metric_list=self.policy.monitor_vars(), @@ -249,3 +250,11 @@ def _setup_env_manager( if context is not None: manager_cfg.context = context return env_cls([partial(self.env.clone, caller) for _ in range(env_num)], manager_cfg) + + @property + def best(self): + best_model_file_path=os.path.join(self.model_save_dir, "eval.pth.tar") + if os.path.exists(best_model_file_path): + policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) + self.policy.learn_mode.load_state_dict(policy_state_dict) + return self diff --git a/ding/bonus/sac.py b/ding/bonus/sac.py index 7aff4f3fd7..868dab7645 100644 --- a/ding/bonus/sac.py +++ b/ding/bonus/sac.py @@ -90,6 +90,7 @@ def __init__( self.policy = SACPolicy(self.cfg.policy, model=model) if policy_state_dict is not None: self.policy.learn_mode.load_state_dict(policy_state_dict) + self.model_save_dir=os.path.join(self.cfg["exp_name"], "model") def train( self, @@ -123,7 +124,7 @@ def train( task.use( CkptSaver( policy=self.policy, - save_dir=os.path.join(self.cfg["exp_name"], "model"), + save_dir=self.model_save_dir, train_freq=n_iter_save_ckpt ) ) @@ -249,3 +250,11 @@ def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: if context is not None: manager_cfg.context = context return env_cls([self.env.clone for _ in range(env_num)], manager_cfg) + + @property + def best(self): + best_model_file_path=os.path.join(self.model_save_dir, "eval.pth.tar") + if os.path.exists(best_model_file_path): + policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) + self.policy.learn_mode.load_state_dict(policy_state_dict) + return self diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 79934b0268..48063b0128 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -89,6 +89,7 @@ def __init__( self.policy = TD3Policy(self.cfg.policy, model=model) if policy_state_dict is not None: self.policy.learn_mode.load_state_dict(policy_state_dict) + self.model_save_dir=os.path.join(self.cfg["exp_name"], "model") def train( self, @@ -122,7 +123,7 @@ def train( task.use( CkptSaver( policy=self.policy, - save_dir=os.path.join(self.cfg["exp_name"], "model"), + save_dir=self.model_save_dir, train_freq=n_iter_save_ckpt ) ) @@ -247,3 +248,11 @@ def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: if context is not None: manager_cfg.context = context return env_cls([self.env.clone for _ in range(env_num)], manager_cfg) + + @property + def best(self): + best_model_file_path=os.path.join(self.model_save_dir, "eval.pth.tar") + if os.path.exists(best_model_file_path): + policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) + self.policy.learn_mode.load_state_dict(policy_state_dict) + return self diff --git a/dizoo/common/ddpg/hopper_ddpg.py b/dizoo/common/ddpg/hopper_ddpg.py index 9c32b2a1f7..687dce3b6a 100644 --- a/dizoo/common/ddpg/hopper_ddpg.py +++ b/dizoo/common/ddpg/hopper_ddpg.py @@ -7,7 +7,7 @@ return_ = agent.train(step=int(10000000), collector_env_num=4, evaluator_env_num=4, debug=False) # Push model to huggingface hub push_model_to_hub( - agent=agent, + agent=agent.best, env_name="OpenAI/Gym/MuJoCo", task_name="Hopper-v3", algo_name="DDPG", diff --git a/dizoo/common/ddpg/lunarlander_ddpg.py b/dizoo/common/ddpg/lunarlander_ddpg.py index e550c5b721..edf07f9649 100644 --- a/dizoo/common/ddpg/lunarlander_ddpg.py +++ b/dizoo/common/ddpg/lunarlander_ddpg.py @@ -7,7 +7,7 @@ return_ = agent.train(step=int(4000000), collector_env_num=4, evaluator_env_num=4) # Push model to huggingface hub push_model_to_hub( - agent=agent, + agent=agent.best, env_name="OpenAI/Gym/Box2d", task_name="LunarLander-v2", algo_name="DDPG", diff --git a/dizoo/common/dqn/lunarlander_dqn.py b/dizoo/common/dqn/lunarlander_dqn.py index 59428a482d..11139c6abc 100644 --- a/dizoo/common/dqn/lunarlander_dqn.py +++ b/dizoo/common/dqn/lunarlander_dqn.py @@ -7,7 +7,7 @@ return_ = agent.train(step=int(4000000), collector_env_num=8, evaluator_env_num=8, debug=False) # Push model to huggingface hub push_model_to_hub( - agent=agent, + agent=agent.best, env_name="OpenAI/Gym/Box2d", task_name="LunarLander-v2", algo_name="DQN", diff --git a/dizoo/common/ppo/hopper_ppo.py b/dizoo/common/ppo/hopper_ppo.py index c8b57383ef..d23bac5a0a 100644 --- a/dizoo/common/ppo/hopper_ppo.py +++ b/dizoo/common/ppo/hopper_ppo.py @@ -7,7 +7,7 @@ return_ = agent.train(step=int(10000000), collector_env_num=4, evaluator_env_num=4, debug=False) # Push model to huggingface hub push_model_to_hub( - agent=agent, + agent=agent.best, env_name="OpenAI/Gym/MuJoCo", task_name="Hopper-v3", algo_name="PPO", diff --git a/dizoo/common/ppo/lunarlander_ppo.py b/dizoo/common/ppo/lunarlander_ppo.py index 9c11750a24..56c81b5e65 100644 --- a/dizoo/common/ppo/lunarlander_ppo.py +++ b/dizoo/common/ppo/lunarlander_ppo.py @@ -7,7 +7,7 @@ return_ = agent.train(step=int(4000000), collector_env_num=4, evaluator_env_num=4) # Push model to huggingface hub push_model_to_hub( - agent=agent, + agent=agent.best, env_name="OpenAI/Gym/Box2d", task_name="LunarLander-v2", algo_name="PPO", diff --git a/dizoo/common/sac/hopper_sac.py b/dizoo/common/sac/hopper_sac.py index 9995b8e42c..0979f4ceee 100644 --- a/dizoo/common/sac/hopper_sac.py +++ b/dizoo/common/sac/hopper_sac.py @@ -7,7 +7,7 @@ return_ = agent.train(step=int(10000000), collector_env_num=4, evaluator_env_num=4, debug=False) # Push model to huggingface hub push_model_to_hub( - agent=agent, + agent=agent.best, env_name="OpenAI/Gym/MuJoCo", task_name="Hopper-v3", algo_name="SAC", diff --git a/dizoo/common/sac/lunarlander_sac.py b/dizoo/common/sac/lunarlander_sac.py index 72a88026d2..f3aa3a6809 100644 --- a/dizoo/common/sac/lunarlander_sac.py +++ b/dizoo/common/sac/lunarlander_sac.py @@ -7,7 +7,7 @@ return_ = agent.train(step=int(4000000), collector_env_num=8, evaluator_env_num=8) # Push model to huggingface hub push_model_to_hub( - agent=agent, + agent=agent.best, env_name="OpenAI/Gym/Box2d", task_name="LunarLander-v2", algo_name="SAC", diff --git a/dizoo/common/td3/hopper_td3.py b/dizoo/common/td3/hopper_td3.py index 2cda317fdf..ce90ac796e 100644 --- a/dizoo/common/td3/hopper_td3.py +++ b/dizoo/common/td3/hopper_td3.py @@ -7,7 +7,7 @@ return_ = agent.train(step=int(10000000), collector_env_num=4, evaluator_env_num=4, debug=False) # Push model to huggingface hub push_model_to_hub( - agent=agent, + agent=agent.best, env_name="OpenAI/Gym/MuJoCo", task_name="Hopper-v3", algo_name="TD3", diff --git a/dizoo/common/td3/lunarlander_td3.py b/dizoo/common/td3/lunarlander_td3.py index 41de88b1a9..c9cdd0b0b4 100644 --- a/dizoo/common/td3/lunarlander_td3.py +++ b/dizoo/common/td3/lunarlander_td3.py @@ -7,7 +7,7 @@ return_ = agent.train(step=int(4000000), collector_env_num=4, evaluator_env_num=4) # Push model to huggingface hub push_model_to_hub( - agent=agent, + agent=agent.best, env_name="OpenAI/Gym/Box2d", task_name="LunarLander-v2", algo_name="TD3", From dc5aa8c5b5da63874b99a1f31d54743d59b9e65f Mon Sep 17 00:00:00 2001 From: zjowowen Date: Mon, 3 Apr 2023 09:45:46 +0000 Subject: [PATCH 070/244] add a2c pipeline --- ding/bonus/__init__.py | 1 + ding/bonus/a2c.py | 255 ++++++++++++++++++ ding/bonus/config.py | 44 ++- .../framework/middleware/functional/logger.py | 7 +- .../middleware/functional/trainer.py | 5 +- ding/policy/a2c.py | 3 + dizoo/common/a2c/lunarlander_a2c.py | 23 ++ dizoo/common/a2c/lunarlander_a2c_deploy.py | 19 ++ dizoo/common/a2c/lunarlander_a2c_download.py | 13 + 9 files changed, 366 insertions(+), 4 deletions(-) create mode 100644 dizoo/common/a2c/lunarlander_a2c.py create mode 100644 dizoo/common/a2c/lunarlander_a2c_deploy.py create mode 100644 dizoo/common/a2c/lunarlander_a2c_download.py diff --git a/ding/bonus/__init__.py b/ding/bonus/__init__.py index 0f5d4b1f48..4b86b99361 100644 --- a/ding/bonus/__init__.py +++ b/ding/bonus/__init__.py @@ -1,3 +1,4 @@ +from .a2c import A2CAgent from .ppof import PPOF from .td3 import TD3Agent from .ddpg import DDPGAgent diff --git a/ding/bonus/a2c.py b/ding/bonus/a2c.py index e69de29bb2..3f41735203 100644 --- a/ding/bonus/a2c.py +++ b/ding/bonus/a2c.py @@ -0,0 +1,255 @@ +from dataclasses import dataclass +from typing import Optional, Union +from ditk import logging +from easydict import EasyDict +import os +import gym +import torch +import treetensor.torch as ttorch +import numpy as np +from ding.framework import task, OnlineRLContext +from ding.framework.middleware import CkptSaver, trainer, \ + wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, \ + gae_estimator, final_ctx_saver +from ding.envs import BaseEnv, BaseEnvManagerV2, SubprocessEnvManagerV2 +from ding.policy import A2CPolicy +from ding.utils import set_pkg_seed +from ding.config import Config, save_config_py, compile_config +from ding.model import VAC +from ding.bonus.config import get_instance_config, get_instance_env + + +@dataclass +class TrainingReturn: + ''' + Attributions + wandb_url: The weight & biases (wandb) project url of the trainning experiment. + ''' + wandb_url: str + + +@dataclass +class EvalReturn: + ''' + Attributions + eval_value: The mean of evaluation return. + eval_value_std: The standard deviation of evaluation return. + ''' + eval_value: np.float32 + eval_value_std: np.float32 + + +class A2CAgent: + supported_env_list = [ + 'lunarlander_discrete', + ] + algorithm = 'A2C' + + def __init__( + self, + env: Union[str, BaseEnv], + seed: int = 0, + exp_name: str = None, + model: Optional[torch.nn.Module] = None, + cfg: Optional[Union[EasyDict, dict, str]] = None, + policy_state_dict: str = None, + ) -> None: + if isinstance(env, str): + assert env in A2CAgent.supported_env_list, "Please use supported envs: {}".format( + A2CAgent.supported_env_list + ) + self.env = get_instance_env(env) + if cfg is None: + # 'It should be default env tuned config' + cfg = get_instance_config(env, algorithm=A2CAgent.algorithm) + else: + assert isinstance(cfg, EasyDict), "Please use EasyDict as config data type." + + if exp_name is not None: + cfg.exp_name = exp_name + self.cfg = compile_config(cfg, policy=A2CPolicy) + self.exp_name = self.cfg.exp_name + + elif isinstance(env, BaseEnv): + self.cfg = compile_config(cfg, policy=A2CPolicy) + raise NotImplementedError + else: + raise TypeError("not support env type: {}, only strings and instances of `BaseEnv` now".format(type(env))) + logging.getLogger().setLevel(logging.INFO) + self.seed = seed + set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda) + if not os.path.exists(self.exp_name): + os.makedirs(self.exp_name) + save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py')) + if model is None: + model = VAC(**self.cfg.policy.model) + self.policy = A2CPolicy(self.cfg.policy, model=model) + if policy_state_dict is not None: + self.policy.learn_mode.load_state_dict(policy_state_dict) + self.model_save_dir=os.path.join(self.cfg["exp_name"], "model") + self.device=self.policy._device + + def train( + self, + step: int = int(1e7), + collector_env_num: int = 4, + evaluator_env_num: int = 4, + n_iter_log_show: int = 500, + n_iter_save_ckpt: int = 1000, + context: Optional[str] = None, + debug: bool = False + ) -> TrainingReturn: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + logging.debug(self.policy._model) + # define env and policy + collector_env = self._setup_env_manager(collector_env_num, context, debug) + evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug) + + with task.start(ctx=OnlineRLContext()): + task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) + task.use( + StepCollector( + self.cfg, + self.policy.collect_mode, + collector_env + ) + ) + task.use(gae_estimator(self.cfg, self.policy.collect_mode)) + task.use(trainer(self.cfg, self.policy.learn_mode, self.device)) + task.use( + CkptSaver( + policy=self.policy, + save_dir=self.model_save_dir, + train_freq=n_iter_save_ckpt + ) + ) + task.use( + wandb_online_logger( + metric_list=self.policy.monitor_vars(), + model=self.policy._model, + anonymous=True, + project_name=self.exp_name + ) + ) + task.use(termination_checker(max_env_step=step)) + task.use(final_ctx_saver(name=self.cfg["exp_name"])) + task.run() + + return TrainingReturn(wandb_url=task.ctx.wandb_url) + + def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> None: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + # define env and policy + env = self.env.clone() + env.seed(self.seed, dynamic_seed=False) + + if enable_save_replay and replay_save_path: + env.enable_save_replay(replay_path=replay_save_path) + elif enable_save_replay: + env.enable_save_replay(replay_path=os.path.join(self.exp_name, 'videos')) + else: + logging.warning('No video would be generated during the deploy.') + + def single_env_forward_wrapper(forward_fn, cuda=True): + + def _forward(obs): + # unsqueeze means add batch dim, i.e. (O, ) -> (1, O) + obs = ttorch.as_tensor(obs).unsqueeze(0) + if cuda and torch.cuda.is_available(): + obs = obs.cuda() + action = forward_fn(obs, mode='compute_actor')["action"] + # squeeze means delete batch dim, i.e. (1, A) -> (A, ) + action = action.squeeze(0).detach().cpu().numpy() + return action + + return _forward + + forward_fn = single_env_forward_wrapper(self.policy._model, self.cfg.policy.cuda) + + # main loop + return_ = 0. + step = 0 + obs = env.reset() + while True: + action = forward_fn(obs) + obs, rew, done, info = env.step(action) + return_ += rew + step += 1 + if done: + break + logging.info(f'A2C deploy is finished, final episode return with {step} steps is: {return_}') + + def collect_data( + self, + env_num: int = 8, + save_data_path: Optional[str] = None, + n_sample: Optional[int] = None, + n_episode: Optional[int] = None, + context: Optional[str] = None, + debug: bool = False + ) -> None: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + if n_episode is not None: + raise NotImplementedError + # define env and policy + env = self._setup_env_manager(env_num, context, debug) + + if save_data_path is None: + save_data_path = os.path.join(self.exp_name, 'demo_data') + + # main execution task + with task.start(ctx=OnlineRLContext()): + task.use( + StepCollector( + self.cfg, self.policy.collect_mode, env, random_collect_size=self.cfg.policy.random_collect_size + ) + ) + task.use(offline_data_saver(save_data_path, data_type='hdf5')) + task.run(max_step=1) + logging.info( + f'A2C collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`' + ) + + def batch_evaluate( + self, + env_num: int = 4, + n_evaluator_episode: int = 4, + context: Optional[str] = None, + debug: bool = False + ) -> EvalReturn: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + # define env and policy + env = self._setup_env_manager(env_num, context, debug) + + evaluate_cfg = self.cfg + evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode + + # main execution task + with task.start(ctx=OnlineRLContext()): + task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, env)) + task.run(max_step=1) + + return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) + + def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: bool = False) -> BaseEnvManagerV2: + if debug: + env_cls = BaseEnvManagerV2 + manager_cfg = env_cls.default_config() + else: + env_cls = SubprocessEnvManagerV2 + manager_cfg = env_cls.default_config() + if context is not None: + manager_cfg.context = context + return env_cls([self.env.clone for _ in range(env_num)], manager_cfg) + + @property + def best(self): + best_model_file_path=os.path.join(self.model_save_dir, "eval.pth.tar") + if os.path.exists(best_model_file_path): + policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) + self.policy.learn_mode.load_state_dict(policy_state_dict) + return self diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 1b6e4adc02..8c49807522 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -4,7 +4,7 @@ from ding.envs import BaseEnv, DingEnvWrapper from ding.envs.env_wrappers import MaxAndSkipWrapper, WarpFrameWrapper, ScaledFloatFrameWrapper, FrameStackWrapper, \ EvalEpisodeReturnEnv, TransposeWrapper, TimeLimitWrapper, FlatObsWrapper, GymToGymnasiumWrapper -from ding.policy import PPOFPolicy, TD3Policy, DDPGPolicy, SACPolicy, DQNPolicy, IMPALAPolicy +from ding.policy import PPOFPolicy, A2CPolicy ,TD3Policy, DDPGPolicy, SACPolicy, DQNPolicy, IMPALAPolicy def get_instance_config(env: str, algorithm: str) -> EasyDict: @@ -119,6 +119,48 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: cfg.learning_rate = 3e-4 else: raise KeyError("not supported env type: {}".format(env)) + elif algorithm == 'A2C': + cfg = EasyDict({"policy": A2CPolicy.default_config()}) + if env == 'lunarlander_discrete': + cfg.update( + dict( + exp_name='LunarLander-v2-A2C', + env=dict( + collector_env_num=8, + evaluator_env_num=8, + env_id='LunarLander-v2', + n_evaluator_episode=8, + stop_value=240, + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=8, + action_shape=4, + ), + learn=dict( + batch_size=160, + learning_rate=3e-4, + entropy_weight=0.001, + adv_norm=True, + ), + collect=dict( + n_sample=320, + discount_factor=0.99, + gae_lambda=0.95, + ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) + else: + raise KeyError("not supported env type: {}".format(env)) elif algorithm == 'TD3': cfg = EasyDict({"policy": TD3Policy.default_config()}) if env == 'hopper': diff --git a/ding/framework/middleware/functional/logger.py b/ding/framework/middleware/functional/logger.py index 50ebbbbd8e..97e1ee1206 100644 --- a/ding/framework/middleware/functional/logger.py +++ b/ding/framework/middleware/functional/logger.py @@ -198,7 +198,12 @@ def _plot(ctx: "OnlineRLContext"): if cfg.plot_logger: for metric in metric_list: - if len(ctx.train_output)>0 and metric in ctx.train_output[0]: + if isinstance(ctx.train_output, Dict) and metric in ctx.train_output: + if isinstance(ctx.train_output[metric], torch.Tensor): + info_for_logging.update({metric: ctx.train_output[metric].cpu().detach().numpy()}) + else: + info_for_logging.update({metric: ctx.train_output[metric]}) + elif isinstance(ctx.train_output, List) and len(ctx.train_output)>0 and metric in ctx.train_output[0]: metric_value_list = [] for item in ctx.train_output: if isinstance(item[metric], torch.Tensor): diff --git a/ding/framework/middleware/functional/trainer.py b/ding/framework/middleware/functional/trainer.py index 1ea7bbe190..82187704e5 100644 --- a/ding/framework/middleware/functional/trainer.py +++ b/ding/framework/middleware/functional/trainer.py @@ -6,7 +6,7 @@ from ding.framework import task, OfflineRLContext, OnlineRLContext -def trainer(cfg: EasyDict, policy: Policy) -> Callable: +def trainer(cfg: EasyDict, policy: Policy, device:str='cpu') -> Callable: """ Overview: The middleware that executes a single training process. @@ -28,7 +28,8 @@ def _train(ctx: Union["OnlineRLContext", "OfflineRLContext"]): if ctx.train_data is None: return - train_output = policy.forward(ctx.train_data) + data = ctx.train_data.to(device) + train_output = policy.forward(data) #if ctx.train_iter % cfg.policy.learn.learner.hook.log_show_after_iter == 0: if True: if isinstance(ctx, OnlineRLContext): diff --git a/ding/policy/a2c.py b/ding/policy/a2c.py index a6b8c41c6a..f80917e3dc 100644 --- a/ding/policy/a2c.py +++ b/ding/policy/a2c.py @@ -274,3 +274,6 @@ def _forward_eval(self, data: dict) -> dict: def _monitor_vars_learn(self) -> List[str]: return super()._monitor_vars_learn() + ['policy_loss', 'value_loss', 'entropy_loss', 'adv_abs_max', 'grad_norm'] + + def monitor_vars(self) -> List[str]: + return self._monitor_vars_learn() \ No newline at end of file diff --git a/dizoo/common/a2c/lunarlander_a2c.py b/dizoo/common/a2c/lunarlander_a2c.py new file mode 100644 index 0000000000..dba7b42f03 --- /dev/null +++ b/dizoo/common/a2c/lunarlander_a2c.py @@ -0,0 +1,23 @@ +from ding.bonus import A2CAgent +from huggingface_ding import push_model_to_hub + +# Instantiate the agent +agent = A2CAgent(env="lunarlander_discrete", exp_name="Lunarlander-v2-A2C") +# Train the agent +return_ = agent.train(step=int(4000000), collector_env_num=8, evaluator_env_num=8, debug=False) +# Push model to huggingface hub +push_model_to_hub( + agent=agent.best, + env_name="OpenAI/Gym/Box2d", + task_name="LunarLander-v2", + algo_name="A2C", + wandb_url=return_.wandb_url, + github_repo_url="https://github.com/opendilab/DI-engine", + github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/a2c.html", + github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", + installation_guide="pip3 install DI-engine[common_env]", + usage_file_by_git_clone="./dizoo/common/a2c/lunarlander_a2c_deploy.py", + usage_file_by_huggingface_ding="./dizoo/common/a2c/lunarlander_a2c_download.py", + train_file="./dizoo/common/a2c/lunarlander_a2c.py", + repo_id="OpenDILabCommunity/Lunarlander-v2-A2C" +) diff --git a/dizoo/common/a2c/lunarlander_a2c_deploy.py b/dizoo/common/a2c/lunarlander_a2c_deploy.py new file mode 100644 index 0000000000..92d0f31fb2 --- /dev/null +++ b/dizoo/common/a2c/lunarlander_a2c_deploy.py @@ -0,0 +1,19 @@ +from ding.bonus import A2CAgent +from ding.config import Config +from easydict import EasyDict +import torch + +# Pull model from files which are git cloned from huggingface +policy_state_dict = torch.load("pytorch_model.bin", map_location=torch.device("cpu")) +cfg = EasyDict(Config.file_to_dict("policy_config.py")) +# Instantiate the agent +agent = A2CAgent( + env="lunarlander_discrete", + exp_name="Lunarlander-v2-A2C", + cfg=cfg.exp_config, + policy_state_dict=policy_state_dict +) +# Continue training +agent.train(step=5000) +# Render the new agent performance +agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/a2c/lunarlander_a2c_download.py b/dizoo/common/a2c/lunarlander_a2c_download.py new file mode 100644 index 0000000000..7e70a6f3e1 --- /dev/null +++ b/dizoo/common/a2c/lunarlander_a2c_download.py @@ -0,0 +1,13 @@ +from ding.bonus import A2CAgent +from huggingface_ding import pull_model_from_hub + +# Pull model from Hugggingface hub +policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/Lunarlander-v2-A2C") +# Instantiate the agent +agent = A2CAgent( + env="lunarlander_discrete", exp_name="Lunarlander-v2-A2C", cfg=cfg.exp_config, policy_state_dict=policy_state_dict +) +# Continue training +agent.train(step=5000) +# Render the new agent performance +agent.deploy(enable_save_replay=True) From ccb2fcfd56ad465deb8e07f6db55bf432e818f61 Mon Sep 17 00:00:00 2001 From: zhangpaipai <1124321458@qq.com> Date: Wed, 5 Apr 2023 17:48:24 +0800 Subject: [PATCH 071/244] add sac halfcheetah+walker2d --- ding/bonus/config.py | 122 +++++++++++++++++++++++++++++++++++++++++++ ding/bonus/sac.py | 2 + 2 files changed, 124 insertions(+) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 8c8bafc2b6..8ef8f0a677 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -242,6 +242,116 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), ) ) + elif env == 'HalfCheetah': + cfg.update( + dict( + exp_name='HalfCheetah-v3-SAC', + seed=0, + env=dict( + env_id='HalfCheetah-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=1, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=12000, + ), + policy=dict( + cuda=False, + random_collect_size=10000, + model=dict( + obs_shape=17, + action_shape=6, + twin_critic=True, + action_space='reparameterization', + actor_head_hidden_size=256, + critic_head_hidden_size=256, + ), + learn=dict( + update_per_collect=1, + batch_size=256, + learning_rate_q=1e-3, + learning_rate_policy=1e-3, + learning_rate_alpha=3e-4, + ignore_done=True, + target_theta=0.005, + discount_factor=0.99, + alpha=0.2, + reparameterization=True, + auto_alpha=False, + ), + collect=dict( + n_sample=1, + unroll_len=1, + ), + command=dict(), + eval=dict(), + other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) + elif env == 'Walker2d': + cfg.update( + dict( + exp_name='Walker2d-v3-SAC', + seed=0, + env=dict( + env_id='Walker2d-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=1, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=6000, + ), + policy=dict( + cuda=True, + random_collect_size=10000, + model=dict( + obs_shape=17, + action_shape=6, + twin_critic=True, + action_space='reparameterization', + actor_head_hidden_size=256, + critic_head_hidden_size=256, + ), + learn=dict( + update_per_collect=1, + batch_size=256, + learning_rate_q=1e-3, + learning_rate_policy=1e-3, + learning_rate_alpha=3e-4, + ignore_done=False, + target_theta=0.005, + discount_factor=0.99, + alpha=0.2, + reparameterization=True, + auto_alpha=False, + ), + collect=dict( + n_sample=1, + unroll_len=1, + ), + command=dict(), + eval=dict(), + other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) elif env == 'lunarlander_continuous': cfg.update( dict( @@ -561,6 +671,18 @@ def get_instance_env(env: str) -> BaseEnv: env_wrapper='mujoco_default', ) return DingEnvWrapper(gym.make('Hopper-v3'), cfg=cfg) + elif env == 'HalfCheetah': + cfg = EasyDict( + env_id='HalfCheetah-v3', + env_wrapper='mujoco_default', + ) + return DingEnvWrapper(gym.make('HalfCheetah-v3'), cfg=cfg) + elif env == 'Walker2d': + cfg = EasyDict( + env_id='Walker2d-v3', + env_wrapper='mujoco_default', + ) + return DingEnvWrapper(gym.make('Walker2d-v3'), cfg=cfg) elif env == "SpaceInvaders": cfg = EasyDict({ 'env_id': "SpaceInvaders-v4", diff --git a/ding/bonus/sac.py b/ding/bonus/sac.py index e1cdac9e0c..ad62319f1a 100644 --- a/ding/bonus/sac.py +++ b/ding/bonus/sac.py @@ -44,6 +44,8 @@ class EvalReturn: class SACOffPolicyAgent: supported_env_list = [ 'hopper', + 'HalfCheetah', + 'Walker2d', 'lunarlander_continuous', ] algorithm = 'SAC' From c937f3b86d64c0164063ff8e34f4400974108cbf Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 6 Apr 2023 04:55:16 +0000 Subject: [PATCH 072/244] fix a2c pipeline bug --- ding/bonus/a2c.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ding/bonus/a2c.py b/ding/bonus/a2c.py index 3f41735203..ed9931907c 100644 --- a/ding/bonus/a2c.py +++ b/ding/bonus/a2c.py @@ -16,6 +16,7 @@ from ding.utils import set_pkg_seed from ding.config import Config, save_config_py, compile_config from ding.model import VAC +from ding.model import model_wrap from ding.bonus.config import get_instance_config, get_instance_env @@ -154,6 +155,8 @@ def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, def single_env_forward_wrapper(forward_fn, cuda=True): + forward_fn = model_wrap(forward_fn, wrapper_name='argmax_sample').forward + def _forward(obs): # unsqueeze means add batch dim, i.e. (O, ) -> (1, O) obs = ttorch.as_tensor(obs).unsqueeze(0) From 27ff4255983168ae9d76b8800d9823eca8851464 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 6 Apr 2023 05:05:50 +0000 Subject: [PATCH 073/244] fix pipeline bug --- ding/bonus/a2c.py | 3 +-- ding/framework/middleware/functional/trainer.py | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/ding/bonus/a2c.py b/ding/bonus/a2c.py index ed9931907c..b16d779051 100644 --- a/ding/bonus/a2c.py +++ b/ding/bonus/a2c.py @@ -88,7 +88,6 @@ def __init__( if policy_state_dict is not None: self.policy.learn_mode.load_state_dict(policy_state_dict) self.model_save_dir=os.path.join(self.cfg["exp_name"], "model") - self.device=self.policy._device def train( self, @@ -117,7 +116,7 @@ def train( ) ) task.use(gae_estimator(self.cfg, self.policy.collect_mode)) - task.use(trainer(self.cfg, self.policy.learn_mode, self.device)) + task.use(trainer(self.cfg, self.policy.learn_mode)) task.use( CkptSaver( policy=self.policy, diff --git a/ding/framework/middleware/functional/trainer.py b/ding/framework/middleware/functional/trainer.py index 82187704e5..1ea7bbe190 100644 --- a/ding/framework/middleware/functional/trainer.py +++ b/ding/framework/middleware/functional/trainer.py @@ -6,7 +6,7 @@ from ding.framework import task, OfflineRLContext, OnlineRLContext -def trainer(cfg: EasyDict, policy: Policy, device:str='cpu') -> Callable: +def trainer(cfg: EasyDict, policy: Policy) -> Callable: """ Overview: The middleware that executes a single training process. @@ -28,8 +28,7 @@ def _train(ctx: Union["OnlineRLContext", "OfflineRLContext"]): if ctx.train_data is None: return - data = ctx.train_data.to(device) - train_output = policy.forward(data) + train_output = policy.forward(ctx.train_data) #if ctx.train_iter % cfg.policy.learn.learner.hook.log_show_after_iter == 0: if True: if isinstance(ctx, OnlineRLContext): From 02bc7f043e0506af7af005c7184a46f5c871384d Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 6 Apr 2023 05:11:38 +0000 Subject: [PATCH 074/244] fix bug --- ding/bonus/a2c.py | 2 +- ding/bonus/ddpg.py | 2 +- ding/bonus/dqn.py | 2 +- ding/bonus/impala.py | 2 +- ding/bonus/ppof.py | 2 +- ding/bonus/sac.py | 2 +- ding/bonus/td3.py | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ding/bonus/a2c.py b/ding/bonus/a2c.py index b16d779051..34ccd6152c 100644 --- a/ding/bonus/a2c.py +++ b/ding/bonus/a2c.py @@ -87,7 +87,7 @@ def __init__( self.policy = A2CPolicy(self.cfg.policy, model=model) if policy_state_dict is not None: self.policy.learn_mode.load_state_dict(policy_state_dict) - self.model_save_dir=os.path.join(self.cfg["exp_name"], "model") + self.model_save_dir=os.path.join(self.exp_name, "model") def train( self, diff --git a/ding/bonus/ddpg.py b/ding/bonus/ddpg.py index 8ef0908b99..5305f7d765 100644 --- a/ding/bonus/ddpg.py +++ b/ding/bonus/ddpg.py @@ -89,7 +89,7 @@ def __init__( self.policy = DDPGPolicy(self.cfg.policy, model=model) if policy_state_dict is not None: self.policy.learn_mode.load_state_dict(policy_state_dict) - self.model_save_dir=os.path.join(self.cfg["exp_name"], "model") + self.model_save_dir=os.path.join(self.exp_name, "model") def train( self, diff --git a/ding/bonus/dqn.py b/ding/bonus/dqn.py index 4ae8be7329..d78c88146e 100644 --- a/ding/bonus/dqn.py +++ b/ding/bonus/dqn.py @@ -88,7 +88,7 @@ def __init__( self.policy = DQNPolicy(self.cfg.policy, model=model) if policy_state_dict is not None: self.policy.learn_mode.load_state_dict(policy_state_dict) - self.model_save_dir=os.path.join(self.cfg["exp_name"], "model") + self.model_save_dir=os.path.join(self.exp_name, "model") def train( self, diff --git a/ding/bonus/impala.py b/ding/bonus/impala.py index 7fb73abc89..c3426965c6 100644 --- a/ding/bonus/impala.py +++ b/ding/bonus/impala.py @@ -89,7 +89,7 @@ def __init__( self.policy = IMPALAPolicy(self.cfg.policy, model=model) if policy_state_dict is not None: self.policy.learn_mode.load_state_dict(policy_state_dict) - self.model_save_dir=os.path.join(self.cfg["exp_name"], "model") + self.model_save_dir=os.path.join(self.exp_name, "model") def train( self, diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index 6b61ba3ac9..6b53355e28 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -110,7 +110,7 @@ def __init__( self.policy = PPOFPolicy(self.cfg, model=model) if policy_state_dict is not None: self.policy.load_state_dict(policy_state_dict) - self.model_save_dir=os.path.join(self.cfg["exp_name"], "model") + self.model_save_dir=os.path.join(self.exp_name, "model") def train( self, diff --git a/ding/bonus/sac.py b/ding/bonus/sac.py index 868dab7645..fbe6f426e7 100644 --- a/ding/bonus/sac.py +++ b/ding/bonus/sac.py @@ -90,7 +90,7 @@ def __init__( self.policy = SACPolicy(self.cfg.policy, model=model) if policy_state_dict is not None: self.policy.learn_mode.load_state_dict(policy_state_dict) - self.model_save_dir=os.path.join(self.cfg["exp_name"], "model") + self.model_save_dir=os.path.join(self.exp_name, "model") def train( self, diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 48063b0128..441063f9d7 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -89,7 +89,7 @@ def __init__( self.policy = TD3Policy(self.cfg.policy, model=model) if policy_state_dict is not None: self.policy.learn_mode.load_state_dict(policy_state_dict) - self.model_save_dir=os.path.join(self.cfg["exp_name"], "model") + self.model_save_dir=os.path.join(self.exp_name, "model") def train( self, From 6fb854f532806e8bd3abf5ef98400df719d4c0b4 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 6 Apr 2023 06:16:43 +0000 Subject: [PATCH 075/244] change config --- dizoo/common/a2c/lunarlander_a2c.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dizoo/common/a2c/lunarlander_a2c.py b/dizoo/common/a2c/lunarlander_a2c.py index dba7b42f03..174ce3b929 100644 --- a/dizoo/common/a2c/lunarlander_a2c.py +++ b/dizoo/common/a2c/lunarlander_a2c.py @@ -4,7 +4,7 @@ # Instantiate the agent agent = A2CAgent(env="lunarlander_discrete", exp_name="Lunarlander-v2-A2C") # Train the agent -return_ = agent.train(step=int(4000000), collector_env_num=8, evaluator_env_num=8, debug=False) +return_ = agent.train(step=int(20000000), collector_env_num=8, evaluator_env_num=8, debug=False) # Push model to huggingface hub push_model_to_hub( agent=agent.best, From a76408cda7c01cbbb833be4415a615009363648d Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 7 Apr 2023 06:47:39 +0000 Subject: [PATCH 076/244] remove IMPALA pipeline --- ding/bonus/__init__.py | 1 - ding/bonus/config.py | 73 ------------ ding/bonus/impala.py | 261 ----------------------------------------- 3 files changed, 335 deletions(-) delete mode 100644 ding/bonus/impala.py diff --git a/ding/bonus/__init__.py b/ding/bonus/__init__.py index 4b86b99361..43e1dd8790 100644 --- a/ding/bonus/__init__.py +++ b/ding/bonus/__init__.py @@ -4,4 +4,3 @@ from .ddpg import DDPGAgent from .dqn import DQNAgent from .sac import SACAgent -from .impala import IMPALAAgent diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 8c49807522..fa1fca248c 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -503,79 +503,6 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ) else: raise KeyError("not supported env type: {}".format(env)) - elif algorithm == 'IMPALA': - cfg = EasyDict({"policy": IMPALAPolicy.default_config()}) - if env == 'SpaceInvaders': - cfg.update( - dict( - exp_name='SpaceInvaders-v4-IMPALA', - seed=0, - env=dict( - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=10000000000, - env_id='SpaceInvaders-v4', - frame_stack=4, - manager=dict(shared_memory=False, ) - ), - policy=dict( - cuda=True, - #unroll_len=32, - random_collect_size=500, - model=dict( - obs_shape=[4, 84, 84], - action_shape=6, - encoder_hidden_size_list=[128, 128, 256, 512], - critic_head_hidden_size=512, - critic_head_layer_num=3, - actor_head_hidden_size=512, - actor_head_layer_num=3, - ), - learn=dict( - # (int) collect n_sample data, train model update_per_collect times - # here we follow impala serial pipeline - update_per_collect=3, # update_per_collect show be in [1, 10] - # (int) the number of data for a train iteration - batch_size=128, - grad_clip_type='clip_norm', - clip_value=5, - learning_rate=0.0003, - # (float) loss weight of the value network, the weight of policy network is set to 1 - value_weight=0.5, - # (float) loss weight of the entropy regularization, the weight of policy network is set to 1 - entropy_weight=0.01, - # (float) discount factor for future reward, defaults int [0, 1] - discount_factor=0.99, - # (float) additional discounting parameter - lambda_=0.95, - # (float) clip ratio of importance weights - rho_clip_ratio=1.0, - # (float) clip ratio of importance weights - c_clip_ratio=1.0, - # (float) clip ratio of importance sampling - rho_pg_clip_ratio=1.0, - ), - collect=dict( - unroll_len=32, - # (int) collect n_sample data, train model n_iteration times - n_sample=16, - collector=dict(collect_print_freq=1000, ), - ), - eval=dict(evaluator=dict(eval_freq=5000, )), - other=dict(replay_buffer=dict(replay_buffer_size=10000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - else: - raise KeyError("not supported env type: {}".format(env)) else: raise KeyError("not supported algorithm type: {}".format(algorithm)) diff --git a/ding/bonus/impala.py b/ding/bonus/impala.py deleted file mode 100644 index c3426965c6..0000000000 --- a/ding/bonus/impala.py +++ /dev/null @@ -1,261 +0,0 @@ -from dataclasses import dataclass -from typing import Optional, Union -from ditk import logging -from easydict import EasyDict -import os -import gym -import torch -import treetensor.torch as ttorch -import numpy as np -from ding.framework import task, OnlineRLContext -from ding.framework.middleware import CkptSaver, multistep_trainer, \ - wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \ - OffPolicyLearner, final_ctx_saver, eps_greedy_handler, nstep_reward_enhancer, epoch_timer -from ding.envs import BaseEnv, BaseEnvManagerV2, SubprocessEnvManagerV2 -from ding.policy import IMPALAPolicy -from ding.utils import set_pkg_seed -from ding.config import Config, save_config_py, compile_config -from ding.model import VAC -from ding.model import model_wrap -from ding.data import DequeBuffer -from ding.bonus.config import get_instance_config, get_instance_env - - -@dataclass -class TrainingReturn: - ''' - Attributions - wandb_url: The weight & biases (wandb) project url of the trainning experiment. - ''' - wandb_url: str - - -@dataclass -class EvalReturn: - ''' - Attributions - eval_value: The mean of evaluation return. - eval_value_std: The standard deviation of evaluation return. - ''' - eval_value: np.float32 - eval_value_std: np.float32 - - -class IMPALAAgent: - supported_env_list = [ - 'SpaceInvaders', - ] - algorithm = 'IMPALA' - - def __init__( - self, - env: Union[str, BaseEnv], - seed: int = 0, - exp_name: str = None, - model: Optional[torch.nn.Module] = None, - cfg: Optional[Union[EasyDict, dict, str]] = None, - policy_state_dict: str = None, - ) -> None: - if isinstance(env, str): - assert env in IMPALAAgent.supported_env_list, "Please use supported envs: {}".format( - IMPALAAgent.supported_env_list - ) - self.env = get_instance_env(env) - if cfg is None: - # 'It should be default env tuned config' - cfg = get_instance_config(env, algorithm=IMPALAAgent.algorithm) - else: - assert isinstance(cfg, EasyDict), "Please use EasyDict as config data type." - - if exp_name is not None: - cfg.exp_name = exp_name - self.cfg = compile_config(cfg, policy=IMPALAPolicy) - self.exp_name = self.cfg.exp_name - - elif isinstance(env, BaseEnv): - self.cfg = compile_config(cfg, policy=IMPALAPolicy) - raise NotImplementedError - else: - raise TypeError("not support env type: {}, only strings and instances of `BaseEnv` now".format(type(env))) - logging.getLogger().setLevel(logging.INFO) - self.seed = seed - set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda) - if not os.path.exists(self.exp_name): - os.makedirs(self.exp_name) - save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py')) - if model is None: - model = VAC(**self.cfg.policy.model) - self.buffer_ = DequeBuffer(size=self.cfg.policy.other.replay_buffer.replay_buffer_size) - self.policy = IMPALAPolicy(self.cfg.policy, model=model) - if policy_state_dict is not None: - self.policy.learn_mode.load_state_dict(policy_state_dict) - self.model_save_dir=os.path.join(self.exp_name, "model") - - def train( - self, - step: int = int(1e7), - collector_env_num: int = 4, - evaluator_env_num: int = 4, - n_iter_log_show: int = 500, - n_iter_save_ckpt: int = 1000, - context: Optional[str] = None, - debug: bool = False - ) -> TrainingReturn: - if debug: - logging.getLogger().setLevel(logging.DEBUG) - logging.debug(self.policy._model) - # define env and policy - collector_env = self._setup_env_manager(collector_env_num, context, debug) - evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug) - - with task.start(ctx=OnlineRLContext()): - task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) - task.use( - StepCollector( - self.cfg, - self.policy.collect_mode, - collector_env, - random_collect_size=self.cfg.policy.random_collect_size - ) - ) - task.use(data_pusher(self.cfg, self.buffer_, group_by_env=True)) - task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_)) - task.use( - CkptSaver( - policy=self.policy, - save_dir=self.model_save_dir, - train_freq=n_iter_save_ckpt - ) - ) - task.use(epoch_timer()) - task.use( - wandb_online_logger( - metric_list=self.policy.monitor_vars(), - model=self.policy._model, - anonymous=True, - project_name=self.exp_name - ) - ) - task.use(termination_checker(max_env_step=step)) - task.use(final_ctx_saver(name=self.cfg["exp_name"])) - task.run() - - return TrainingReturn(wandb_url=task.ctx.wandb_url) - - def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> None: - if debug: - logging.getLogger().setLevel(logging.DEBUG) - # define env and policy - env = self.env.clone() - env.seed(self.seed, dynamic_seed=False) - - if enable_save_replay and replay_save_path: - env.enable_save_replay(replay_path=replay_save_path) - elif enable_save_replay: - env.enable_save_replay(replay_path=os.path.join(self.exp_name, 'videos')) - else: - logging.warning('No video would be generated during the deploy.') - - def single_env_forward_wrapper(forward_fn, cuda=True): - - forward_fn = model_wrap(forward_fn, wrapper_name='base').forward - - def _forward(obs): - # unsqueeze means add batch dim, i.e. (O, ) -> (1, O) - obs = ttorch.as_tensor(obs).unsqueeze(0) - if cuda and torch.cuda.is_available(): - obs = obs.cuda() - (mu, sigma) = forward_fn(obs, mode='compute_actor')['logit'] - action = torch.tanh(mu).detach().cpu().numpy()[0] # deterministic_eval - return action - - return _forward - - forward_fn = single_env_forward_wrapper(self.policy._model, self.cfg.policy.cuda) - - # main loop - return_ = 0. - step = 0 - obs = env.reset() - while True: - action = forward_fn(obs) - obs, rew, done, info = env.step(action) - return_ += rew - step += 1 - if done: - break - logging.info(f'IMPALA deploy is finished, final episode return with {step} steps is: {return_}') - - def collect_data( - self, - env_num: int = 8, - save_data_path: Optional[str] = None, - n_sample: Optional[int] = None, - n_episode: Optional[int] = None, - context: Optional[str] = None, - debug: bool = False - ) -> None: - if debug: - logging.getLogger().setLevel(logging.DEBUG) - if n_episode is not None: - raise NotImplementedError - # define env and policy - env = self._setup_env_manager(env_num, context, debug) - - if save_data_path is None: - save_data_path = os.path.join(self.exp_name, 'demo_data') - - # main execution task - with task.start(ctx=OnlineRLContext()): - task.use( - StepCollector( - self.cfg, self.policy.collect_mode, env, random_collect_size=self.cfg.policy.random_collect_size - ) - ) - task.use(offline_data_saver(save_data_path, data_type='hdf5')) - task.run(max_step=1) - logging.info( - f'IMPALA collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`' - ) - - def batch_evaluate( - self, - env_num: int = 4, - n_evaluator_episode: int = 4, - context: Optional[str] = None, - debug: bool = False - ) -> EvalReturn: - if debug: - logging.getLogger().setLevel(logging.DEBUG) - # define env and policy - env = self._setup_env_manager(env_num, context, debug) - - evaluate_cfg = self.cfg - evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode - - # main execution task - with task.start(ctx=OnlineRLContext()): - task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, env)) - task.run(max_step=1) - - return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) - - def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: bool = False) -> BaseEnvManagerV2: - if debug: - env_cls = BaseEnvManagerV2 - manager_cfg = env_cls.default_config() - else: - env_cls = SubprocessEnvManagerV2 - manager_cfg = env_cls.default_config() - if context is not None: - manager_cfg.context = context - return env_cls([self.env.clone for _ in range(env_num)], manager_cfg) - - @property - def best(self): - best_model_file_path=os.path.join(self.model_save_dir, "eval.pth.tar") - if os.path.exists(best_model_file_path): - policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) - self.policy.learn_mode.load_state_dict(policy_state_dict) - return self - From fd7f92296d2bc5b662b4823f95b52659d75243a2 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 7 Apr 2023 06:53:35 +0000 Subject: [PATCH 077/244] format code --- ding/bonus/a2c.py | 20 ++++--------------- ding/bonus/config.py | 2 +- ding/bonus/ddpg.py | 12 +++-------- ding/bonus/dqn.py | 12 +++-------- ding/bonus/ppof.py | 4 ++-- ding/bonus/sac.py | 12 +++-------- ding/bonus/td3.py | 12 +++-------- .../framework/middleware/functional/logger.py | 2 +- dizoo/common/a2c/lunarlander_a2c_deploy.py | 5 +---- dizoo/common/ddpg/hopper_ddpg.py | 3 +-- dizoo/common/ddpg/hopper_ddpg_deploy.py | 4 +--- dizoo/common/ddpg/hopper_ddpg_download.py | 7 +------ dizoo/common/dqn/lunarlander_dqn_deploy.py | 5 +---- dizoo/common/ppo/hopper_ppo.py | 3 +-- dizoo/common/ppo/hopper_ppo_deploy.py | 4 +--- dizoo/common/ppo/hopper_ppo_download.py | 7 +------ dizoo/common/sac/hopper_sac.py | 4 +--- dizoo/common/sac/hopper_sac_deploy.py | 4 +--- dizoo/common/sac/hopper_sac_download.py | 7 +------ dizoo/common/td3/hopper_td3.py | 4 +--- dizoo/common/td3/hopper_td3_deploy.py | 4 +--- dizoo/common/td3/hopper_td3_download.py | 7 +------ 22 files changed, 34 insertions(+), 110 deletions(-) diff --git a/ding/bonus/a2c.py b/ding/bonus/a2c.py index 34ccd6152c..bab5b97655 100644 --- a/ding/bonus/a2c.py +++ b/ding/bonus/a2c.py @@ -87,7 +87,7 @@ def __init__( self.policy = A2CPolicy(self.cfg.policy, model=model) if policy_state_dict is not None: self.policy.learn_mode.load_state_dict(policy_state_dict) - self.model_save_dir=os.path.join(self.exp_name, "model") + self.model_save_dir = os.path.join(self.exp_name, "model") def train( self, @@ -108,22 +108,10 @@ def train( with task.start(ctx=OnlineRLContext()): task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) - task.use( - StepCollector( - self.cfg, - self.policy.collect_mode, - collector_env - ) - ) + task.use(StepCollector(self.cfg, self.policy.collect_mode, collector_env)) task.use(gae_estimator(self.cfg, self.policy.collect_mode)) task.use(trainer(self.cfg, self.policy.learn_mode)) - task.use( - CkptSaver( - policy=self.policy, - save_dir=self.model_save_dir, - train_freq=n_iter_save_ckpt - ) - ) + task.use(CkptSaver(policy=self.policy, save_dir=self.model_save_dir, train_freq=n_iter_save_ckpt)) task.use( wandb_online_logger( metric_list=self.policy.monitor_vars(), @@ -250,7 +238,7 @@ def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: @property def best(self): - best_model_file_path=os.path.join(self.model_save_dir, "eval.pth.tar") + best_model_file_path = os.path.join(self.model_save_dir, "eval.pth.tar") if os.path.exists(best_model_file_path): policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) self.policy.learn_mode.load_state_dict(policy_state_dict) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index fa1fca248c..415db966df 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -4,7 +4,7 @@ from ding.envs import BaseEnv, DingEnvWrapper from ding.envs.env_wrappers import MaxAndSkipWrapper, WarpFrameWrapper, ScaledFloatFrameWrapper, FrameStackWrapper, \ EvalEpisodeReturnEnv, TransposeWrapper, TimeLimitWrapper, FlatObsWrapper, GymToGymnasiumWrapper -from ding.policy import PPOFPolicy, A2CPolicy ,TD3Policy, DDPGPolicy, SACPolicy, DQNPolicy, IMPALAPolicy +from ding.policy import PPOFPolicy, A2CPolicy, TD3Policy, DDPGPolicy, SACPolicy, DQNPolicy, IMPALAPolicy def get_instance_config(env: str, algorithm: str) -> EasyDict: diff --git a/ding/bonus/ddpg.py b/ding/bonus/ddpg.py index 5305f7d765..c3e60404e7 100644 --- a/ding/bonus/ddpg.py +++ b/ding/bonus/ddpg.py @@ -89,7 +89,7 @@ def __init__( self.policy = DDPGPolicy(self.cfg.policy, model=model) if policy_state_dict is not None: self.policy.learn_mode.load_state_dict(policy_state_dict) - self.model_save_dir=os.path.join(self.exp_name, "model") + self.model_save_dir = os.path.join(self.exp_name, "model") def train( self, @@ -120,13 +120,7 @@ def train( ) task.use(data_pusher(self.cfg, self.buffer_)) task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_)) - task.use( - CkptSaver( - policy=self.policy, - save_dir=self.model_save_dir, - train_freq=n_iter_save_ckpt - ) - ) + task.use(CkptSaver(policy=self.policy, save_dir=self.model_save_dir, train_freq=n_iter_save_ckpt)) task.use( wandb_online_logger( metric_list=self.policy.monitor_vars(), @@ -251,7 +245,7 @@ def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: @property def best(self): - best_model_file_path=os.path.join(self.model_save_dir, "eval.pth.tar") + best_model_file_path = os.path.join(self.model_save_dir, "eval.pth.tar") if os.path.exists(best_model_file_path): policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) self.policy.learn_mode.load_state_dict(policy_state_dict) diff --git a/ding/bonus/dqn.py b/ding/bonus/dqn.py index d78c88146e..4e4bb17feb 100644 --- a/ding/bonus/dqn.py +++ b/ding/bonus/dqn.py @@ -88,7 +88,7 @@ def __init__( self.policy = DQNPolicy(self.cfg.policy, model=model) if policy_state_dict is not None: self.policy.learn_mode.load_state_dict(policy_state_dict) - self.model_save_dir=os.path.join(self.exp_name, "model") + self.model_save_dir = os.path.join(self.exp_name, "model") def train( self, @@ -121,13 +121,7 @@ def train( task.use(nstep_reward_enhancer(self.cfg)) task.use(data_pusher(self.cfg, self.buffer_)) task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_)) - task.use( - CkptSaver( - policy=self.policy, - save_dir=self.model_save_dir, - train_freq=n_iter_save_ckpt - ) - ) + task.use(CkptSaver(policy=self.policy, save_dir=self.model_save_dir, train_freq=n_iter_save_ckpt)) task.use( wandb_online_logger( metric_list=self.policy.monitor_vars(), @@ -254,7 +248,7 @@ def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: @property def best(self): - best_model_file_path=os.path.join(self.model_save_dir, "eval.pth.tar") + best_model_file_path = os.path.join(self.model_save_dir, "eval.pth.tar") if os.path.exists(best_model_file_path): policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) self.policy.learn_mode.load_state_dict(policy_state_dict) diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index e072339da5..71f342aba6 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -125,7 +125,7 @@ def __init__( self.policy = PPOFPolicy(self.cfg, model=model) if policy_state_dict is not None: self.policy.load_state_dict(policy_state_dict) - self.model_save_dir=os.path.join(self.exp_name, "model") + self.model_save_dir = os.path.join(self.exp_name, "model") def train( self, @@ -268,7 +268,7 @@ def _setup_env_manager( @property def best(self): - best_model_file_path=os.path.join(self.model_save_dir, "eval.pth.tar") + best_model_file_path = os.path.join(self.model_save_dir, "eval.pth.tar") if os.path.exists(best_model_file_path): policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) self.policy.learn_mode.load_state_dict(policy_state_dict) diff --git a/ding/bonus/sac.py b/ding/bonus/sac.py index fbe6f426e7..880cab0ec2 100644 --- a/ding/bonus/sac.py +++ b/ding/bonus/sac.py @@ -90,7 +90,7 @@ def __init__( self.policy = SACPolicy(self.cfg.policy, model=model) if policy_state_dict is not None: self.policy.learn_mode.load_state_dict(policy_state_dict) - self.model_save_dir=os.path.join(self.exp_name, "model") + self.model_save_dir = os.path.join(self.exp_name, "model") def train( self, @@ -121,13 +121,7 @@ def train( ) task.use(data_pusher(self.cfg, self.buffer_)) task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_)) - task.use( - CkptSaver( - policy=self.policy, - save_dir=self.model_save_dir, - train_freq=n_iter_save_ckpt - ) - ) + task.use(CkptSaver(policy=self.policy, save_dir=self.model_save_dir, train_freq=n_iter_save_ckpt)) task.use( wandb_online_logger( metric_list=self.policy.monitor_vars(), @@ -253,7 +247,7 @@ def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: @property def best(self): - best_model_file_path=os.path.join(self.model_save_dir, "eval.pth.tar") + best_model_file_path = os.path.join(self.model_save_dir, "eval.pth.tar") if os.path.exists(best_model_file_path): policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) self.policy.learn_mode.load_state_dict(policy_state_dict) diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 441063f9d7..31ca557088 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -89,7 +89,7 @@ def __init__( self.policy = TD3Policy(self.cfg.policy, model=model) if policy_state_dict is not None: self.policy.learn_mode.load_state_dict(policy_state_dict) - self.model_save_dir=os.path.join(self.exp_name, "model") + self.model_save_dir = os.path.join(self.exp_name, "model") def train( self, @@ -120,13 +120,7 @@ def train( ) task.use(data_pusher(self.cfg, self.buffer_)) task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_)) - task.use( - CkptSaver( - policy=self.policy, - save_dir=self.model_save_dir, - train_freq=n_iter_save_ckpt - ) - ) + task.use(CkptSaver(policy=self.policy, save_dir=self.model_save_dir, train_freq=n_iter_save_ckpt)) task.use( wandb_online_logger( metric_list=self.policy.monitor_vars(), @@ -251,7 +245,7 @@ def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: @property def best(self): - best_model_file_path=os.path.join(self.model_save_dir, "eval.pth.tar") + best_model_file_path = os.path.join(self.model_save_dir, "eval.pth.tar") if os.path.exists(best_model_file_path): policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) self.policy.learn_mode.load_state_dict(policy_state_dict) diff --git a/ding/framework/middleware/functional/logger.py b/ding/framework/middleware/functional/logger.py index 97e1ee1206..c770d74c6a 100644 --- a/ding/framework/middleware/functional/logger.py +++ b/ding/framework/middleware/functional/logger.py @@ -203,7 +203,7 @@ def _plot(ctx: "OnlineRLContext"): info_for_logging.update({metric: ctx.train_output[metric].cpu().detach().numpy()}) else: info_for_logging.update({metric: ctx.train_output[metric]}) - elif isinstance(ctx.train_output, List) and len(ctx.train_output)>0 and metric in ctx.train_output[0]: + elif isinstance(ctx.train_output, List) and len(ctx.train_output) > 0 and metric in ctx.train_output[0]: metric_value_list = [] for item in ctx.train_output: if isinstance(item[metric], torch.Tensor): diff --git a/dizoo/common/a2c/lunarlander_a2c_deploy.py b/dizoo/common/a2c/lunarlander_a2c_deploy.py index 92d0f31fb2..e2dda00828 100644 --- a/dizoo/common/a2c/lunarlander_a2c_deploy.py +++ b/dizoo/common/a2c/lunarlander_a2c_deploy.py @@ -8,10 +8,7 @@ cfg = EasyDict(Config.file_to_dict("policy_config.py")) # Instantiate the agent agent = A2CAgent( - env="lunarlander_discrete", - exp_name="Lunarlander-v2-A2C", - cfg=cfg.exp_config, - policy_state_dict=policy_state_dict + env="lunarlander_discrete", exp_name="Lunarlander-v2-A2C", cfg=cfg.exp_config, policy_state_dict=policy_state_dict ) # Continue training agent.train(step=5000) diff --git a/dizoo/common/ddpg/hopper_ddpg.py b/dizoo/common/ddpg/hopper_ddpg.py index 687dce3b6a..d99ef146f4 100644 --- a/dizoo/common/ddpg/hopper_ddpg.py +++ b/dizoo/common/ddpg/hopper_ddpg.py @@ -15,8 +15,7 @@ github_repo_url="https://github.com/opendilab/DI-engine", github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/ddpg.html", github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco.html", - installation_guide= -''' + installation_guide=''' sudo apt update -y \ && sudo apt install -y \ build-essential \ diff --git a/dizoo/common/ddpg/hopper_ddpg_deploy.py b/dizoo/common/ddpg/hopper_ddpg_deploy.py index 6044f6a5b2..ee0e46ded0 100644 --- a/dizoo/common/ddpg/hopper_ddpg_deploy.py +++ b/dizoo/common/ddpg/hopper_ddpg_deploy.py @@ -7,9 +7,7 @@ policy_state_dict = torch.load("pytorch_model.bin", map_location=torch.device("cpu")) cfg = EasyDict(Config.file_to_dict("policy_config.py")) # Instantiate the agent -agent = DDPGAgent( - env="hopper", exp_name="Hopper-v3-DDPG", cfg=cfg.exp_config, policy_state_dict=policy_state_dict -) +agent = DDPGAgent(env="hopper", exp_name="Hopper-v3-DDPG", cfg=cfg.exp_config, policy_state_dict=policy_state_dict) # Continue training agent.train(step=5000) # Render the new agent performance diff --git a/dizoo/common/ddpg/hopper_ddpg_download.py b/dizoo/common/ddpg/hopper_ddpg_download.py index 42208aec1a..ec474dd980 100644 --- a/dizoo/common/ddpg/hopper_ddpg_download.py +++ b/dizoo/common/ddpg/hopper_ddpg_download.py @@ -4,12 +4,7 @@ # Pull model from Hugggingface hub policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/Hopper-v3-DDPG") # Instantiate the agent -agent = DDPGAgent( - env="hopper", - exp_name="Hopper-v3-DDPG", - cfg=cfg.exp_config, - policy_state_dict=policy_state_dict -) +agent = DDPGAgent(env="hopper", exp_name="Hopper-v3-DDPG", cfg=cfg.exp_config, policy_state_dict=policy_state_dict) # Continue training agent.train(step=5000) # Render the new agent performance diff --git a/dizoo/common/dqn/lunarlander_dqn_deploy.py b/dizoo/common/dqn/lunarlander_dqn_deploy.py index 485bb26181..0947dd68f5 100644 --- a/dizoo/common/dqn/lunarlander_dqn_deploy.py +++ b/dizoo/common/dqn/lunarlander_dqn_deploy.py @@ -8,10 +8,7 @@ cfg = EasyDict(Config.file_to_dict("policy_config.py")) # Instantiate the agent agent = DQNAgent( - env="lunarlander_discrete", - exp_name="Lunarlander-v2-DQN", - cfg=cfg.exp_config, - policy_state_dict=policy_state_dict + env="lunarlander_discrete", exp_name="Lunarlander-v2-DQN", cfg=cfg.exp_config, policy_state_dict=policy_state_dict ) # Continue training agent.train(step=5000) diff --git a/dizoo/common/ppo/hopper_ppo.py b/dizoo/common/ppo/hopper_ppo.py index d23bac5a0a..16da7ad484 100644 --- a/dizoo/common/ppo/hopper_ppo.py +++ b/dizoo/common/ppo/hopper_ppo.py @@ -15,8 +15,7 @@ github_repo_url="https://github.com/opendilab/DI-engine", github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/ppo.html", github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco.html", - installation_guide= -''' + installation_guide=''' sudo apt update -y \ && sudo apt install -y \ build-essential \ diff --git a/dizoo/common/ppo/hopper_ppo_deploy.py b/dizoo/common/ppo/hopper_ppo_deploy.py index afc094ec33..670f644e86 100644 --- a/dizoo/common/ppo/hopper_ppo_deploy.py +++ b/dizoo/common/ppo/hopper_ppo_deploy.py @@ -7,9 +7,7 @@ policy_state_dict = torch.load("pytorch_model.bin", map_location=torch.device("cpu")) cfg = EasyDict(Config.file_to_dict("policy_config.py")) # Instantiate the agent -agent = PPOF( - env="hopper", exp_name="Hopper-v3-PPO", cfg=cfg.exp_config, policy_state_dict=policy_state_dict -) +agent = PPOF(env="hopper", exp_name="Hopper-v3-PPO", cfg=cfg.exp_config, policy_state_dict=policy_state_dict) # Continue training agent.train(step=5000) # Render the new agent performance diff --git a/dizoo/common/ppo/hopper_ppo_download.py b/dizoo/common/ppo/hopper_ppo_download.py index bb95f353e5..824fc49428 100644 --- a/dizoo/common/ppo/hopper_ppo_download.py +++ b/dizoo/common/ppo/hopper_ppo_download.py @@ -4,12 +4,7 @@ # Pull model from Hugggingface hub policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/Hopper-v3-PPO") # Instantiate the agent -agent = PPOF( - env="hopper", - exp_name="Hopper-v3-PPO", - cfg=cfg.exp_config, - policy_state_dict=policy_state_dict -) +agent = PPOF(env="hopper", exp_name="Hopper-v3-PPO", cfg=cfg.exp_config, policy_state_dict=policy_state_dict) # Continue training agent.train(step=5000) # Render the new agent performance diff --git a/dizoo/common/sac/hopper_sac.py b/dizoo/common/sac/hopper_sac.py index 0979f4ceee..af1379b57d 100644 --- a/dizoo/common/sac/hopper_sac.py +++ b/dizoo/common/sac/hopper_sac.py @@ -15,8 +15,7 @@ github_repo_url="https://github.com/opendilab/DI-engine", github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/sac.html", github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco.html", - installation_guide= -''' + installation_guide=''' sudo apt update -y \ && sudo apt install -y \ build-essential \ @@ -44,4 +43,3 @@ train_file="./dizoo/common/sac/hopper_sac.py", repo_id="OpenDILabCommunity/Hopper-v3-SAC" ) - diff --git a/dizoo/common/sac/hopper_sac_deploy.py b/dizoo/common/sac/hopper_sac_deploy.py index 46ef580274..d3cdaf4fd0 100644 --- a/dizoo/common/sac/hopper_sac_deploy.py +++ b/dizoo/common/sac/hopper_sac_deploy.py @@ -7,9 +7,7 @@ policy_state_dict = torch.load("pytorch_model.bin", map_location=torch.device("cpu")) cfg = EasyDict(Config.file_to_dict("policy_config.py")) # Instantiate the agent -agent = SACAgent( - env="hopper", exp_name="Hopper-v3-SAC", cfg=cfg.exp_config, policy_state_dict=policy_state_dict -) +agent = SACAgent(env="hopper", exp_name="Hopper-v3-SAC", cfg=cfg.exp_config, policy_state_dict=policy_state_dict) # Continue training agent.train(step=5000) # Render the new agent performance diff --git a/dizoo/common/sac/hopper_sac_download.py b/dizoo/common/sac/hopper_sac_download.py index 59344c9d34..a6ac910b6d 100644 --- a/dizoo/common/sac/hopper_sac_download.py +++ b/dizoo/common/sac/hopper_sac_download.py @@ -4,12 +4,7 @@ # Pull model from Hugggingface hub policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/Hopper-v3-SAC") # Instantiate the agent -agent = SACAgent( - env="hopper", - exp_name="Hopper-v3-SAC", - cfg=cfg.exp_config, - policy_state_dict=policy_state_dict -) +agent = SACAgent(env="hopper", exp_name="Hopper-v3-SAC", cfg=cfg.exp_config, policy_state_dict=policy_state_dict) # Continue training agent.train(step=5000) # Render the new agent performance diff --git a/dizoo/common/td3/hopper_td3.py b/dizoo/common/td3/hopper_td3.py index ce90ac796e..552f79d125 100644 --- a/dizoo/common/td3/hopper_td3.py +++ b/dizoo/common/td3/hopper_td3.py @@ -15,8 +15,7 @@ github_repo_url="https://github.com/opendilab/DI-engine", github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/td3.html", github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco.html", - installation_guide= -''' + installation_guide=''' sudo apt update -y \ && sudo apt install -y \ build-essential \ @@ -44,4 +43,3 @@ train_file="./dizoo/common/td3/hopper_td3.py", repo_id="OpenDILabCommunity/Hopper-v3-TD3" ) - diff --git a/dizoo/common/td3/hopper_td3_deploy.py b/dizoo/common/td3/hopper_td3_deploy.py index 7618135d9c..4ce16d6870 100644 --- a/dizoo/common/td3/hopper_td3_deploy.py +++ b/dizoo/common/td3/hopper_td3_deploy.py @@ -7,9 +7,7 @@ policy_state_dict = torch.load("pytorch_model.bin", map_location=torch.device("cpu")) cfg = EasyDict(Config.file_to_dict("policy_config.py")) # Instantiate the agent -agent = SACAgent( - env="hopper", exp_name="Hopper-v3-TD3", cfg=cfg.exp_config, policy_state_dict=policy_state_dict -) +agent = SACAgent(env="hopper", exp_name="Hopper-v3-TD3", cfg=cfg.exp_config, policy_state_dict=policy_state_dict) # Continue training agent.train(step=5000) # Render the new agent performance diff --git a/dizoo/common/td3/hopper_td3_download.py b/dizoo/common/td3/hopper_td3_download.py index 280e325eb6..3a0f3d90f8 100644 --- a/dizoo/common/td3/hopper_td3_download.py +++ b/dizoo/common/td3/hopper_td3_download.py @@ -4,12 +4,7 @@ # Pull model from Hugggingface hub policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/Hopper-v3-TD3") # Instantiate the agent -agent = TD3Agent( - env="hopper", - exp_name="Hopper-v3-TD3", - cfg=cfg.exp_config, - policy_state_dict=policy_state_dict -) +agent = TD3Agent(env="hopper", exp_name="Hopper-v3-TD3", cfg=cfg.exp_config, policy_state_dict=policy_state_dict) # Continue training agent.train(step=5000) # Render the new agent performance From 70009aefd2d96f1ec8c62876b509915d6c655cc2 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 7 Apr 2023 07:00:21 +0000 Subject: [PATCH 078/244] polish code --- ding/framework/middleware/functional/evaluator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ding/framework/middleware/functional/evaluator.py b/ding/framework/middleware/functional/evaluator.py index 03e6c9720a..7aa654afe6 100644 --- a/ding/framework/middleware/functional/evaluator.py +++ b/ding/framework/middleware/functional/evaluator.py @@ -316,6 +316,7 @@ def interaction_evaluator_ttorch( Arguments: - policy (:obj:`Policy`): The policy to be evaluated. - env (:obj:`BaseEnvManager`): The env for the evaluation. + - render (:obj:`bool`): Whether to render env images and policy logits. """ if task.router.is_active and not task.has_role(task.role.EVALUATOR): return task.void() From 772c354fe6b2e5c46193438cd6ab1156f3ae59de Mon Sep 17 00:00:00 2001 From: zhangpaipai <1124321458@qq.com> Date: Mon, 10 Apr 2023 11:31:04 +0800 Subject: [PATCH 079/244] polish c51 and add ddpg halfcheetah walker2d --- ding/bonus/c51.py | 15 ++++-- ding/bonus/config.py | 107 ++++++++++++++++++++++++++++++++++++++++++- ding/bonus/ddpg.py | 2 + 3 files changed, 118 insertions(+), 6 deletions(-) diff --git a/ding/bonus/c51.py b/ding/bonus/c51.py index 0e7e696020..b513b11c17 100644 --- a/ding/bonus/c51.py +++ b/ding/bonus/c51.py @@ -88,13 +88,13 @@ def __init__( self.policy = C51Policy(self.cfg.policy, model=model) if policy_state_dict is not None: self.policy.learn_mode.load_state_dict(policy_state_dict) + self.model_save_dir=os.path.join(self.cfg["exp_name"], "model") def train( self, step: int = int(1e7), collector_env_num: int = 4, evaluator_env_num: int = 4, - n_iter_log_show: int = 500, n_iter_save_ckpt: int = 1000, context: Optional[str] = None, debug: bool = False @@ -113,8 +113,7 @@ def train( StepCollector( self.cfg, self.policy.collect_mode, - collector_env, - random_collect_size=self.cfg.policy.random_collect_size + collector_env ) ) task.use(nstep_reward_enhancer(self.cfg)) @@ -123,7 +122,7 @@ def train( task.use( CkptSaver( policy=self.policy, - save_dir=os.path.join(self.cfg["exp_name"], "model"), + save_dir=self.model_save_dir, train_freq=n_iter_save_ckpt ) ) @@ -249,3 +248,11 @@ def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: if context is not None: manager_cfg.context = context return env_cls([self.env.clone for _ in range(env_num)], manager_cfg) + + @property + def best(self): + best_model_file_path=os.path.join(self.model_save_dir, "eval.pth.tar") + if os.path.exists(best_model_file_path): + policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) + self.policy.learn_mode.load_state_dict(policy_state_dict) + return self diff --git a/ding/bonus/config.py b/ding/bonus/config.py index ed000c81bb..9c2412c383 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -295,6 +295,110 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ) ) ) + elif env == 'HalfCheetah': + cfg.update( + dict( + exp_name='HalfCheetah-v3-DDPG', + seed=0, + env=dict( + env_id='HalfCheetah-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=1, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=11000, + ), + policy=dict( + cuda=True, + random_collect_size=25000, + model=dict( + obs_shape=17, + action_shape=6, + twin_critic=False, + actor_head_hidden_size=256, + critic_head_hidden_size=256, + action_space='regression', + ), + learn=dict( + update_per_collect=1, + batch_size=256, + learning_rate_actor=1e-3, + learning_rate_critic=1e-3, + ignore_done=True, + target_theta=0.005, + discount_factor=0.99, + actor_update_freq=1, + noise=False, + ), + collect=dict( + n_sample=1, + unroll_len=1, + noise_sigma=0.1, + ), + other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) + elif env == 'Walker2d': + cfg.update( + dict( + exp_name='Walker2d-v3-DDPG', + seed=0, + env=dict( + env_id='Walker2d-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=1, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=6000, + ), + policy=dict( + cuda=True, + random_collect_size=25000, + model=dict( + obs_shape=17, + action_shape=6, + twin_critic=False, + actor_head_hidden_size=256, + critic_head_hidden_size=256, + action_space='regression', + ), + learn=dict( + update_per_collect=1, + batch_size=256, + learning_rate_actor=1e-3, + learning_rate_critic=1e-3, + ignore_done=False, + target_theta=0.005, + discount_factor=0.99, + actor_update_freq=1, + noise=False, + ), + collect=dict( + n_sample=1, + unroll_len=1, + noise_sigma=0.1, + ), + other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) elif env == 'lunarlander_continuous': cfg.update( dict( @@ -687,7 +791,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: else: raise KeyError("not supported env type: {}".format(env)) elif algorithm == 'C51': - cfg = EasyDict({"policy":C51Policy.default_config()}) + cfg = EasyDict({"policy": C51Policy.default_config()}) if env == 'lunarlander_discrete': cfg.update( dict( @@ -730,7 +834,6 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: decay=50000, ), replay_buffer=dict(replay_buffer_size=100000, ) ), - random_collect_size=0, ), wandb_logger=dict( gradient_logger=True, diff --git a/ding/bonus/ddpg.py b/ding/bonus/ddpg.py index 8ef0908b99..c379416a24 100644 --- a/ding/bonus/ddpg.py +++ b/ding/bonus/ddpg.py @@ -43,6 +43,8 @@ class EvalReturn: class DDPGAgent: supported_env_list = [ 'hopper', + 'HalfCheetah', + 'Walker2d', 'lunarlander_continuous', ] algorithm = 'DDPG' From 12d629187a8003c3dea7b24f8ec25bc053b6bca2 Mon Sep 17 00:00:00 2001 From: zhangpaipai <1124321458@qq.com> Date: Tue, 11 Apr 2023 13:24:13 +0800 Subject: [PATCH 080/244] add dizoo/common for zjow to review --- ding/bonus/config.py | 114 ++++++++++++++++++++++++++ ding/bonus/td3.py | 2 + dizoo/common/c51/lunarlander_c51.py | 23 ++++++ dizoo/common/ddpg/halfcheetah_ddpg.py | 47 +++++++++++ dizoo/common/ddpg/walker2d_ddpg.py | 47 +++++++++++ dizoo/common/sac/halfcheetah_sac.py | 47 +++++++++++ dizoo/common/sac/walker2d_sac.py | 47 +++++++++++ dizoo/common/td3/halfcheetah_td3.py | 47 +++++++++++ dizoo/common/td3/walker2d_td3.py | 47 +++++++++++ 9 files changed, 421 insertions(+) create mode 100644 dizoo/common/c51/lunarlander_c51.py create mode 100644 dizoo/common/ddpg/halfcheetah_ddpg.py create mode 100644 dizoo/common/ddpg/walker2d_ddpg.py create mode 100644 dizoo/common/sac/halfcheetah_sac.py create mode 100644 dizoo/common/sac/walker2d_sac.py create mode 100644 dizoo/common/td3/halfcheetah_td3.py create mode 100644 dizoo/common/td3/walker2d_td3.py diff --git a/ding/bonus/config.py b/ding/bonus/config.py index feeea47db5..e63e7b11d6 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -197,6 +197,120 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), ) ) + elif env == 'HalfCheetah': + cfg.update( + dict( + exp_name='HalfCheetah-v3-TD3', + seed=0, + env=dict( + env_id='HalfCheetah-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=1, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=11000, + ), + policy=dict( + cuda=True, + random_collect_size=25000, + model=dict( + obs_shape=17, + action_shape=6, + twin_critic=True, + actor_head_hidden_size=256, + critic_head_hidden_size=256, + action_space='regression', + ), + learn=dict( + update_per_collect=1, + batch_size=256, + learning_rate_actor=1e-3, + learning_rate_critic=1e-3, + ignore_done=True, + target_theta=0.005, + discount_factor=0.99, + actor_update_freq=2, + noise=True, + noise_sigma=0.2, + noise_range=dict( + min=-0.5, + max=0.5, + ), + ), + collect=dict( + n_sample=1, + unroll_len=1, + noise_sigma=0.1, + ), + other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) + elif env == 'Walker2d': + cfg.update( + dict( + exp_name='Walker2d-v3-TD3', + seed=0, + env=dict( + env_id='Walker2d-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=1, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=6000, + ), + policy=dict( + cuda=True, + random_collect_size=25000, + model=dict( + obs_shape=17, + action_shape=6, + twin_critic=True, + actor_head_hidden_size=256, + critic_head_hidden_size=256, + action_space='regression', + ), + learn=dict( + update_per_collect=1, + batch_size=256, + learning_rate_actor=1e-3, + learning_rate_critic=1e-3, + ignore_done=False, + target_theta=0.005, + discount_factor=0.99, + actor_update_freq=2, + noise=True, + noise_sigma=0.2, + noise_range=dict( + min=-0.5, + max=0.5, + ), + ), + collect=dict( + n_sample=1, + unroll_len=1, + noise_sigma=0.1, + ), + other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) elif env == 'lunarlander_continuous': cfg.update( dict( diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 31ca557088..d570c17f71 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -43,6 +43,8 @@ class EvalReturn: class TD3Agent: supported_env_list = [ 'hopper', + 'HalfCheetah', + 'Walker2d', 'lunarlander_continuous', ] algorithm = 'TD3' diff --git a/dizoo/common/c51/lunarlander_c51.py b/dizoo/common/c51/lunarlander_c51.py new file mode 100644 index 0000000000..07d1442ff5 --- /dev/null +++ b/dizoo/common/c51/lunarlander_c51.py @@ -0,0 +1,23 @@ +from ding.bonus import C51Agent +from huggingface_ding import push_model_to_hub + +# Instantiate the agent +agent = C51Agent("lunarlander_discrete", exp_name="LunarLander-v2-C51") +# Train the agent +return_ = agent.train(step=200000) +# Push model to huggingface hub +""" push_model_to_hub( + agent=agent.best, + env_name="OpenAI/Gym/Box2d", + task_name="LunarLander-v2", + algo_name="DDPG", + wandb_url=return_.wandb_url, + github_repo_url="https://github.com/opendilab/DI-engine", + github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/ddpg.html", + github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", + installation_guide="pip3 install DI-engine[common_env]", + usage_file_by_git_clone="./dizoo/common/ddpg/lunarlander_ddpg_deploy.py", + usage_file_by_huggingface_ding="./dizoo/common/ddpg/lunarlander_ddpg_download.py", + train_file="./dizoo/common/ddpg/lunarlander_ddpg.py", + repo_id="OpenDILabCommunity/LunarLander-v2-DDPG" +) """ \ No newline at end of file diff --git a/dizoo/common/ddpg/halfcheetah_ddpg.py b/dizoo/common/ddpg/halfcheetah_ddpg.py new file mode 100644 index 0000000000..573b706df3 --- /dev/null +++ b/dizoo/common/ddpg/halfcheetah_ddpg.py @@ -0,0 +1,47 @@ +from ding.bonus import DDPGAgent +from huggingface_ding import push_model_to_hub + +# Instantiate the agent +agent = DDPGAgent(env="HalfCheetah", exp_name="HalfCheetah-v3-DDPG") +# Train the agent +return_ = agent.train(step=1000000) +# Render the new agent performance +agent.deploy(enable_save_replay=True) +# Push model to huggingface hub +""" push_model_to_hub( + agent=agent.best, + env_name="OpenAI/Gym/MuJoCo", + task_name="Hopper-v3", + algo_name="DDPG", + wandb_url=return_.wandb_url, + github_repo_url="https://github.com/opendilab/DI-engine", + github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/ddpg.html", + github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco.html", + installation_guide=''' +sudo apt update -y \ + && sudo apt install -y \ + build-essential \ + libgl1-mesa-dev \ + libgl1-mesa-glx \ + libglew-dev \ + libosmesa6-dev \ + libglfw3 \ + libglfw3-dev \ + libsdl2-dev \ + libsdl2-image-dev \ + libglm-dev \ + libfreetype6-dev \ + patchelf + +mkdir -p ~/.mujoco +wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz +tar -xf mujoco.tar.gz -C ~/.mujoco +echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin" >> ~/.bashrc +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin +pip3 install DI-engine[common_env] +''', + usage_file_by_git_clone="./dizoo/common/ddpg/hopper_ddpg_deploy.py", + usage_file_by_huggingface_ding="./dizoo/common/ddpg/hopper_ddpg_download.py", + train_file="./dizoo/common/ddpg/hopper_ddpg.py", + repo_id="OpenDILabCommunity/Hopper-v3-DDPG" +) """ diff --git a/dizoo/common/ddpg/walker2d_ddpg.py b/dizoo/common/ddpg/walker2d_ddpg.py new file mode 100644 index 0000000000..ec1044df0c --- /dev/null +++ b/dizoo/common/ddpg/walker2d_ddpg.py @@ -0,0 +1,47 @@ +from ding.bonus import DDPGAgent +from huggingface_ding import push_model_to_hub + +# Instantiate the agent +agent = DDPGAgent(env="Walker2d", exp_name="Walker2d-v3-DDPG") +# Train the agent +return_ = agent.train(step=1000000) +# Render the new agent performance +agent.deploy(enable_save_replay=True) +# Push model to huggingface hub +""" push_model_to_hub( + agent=agent.best, + env_name="OpenAI/Gym/MuJoCo", + task_name="Hopper-v3", + algo_name="DDPG", + wandb_url=return_.wandb_url, + github_repo_url="https://github.com/opendilab/DI-engine", + github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/ddpg.html", + github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco.html", + installation_guide=''' +sudo apt update -y \ + && sudo apt install -y \ + build-essential \ + libgl1-mesa-dev \ + libgl1-mesa-glx \ + libglew-dev \ + libosmesa6-dev \ + libglfw3 \ + libglfw3-dev \ + libsdl2-dev \ + libsdl2-image-dev \ + libglm-dev \ + libfreetype6-dev \ + patchelf + +mkdir -p ~/.mujoco +wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz +tar -xf mujoco.tar.gz -C ~/.mujoco +echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin" >> ~/.bashrc +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin +pip3 install DI-engine[common_env] +''', + usage_file_by_git_clone="./dizoo/common/ddpg/hopper_ddpg_deploy.py", + usage_file_by_huggingface_ding="./dizoo/common/ddpg/hopper_ddpg_download.py", + train_file="./dizoo/common/ddpg/hopper_ddpg.py", + repo_id="OpenDILabCommunity/Hopper-v3-DDPG" +) """ \ No newline at end of file diff --git a/dizoo/common/sac/halfcheetah_sac.py b/dizoo/common/sac/halfcheetah_sac.py new file mode 100644 index 0000000000..df8f0c87f4 --- /dev/null +++ b/dizoo/common/sac/halfcheetah_sac.py @@ -0,0 +1,47 @@ +from ding.bonus.sac import SACAgent +from huggingface_ding import push_model_to_hub + +# Instantiate the agent +agent = SACAgent(env="HalfCheetah", exp_name="HalfCheetah-v3-SAC") +# Train the agent +return_ = agent.train(step=1000000) +# Render the new agent performance +agent.deploy(enable_save_replay=True) +# Push model to huggingface hub +""" push_model_to_hub( + agent=agent.best, + env_name="OpenAI/Gym/MuJoCo", + task_name="Hopper-v3", + algo_name="SAC", + wandb_url=return_.wandb_url, + github_repo_url="https://github.com/opendilab/DI-engine", + github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/sac.html", + github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco.html", + installation_guide=''' +sudo apt update -y \ + && sudo apt install -y \ + build-essential \ + libgl1-mesa-dev \ + libgl1-mesa-glx \ + libglew-dev \ + libosmesa6-dev \ + libglfw3 \ + libglfw3-dev \ + libsdl2-dev \ + libsdl2-image-dev \ + libglm-dev \ + libfreetype6-dev \ + patchelf + +mkdir -p ~/.mujoco +wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz +tar -xf mujoco.tar.gz -C ~/.mujoco +echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin" >> ~/.bashrc +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin +pip3 install DI-engine[common_env] +''', + usage_file_by_git_clone="./dizoo/common/sac/hopper_sac_deploy.py", + usage_file_by_huggingface_ding="./dizoo/common/sac/hopper_sac_download.py", + train_file="./dizoo/common/sac/hopper_sac.py", + repo_id="OpenDILabCommunity/Hopper-v3-SAC" +) """ \ No newline at end of file diff --git a/dizoo/common/sac/walker2d_sac.py b/dizoo/common/sac/walker2d_sac.py new file mode 100644 index 0000000000..0406394e57 --- /dev/null +++ b/dizoo/common/sac/walker2d_sac.py @@ -0,0 +1,47 @@ +from ding.bonus.sac import SACAgent +from huggingface_ding import push_model_to_hub + +# Instantiate the agent +agent = SACAgent(env="Walker2d", exp_name="Walker2d-v3-SAC") +# Train the agent +return_ = agent.train(step=1000000) +# Render the new agent performance +agent.deploy(enable_save_replay=True) +# Push model to huggingface hub +""" push_model_to_hub( + agent=agent.best, + env_name="OpenAI/Gym/MuJoCo", + task_name="Hopper-v3", + algo_name="SAC", + wandb_url=return_.wandb_url, + github_repo_url="https://github.com/opendilab/DI-engine", + github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/sac.html", + github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco.html", + installation_guide=''' +sudo apt update -y \ + && sudo apt install -y \ + build-essential \ + libgl1-mesa-dev \ + libgl1-mesa-glx \ + libglew-dev \ + libosmesa6-dev \ + libglfw3 \ + libglfw3-dev \ + libsdl2-dev \ + libsdl2-image-dev \ + libglm-dev \ + libfreetype6-dev \ + patchelf + +mkdir -p ~/.mujoco +wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz +tar -xf mujoco.tar.gz -C ~/.mujoco +echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin" >> ~/.bashrc +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin +pip3 install DI-engine[common_env] +''', + usage_file_by_git_clone="./dizoo/common/sac/hopper_sac_deploy.py", + usage_file_by_huggingface_ding="./dizoo/common/sac/hopper_sac_download.py", + train_file="./dizoo/common/sac/hopper_sac.py", + repo_id="OpenDILabCommunity/Hopper-v3-SAC" +) """ \ No newline at end of file diff --git a/dizoo/common/td3/halfcheetah_td3.py b/dizoo/common/td3/halfcheetah_td3.py new file mode 100644 index 0000000000..f521320fb6 --- /dev/null +++ b/dizoo/common/td3/halfcheetah_td3.py @@ -0,0 +1,47 @@ +from ding.bonus import TD3Agent +from huggingface_ding import push_model_to_hub + +# Instantiate the agent +agent = TD3Agent(env="HalfCheetah", exp_name="HalfCheetah-v3-TD3") +# Train the agent +return_ = agent.train(step=1000000) +# Render the new agent performance +agent.deploy(enable_save_replay=True) +# Push model to huggingface hub +""" push_model_to_hub( + agent=agent.best, + env_name="OpenAI/Gym/MuJoCo", + task_name="Hopper-v3", + algo_name="TD3", + wandb_url=return_.wandb_url, + github_repo_url="https://github.com/opendilab/DI-engine", + github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/td3.html", + github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco.html", + installation_guide=''' +sudo apt update -y \ + && sudo apt install -y \ + build-essential \ + libgl1-mesa-dev \ + libgl1-mesa-glx \ + libglew-dev \ + libosmesa6-dev \ + libglfw3 \ + libglfw3-dev \ + libsdl2-dev \ + libsdl2-image-dev \ + libglm-dev \ + libfreetype6-dev \ + patchelf + +mkdir -p ~/.mujoco +wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz +tar -xf mujoco.tar.gz -C ~/.mujoco +echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin" >> ~/.bashrc +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin +pip3 install DI-engine[common_env] +''', + usage_file_by_git_clone="./dizoo/common/td3/hopper_td3_deploy.py", + usage_file_by_huggingface_ding="./dizoo/common/td3/hopper_td3_download.py", + train_file="./dizoo/common/td3/hopper_td3.py", + repo_id="OpenDILabCommunity/Hopper-v3-TD3" +) """ \ No newline at end of file diff --git a/dizoo/common/td3/walker2d_td3.py b/dizoo/common/td3/walker2d_td3.py new file mode 100644 index 0000000000..be17109d23 --- /dev/null +++ b/dizoo/common/td3/walker2d_td3.py @@ -0,0 +1,47 @@ +from ding.bonus import TD3Agent +from huggingface_ding import push_model_to_hub + +# Instantiate the agent +agent = TD3Agent(env="Walker2d", exp_name="Walker2d-v3-TD3") +# Train the agent +return_ = agent.train(step=1000000) +# Render the new agent performance +agent.deploy(enable_save_replay=True) +# Push model to huggingface hub +""" push_model_to_hub( + agent=agent.best, + env_name="OpenAI/Gym/MuJoCo", + task_name="Hopper-v3", + algo_name="TD3", + wandb_url=return_.wandb_url, + github_repo_url="https://github.com/opendilab/DI-engine", + github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/td3.html", + github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco.html", + installation_guide=''' +sudo apt update -y \ + && sudo apt install -y \ + build-essential \ + libgl1-mesa-dev \ + libgl1-mesa-glx \ + libglew-dev \ + libosmesa6-dev \ + libglfw3 \ + libglfw3-dev \ + libsdl2-dev \ + libsdl2-image-dev \ + libglm-dev \ + libfreetype6-dev \ + patchelf + +mkdir -p ~/.mujoco +wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz +tar -xf mujoco.tar.gz -C ~/.mujoco +echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin" >> ~/.bashrc +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin +pip3 install DI-engine[common_env] +''', + usage_file_by_git_clone="./dizoo/common/td3/hopper_td3_deploy.py", + usage_file_by_huggingface_ding="./dizoo/common/td3/hopper_td3_download.py", + train_file="./dizoo/common/td3/hopper_td3.py", + repo_id="OpenDILabCommunity/Hopper-v3-TD3" +) """ \ No newline at end of file From fec830ac71588ef46028f20e7458fe25284d86fc Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 11 Apr 2023 05:30:22 +0000 Subject: [PATCH 081/244] fix agent best method --- ding/bonus/a2c.py | 4 ++-- ding/bonus/ddpg.py | 4 ++-- ding/bonus/dqn.py | 4 ++-- ding/bonus/ppof.py | 2 +- ding/bonus/sac.py | 2 +- ding/bonus/td3.py | 4 ++-- ding/framework/middleware/ckpt_handler.py | 2 +- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/ding/bonus/a2c.py b/ding/bonus/a2c.py index bab5b97655..4c0a21133f 100644 --- a/ding/bonus/a2c.py +++ b/ding/bonus/a2c.py @@ -238,8 +238,8 @@ def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: @property def best(self): - best_model_file_path = os.path.join(self.model_save_dir, "eval.pth.tar") + best_model_file_path = os.path.join(self.model_save_dir, "ckpt", "eval.pth.tar") if os.path.exists(best_model_file_path): policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) self.policy.learn_mode.load_state_dict(policy_state_dict) - return self + return self \ No newline at end of file diff --git a/ding/bonus/ddpg.py b/ding/bonus/ddpg.py index c3e60404e7..344a31079e 100644 --- a/ding/bonus/ddpg.py +++ b/ding/bonus/ddpg.py @@ -245,8 +245,8 @@ def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: @property def best(self): - best_model_file_path = os.path.join(self.model_save_dir, "eval.pth.tar") + best_model_file_path = os.path.join(self.model_save_dir, "ckpt", "eval.pth.tar") if os.path.exists(best_model_file_path): policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) self.policy.learn_mode.load_state_dict(policy_state_dict) - return self + return self \ No newline at end of file diff --git a/ding/bonus/dqn.py b/ding/bonus/dqn.py index 4e4bb17feb..a93b2461b0 100644 --- a/ding/bonus/dqn.py +++ b/ding/bonus/dqn.py @@ -248,8 +248,8 @@ def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: @property def best(self): - best_model_file_path = os.path.join(self.model_save_dir, "eval.pth.tar") + best_model_file_path = os.path.join(self.model_save_dir, "ckpt", "eval.pth.tar") if os.path.exists(best_model_file_path): policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) self.policy.learn_mode.load_state_dict(policy_state_dict) - return self + return self \ No newline at end of file diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index 71f342aba6..f98b6df54f 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -268,7 +268,7 @@ def _setup_env_manager( @property def best(self): - best_model_file_path = os.path.join(self.model_save_dir, "eval.pth.tar") + best_model_file_path = os.path.join(self.model_save_dir, "ckpt", "eval.pth.tar") if os.path.exists(best_model_file_path): policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) self.policy.learn_mode.load_state_dict(policy_state_dict) diff --git a/ding/bonus/sac.py b/ding/bonus/sac.py index 880cab0ec2..d09d27ba57 100644 --- a/ding/bonus/sac.py +++ b/ding/bonus/sac.py @@ -247,7 +247,7 @@ def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: @property def best(self): - best_model_file_path = os.path.join(self.model_save_dir, "eval.pth.tar") + best_model_file_path = os.path.join(self.model_save_dir, "ckpt", "eval.pth.tar") if os.path.exists(best_model_file_path): policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) self.policy.learn_mode.load_state_dict(policy_state_dict) diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 31ca557088..7491c54a56 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -245,8 +245,8 @@ def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: @property def best(self): - best_model_file_path = os.path.join(self.model_save_dir, "eval.pth.tar") + best_model_file_path = os.path.join(self.model_save_dir, "ckpt", "eval.pth.tar") if os.path.exists(best_model_file_path): policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) self.policy.learn_mode.load_state_dict(policy_state_dict) - return self + return self \ No newline at end of file diff --git a/ding/framework/middleware/ckpt_handler.py b/ding/framework/middleware/ckpt_handler.py index 3502d9c50f..65a14c88f9 100644 --- a/ding/framework/middleware/ckpt_handler.py +++ b/ding/framework/middleware/ckpt_handler.py @@ -63,7 +63,7 @@ def __call__(self, ctx: Union["OnlineRLContext", "OfflineRLContext"]) -> None: # best episode return so far if ctx.eval_value is not None and ctx.eval_value > self.max_eval_value: - save_file("{}/eval.pth.tar".format(self.prefix), self.policy.eval_mode.state_dict()) + save_file("{}/eval.pth.tar".format(self.prefix), self.policy.learn_mode.state_dict()) self.max_eval_value = ctx.eval_value # finish From 8f523e7ff4d1e3470a0abada195cfb0c760d6022 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 11 Apr 2023 09:24:43 +0000 Subject: [PATCH 082/244] reset dizoo --- dizoo/atari/entry/spaceinvaders_dqn_eval.py | 5 +- dizoo/atari/example/atari_dqn_dist_ddp.py | 1 + .../carracing/config/carracing_dqn_config.py | 9 +- dizoo/box2d/carracing/envs/carracing_env.py | 1 + .../carracing/envs/test_carracing_env.py | 10 +- .../config/lunarlander_cont_sac_config.py | 4 +- .../config/lunarlander_cont_td3_config.py | 4 +- .../cartpole/config/cartpole_bc_config.py | 2 +- .../config/mtcar_rainbow_config.py | 95 +++++++++---------- .../mountain_car/envs/__init__.py | 2 +- .../pendulum/config/pendulum_ibc_config.py | 13 +-- .../pendulum/config/pendulum_td3_bc_config.py | 2 +- .../pendulum/entry/pendulum_dqn_eval.py | 5 +- dizoo/common/a2c/lunarlander_a2c.py | 23 ----- dizoo/common/a2c/lunarlander_a2c_deploy.py | 16 ---- dizoo/common/a2c/lunarlander_a2c_download.py | 13 --- dizoo/common/ddpg/hopper_ddpg.py | 45 --------- dizoo/common/ddpg/hopper_ddpg_deploy.py | 14 --- dizoo/common/ddpg/hopper_ddpg_download.py | 11 --- dizoo/common/ddpg/lunarlander_ddpg.py | 23 ----- dizoo/common/ddpg/lunarlander_ddpg_deploy.py | 19 ---- .../common/ddpg/lunarlander_ddpg_download.py | 16 ---- dizoo/common/dqn/lunarlander_dqn.py | 23 ----- dizoo/common/dqn/lunarlander_dqn_deploy.py | 16 ---- dizoo/common/dqn/lunarlander_dqn_download.py | 13 --- dizoo/common/ppo/hopper_ppo.py | 45 --------- dizoo/common/ppo/hopper_ppo_deploy.py | 14 --- dizoo/common/ppo/hopper_ppo_download.py | 11 --- dizoo/common/ppo/lunarlander_ppo.py | 23 ----- dizoo/common/ppo/lunarlander_ppo_deploy.py | 16 ---- dizoo/common/ppo/lunarlander_ppo_download.py | 13 --- dizoo/common/sac/hopper_sac.py | 45 --------- dizoo/common/sac/hopper_sac_deploy.py | 14 --- dizoo/common/sac/hopper_sac_download.py | 11 --- dizoo/common/sac/lunarlander_sac.py | 23 ----- dizoo/common/sac/lunarlander_sac_deploy.py | 16 ---- dizoo/common/sac/lunarlander_sac_download.py | 16 ---- dizoo/common/td3/hopper_td3.py | 45 --------- dizoo/common/td3/hopper_td3_deploy.py | 14 --- dizoo/common/td3/hopper_td3_download.py | 11 --- dizoo/common/td3/lunarlander_td3.py | 23 ----- dizoo/common/td3/lunarlander_td3_deploy.py | 19 ---- dizoo/common/td3/lunarlander_td3_download.py | 16 ---- .../config/halfcheetah_expert_td3bc_config.py | 2 +- .../halfcheetah_medium_expert_td3bc_config.py | 2 +- .../halfcheetah_medium_replay_td3bc_config.py | 2 +- .../config/halfcheetah_medium_td3bc_config.py | 2 +- .../config/halfcheetah_random_td3bc_config.py | 2 +- .../d4rl/config/hopper_expert_td3bc_config.py | 2 +- .../config/hopper_medium_expert_bc_config.py | 6 +- .../hopper_medium_expert_ibc_ar_config.py | 14 ++- .../config/hopper_medium_expert_ibc_config.py | 14 ++- .../hopper_medium_expert_ibc_mcmc_config.py | 14 ++- .../hopper_medium_expert_td3bc_config.py | 2 +- .../hopper_medium_replay_td3bc_config.py | 2 +- .../d4rl/config/hopper_medium_td3bc_config.py | 2 +- .../d4rl/config/hopper_random_td3bc_config.py | 2 +- .../d4rl/config/kitchen_complete_bc_config.py | 8 +- .../config/kitchen_complete_ibc_ar_config.py | 14 ++- .../config/kitchen_complete_ibc_config.py | 14 ++- .../kitchen_complete_ibc_mcmc_config.py | 14 ++- dizoo/d4rl/config/pen_human_bc_config.py | 6 +- dizoo/d4rl/config/pen_human_ibc_ar_config.py | 14 ++- dizoo/d4rl/config/pen_human_ibc_config.py | 14 ++- .../d4rl/config/pen_human_ibc_mcmc_config.py | 14 ++- .../config/walker2d_expert_td3bc_config.py | 2 +- .../walker2d_medium_expert_td3bc_config.py | 2 +- .../walker2d_medium_replay_td3bc_config.py | 2 +- .../config/walker2d_medium_td3bc_config.py | 2 +- .../config/walker2d_random_td3bc_config.py | 2 +- dizoo/d4rl/entry/d4rl_cql_main.py | 2 +- dizoo/d4rl/entry/d4rl_td3_bc_main.py | 2 +- dizoo/dmc2gym/config/dmc2gym_ppo_config.py | 1 + dizoo/dmc2gym/entry/dmc2gym_sac_pixel_main.py | 33 +++---- dizoo/dmc2gym/entry/dmc2gym_sac_state_main.py | 33 +++---- dizoo/dmc2gym/envs/dmc2gym_env.py | 2 - dizoo/dmc2gym/envs/test_dmc2gym_env.py | 1 + .../evogym/envs/test/visualize_simple_env.py | 1 + .../config/stocks_dqn_config.py | 6 +- .../worker/trading_serial_evaluator.py | 26 ++--- .../envs/gym-hybrid/gym_hybrid/__init__.py | 3 +- dizoo/gym_hybrid/envs/gym-hybrid/setup.py | 9 +- .../envs/gym-hybrid/tests/moving.py | 1 + dizoo/gym_hybrid/envs/test_gym_hybrid_env.py | 12 +-- .../entry/imagenet_res18_config.py | 4 +- dizoo/league_demo/league_demo_collector.py | 14 +-- dizoo/maze/entry/maze_bc_main.py | 14 ++- dizoo/minigrid/utils/eval.py | 10 +- dizoo/mujoco/config/halfcheetah_bdq_config.py | 7 +- dizoo/mujoco/config/hopper_bdq_config.py | 6 +- dizoo/mujoco/envs/mujoco_wrappers.py | 8 +- .../config/ant_mappo_config.py | 1 + .../config/ant_masac_config.py | 4 +- .../config/ptz_simple_spread_madqn_config.py | 8 +- dizoo/rocket/entry/rocket_hover_ppo_main.py | 6 +- dizoo/rocket/entry/rocket_landing_ppo_main.py | 8 +- dizoo/rocket/envs/test_rocket_env.py | 6 +- dizoo/smac/config/smac_3s5z_madqn_config.py | 12 ++- .../config/smac_3s5zvs3s6z_madqn_config.py | 12 ++- dizoo/smac/config/smac_5m6m_madqn_config.py | 11 ++- dizoo/smac/config/smac_8m9m_madqn_config.py | 11 ++- dizoo/smac/config/smac_MMM2_madqn_config.py | 12 ++- dizoo/smac/config/smac_MMM_madqn_config.py | 12 ++- dizoo/smac/utils/eval.py | 10 +- 104 files changed, 354 insertions(+), 890 deletions(-) delete mode 100644 dizoo/common/a2c/lunarlander_a2c.py delete mode 100644 dizoo/common/a2c/lunarlander_a2c_deploy.py delete mode 100644 dizoo/common/a2c/lunarlander_a2c_download.py delete mode 100644 dizoo/common/ddpg/hopper_ddpg.py delete mode 100644 dizoo/common/ddpg/hopper_ddpg_deploy.py delete mode 100644 dizoo/common/ddpg/hopper_ddpg_download.py delete mode 100644 dizoo/common/ddpg/lunarlander_ddpg.py delete mode 100644 dizoo/common/ddpg/lunarlander_ddpg_deploy.py delete mode 100644 dizoo/common/ddpg/lunarlander_ddpg_download.py delete mode 100644 dizoo/common/dqn/lunarlander_dqn.py delete mode 100644 dizoo/common/dqn/lunarlander_dqn_deploy.py delete mode 100644 dizoo/common/dqn/lunarlander_dqn_download.py delete mode 100644 dizoo/common/ppo/hopper_ppo.py delete mode 100644 dizoo/common/ppo/hopper_ppo_deploy.py delete mode 100644 dizoo/common/ppo/hopper_ppo_download.py delete mode 100644 dizoo/common/ppo/lunarlander_ppo.py delete mode 100644 dizoo/common/ppo/lunarlander_ppo_deploy.py delete mode 100644 dizoo/common/ppo/lunarlander_ppo_download.py delete mode 100644 dizoo/common/sac/hopper_sac.py delete mode 100644 dizoo/common/sac/hopper_sac_deploy.py delete mode 100644 dizoo/common/sac/hopper_sac_download.py delete mode 100644 dizoo/common/sac/lunarlander_sac.py delete mode 100644 dizoo/common/sac/lunarlander_sac_deploy.py delete mode 100644 dizoo/common/sac/lunarlander_sac_download.py delete mode 100644 dizoo/common/td3/hopper_td3.py delete mode 100644 dizoo/common/td3/hopper_td3_deploy.py delete mode 100644 dizoo/common/td3/hopper_td3_download.py delete mode 100644 dizoo/common/td3/lunarlander_td3.py delete mode 100644 dizoo/common/td3/lunarlander_td3_deploy.py delete mode 100644 dizoo/common/td3/lunarlander_td3_download.py diff --git a/dizoo/atari/entry/spaceinvaders_dqn_eval.py b/dizoo/atari/entry/spaceinvaders_dqn_eval.py index 35e15a578c..d8bfde290d 100644 --- a/dizoo/atari/entry/spaceinvaders_dqn_eval.py +++ b/dizoo/atari/entry/spaceinvaders_dqn_eval.py @@ -15,9 +15,8 @@ from ding.rl_utils import get_epsilon_greedy_fn from dizoo.atari.config.serial.spaceinvaders.spaceinvaders_dqn_config import main_config, create_config - def main(rl_cfg, seed=0): - main_cfg, create_cfg = rl_cfg + main_cfg, create_cfg =rl_cfg cfg = compile_config( main_cfg, BaseEnvManager, @@ -57,4 +56,4 @@ def main(rl_cfg, seed=0): if __name__ == "__main__": - main(rl_cfg=(main_config, create_config), seed=0) + main(rl_cfg=(main_config, create_config),seed=0) diff --git a/dizoo/atari/example/atari_dqn_dist_ddp.py b/dizoo/atari/example/atari_dqn_dist_ddp.py index 5dbfc4e65c..f194c326bc 100644 --- a/dizoo/atari/example/atari_dqn_dist_ddp.py +++ b/dizoo/atari/example/atari_dqn_dist_ddp.py @@ -14,6 +14,7 @@ from dizoo.atari.envs.atari_env import AtariEnv from dizoo.atari.config.serial.pong.pong_dqn_config import main_config, create_config + logging.getLogger().setLevel(logging.INFO) main_config.exp_name = 'pong_dqn_seed0_ditask_dist_ddp' diff --git a/dizoo/box2d/carracing/config/carracing_dqn_config.py b/dizoo/box2d/carracing/config/carracing_dqn_config.py index 1792056a83..31dd42fca8 100644 --- a/dizoo/box2d/carracing/config/carracing_dqn_config.py +++ b/dizoo/box2d/carracing/config/carracing_dqn_config.py @@ -29,14 +29,17 @@ learning_rate=0.0001, target_update_freq=100, ), - collect=dict(n_sample=64, ), + collect=dict( + n_sample=64, + ), other=dict( eps=dict( type='exp', start=0.95, end=0.1, decay=50000, - ), replay_buffer=dict(replay_buffer_size=100000, ) + ), + replay_buffer=dict(replay_buffer_size=100000, ) ), ), ) @@ -57,4 +60,4 @@ if __name__ == "__main__": # or you can enter `ding -m serial -c carracing_dqn_config.py -s 0` from ding.entry import serial_pipeline - serial_pipeline([main_config, create_config], seed=0) + serial_pipeline([main_config, create_config], seed=0) \ No newline at end of file diff --git a/dizoo/box2d/carracing/envs/carracing_env.py b/dizoo/box2d/carracing/envs/carracing_env.py index 60ebaa97d1..39b82a2502 100644 --- a/dizoo/box2d/carracing/envs/carracing_env.py +++ b/dizoo/box2d/carracing/envs/carracing_env.py @@ -2,6 +2,7 @@ import copy import os + import gym import numpy as np from easydict import EasyDict diff --git a/dizoo/box2d/carracing/envs/test_carracing_env.py b/dizoo/box2d/carracing/envs/test_carracing_env.py index 47a5fa4638..7eb4a75039 100644 --- a/dizoo/box2d/carracing/envs/test_carracing_env.py +++ b/dizoo/box2d/carracing/envs/test_carracing_env.py @@ -5,7 +5,15 @@ @pytest.mark.envtest -@pytest.mark.parametrize('cfg', [EasyDict({'env_id': 'CarRacing-v2', 'continuous': False, 'act_scale': False})]) +@pytest.mark.parametrize( + 'cfg', [ + EasyDict({ + 'env_id': 'CarRacing-v2', + 'continuous': False, + 'act_scale': False + }) + ] +) class TestCarRacing: def test_naive(self, cfg): diff --git a/dizoo/box2d/lunarlander/config/lunarlander_cont_sac_config.py b/dizoo/box2d/lunarlander/config/lunarlander_cont_sac_config.py index f8a8ab47e7..0e60fce608 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_cont_sac_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_cont_sac_config.py @@ -28,7 +28,9 @@ learning_rate_alpha=3e-4, auto_alpha=True, ), - collect=dict(n_sample=256, ), + collect=dict( + n_sample=256, + ), eval=dict(evaluator=dict(eval_freq=1000, ), ), other=dict(replay_buffer=dict(replay_buffer_size=int(1e5), ), ), ), diff --git a/dizoo/box2d/lunarlander/config/lunarlander_cont_td3_config.py b/dizoo/box2d/lunarlander/config/lunarlander_cont_td3_config.py index 273c933bc8..d95932f237 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_cont_td3_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_cont_td3_config.py @@ -12,7 +12,7 @@ stop_value=200, ), policy=dict( - cuda=True, + cuda=False, random_collect_size=10000, model=dict( obs_shape=8, @@ -22,7 +22,7 @@ ), learn=dict( update_per_collect=256, - batch_size=256, + batch_size=128, learning_rate_actor=3e-4, learning_rate_critic=1e-3, actor_update_freq=2, diff --git a/dizoo/classic_control/cartpole/config/cartpole_bc_config.py b/dizoo/classic_control/cartpole/config/cartpole_bc_config.py index b1975718f3..8315e934fe 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_bc_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_bc_config.py @@ -20,7 +20,7 @@ batch_size=64, learning_rate=0.01, learner=dict(hook=dict(save_ckpt_after_iter=1000)), - train_epoch=20, + train_epoch = 20, ), eval=dict(evaluator=dict(eval_freq=40, )) ), diff --git a/dizoo/classic_control/mountain_car/config/mtcar_rainbow_config.py b/dizoo/classic_control/mountain_car/config/mtcar_rainbow_config.py index b293d44494..c6c4fb4db0 100644 --- a/dizoo/classic_control/mountain_car/config/mtcar_rainbow_config.py +++ b/dizoo/classic_control/mountain_car/config/mtcar_rainbow_config.py @@ -1,63 +1,58 @@ from easydict import EasyDict # DI-Engine uses EasyDict for configuration, by convention -mtcar_rainbow_config = EasyDict( - dict( - exp_name='mtcar_rainbow_seed0', - env=dict( - collector_env_num=8, - evaluator_env_num=5, - n_evaluator_episode=5, - stop_value=195, +mtcar_rainbow_config = EasyDict(dict( + exp_name='mtcar_rainbow_seed0', + env=dict( + collector_env_num=8, + evaluator_env_num=5, + n_evaluator_episode=5, + stop_value=195, + ), + policy=dict( + cuda=False, + priority=True, + discount_factor=0.97, + nstep=3, + model=dict( + obs_shape=2, + action_shape=3, + encoder_hidden_size_list=[128, 128, 64], ), - policy=dict( - cuda=False, - priority=True, - discount_factor=0.97, - nstep=3, - model=dict( - obs_shape=2, - action_shape=3, - encoder_hidden_size_list=[128, 128, 64], - ), - learn=dict( - update_per_collect=3, - batch_size=64, - learning_rate=0.001, - target_update_freq=100, - ), - collect=dict( - n_sample=80, - unroll_len=1, - ), - other=dict( - eps=dict( - type='exp', - start=0.95, - end=0.1, - decay=10000, - ), - replay_buffer=dict(replay_buffer_size=20000, ) - ), + learn=dict( + update_per_collect=3, + batch_size=64, + learning_rate=0.001, + target_update_freq=100, ), - ) -) + collect=dict( + n_sample=80, + unroll_len=1, + ), + other=dict( + eps=dict( + type='exp', + start=0.95, + end=0.1, + decay=10000, + ), replay_buffer=dict(replay_buffer_size=20000, ) + ), + ), +)) main_config = mtcar_rainbow_config -mtcar_rainbow_create_config = EasyDict( - dict( - env=dict( - type='mountain_car', - import_names=['dizoo.classic_control.mountain_car.envs.mtcar_env'], - ), - env_manager=dict(type='base'), - policy=dict(type='rainbow'), - ) -) +mtcar_rainbow_create_config = EasyDict(dict( + env=dict( + type='mountain_car', + import_names=['dizoo.classic_control.mountain_car.envs.mtcar_env'], + ), + env_manager=dict(type='base'), + policy=dict(type='rainbow'), +)) create_config = mtcar_rainbow_create_config if __name__ == "__main__": from ding.entry import serial_pipeline - serial_pipeline((main_config, create_config), seed=0) + serial_pipeline((main_config, create_config), seed=0) \ No newline at end of file diff --git a/dizoo/classic_control/mountain_car/envs/__init__.py b/dizoo/classic_control/mountain_car/envs/__init__.py index 9e8ca86d5f..19f7eaf1cc 100644 --- a/dizoo/classic_control/mountain_car/envs/__init__.py +++ b/dizoo/classic_control/mountain_car/envs/__init__.py @@ -1 +1 @@ -from .mtcar_env import MountainCarEnv +from .mtcar_env import MountainCarEnv \ No newline at end of file diff --git a/dizoo/classic_control/pendulum/config/pendulum_ibc_config.py b/dizoo/classic_control/pendulum/config/pendulum_ibc_config.py index 7c56f283fe..247fdad045 100644 --- a/dizoo/classic_control/pendulum/config/pendulum_ibc_config.py +++ b/dizoo/classic_control/pendulum/config/pendulum_ibc_config.py @@ -13,15 +13,16 @@ ), policy=dict( cuda=cuda, - model=dict(obs_shape=3, action_shape=1, stochastic_optim=dict( - type='mcmc', - cuda=cuda, - )), + model=dict( + obs_shape=3, + action_shape=1, + stochastic_optim=dict(type='mcmc', cuda=cuda,) + ), learn=dict( multi_gpu=multi_gpu, train_epoch=15, batch_size=256, - optim=dict(learning_rate=1e-5, ), + optim=dict(learning_rate=1e-5,), learner=dict(hook=dict(log_show_after_iter=1000)), ), collect=dict( @@ -29,7 +30,7 @@ data_path='./pendulum_sac_data_generation/expert_demos.hdf5', collector_logit=False, ), - eval=dict(evaluator=dict(eval_freq=-1, )), + eval=dict(evaluator=dict(eval_freq=-1,)), ), ) pendulum_ibc_config = EasyDict(main_config) diff --git a/dizoo/classic_control/pendulum/config/pendulum_td3_bc_config.py b/dizoo/classic_control/pendulum/config/pendulum_td3_bc_config.py index 8583fc6ada..82a44f034e 100644 --- a/dizoo/classic_control/pendulum/config/pendulum_td3_bc_config.py +++ b/dizoo/classic_control/pendulum/config/pendulum_td3_bc_config.py @@ -6,7 +6,7 @@ collector_env_num=8, evaluator_env_num=5, norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), # (bool) Scale output action into legal range. diff --git a/dizoo/classic_control/pendulum/entry/pendulum_dqn_eval.py b/dizoo/classic_control/pendulum/entry/pendulum_dqn_eval.py index fb80ad42ad..a5a7b9ab32 100644 --- a/dizoo/classic_control/pendulum/entry/pendulum_dqn_eval.py +++ b/dizoo/classic_control/pendulum/entry/pendulum_dqn_eval.py @@ -15,9 +15,8 @@ from ding.rl_utils import get_epsilon_greedy_fn from dizoo.classic_control.pendulum.config.pendulum_dqn_config import main_config, create_config - def main(rl_cfg, seed=0): - main_cfg, create_cfg = rl_cfg + main_cfg, create_cfg =rl_cfg cfg = compile_config( main_cfg, BaseEnvManager, @@ -57,4 +56,4 @@ def main(rl_cfg, seed=0): if __name__ == "__main__": - main(rl_cfg=(main_config, create_config), seed=0) + main(rl_cfg=(main_config, create_config),seed=0) diff --git a/dizoo/common/a2c/lunarlander_a2c.py b/dizoo/common/a2c/lunarlander_a2c.py deleted file mode 100644 index 174ce3b929..0000000000 --- a/dizoo/common/a2c/lunarlander_a2c.py +++ /dev/null @@ -1,23 +0,0 @@ -from ding.bonus import A2CAgent -from huggingface_ding import push_model_to_hub - -# Instantiate the agent -agent = A2CAgent(env="lunarlander_discrete", exp_name="Lunarlander-v2-A2C") -# Train the agent -return_ = agent.train(step=int(20000000), collector_env_num=8, evaluator_env_num=8, debug=False) -# Push model to huggingface hub -push_model_to_hub( - agent=agent.best, - env_name="OpenAI/Gym/Box2d", - task_name="LunarLander-v2", - algo_name="A2C", - wandb_url=return_.wandb_url, - github_repo_url="https://github.com/opendilab/DI-engine", - github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/a2c.html", - github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", - installation_guide="pip3 install DI-engine[common_env]", - usage_file_by_git_clone="./dizoo/common/a2c/lunarlander_a2c_deploy.py", - usage_file_by_huggingface_ding="./dizoo/common/a2c/lunarlander_a2c_download.py", - train_file="./dizoo/common/a2c/lunarlander_a2c.py", - repo_id="OpenDILabCommunity/Lunarlander-v2-A2C" -) diff --git a/dizoo/common/a2c/lunarlander_a2c_deploy.py b/dizoo/common/a2c/lunarlander_a2c_deploy.py deleted file mode 100644 index e2dda00828..0000000000 --- a/dizoo/common/a2c/lunarlander_a2c_deploy.py +++ /dev/null @@ -1,16 +0,0 @@ -from ding.bonus import A2CAgent -from ding.config import Config -from easydict import EasyDict -import torch - -# Pull model from files which are git cloned from huggingface -policy_state_dict = torch.load("pytorch_model.bin", map_location=torch.device("cpu")) -cfg = EasyDict(Config.file_to_dict("policy_config.py")) -# Instantiate the agent -agent = A2CAgent( - env="lunarlander_discrete", exp_name="Lunarlander-v2-A2C", cfg=cfg.exp_config, policy_state_dict=policy_state_dict -) -# Continue training -agent.train(step=5000) -# Render the new agent performance -agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/a2c/lunarlander_a2c_download.py b/dizoo/common/a2c/lunarlander_a2c_download.py deleted file mode 100644 index 7e70a6f3e1..0000000000 --- a/dizoo/common/a2c/lunarlander_a2c_download.py +++ /dev/null @@ -1,13 +0,0 @@ -from ding.bonus import A2CAgent -from huggingface_ding import pull_model_from_hub - -# Pull model from Hugggingface hub -policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/Lunarlander-v2-A2C") -# Instantiate the agent -agent = A2CAgent( - env="lunarlander_discrete", exp_name="Lunarlander-v2-A2C", cfg=cfg.exp_config, policy_state_dict=policy_state_dict -) -# Continue training -agent.train(step=5000) -# Render the new agent performance -agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/ddpg/hopper_ddpg.py b/dizoo/common/ddpg/hopper_ddpg.py deleted file mode 100644 index d99ef146f4..0000000000 --- a/dizoo/common/ddpg/hopper_ddpg.py +++ /dev/null @@ -1,45 +0,0 @@ -from ding.bonus import DDPGAgent -from huggingface_ding import push_model_to_hub - -# Instantiate the agent -agent = DDPGAgent(env="hopper", exp_name="Hopper-v3-DDPG") -# Train the agent -return_ = agent.train(step=int(10000000), collector_env_num=4, evaluator_env_num=4, debug=False) -# Push model to huggingface hub -push_model_to_hub( - agent=agent.best, - env_name="OpenAI/Gym/MuJoCo", - task_name="Hopper-v3", - algo_name="DDPG", - wandb_url=return_.wandb_url, - github_repo_url="https://github.com/opendilab/DI-engine", - github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/ddpg.html", - github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco.html", - installation_guide=''' -sudo apt update -y \ - && sudo apt install -y \ - build-essential \ - libgl1-mesa-dev \ - libgl1-mesa-glx \ - libglew-dev \ - libosmesa6-dev \ - libglfw3 \ - libglfw3-dev \ - libsdl2-dev \ - libsdl2-image-dev \ - libglm-dev \ - libfreetype6-dev \ - patchelf - -mkdir -p ~/.mujoco -wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz -tar -xf mujoco.tar.gz -C ~/.mujoco -echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin" >> ~/.bashrc -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin -pip3 install DI-engine[common_env] -''', - usage_file_by_git_clone="./dizoo/common/ddpg/hopper_ddpg_deploy.py", - usage_file_by_huggingface_ding="./dizoo/common/ddpg/hopper_ddpg_download.py", - train_file="./dizoo/common/ddpg/hopper_ddpg.py", - repo_id="OpenDILabCommunity/Hopper-v3-DDPG" -) diff --git a/dizoo/common/ddpg/hopper_ddpg_deploy.py b/dizoo/common/ddpg/hopper_ddpg_deploy.py deleted file mode 100644 index ee0e46ded0..0000000000 --- a/dizoo/common/ddpg/hopper_ddpg_deploy.py +++ /dev/null @@ -1,14 +0,0 @@ -from ding.bonus import DDPGAgent -from ding.config import Config -from easydict import EasyDict -import torch - -# Pull model from files which are git cloned from huggingface -policy_state_dict = torch.load("pytorch_model.bin", map_location=torch.device("cpu")) -cfg = EasyDict(Config.file_to_dict("policy_config.py")) -# Instantiate the agent -agent = DDPGAgent(env="hopper", exp_name="Hopper-v3-DDPG", cfg=cfg.exp_config, policy_state_dict=policy_state_dict) -# Continue training -agent.train(step=5000) -# Render the new agent performance -agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/ddpg/hopper_ddpg_download.py b/dizoo/common/ddpg/hopper_ddpg_download.py deleted file mode 100644 index ec474dd980..0000000000 --- a/dizoo/common/ddpg/hopper_ddpg_download.py +++ /dev/null @@ -1,11 +0,0 @@ -from ding.bonus import DDPGAgent -from huggingface_ding import pull_model_from_hub - -# Pull model from Hugggingface hub -policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/Hopper-v3-DDPG") -# Instantiate the agent -agent = DDPGAgent(env="hopper", exp_name="Hopper-v3-DDPG", cfg=cfg.exp_config, policy_state_dict=policy_state_dict) -# Continue training -agent.train(step=5000) -# Render the new agent performance -agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/ddpg/lunarlander_ddpg.py b/dizoo/common/ddpg/lunarlander_ddpg.py deleted file mode 100644 index edf07f9649..0000000000 --- a/dizoo/common/ddpg/lunarlander_ddpg.py +++ /dev/null @@ -1,23 +0,0 @@ -from ding.bonus import DDPGAgent -from huggingface_ding import push_model_to_hub - -# Instantiate the agent -agent = DDPGAgent("lunarlander_continuous", exp_name="LunarLander-v2-DDPG") -# Train the agent -return_ = agent.train(step=int(4000000), collector_env_num=4, evaluator_env_num=4) -# Push model to huggingface hub -push_model_to_hub( - agent=agent.best, - env_name="OpenAI/Gym/Box2d", - task_name="LunarLander-v2", - algo_name="DDPG", - wandb_url=return_.wandb_url, - github_repo_url="https://github.com/opendilab/DI-engine", - github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/ddpg.html", - github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", - installation_guide="pip3 install DI-engine[common_env]", - usage_file_by_git_clone="./dizoo/common/ddpg/lunarlander_ddpg_deploy.py", - usage_file_by_huggingface_ding="./dizoo/common/ddpg/lunarlander_ddpg_download.py", - train_file="./dizoo/common/ddpg/lunarlander_ddpg.py", - repo_id="OpenDILabCommunity/LunarLander-v2-DDPG" -) diff --git a/dizoo/common/ddpg/lunarlander_ddpg_deploy.py b/dizoo/common/ddpg/lunarlander_ddpg_deploy.py deleted file mode 100644 index ceeedaaee4..0000000000 --- a/dizoo/common/ddpg/lunarlander_ddpg_deploy.py +++ /dev/null @@ -1,19 +0,0 @@ -from ding.bonus import DDPGAgent -from ding.config import Config -from easydict import EasyDict -import torch - -# Pull model from files which are git cloned from huggingface -policy_state_dict = torch.load("pytorch_model.bin", map_location=torch.device("cpu")) -cfg = EasyDict(Config.file_to_dict("policy_config.py")) -# Instantiate the agent -agent = DDPGAgent( - env="lunarlander_continuous", - exp_name="LunarLander-v2-DDPG", - cfg=cfg.exp_config, - policy_state_dict=policy_state_dict -) -# Continue training -agent.train(step=5000) -# Render the new agent performance -agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/ddpg/lunarlander_ddpg_download.py b/dizoo/common/ddpg/lunarlander_ddpg_download.py deleted file mode 100644 index 907d8e9f1f..0000000000 --- a/dizoo/common/ddpg/lunarlander_ddpg_download.py +++ /dev/null @@ -1,16 +0,0 @@ -from ding.bonus import DDPGAgent -from huggingface_ding import pull_model_from_hub - -# Pull model from Hugggingface hub -policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/LunarLander-v2-DDPG") -# Instantiate the agent -agent = DDPGAgent( - env="lunarlander_continuous", - exp_name="LunarLander-v2-DDPG", - cfg=cfg.exp_config, - policy_state_dict=policy_state_dict -) -# Continue training -agent.train(step=5000) -# Render the new agent performance -agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/dqn/lunarlander_dqn.py b/dizoo/common/dqn/lunarlander_dqn.py deleted file mode 100644 index 11139c6abc..0000000000 --- a/dizoo/common/dqn/lunarlander_dqn.py +++ /dev/null @@ -1,23 +0,0 @@ -from ding.bonus import DQNAgent -from huggingface_ding import push_model_to_hub - -# Instantiate the agent -agent = DQNAgent(env="lunarlander_discrete", exp_name="Lunarlander-v2-DQN") -# Train the agent -return_ = agent.train(step=int(4000000), collector_env_num=8, evaluator_env_num=8, debug=False) -# Push model to huggingface hub -push_model_to_hub( - agent=agent.best, - env_name="OpenAI/Gym/Box2d", - task_name="LunarLander-v2", - algo_name="DQN", - wandb_url=return_.wandb_url, - github_repo_url="https://github.com/opendilab/DI-engine", - github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/dqn.html", - github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", - installation_guide="pip3 install DI-engine[common_env]", - usage_file_by_git_clone="./dizoo/common/dqn/lunarlander_dqn_deploy.py", - usage_file_by_huggingface_ding="./dizoo/common/dqn/lunarlander_dqn_download.py", - train_file="./dizoo/common/dqn/lunarlander_dqn.py", - repo_id="OpenDILabCommunity/Lunarlander-v2-DQN" -) diff --git a/dizoo/common/dqn/lunarlander_dqn_deploy.py b/dizoo/common/dqn/lunarlander_dqn_deploy.py deleted file mode 100644 index 0947dd68f5..0000000000 --- a/dizoo/common/dqn/lunarlander_dqn_deploy.py +++ /dev/null @@ -1,16 +0,0 @@ -from ding.bonus import DQNAgent -from ding.config import Config -from easydict import EasyDict -import torch - -# Pull model from files which are git cloned from huggingface -policy_state_dict = torch.load("pytorch_model.bin", map_location=torch.device("cpu")) -cfg = EasyDict(Config.file_to_dict("policy_config.py")) -# Instantiate the agent -agent = DQNAgent( - env="lunarlander_discrete", exp_name="Lunarlander-v2-DQN", cfg=cfg.exp_config, policy_state_dict=policy_state_dict -) -# Continue training -agent.train(step=5000) -# Render the new agent performance -agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/dqn/lunarlander_dqn_download.py b/dizoo/common/dqn/lunarlander_dqn_download.py deleted file mode 100644 index 380b1e22d4..0000000000 --- a/dizoo/common/dqn/lunarlander_dqn_download.py +++ /dev/null @@ -1,13 +0,0 @@ -from ding.bonus import DQNAgent -from huggingface_ding import pull_model_from_hub - -# Pull model from Hugggingface hub -policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/Lunarlander-v2-DQN") -# Instantiate the agent -agent = DQNAgent( - env="lunarlander_discrete", exp_name="Lunarlander-v2-DQN", cfg=cfg.exp_config, policy_state_dict=policy_state_dict -) -# Continue training -agent.train(step=5000) -# Render the new agent performance -agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/ppo/hopper_ppo.py b/dizoo/common/ppo/hopper_ppo.py deleted file mode 100644 index 16da7ad484..0000000000 --- a/dizoo/common/ppo/hopper_ppo.py +++ /dev/null @@ -1,45 +0,0 @@ -from ding.bonus import PPOF -from huggingface_ding import push_model_to_hub - -# Instantiate the agent -agent = PPOF(env="hopper", exp_name="Hopper-v3-PPO") -# Train the agent -return_ = agent.train(step=int(10000000), collector_env_num=4, evaluator_env_num=4, debug=False) -# Push model to huggingface hub -push_model_to_hub( - agent=agent.best, - env_name="OpenAI/Gym/MuJoCo", - task_name="Hopper-v3", - algo_name="PPO", - wandb_url=return_.wandb_url, - github_repo_url="https://github.com/opendilab/DI-engine", - github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/ppo.html", - github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco.html", - installation_guide=''' -sudo apt update -y \ - && sudo apt install -y \ - build-essential \ - libgl1-mesa-dev \ - libgl1-mesa-glx \ - libglew-dev \ - libosmesa6-dev \ - libglfw3 \ - libglfw3-dev \ - libsdl2-dev \ - libsdl2-image-dev \ - libglm-dev \ - libfreetype6-dev \ - patchelf - -mkdir -p ~/.mujoco -wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz -tar -xf mujoco.tar.gz -C ~/.mujoco -echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin" >> ~/.bashrc -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin -pip3 install DI-engine[common_env] -''', - usage_file_by_git_clone="./dizoo/common/ppo/hopper_ppo_deploy.py", - usage_file_by_huggingface_ding="./dizoo/common/ppo/hopper_ppo_download.py", - train_file="./dizoo/common/ppo/hopper_ppo.py", - repo_id="OpenDILabCommunity/Hopper-v3-PPO" -) diff --git a/dizoo/common/ppo/hopper_ppo_deploy.py b/dizoo/common/ppo/hopper_ppo_deploy.py deleted file mode 100644 index 670f644e86..0000000000 --- a/dizoo/common/ppo/hopper_ppo_deploy.py +++ /dev/null @@ -1,14 +0,0 @@ -from ding.bonus import PPOF -from ding.config import Config -from easydict import EasyDict -import torch - -# Pull model from files which are git cloned from huggingface -policy_state_dict = torch.load("pytorch_model.bin", map_location=torch.device("cpu")) -cfg = EasyDict(Config.file_to_dict("policy_config.py")) -# Instantiate the agent -agent = PPOF(env="hopper", exp_name="Hopper-v3-PPO", cfg=cfg.exp_config, policy_state_dict=policy_state_dict) -# Continue training -agent.train(step=5000) -# Render the new agent performance -agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/ppo/hopper_ppo_download.py b/dizoo/common/ppo/hopper_ppo_download.py deleted file mode 100644 index 824fc49428..0000000000 --- a/dizoo/common/ppo/hopper_ppo_download.py +++ /dev/null @@ -1,11 +0,0 @@ -from ding.bonus import PPOF -from huggingface_ding import pull_model_from_hub - -# Pull model from Hugggingface hub -policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/Hopper-v3-PPO") -# Instantiate the agent -agent = PPOF(env="hopper", exp_name="Hopper-v3-PPO", cfg=cfg.exp_config, policy_state_dict=policy_state_dict) -# Continue training -agent.train(step=5000) -# Render the new agent performance -agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/ppo/lunarlander_ppo.py b/dizoo/common/ppo/lunarlander_ppo.py deleted file mode 100644 index 56c81b5e65..0000000000 --- a/dizoo/common/ppo/lunarlander_ppo.py +++ /dev/null @@ -1,23 +0,0 @@ -from ding.bonus import PPOF -from huggingface_ding import push_model_to_hub - -# Instantiate the agent -agent = PPOF("lunarlander_discrete", exp_name="LunarLander-v2-PPO") -# Train the agent -return_ = agent.train(step=int(4000000), collector_env_num=4, evaluator_env_num=4) -# Push model to huggingface hub -push_model_to_hub( - agent=agent.best, - env_name="OpenAI/Gym/Box2d", - task_name="LunarLander-v2", - algo_name="PPO", - wandb_url=return_.wandb_url, - github_repo_url="https://github.com/opendilab/DI-engine", - github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/ppo.html", - github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", - installation_guide="pip3 install DI-engine[common_env]", - usage_file_by_git_clone="./dizoo/common/ppo/lunarlander_ppo_deploy.py", - usage_file_by_huggingface_ding="./dizoo/common/ppo/lunarlander_ppo_download.py", - train_file="./dizoo/common/ppo/lunarlander_ppo.py", - repo_id="OpenDILabCommunity/LunarLander-v2-PPO" -) diff --git a/dizoo/common/ppo/lunarlander_ppo_deploy.py b/dizoo/common/ppo/lunarlander_ppo_deploy.py deleted file mode 100644 index 87b266995d..0000000000 --- a/dizoo/common/ppo/lunarlander_ppo_deploy.py +++ /dev/null @@ -1,16 +0,0 @@ -from ding.bonus import PPOF -from ding.config import Config -from easydict import EasyDict -import torch - -# Pull model from files which are git cloned from huggingface -policy_state_dict = torch.load("pytorch_model.bin", map_location=torch.device("cpu")) -cfg = EasyDict(Config.file_to_dict("policy_config.py")) -# Instantiate the agent -agent = PPOF( - env="lunarlander_discrete", exp_name="lunarlander-ppo", cfg=cfg.exp_config, policy_state_dict=policy_state_dict -) -# Continue training -agent.train(step=5000) -# Render the new agent performance -agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/ppo/lunarlander_ppo_download.py b/dizoo/common/ppo/lunarlander_ppo_download.py deleted file mode 100644 index fcf0ec3a03..0000000000 --- a/dizoo/common/ppo/lunarlander_ppo_download.py +++ /dev/null @@ -1,13 +0,0 @@ -from ding.bonus import PPOF -from huggingface_ding import pull_model_from_hub - -# Pull model from Hugggingface hub -policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/LunarLander-v2-PPO") -# Instantiate the agent -agent = PPOF( - env="lunarlander_discrete", exp_name="LunarLander-v2-PPO", cfg=cfg.exp_config, policy_state_dict=policy_state_dict -) -# Continue training -agent.train(step=5000) -# Render the new agent performance -agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/sac/hopper_sac.py b/dizoo/common/sac/hopper_sac.py deleted file mode 100644 index af1379b57d..0000000000 --- a/dizoo/common/sac/hopper_sac.py +++ /dev/null @@ -1,45 +0,0 @@ -from ding.bonus.sac import SACAgent -from huggingface_ding import push_model_to_hub - -# Instantiate the agent -agent = SACAgent(env="hopper", exp_name="Hopper-v3-SAC") -# Train the agent -return_ = agent.train(step=int(10000000), collector_env_num=4, evaluator_env_num=4, debug=False) -# Push model to huggingface hub -push_model_to_hub( - agent=agent.best, - env_name="OpenAI/Gym/MuJoCo", - task_name="Hopper-v3", - algo_name="SAC", - wandb_url=return_.wandb_url, - github_repo_url="https://github.com/opendilab/DI-engine", - github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/sac.html", - github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco.html", - installation_guide=''' -sudo apt update -y \ - && sudo apt install -y \ - build-essential \ - libgl1-mesa-dev \ - libgl1-mesa-glx \ - libglew-dev \ - libosmesa6-dev \ - libglfw3 \ - libglfw3-dev \ - libsdl2-dev \ - libsdl2-image-dev \ - libglm-dev \ - libfreetype6-dev \ - patchelf - -mkdir -p ~/.mujoco -wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz -tar -xf mujoco.tar.gz -C ~/.mujoco -echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin" >> ~/.bashrc -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin -pip3 install DI-engine[common_env] -''', - usage_file_by_git_clone="./dizoo/common/sac/hopper_sac_deploy.py", - usage_file_by_huggingface_ding="./dizoo/common/sac/hopper_sac_download.py", - train_file="./dizoo/common/sac/hopper_sac.py", - repo_id="OpenDILabCommunity/Hopper-v3-SAC" -) diff --git a/dizoo/common/sac/hopper_sac_deploy.py b/dizoo/common/sac/hopper_sac_deploy.py deleted file mode 100644 index d3cdaf4fd0..0000000000 --- a/dizoo/common/sac/hopper_sac_deploy.py +++ /dev/null @@ -1,14 +0,0 @@ -from ding.bonus import SACAgent -from ding.config import Config -from easydict import EasyDict -import torch - -# Pull model from files which are git cloned from huggingface -policy_state_dict = torch.load("pytorch_model.bin", map_location=torch.device("cpu")) -cfg = EasyDict(Config.file_to_dict("policy_config.py")) -# Instantiate the agent -agent = SACAgent(env="hopper", exp_name="Hopper-v3-SAC", cfg=cfg.exp_config, policy_state_dict=policy_state_dict) -# Continue training -agent.train(step=5000) -# Render the new agent performance -agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/sac/hopper_sac_download.py b/dizoo/common/sac/hopper_sac_download.py deleted file mode 100644 index a6ac910b6d..0000000000 --- a/dizoo/common/sac/hopper_sac_download.py +++ /dev/null @@ -1,11 +0,0 @@ -from ding.bonus import SACAgent -from huggingface_ding import pull_model_from_hub - -# Pull model from Hugggingface hub -policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/Hopper-v3-SAC") -# Instantiate the agent -agent = SACAgent(env="hopper", exp_name="Hopper-v3-SAC", cfg=cfg.exp_config, policy_state_dict=policy_state_dict) -# Continue training -agent.train(step=5000) -# Render the new agent performance -agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/sac/lunarlander_sac.py b/dizoo/common/sac/lunarlander_sac.py deleted file mode 100644 index f3aa3a6809..0000000000 --- a/dizoo/common/sac/lunarlander_sac.py +++ /dev/null @@ -1,23 +0,0 @@ -from ding.bonus import SACAgent -from huggingface_ding import push_model_to_hub - -# Instantiate the agent -agent = SACAgent("lunarlander_continuous", exp_name="LunarLander-v2-SAC") -# Train the agent -return_ = agent.train(step=int(4000000), collector_env_num=8, evaluator_env_num=8) -# Push model to huggingface hub -push_model_to_hub( - agent=agent.best, - env_name="OpenAI/Gym/Box2d", - task_name="LunarLander-v2", - algo_name="SAC", - wandb_url=return_.wandb_url, - github_repo_url="https://github.com/opendilab/DI-engine", - github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/sac.html", - github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", - installation_guide="pip3 install DI-engine[common_env]", - usage_file_by_git_clone="./dizoo/common/sac/lunarlander_sac_deploy.py", - usage_file_by_huggingface_ding="./dizoo/common/sac/lunarlander_sac_download.py", - train_file="./dizoo/common/sac/lunarlander_sac.py", - repo_id="OpenDILabCommunity/LunarLander-v2-SAC" -) diff --git a/dizoo/common/sac/lunarlander_sac_deploy.py b/dizoo/common/sac/lunarlander_sac_deploy.py deleted file mode 100644 index c9ec71331f..0000000000 --- a/dizoo/common/sac/lunarlander_sac_deploy.py +++ /dev/null @@ -1,16 +0,0 @@ -from ding.bonus import SACAgent -from ding.config import Config -from easydict import EasyDict -import torch - -# Pull model from files which are git cloned from huggingface -policy_state_dict = torch.load("pytorch_model.bin", map_location=torch.device("cpu")) -cfg = EasyDict(Config.file_to_dict("policy_config.py")) -# Instantiate the agent -agent = SACAgent( - env="lunarlander_continuous", exp_name="lunarlander-sac", cfg=cfg.exp_config, policy_state_dict=policy_state_dict -) -# Continue training -agent.train(step=5000) -# Render the new agent performance -agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/sac/lunarlander_sac_download.py b/dizoo/common/sac/lunarlander_sac_download.py deleted file mode 100644 index a9d3cbd000..0000000000 --- a/dizoo/common/sac/lunarlander_sac_download.py +++ /dev/null @@ -1,16 +0,0 @@ -from ding.bonus import SACAgent -from huggingface_ding import pull_model_from_hub - -# Pull model from Hugggingface hub -policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/LunarLander-v2-SAC") -# Instantiate the agent -agent = SACAgent( - env="lunarlander_continuous", - exp_name="LunarLander-v2-SAC", - cfg=cfg.exp_config, - policy_state_dict=policy_state_dict -) -# Continue training -agent.train(step=5000) -# Render the new agent performance -agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/td3/hopper_td3.py b/dizoo/common/td3/hopper_td3.py deleted file mode 100644 index 552f79d125..0000000000 --- a/dizoo/common/td3/hopper_td3.py +++ /dev/null @@ -1,45 +0,0 @@ -from ding.bonus import TD3Agent -from huggingface_ding import push_model_to_hub - -# Instantiate the agent -agent = TD3Agent(env="hopper", exp_name="Hopper-v3-TD3") -# Train the agent -return_ = agent.train(step=int(10000000), collector_env_num=4, evaluator_env_num=4, debug=False) -# Push model to huggingface hub -push_model_to_hub( - agent=agent.best, - env_name="OpenAI/Gym/MuJoCo", - task_name="Hopper-v3", - algo_name="TD3", - wandb_url=return_.wandb_url, - github_repo_url="https://github.com/opendilab/DI-engine", - github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/td3.html", - github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco.html", - installation_guide=''' -sudo apt update -y \ - && sudo apt install -y \ - build-essential \ - libgl1-mesa-dev \ - libgl1-mesa-glx \ - libglew-dev \ - libosmesa6-dev \ - libglfw3 \ - libglfw3-dev \ - libsdl2-dev \ - libsdl2-image-dev \ - libglm-dev \ - libfreetype6-dev \ - patchelf - -mkdir -p ~/.mujoco -wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz -tar -xf mujoco.tar.gz -C ~/.mujoco -echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin" >> ~/.bashrc -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin -pip3 install DI-engine[common_env] -''', - usage_file_by_git_clone="./dizoo/common/td3/hopper_td3_deploy.py", - usage_file_by_huggingface_ding="./dizoo/common/td3/hopper_td3_download.py", - train_file="./dizoo/common/td3/hopper_td3.py", - repo_id="OpenDILabCommunity/Hopper-v3-TD3" -) diff --git a/dizoo/common/td3/hopper_td3_deploy.py b/dizoo/common/td3/hopper_td3_deploy.py deleted file mode 100644 index 4ce16d6870..0000000000 --- a/dizoo/common/td3/hopper_td3_deploy.py +++ /dev/null @@ -1,14 +0,0 @@ -from ding.bonus import SACAgent -from ding.config import Config -from easydict import EasyDict -import torch - -# Pull model from files which are git cloned from huggingface -policy_state_dict = torch.load("pytorch_model.bin", map_location=torch.device("cpu")) -cfg = EasyDict(Config.file_to_dict("policy_config.py")) -# Instantiate the agent -agent = SACAgent(env="hopper", exp_name="Hopper-v3-TD3", cfg=cfg.exp_config, policy_state_dict=policy_state_dict) -# Continue training -agent.train(step=5000) -# Render the new agent performance -agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/td3/hopper_td3_download.py b/dizoo/common/td3/hopper_td3_download.py deleted file mode 100644 index 3a0f3d90f8..0000000000 --- a/dizoo/common/td3/hopper_td3_download.py +++ /dev/null @@ -1,11 +0,0 @@ -from ding.bonus import TD3Agent -from huggingface_ding import pull_model_from_hub - -# Pull model from Hugggingface hub -policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/Hopper-v3-TD3") -# Instantiate the agent -agent = TD3Agent(env="hopper", exp_name="Hopper-v3-TD3", cfg=cfg.exp_config, policy_state_dict=policy_state_dict) -# Continue training -agent.train(step=5000) -# Render the new agent performance -agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/td3/lunarlander_td3.py b/dizoo/common/td3/lunarlander_td3.py deleted file mode 100644 index c9cdd0b0b4..0000000000 --- a/dizoo/common/td3/lunarlander_td3.py +++ /dev/null @@ -1,23 +0,0 @@ -from ding.bonus import TD3Agent -from huggingface_ding import push_model_to_hub - -# Instantiate the agent -agent = TD3Agent("lunarlander_continuous", exp_name="LunarLander-v2-TD3") -# Train the agent -return_ = agent.train(step=int(4000000), collector_env_num=4, evaluator_env_num=4) -# Push model to huggingface hub -push_model_to_hub( - agent=agent.best, - env_name="OpenAI/Gym/Box2d", - task_name="LunarLander-v2", - algo_name="TD3", - wandb_url=return_.wandb_url, - github_repo_url="https://github.com/opendilab/DI-engine", - github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/td3.html", - github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", - installation_guide="pip3 install DI-engine[common_env]", - usage_file_by_git_clone="./dizoo/common/td3/lunarlander_td3_deploy.py", - usage_file_by_huggingface_ding="./dizoo/common/td3/lunarlander_td3_download.py", - train_file="./dizoo/common/td3/lunarlander_td3.py", - repo_id="OpenDILabCommunity/LunarLander-v2-TD3" -) diff --git a/dizoo/common/td3/lunarlander_td3_deploy.py b/dizoo/common/td3/lunarlander_td3_deploy.py deleted file mode 100644 index 043f4b660b..0000000000 --- a/dizoo/common/td3/lunarlander_td3_deploy.py +++ /dev/null @@ -1,19 +0,0 @@ -from ding.bonus import TD3Agent -from ding.config import Config -from easydict import EasyDict -import torch - -# Pull model from files which are git cloned from huggingface -policy_state_dict = torch.load("pytorch_model.bin", map_location=torch.device("cpu")) -cfg = EasyDict(Config.file_to_dict("policy_config.py")) -# Instantiate the agent -agent = TD3Agent( - env="lunarlander_continuous", - exp_name="LunarLander-v2-TD3", - cfg=cfg.exp_config, - policy_state_dict=policy_state_dict -) -# Continue training -agent.train(step=5000) -# Render the new agent performance -agent.deploy(enable_save_replay=True) diff --git a/dizoo/common/td3/lunarlander_td3_download.py b/dizoo/common/td3/lunarlander_td3_download.py deleted file mode 100644 index 9809d4ddd4..0000000000 --- a/dizoo/common/td3/lunarlander_td3_download.py +++ /dev/null @@ -1,16 +0,0 @@ -from ding.bonus import TD3Agent -from huggingface_ding import pull_model_from_hub - -# Pull model from Hugggingface hub -policy_state_dict, cfg = pull_model_from_hub(repo_id="OpenDILabCommunity/LunarLander-v2-TD3") -# Instantiate the agent -agent = TD3Agent( - env="lunarlander_continuous", - exp_name="LunarLander-v2-TD3", - cfg=cfg.exp_config, - policy_state_dict=policy_state_dict -) -# Continue training -agent.train(step=5000) -# Render the new agent performance -agent.deploy(enable_save_replay=True) diff --git a/dizoo/d4rl/config/halfcheetah_expert_td3bc_config.py b/dizoo/d4rl/config/halfcheetah_expert_td3bc_config.py index 77b24abcf1..e798cf66e3 100644 --- a/dizoo/d4rl/config/halfcheetah_expert_td3bc_config.py +++ b/dizoo/d4rl/config/halfcheetah_expert_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='walker2d-expert-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/halfcheetah_medium_expert_td3bc_config.py b/dizoo/d4rl/config/halfcheetah_medium_expert_td3bc_config.py index d93ba6f445..8d25289131 100644 --- a/dizoo/d4rl/config/halfcheetah_medium_expert_td3bc_config.py +++ b/dizoo/d4rl/config/halfcheetah_medium_expert_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='halfcheetah-medium-expert-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/halfcheetah_medium_replay_td3bc_config.py b/dizoo/d4rl/config/halfcheetah_medium_replay_td3bc_config.py index cbc35370f4..3561f320fb 100644 --- a/dizoo/d4rl/config/halfcheetah_medium_replay_td3bc_config.py +++ b/dizoo/d4rl/config/halfcheetah_medium_replay_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='halfcheetah-medium-replay-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/halfcheetah_medium_td3bc_config.py b/dizoo/d4rl/config/halfcheetah_medium_td3bc_config.py index 38f78689ea..ef6e2d3f40 100644 --- a/dizoo/d4rl/config/halfcheetah_medium_td3bc_config.py +++ b/dizoo/d4rl/config/halfcheetah_medium_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='halfcheetah-medium-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/halfcheetah_random_td3bc_config.py b/dizoo/d4rl/config/halfcheetah_random_td3bc_config.py index dec3f2edc1..dbe94d1a24 100644 --- a/dizoo/d4rl/config/halfcheetah_random_td3bc_config.py +++ b/dizoo/d4rl/config/halfcheetah_random_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='halfcheetah-random-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/hopper_expert_td3bc_config.py b/dizoo/d4rl/config/hopper_expert_td3bc_config.py index 776366ba0c..b0874a0018 100644 --- a/dizoo/d4rl/config/hopper_expert_td3bc_config.py +++ b/dizoo/d4rl/config/hopper_expert_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='hopper-expert-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/hopper_medium_expert_bc_config.py b/dizoo/d4rl/config/hopper_medium_expert_bc_config.py index 348361dd2d..e04bd28069 100644 --- a/dizoo/d4rl/config/hopper_medium_expert_bc_config.py +++ b/dizoo/d4rl/config/hopper_medium_expert_bc_config.py @@ -8,7 +8,7 @@ env=dict( env_id='hopper-medium-expert-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), evaluator_env_num=8, @@ -38,7 +38,7 @@ data_type='d4rl', data_path=None, ), - eval=dict(evaluator=dict(eval_freq=-1, )), + eval=dict(evaluator=dict(eval_freq=-1,)), ), ) main_config = EasyDict(main_config) @@ -48,7 +48,7 @@ type='d4rl', import_names=['dizoo.d4rl.envs.d4rl_env'], ), - env_manager=dict(type='base', ), + env_manager=dict(type='base',), policy=dict( type='bc', import_names=['ding.policy.bc'], diff --git a/dizoo/d4rl/config/hopper_medium_expert_ibc_ar_config.py b/dizoo/d4rl/config/hopper_medium_expert_ibc_ar_config.py index 5d1090dc77..061b8b53a6 100644 --- a/dizoo/d4rl/config/hopper_medium_expert_ibc_ar_config.py +++ b/dizoo/d4rl/config/hopper_medium_expert_ibc_ar_config.py @@ -8,7 +8,7 @@ env=dict( env_id='hopper-medium-expert-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), evaluator_env_num=8, @@ -18,19 +18,23 @@ ), policy=dict( cuda=cuda, - model=dict(obs_shape=11, action_shape=3, stochastic_optim=dict(type='ardfo', )), + model=dict( + obs_shape=11, + action_shape=3, + stochastic_optim=dict(type='ardfo',) + ), learn=dict( multi_gpu=multi_gpu, train_epoch=15, batch_size=256, - optim=dict(learning_rate=1e-5, ), + optim=dict(learning_rate=1e-5,), learner=dict(hook=dict(log_show_after_iter=1000)), ), collect=dict( data_type='d4rl', data_path=None, ), - eval=dict(evaluator=dict(eval_freq=-1, )), + eval=dict(evaluator=dict(eval_freq=-1,)), ), ) main_config = EasyDict(main_config) @@ -40,7 +44,7 @@ type='d4rl', import_names=['dizoo.d4rl.envs.d4rl_env'], ), - env_manager=dict(type='base', ), + env_manager=dict(type='base',), policy=dict( type='ibc', import_names=['ding.policy.ibc'], diff --git a/dizoo/d4rl/config/hopper_medium_expert_ibc_config.py b/dizoo/d4rl/config/hopper_medium_expert_ibc_config.py index 0f040970e6..e7a72984b6 100644 --- a/dizoo/d4rl/config/hopper_medium_expert_ibc_config.py +++ b/dizoo/d4rl/config/hopper_medium_expert_ibc_config.py @@ -8,7 +8,7 @@ env=dict( env_id='hopper-medium-expert-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), evaluator_env_num=8, @@ -18,19 +18,23 @@ ), policy=dict( cuda=cuda, - model=dict(obs_shape=11, action_shape=3, stochastic_optim=dict(type='dfo', )), + model=dict( + obs_shape=11, + action_shape=3, + stochastic_optim=dict(type='dfo',) + ), learn=dict( multi_gpu=multi_gpu, train_epoch=15, batch_size=256, - optim=dict(learning_rate=1e-5, ), + optim=dict(learning_rate=1e-5,), learner=dict(hook=dict(log_show_after_iter=1000)), ), collect=dict( data_type='d4rl', data_path=None, ), - eval=dict(evaluator=dict(eval_freq=-1, )), + eval=dict(evaluator=dict(eval_freq=-1,)), ), ) main_config = EasyDict(main_config) @@ -40,7 +44,7 @@ type='d4rl', import_names=['dizoo.d4rl.envs.d4rl_env'], ), - env_manager=dict(type='base', ), + env_manager=dict(type='base',), policy=dict( type='ibc', import_names=['ding.policy.ibc'], diff --git a/dizoo/d4rl/config/hopper_medium_expert_ibc_mcmc_config.py b/dizoo/d4rl/config/hopper_medium_expert_ibc_mcmc_config.py index 478e0c5d44..e5f6f3dbb1 100644 --- a/dizoo/d4rl/config/hopper_medium_expert_ibc_mcmc_config.py +++ b/dizoo/d4rl/config/hopper_medium_expert_ibc_mcmc_config.py @@ -8,7 +8,7 @@ env=dict( env_id='hopper-medium-expert-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), evaluator_env_num=8, @@ -18,19 +18,23 @@ ), policy=dict( cuda=cuda, - model=dict(obs_shape=11, action_shape=3, stochastic_optim=dict(type='mcmc', )), + model=dict( + obs_shape=11, + action_shape=3, + stochastic_optim=dict(type='mcmc',) + ), learn=dict( multi_gpu=multi_gpu, train_epoch=15, batch_size=256, - optim=dict(learning_rate=1e-5, ), + optim=dict(learning_rate=1e-5,), learner=dict(hook=dict(log_show_after_iter=1000)), ), collect=dict( data_type='d4rl', data_path=None, ), - eval=dict(evaluator=dict(eval_freq=-1, )), + eval=dict(evaluator=dict(eval_freq=-1,)), ), ) main_config = EasyDict(main_config) @@ -40,7 +44,7 @@ type='d4rl', import_names=['dizoo.d4rl.envs.d4rl_env'], ), - env_manager=dict(type='base', ), + env_manager=dict(type='base',), policy=dict( type='ibc', import_names=['ding.policy.ibc'], diff --git a/dizoo/d4rl/config/hopper_medium_expert_td3bc_config.py b/dizoo/d4rl/config/hopper_medium_expert_td3bc_config.py index 16212d4518..19531debad 100644 --- a/dizoo/d4rl/config/hopper_medium_expert_td3bc_config.py +++ b/dizoo/d4rl/config/hopper_medium_expert_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='hopper-medium-expert-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/hopper_medium_replay_td3bc_config.py b/dizoo/d4rl/config/hopper_medium_replay_td3bc_config.py index 87bc42721f..8f754781db 100644 --- a/dizoo/d4rl/config/hopper_medium_replay_td3bc_config.py +++ b/dizoo/d4rl/config/hopper_medium_replay_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='hopper-medium-replay-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/hopper_medium_td3bc_config.py b/dizoo/d4rl/config/hopper_medium_td3bc_config.py index 15ed2b9073..cbf5fcce19 100644 --- a/dizoo/d4rl/config/hopper_medium_td3bc_config.py +++ b/dizoo/d4rl/config/hopper_medium_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='hopper-medium-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/hopper_random_td3bc_config.py b/dizoo/d4rl/config/hopper_random_td3bc_config.py index 0f1127f16a..8cf796b5fb 100644 --- a/dizoo/d4rl/config/hopper_random_td3bc_config.py +++ b/dizoo/d4rl/config/hopper_random_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='hopper-random-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/kitchen_complete_bc_config.py b/dizoo/d4rl/config/kitchen_complete_bc_config.py index 413696993d..7160885da3 100644 --- a/dizoo/d4rl/config/kitchen_complete_bc_config.py +++ b/dizoo/d4rl/config/kitchen_complete_bc_config.py @@ -8,7 +8,7 @@ env=dict( env_id='kitchen-complete-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), evaluator_env_num=8, @@ -19,7 +19,7 @@ policy=dict( cuda=cuda, continuous=True, - loss_type='mse_loss', + loss_type='mse_loss', model=dict( obs_shape=60, action_shape=9, @@ -38,7 +38,7 @@ data_type='d4rl', data_path=None, ), - eval=dict(evaluator=dict(eval_freq=1000, )), + eval=dict(evaluator=dict(eval_freq=1000,)), ), ) main_config = EasyDict(main_config) @@ -48,7 +48,7 @@ type='d4rl', import_names=['dizoo.d4rl.envs.d4rl_env'], ), - env_manager=dict(type='base', ), + env_manager=dict(type='base',), policy=dict( type='bc', import_names=['ding.policy.bc'], diff --git a/dizoo/d4rl/config/kitchen_complete_ibc_ar_config.py b/dizoo/d4rl/config/kitchen_complete_ibc_ar_config.py index bbb7198af0..403dc52eff 100644 --- a/dizoo/d4rl/config/kitchen_complete_ibc_ar_config.py +++ b/dizoo/d4rl/config/kitchen_complete_ibc_ar_config.py @@ -8,7 +8,7 @@ env=dict( env_id='kitchen-complete-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), evaluator_env_num=8, @@ -18,19 +18,23 @@ ), policy=dict( cuda=cuda, - model=dict(obs_shape=60, action_shape=9, stochastic_optim=dict(type='ardfo', )), + model=dict( + obs_shape=60, + action_shape=9, + stochastic_optim=dict(type='ardfo',) + ), learn=dict( multi_gpu=multi_gpu, train_epoch=1000, batch_size=256, - optim=dict(learning_rate=1e-5, ), + optim=dict(learning_rate=1e-5,), learner=dict(hook=dict(log_show_after_iter=100)), ), collect=dict( data_type='d4rl', data_path=None, ), - eval=dict(evaluator=dict(eval_freq=1000, )), + eval=dict(evaluator=dict(eval_freq=1000,)), ), ) main_config = EasyDict(main_config) @@ -40,7 +44,7 @@ type='d4rl', import_names=['dizoo.d4rl.envs.d4rl_env'], ), - env_manager=dict(type='base', ), + env_manager=dict(type='base',), policy=dict( type='ibc', import_names=['ding.policy.ibc'], diff --git a/dizoo/d4rl/config/kitchen_complete_ibc_config.py b/dizoo/d4rl/config/kitchen_complete_ibc_config.py index 1606cb7792..5c02f04a81 100644 --- a/dizoo/d4rl/config/kitchen_complete_ibc_config.py +++ b/dizoo/d4rl/config/kitchen_complete_ibc_config.py @@ -8,7 +8,7 @@ env=dict( env_id='kitchen-complete-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), evaluator_env_num=8, @@ -18,19 +18,23 @@ ), policy=dict( cuda=cuda, - model=dict(obs_shape=60, action_shape=9, stochastic_optim=dict(type='dfo', )), + model=dict( + obs_shape=60, + action_shape=9, + stochastic_optim=dict(type='dfo',) + ), learn=dict( multi_gpu=multi_gpu, train_epoch=1000, batch_size=256, - optim=dict(learning_rate=1e-5, ), + optim=dict(learning_rate=1e-5,), learner=dict(hook=dict(log_show_after_iter=100)), ), collect=dict( data_type='d4rl', data_path=None, ), - eval=dict(evaluator=dict(eval_freq=1000, )), + eval=dict(evaluator=dict(eval_freq=1000,)), ), ) main_config = EasyDict(main_config) @@ -40,7 +44,7 @@ type='d4rl', import_names=['dizoo.d4rl.envs.d4rl_env'], ), - env_manager=dict(type='base', ), + env_manager=dict(type='base',), policy=dict( type='ibc', import_names=['ding.policy.ibc'], diff --git a/dizoo/d4rl/config/kitchen_complete_ibc_mcmc_config.py b/dizoo/d4rl/config/kitchen_complete_ibc_mcmc_config.py index 14924d5257..d93c5eb737 100644 --- a/dizoo/d4rl/config/kitchen_complete_ibc_mcmc_config.py +++ b/dizoo/d4rl/config/kitchen_complete_ibc_mcmc_config.py @@ -8,7 +8,7 @@ env=dict( env_id='kitchen-complete-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), evaluator_env_num=8, @@ -18,19 +18,23 @@ ), policy=dict( cuda=cuda, - model=dict(obs_shape=60, action_shape=9, stochastic_optim=dict(type='mcmc', )), + model=dict( + obs_shape=60, + action_shape=9, + stochastic_optim=dict(type='mcmc',) + ), learn=dict( multi_gpu=multi_gpu, train_epoch=1000, batch_size=256, - optim=dict(learning_rate=1e-5, ), + optim=dict(learning_rate=1e-5,), learner=dict(hook=dict(log_show_after_iter=100)), ), collect=dict( data_type='d4rl', data_path=None, ), - eval=dict(evaluator=dict(eval_freq=1000, )), + eval=dict(evaluator=dict(eval_freq=1000,)), ), ) main_config = EasyDict(main_config) @@ -40,7 +44,7 @@ type='d4rl', import_names=['dizoo.d4rl.envs.d4rl_env'], ), - env_manager=dict(type='base', ), + env_manager=dict(type='base',), policy=dict( type='ibc', import_names=['ding.policy.ibc'], diff --git a/dizoo/d4rl/config/pen_human_bc_config.py b/dizoo/d4rl/config/pen_human_bc_config.py index 215b706ffc..6779ffd934 100644 --- a/dizoo/d4rl/config/pen_human_bc_config.py +++ b/dizoo/d4rl/config/pen_human_bc_config.py @@ -8,7 +8,7 @@ env=dict( env_id='pen-human-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), evaluator_env_num=8, @@ -38,7 +38,7 @@ data_type='d4rl', data_path=None, ), - eval=dict(evaluator=dict(eval_freq=1000, )), + eval=dict(evaluator=dict(eval_freq=1000,)), ), ) main_config = EasyDict(main_config) @@ -48,7 +48,7 @@ type='d4rl', import_names=['dizoo.d4rl.envs.d4rl_env'], ), - env_manager=dict(type='base', ), + env_manager=dict(type='base',), policy=dict( type='bc', import_names=['ding.policy.bc'], diff --git a/dizoo/d4rl/config/pen_human_ibc_ar_config.py b/dizoo/d4rl/config/pen_human_ibc_ar_config.py index 4f59733fd5..b75e3b9f11 100644 --- a/dizoo/d4rl/config/pen_human_ibc_ar_config.py +++ b/dizoo/d4rl/config/pen_human_ibc_ar_config.py @@ -8,7 +8,7 @@ env=dict( env_id='pen-human-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), evaluator_env_num=8, @@ -19,20 +19,24 @@ policy=dict( cuda=cuda, model=dict( - obs_shape=45, action_shape=24, hidden_size=128, hidden_layer_num=4, stochastic_optim=dict(type='ardfo', ) + obs_shape=45, + action_shape=24, + hidden_size=128, + hidden_layer_num=4, + stochastic_optim=dict(type='ardfo',) ), learn=dict( multi_gpu=multi_gpu, train_epoch=1000, batch_size=256, - optim=dict(learning_rate=1e-5, ), + optim=dict(learning_rate=1e-5,), learner=dict(hook=dict(log_show_after_iter=100)), ), collect=dict( data_type='d4rl', data_path=None, ), - eval=dict(evaluator=dict(eval_freq=1000, )), + eval=dict(evaluator=dict(eval_freq=1000,)), ), ) main_config = EasyDict(main_config) @@ -42,7 +46,7 @@ type='d4rl', import_names=['dizoo.d4rl.envs.d4rl_env'], ), - env_manager=dict(type='base', ), + env_manager=dict(type='base',), policy=dict( type='ibc', import_names=['ding.policy.ibc'], diff --git a/dizoo/d4rl/config/pen_human_ibc_config.py b/dizoo/d4rl/config/pen_human_ibc_config.py index 9ed4f6d17b..207487d921 100644 --- a/dizoo/d4rl/config/pen_human_ibc_config.py +++ b/dizoo/d4rl/config/pen_human_ibc_config.py @@ -8,7 +8,7 @@ env=dict( env_id='pen-human-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), evaluator_env_num=8, @@ -18,19 +18,23 @@ ), policy=dict( cuda=cuda, - model=dict(obs_shape=45, action_shape=24, stochastic_optim=dict(type='dfo', )), + model=dict( + obs_shape=45, + action_shape=24, + stochastic_optim=dict(type='dfo',) + ), learn=dict( multi_gpu=multi_gpu, train_epoch=1000, batch_size=256, - optim=dict(learning_rate=1e-5, ), + optim=dict(learning_rate=1e-5,), learner=dict(hook=dict(log_show_after_iter=100)), ), collect=dict( data_type='d4rl', data_path=None, ), - eval=dict(evaluator=dict(eval_freq=1000, )), + eval=dict(evaluator=dict(eval_freq=1000,)), ), ) main_config = EasyDict(main_config) @@ -40,7 +44,7 @@ type='d4rl', import_names=['dizoo.d4rl.envs.d4rl_env'], ), - env_manager=dict(type='base', ), + env_manager=dict(type='base',), policy=dict( type='ibc', import_names=['ding.policy.ibc'], diff --git a/dizoo/d4rl/config/pen_human_ibc_mcmc_config.py b/dizoo/d4rl/config/pen_human_ibc_mcmc_config.py index 4dd6b37f90..cee0f631fd 100644 --- a/dizoo/d4rl/config/pen_human_ibc_mcmc_config.py +++ b/dizoo/d4rl/config/pen_human_ibc_mcmc_config.py @@ -8,7 +8,7 @@ env=dict( env_id='pen-human-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), evaluator_env_num=8, @@ -18,19 +18,23 @@ ), policy=dict( cuda=cuda, - model=dict(obs_shape=45, action_shape=24, stochastic_optim=dict(type='mcmc', )), + model=dict( + obs_shape=45, + action_shape=24, + stochastic_optim=dict(type='mcmc',) + ), learn=dict( multi_gpu=multi_gpu, train_epoch=1000, batch_size=256, - optim=dict(learning_rate=1e-5, ), + optim=dict(learning_rate=1e-5,), learner=dict(hook=dict(log_show_after_iter=100)), ), collect=dict( data_type='d4rl', data_path=None, ), - eval=dict(evaluator=dict(eval_freq=1000, )), + eval=dict(evaluator=dict(eval_freq=1000,)), ), ) main_config = EasyDict(main_config) @@ -40,7 +44,7 @@ type='d4rl', import_names=['dizoo.d4rl.envs.d4rl_env'], ), - env_manager=dict(type='base', ), + env_manager=dict(type='base',), policy=dict( type='ibc', import_names=['ding.policy.ibc'], diff --git a/dizoo/d4rl/config/walker2d_expert_td3bc_config.py b/dizoo/d4rl/config/walker2d_expert_td3bc_config.py index f5530b7bfd..c12d58b230 100644 --- a/dizoo/d4rl/config/walker2d_expert_td3bc_config.py +++ b/dizoo/d4rl/config/walker2d_expert_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='halfcheetah-expert-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/walker2d_medium_expert_td3bc_config.py b/dizoo/d4rl/config/walker2d_medium_expert_td3bc_config.py index d85ddc134e..2aed878dd8 100644 --- a/dizoo/d4rl/config/walker2d_medium_expert_td3bc_config.py +++ b/dizoo/d4rl/config/walker2d_medium_expert_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='walker2d-medium-expert-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/walker2d_medium_replay_td3bc_config.py b/dizoo/d4rl/config/walker2d_medium_replay_td3bc_config.py index e997e7d8aa..67cc95a1c2 100644 --- a/dizoo/d4rl/config/walker2d_medium_replay_td3bc_config.py +++ b/dizoo/d4rl/config/walker2d_medium_replay_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='walker2d-medium-replay-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/walker2d_medium_td3bc_config.py b/dizoo/d4rl/config/walker2d_medium_td3bc_config.py index 619dc62a8d..dc76b5c012 100644 --- a/dizoo/d4rl/config/walker2d_medium_td3bc_config.py +++ b/dizoo/d4rl/config/walker2d_medium_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='walker2d-medium-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/config/walker2d_random_td3bc_config.py b/dizoo/d4rl/config/walker2d_random_td3bc_config.py index fe915b65bc..f252c14dbd 100644 --- a/dizoo/d4rl/config/walker2d_random_td3bc_config.py +++ b/dizoo/d4rl/config/walker2d_random_td3bc_config.py @@ -7,7 +7,7 @@ env=dict( env_id='walker2d-random-v0', norm_obs=dict( - use_norm=True, + use_norm=True, offline_stats=dict(use_offline_stats=True, ), ), collector_env_num=1, diff --git a/dizoo/d4rl/entry/d4rl_cql_main.py b/dizoo/d4rl/entry/d4rl_cql_main.py index 7a8934a90a..9315a3644d 100644 --- a/dizoo/d4rl/entry/d4rl_cql_main.py +++ b/dizoo/d4rl/entry/d4rl_cql_main.py @@ -5,7 +5,7 @@ def train(args): # launch from anywhere - config = Path(__file__).absolute().parent.parent / 'config' / args.config + config = Path(__file__).absolute().parent.parent / 'config' / args.config config = read_config(str(config)) config[0].exp_name = config[0].exp_name.replace('0', str(args.seed)) serial_pipeline_offline(config, seed=args.seed) diff --git a/dizoo/d4rl/entry/d4rl_td3_bc_main.py b/dizoo/d4rl/entry/d4rl_td3_bc_main.py index b25bf904a5..bdf945978f 100644 --- a/dizoo/d4rl/entry/d4rl_td3_bc_main.py +++ b/dizoo/d4rl/entry/d4rl_td3_bc_main.py @@ -5,7 +5,7 @@ def train(args): # launch from anywhere - config = Path(__file__).absolute().parent.parent / 'config' / args.config + config = Path(__file__).absolute().parent.parent / 'config' / args.config config = read_config(str(config)) config[0].exp_name = config[0].exp_name.replace('0', str(args.seed)) serial_pipeline_offline(config, seed=args.seed) diff --git a/dizoo/dmc2gym/config/dmc2gym_ppo_config.py b/dizoo/dmc2gym/config/dmc2gym_ppo_config.py index 207b398e63..4f48633c5f 100644 --- a/dizoo/dmc2gym/config/dmc2gym_ppo_config.py +++ b/dizoo/dmc2gym/config/dmc2gym_ppo_config.py @@ -1,5 +1,6 @@ from easydict import EasyDict + cartpole_balance_ppo_config = dict( exp_name='dmc2gym_cartpole_balance_ppo', env=dict( diff --git a/dizoo/dmc2gym/entry/dmc2gym_sac_pixel_main.py b/dizoo/dmc2gym/entry/dmc2gym_sac_pixel_main.py index 1f6eb2abb5..60a83921ef 100644 --- a/dizoo/dmc2gym/entry/dmc2gym_sac_pixel_main.py +++ b/dizoo/dmc2gym/entry/dmc2gym_sac_pixel_main.py @@ -15,7 +15,6 @@ from dizoo.dmc2gym.envs.dmc2gym_env import DMC2GymEnv from dizoo.dmc2gym.config.dmc2gym_sac_pixel_config import main_config, create_config - def main(): logging.getLogger().setLevel(logging.INFO) main_config.exp_name = 'dmc2gym_sac_pixel_seed0' @@ -24,8 +23,8 @@ def main(): num_seed = 1 for seed_i in range(num_seed): - tb_logger = SummaryWriter(os.path.join('./{}/log/'.format(cfg.exp_name), 'seed' + str(seed_i))) - + tb_logger = SummaryWriter(os.path.join('./{}/log/'.format(cfg.exp_name), 'seed'+str(seed_i))) + with task.start(async_mode=False, ctx=OnlineRLContext()): collector_env = BaseEnvManagerV2( env_fn=[lambda: DMC2GymEnv(cfg.env) for _ in range(cfg.env.collector_env_num)], cfg=cfg.env.manager @@ -43,20 +42,16 @@ def main(): def _add_scalar(ctx): if ctx.eval_value != -np.inf: - tb_logger.add_scalar('evaluator_step/reward', ctx.eval_value, global_step=ctx.env_step) + tb_logger.add_scalar('evaluator_step/reward', ctx.eval_value, global_step= ctx.env_step) collector_rewards = [ctx.trajectories[i]['reward'] for i in range(len(ctx.trajectories))] collector_mean_reward = sum(collector_rewards) / len(ctx.trajectories) # collector_max_reward = max(collector_rewards) # collector_min_reward = min(collector_rewards) - tb_logger.add_scalar('collecter_step/mean_reward', collector_mean_reward, global_step=ctx.env_step) + tb_logger.add_scalar('collecter_step/mean_reward', collector_mean_reward, global_step= ctx.env_step) # tb_logger.add_scalar('collecter_step/max_reward', collector_max_reward, global_step= ctx.env_step) # tb_logger.add_scalar('collecter_step/min_reward', collector_min_reward, global_step= ctx.env_step) - tb_logger.add_scalar( - 'collecter_step/avg_env_step_per_episode', - ctx.env_step / ctx.env_episode, - global_step=ctx.env_step - ) - + tb_logger.add_scalar('collecter_step/avg_env_step_per_episode', ctx.env_step/ctx.env_episode, global_step= ctx.env_step) + def _add_train_scalar(ctx): len_train = len(ctx.train_output) cur_lr_q_avg = sum([ctx.train_output[i]['cur_lr_q'] for i in range(len_train)]) / len_train @@ -64,17 +59,15 @@ def _add_train_scalar(ctx): critic_loss_avg = sum([ctx.train_output[i]['critic_loss'] for i in range(len_train)]) / len_train policy_loss_avg = sum([ctx.train_output[i]['policy_loss'] for i in range(len_train)]) / len_train total_loss_avg = sum([ctx.train_output[i]['total_loss'] for i in range(len_train)]) / len_train - tb_logger.add_scalar('learner_step/cur_lr_q_avg', cur_lr_q_avg, global_step=ctx.env_step) - tb_logger.add_scalar('learner_step/cur_lr_p_avg', cur_lr_p_avg, global_step=ctx.env_step) - tb_logger.add_scalar('learner_step/critic_loss_avg', critic_loss_avg, global_step=ctx.env_step) - tb_logger.add_scalar('learner_step/policy_loss_avg', policy_loss_avg, global_step=ctx.env_step) - tb_logger.add_scalar('learner_step/total_loss_avg', total_loss_avg, global_step=ctx.env_step) - + tb_logger.add_scalar('learner_step/cur_lr_q_avg', cur_lr_q_avg, global_step= ctx.env_step) + tb_logger.add_scalar('learner_step/cur_lr_p_avg', cur_lr_p_avg, global_step= ctx.env_step) + tb_logger.add_scalar('learner_step/critic_loss_avg', critic_loss_avg, global_step= ctx.env_step) + tb_logger.add_scalar('learner_step/policy_loss_avg', policy_loss_avg, global_step= ctx.env_step) + tb_logger.add_scalar('learner_step/total_loss_avg', total_loss_avg, global_step= ctx.env_step) + task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) task.use( - StepCollector( - cfg, policy.collect_mode, collector_env, random_collect_size=cfg.policy.random_collect_size - ) + StepCollector(cfg, policy.collect_mode, collector_env, random_collect_size=cfg.policy.random_collect_size) ) task.use(_add_scalar) task.use(data_pusher(cfg, buffer_)) diff --git a/dizoo/dmc2gym/entry/dmc2gym_sac_state_main.py b/dizoo/dmc2gym/entry/dmc2gym_sac_state_main.py index 7e6cf920f5..6bc7036352 100644 --- a/dizoo/dmc2gym/entry/dmc2gym_sac_state_main.py +++ b/dizoo/dmc2gym/entry/dmc2gym_sac_state_main.py @@ -15,7 +15,6 @@ from tensorboardX import SummaryWriter import os - def main(): logging.getLogger().setLevel(logging.INFO) main_config.exp_name = 'dmc2gym_sac_state_nseed_5M' @@ -24,8 +23,8 @@ def main(): num_seed = 4 for seed_i in range(num_seed): - tb_logger = SummaryWriter(os.path.join('./{}/log/'.format(cfg.exp_name), 'seed' + str(seed_i))) - + tb_logger = SummaryWriter(os.path.join('./{}/log/'.format(cfg.exp_name), 'seed'+str(seed_i))) + with task.start(async_mode=False, ctx=OnlineRLContext()): collector_env = BaseEnvManagerV2( env_fn=[lambda: DMC2GymEnv(cfg.env) for _ in range(cfg.env.collector_env_num)], cfg=cfg.env.manager @@ -42,20 +41,16 @@ def main(): def _add_scalar(ctx): if ctx.eval_value != -np.inf: - tb_logger.add_scalar('evaluator_step/reward', ctx.eval_value, global_step=ctx.env_step) + tb_logger.add_scalar('evaluator_step/reward', ctx.eval_value, global_step= ctx.env_step) collector_rewards = [ctx.trajectories[i]['reward'] for i in range(len(ctx.trajectories))] collector_mean_reward = sum(collector_rewards) / len(ctx.trajectories) # collector_max_reward = max(collector_rewards) # collector_min_reward = min(collector_rewards) - tb_logger.add_scalar('collecter_step/mean_reward', collector_mean_reward, global_step=ctx.env_step) + tb_logger.add_scalar('collecter_step/mean_reward', collector_mean_reward, global_step= ctx.env_step) # tb_logger.add_scalar('collecter_step/max_reward', collector_max_reward, global_step= ctx.env_step) # tb_logger.add_scalar('collecter_step/min_reward', collector_min_reward, global_step= ctx.env_step) - tb_logger.add_scalar( - 'collecter_step/avg_env_step_per_episode', - ctx.env_step / ctx.env_episode, - global_step=ctx.env_step - ) - + tb_logger.add_scalar('collecter_step/avg_env_step_per_episode', ctx.env_step/ctx.env_episode, global_step= ctx.env_step) + def _add_train_scalar(ctx): len_train = len(ctx.train_output) cur_lr_q_avg = sum([ctx.train_output[i]['cur_lr_q'] for i in range(len_train)]) / len_train @@ -63,17 +58,15 @@ def _add_train_scalar(ctx): critic_loss_avg = sum([ctx.train_output[i]['critic_loss'] for i in range(len_train)]) / len_train policy_loss_avg = sum([ctx.train_output[i]['policy_loss'] for i in range(len_train)]) / len_train total_loss_avg = sum([ctx.train_output[i]['total_loss'] for i in range(len_train)]) / len_train - tb_logger.add_scalar('learner_step/cur_lr_q_avg', cur_lr_q_avg, global_step=ctx.env_step) - tb_logger.add_scalar('learner_step/cur_lr_p_avg', cur_lr_p_avg, global_step=ctx.env_step) - tb_logger.add_scalar('learner_step/critic_loss_avg', critic_loss_avg, global_step=ctx.env_step) - tb_logger.add_scalar('learner_step/policy_loss_avg', policy_loss_avg, global_step=ctx.env_step) - tb_logger.add_scalar('learner_step/total_loss_avg', total_loss_avg, global_step=ctx.env_step) - + tb_logger.add_scalar('learner_step/cur_lr_q_avg', cur_lr_q_avg, global_step= ctx.env_step) + tb_logger.add_scalar('learner_step/cur_lr_p_avg', cur_lr_p_avg, global_step= ctx.env_step) + tb_logger.add_scalar('learner_step/critic_loss_avg', critic_loss_avg, global_step= ctx.env_step) + tb_logger.add_scalar('learner_step/policy_loss_avg', policy_loss_avg, global_step= ctx.env_step) + tb_logger.add_scalar('learner_step/total_loss_avg', total_loss_avg, global_step= ctx.env_step) + task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) task.use( - StepCollector( - cfg, policy.collect_mode, collector_env, random_collect_size=cfg.policy.random_collect_size - ) + StepCollector(cfg, policy.collect_mode, collector_env, random_collect_size=cfg.policy.random_collect_size) ) task.use(_add_scalar) task.use(data_pusher(cfg, buffer_)) diff --git a/dizoo/dmc2gym/envs/dmc2gym_env.py b/dizoo/dmc2gym/envs/dmc2gym_env.py index 14c70b6f44..9e97629897 100644 --- a/dizoo/dmc2gym/envs/dmc2gym_env.py +++ b/dizoo/dmc2gym/envs/dmc2gym_env.py @@ -10,7 +10,6 @@ def dmc2gym_observation_space(dim, minimum=-np.inf, maximum=np.inf, dtype=np.float32) -> Callable: - def observation_space(from_pixels=True, height=84, width=84, channels_first=True) -> Box: if from_pixels: shape = [3, height, width] if channels_first else [height, width, 3] @@ -30,7 +29,6 @@ def dmc2gym_action_space(dim, minimum=-1, maximum=1, dtype=np.float32) -> Box: def dmc2gym_reward_space(minimum=0, maximum=1, dtype=np.float32) -> Callable: - def reward_space(frame_skip=1) -> Box: return Box( np.repeat(minimum * frame_skip, 1).astype(dtype), diff --git a/dizoo/dmc2gym/envs/test_dmc2gym_env.py b/dizoo/dmc2gym/envs/test_dmc2gym_env.py index 5245a7a86a..94e6d9e9a7 100644 --- a/dizoo/dmc2gym/envs/test_dmc2gym_env.py +++ b/dizoo/dmc2gym/envs/test_dmc2gym_env.py @@ -47,3 +47,4 @@ def test_naive(self): assert timestep.reward <= env.reward_space.high print(env.observation_space, env.action_space, env.reward_space) env.close() + diff --git a/dizoo/evogym/envs/test/visualize_simple_env.py b/dizoo/evogym/envs/test/visualize_simple_env.py index 2203209fbe..cde80b725c 100644 --- a/dizoo/evogym/envs/test/visualize_simple_env.py +++ b/dizoo/evogym/envs/test/visualize_simple_env.py @@ -7,6 +7,7 @@ from dizoo.evogym.envs.viewer import DingEvoViewer from evogym.sim import EvoSim + if __name__ == '__main__': gym.logger.set_level(gym.logger.DEBUG) # create a random robot diff --git a/dizoo/gym_anytrading/config/stocks_dqn_config.py b/dizoo/gym_anytrading/config/stocks_dqn_config.py index c05a1f5974..c16ab0a5a5 100644 --- a/dizoo/gym_anytrading/config/stocks_dqn_config.py +++ b/dizoo/gym_anytrading/config/stocks_dqn_config.py @@ -78,11 +78,13 @@ import_names=['dizoo.gym_anytrading.envs.stocks_env'], ), env_manager=dict(type='base'), - policy=dict(type='dqn', ), + policy=dict( + type='dqn', + ), evaluator=dict( type='trading_interaction', import_names=['dizoo.gym_anytrading.worker'], - ), + ), ) stocks_dqn_create_config = EasyDict(stocks_dqn_create_config) create_config = stocks_dqn_create_config diff --git a/dizoo/gym_anytrading/worker/trading_serial_evaluator.py b/dizoo/gym_anytrading/worker/trading_serial_evaluator.py index d2fa4d22d1..9c7749f722 100644 --- a/dizoo/gym_anytrading/worker/trading_serial_evaluator.py +++ b/dizoo/gym_anytrading/worker/trading_serial_evaluator.py @@ -32,13 +32,13 @@ class TradingSerialEvaluator(InteractionSerialEvaluator): ) def __init__( - self, - cfg: dict, - env: BaseEnvManager = None, - policy: namedtuple = None, - tb_logger: 'SummaryWriter' = None, # noqa - exp_name: Optional[str] = 'default_experiment', - instance_name: Optional[str] = 'evaluator', + self, + cfg: dict, + env: BaseEnvManager = None, + policy: namedtuple = None, + tb_logger: 'SummaryWriter' = None, # noqa + exp_name: Optional[str] = 'default_experiment', + instance_name: Optional[str] = 'evaluator', ) -> None: """ Overview: @@ -49,12 +49,12 @@ def __init__( super().__init__(cfg, env, policy, tb_logger, exp_name, instance_name) def eval( - self, - save_ckpt_fn: Callable = None, - train_iter: int = -1, - envstep: int = -1, - n_episode: Optional[int] = None, - force_render: bool = False, + self, + save_ckpt_fn: Callable = None, + train_iter: int = -1, + envstep: int = -1, + n_episode: Optional[int] = None, + force_render: bool = False, ) -> Tuple[bool, dict]: ''' Overview: diff --git a/dizoo/gym_hybrid/envs/gym-hybrid/gym_hybrid/__init__.py b/dizoo/gym_hybrid/envs/gym-hybrid/gym_hybrid/__init__.py index 89cb5d7764..aa9f5bdf37 100644 --- a/dizoo/gym_hybrid/envs/gym-hybrid/gym_hybrid/__init__.py +++ b/dizoo/gym_hybrid/envs/gym-hybrid/gym_hybrid/__init__.py @@ -3,6 +3,7 @@ from gym_hybrid.environments import SlidingEnv from gym_hybrid.environments import HardMoveEnv + register( id='Moving-v0', entry_point='gym_hybrid:MovingEnv', @@ -14,4 +15,4 @@ register( id='HardMove-v0', entry_point='gym_hybrid:HardMoveEnv', -) +) \ No newline at end of file diff --git a/dizoo/gym_hybrid/envs/gym-hybrid/setup.py b/dizoo/gym_hybrid/envs/gym-hybrid/setup.py index 248ccb4535..af82deb670 100644 --- a/dizoo/gym_hybrid/envs/gym-hybrid/setup.py +++ b/dizoo/gym_hybrid/envs/gym-hybrid/setup.py @@ -1,8 +1,7 @@ from setuptools import setup -setup( - name='gym_hybrid', - version='0.0.2', # original gym_hybrid version='0.0.1' - packages=['gym_hybrid'], - install_requires=['gym', 'numpy'], +setup(name='gym_hybrid', + version='0.0.2', # original gym_hybrid version='0.0.1' + packages=['gym_hybrid'], + install_requires=['gym', 'numpy'], ) diff --git a/dizoo/gym_hybrid/envs/gym-hybrid/tests/moving.py b/dizoo/gym_hybrid/envs/gym-hybrid/tests/moving.py index 52315decd9..dbc230c0d7 100644 --- a/dizoo/gym_hybrid/envs/gym-hybrid/tests/moving.py +++ b/dizoo/gym_hybrid/envs/gym-hybrid/tests/moving.py @@ -2,6 +2,7 @@ import gym import gym_hybrid + if __name__ == '__main__': env = gym.make('Moving-v0') env.reset() diff --git a/dizoo/gym_hybrid/envs/test_gym_hybrid_env.py b/dizoo/gym_hybrid/envs/test_gym_hybrid_env.py index 896987f33f..7a7bc10006 100644 --- a/dizoo/gym_hybrid/envs/test_gym_hybrid_env.py +++ b/dizoo/gym_hybrid/envs/test_gym_hybrid_env.py @@ -8,17 +8,7 @@ class TestGymHybridEnv: def test_naive(self): - env = GymHybridEnv( - EasyDict( - { - 'env_id': 'Moving-v0', - 'act_scale': False, - 'save_replay_gif': False, - 'replay_path_gif': None, - 'replay_path': None - } - ) - ) + env = GymHybridEnv(EasyDict({'env_id': 'Moving-v0', 'act_scale': False, 'save_replay_gif': False, 'replay_path_gif': None, 'replay_path': None})) env.enable_save_replay('./video') env.seed(314, dynamic_seed=False) assert env._seed == 314 diff --git a/dizoo/image_classification/entry/imagenet_res18_config.py b/dizoo/image_classification/entry/imagenet_res18_config.py index bd4f473dd6..970ea4f2fd 100644 --- a/dizoo/image_classification/entry/imagenet_res18_config.py +++ b/dizoo/image_classification/entry/imagenet_res18_config.py @@ -27,7 +27,9 @@ learn_data_path='/mnt/lustre/share/images/train', eval_data_path='/mnt/lustre/share/images/val', ), - eval=dict(batch_size=32, evaluator=dict(eval_freq=1, stop_value=dict(loss=0.5, acc1=75.0, acc5=95.0))), + eval=dict( + batch_size=32, evaluator=dict(eval_freq=1, stop_value=dict(loss=0.5, acc1=75.0, acc5=95.0)) + ), ), env=dict(), ) diff --git a/dizoo/league_demo/league_demo_collector.py b/dizoo/league_demo/league_demo_collector.py index ce7985a6dc..211e15b5e8 100644 --- a/dizoo/league_demo/league_demo_collector.py +++ b/dizoo/league_demo/league_demo_collector.py @@ -25,13 +25,13 @@ class LeagueDemoCollector(ISerialCollector): config = dict(deepcopy_obs=False, transform_obs=False, collect_print_freq=100, get_train_sample=False) def __init__( - self, - cfg: EasyDict, - env: BaseEnvManager = None, - policy: List[namedtuple] = None, - tb_logger: 'SummaryWriter' = None, # noqa - exp_name: Optional[str] = 'default_experiment', - instance_name: Optional[str] = 'collector' + self, + cfg: EasyDict, + env: BaseEnvManager = None, + policy: List[namedtuple] = None, + tb_logger: 'SummaryWriter' = None, # noqa + exp_name: Optional[str] = 'default_experiment', + instance_name: Optional[str] = 'collector' ) -> None: """ Overview: diff --git a/dizoo/maze/entry/maze_bc_main.py b/dizoo/maze/entry/maze_bc_main.py index 3a42d4e921..efd9b6d2a8 100644 --- a/dizoo/maze/entry/maze_bc_main.py +++ b/dizoo/maze/entry/maze_bc_main.py @@ -61,7 +61,9 @@ def get_vi_sequence(env, observation): cur_x, cur_y = start_x, start_y while cur_x != target_location[0] or cur_y != target_location[1]: act = vi_sequence[-1][cur_x, cur_y] - track_back.append((torch.FloatTensor(env.process_states([cur_x, cur_y], env.get_maze_map())), act)) + track_back.append(( + torch.FloatTensor(env.process_states([cur_x, cur_y], env.get_maze_map())), + act)) if act == 0: cur_x += 1 elif act == 1: @@ -87,7 +89,6 @@ def __len__(self): def load_bc_dataset(train_seeds=1, test_seeds=1, batch_size=32): - def load_env(seed): ccc = easydict.EasyDict({'size': 16}) e = Maze(ccc) @@ -110,8 +111,13 @@ def load_env(seed): data += track_back - train_data = BCDataset(data_train) - test_data = BCDataset(data_test) + + train_data = BCDataset( + data_train + ) + test_data = BCDataset( + data_test + ) train_dataset = DataLoader(train_data, batch_size=batch_size, shuffle=True) test_dataset = DataLoader(test_data, batch_size=batch_size, shuffle=True) diff --git a/dizoo/minigrid/utils/eval.py b/dizoo/minigrid/utils/eval.py index e3c6acb9fb..e8e4f728fa 100644 --- a/dizoo/minigrid/utils/eval.py +++ b/dizoo/minigrid/utils/eval.py @@ -8,11 +8,11 @@ def eval( - input_cfg: Union[str, Tuple[dict, dict]], - seed: int = 0, - model: Optional[torch.nn.Module] = None, - state_dict: Optional[dict] = None, - replay_path: Optional[str] = './video', + input_cfg: Union[str, Tuple[dict, dict]], + seed: int = 0, + model: Optional[torch.nn.Module] = None, + state_dict: Optional[dict] = None, + replay_path: Optional[str] = './video', ) -> float: r""" Overview: diff --git a/dizoo/mujoco/config/halfcheetah_bdq_config.py b/dizoo/mujoco/config/halfcheetah_bdq_config.py index 25fb65ba35..145bf8062e 100644 --- a/dizoo/mujoco/config/halfcheetah_bdq_config.py +++ b/dizoo/mujoco/config/halfcheetah_bdq_config.py @@ -22,6 +22,7 @@ action_bins_per_branch=2, # mean the action shape is 6, 2 discrete actions for each action dimension encoder_hidden_size_list=[256, 256, 128], ), + learn=dict( batch_size=512, learning_rate=3e-4, @@ -64,8 +65,4 @@ if __name__ == "__main__": # or you can enter `ding -m serial_onpolicy -c halfcheetah_onbdq_config.py -s 0` from ding.entry import serial_pipeline - serial_pipeline( - (main_config, create_config), - seed=0, - max_env_step=10000000, - ) + serial_pipeline((main_config, create_config), seed=0, max_env_step=10000000,) \ No newline at end of file diff --git a/dizoo/mujoco/config/hopper_bdq_config.py b/dizoo/mujoco/config/hopper_bdq_config.py index 34dbe21664..de08da2a7a 100644 --- a/dizoo/mujoco/config/hopper_bdq_config.py +++ b/dizoo/mujoco/config/hopper_bdq_config.py @@ -68,8 +68,4 @@ if __name__ == "__main__": # or you can enter `ding -m serial_onpolicy -c hopper_bdq_config.py -s 0` from ding.entry import serial_pipeline - serial_pipeline( - [main_config, create_config], - seed=0, - max_env_step=10000000, - ) + serial_pipeline([main_config, create_config], seed=0, max_env_step=10000000,) diff --git a/dizoo/mujoco/envs/mujoco_wrappers.py b/dizoo/mujoco/envs/mujoco_wrappers.py index d99819783c..8fc19cd503 100644 --- a/dizoo/mujoco/envs/mujoco_wrappers.py +++ b/dizoo/mujoco/envs/mujoco_wrappers.py @@ -6,10 +6,10 @@ def wrap_mujoco( - env_id, - norm_obs: Dict = dict(use_norm=False, ), - norm_reward: Dict = dict(use_norm=False, ), - delay_reward_step: int = 1 + env_id, + norm_obs: Dict = dict(use_norm=False, ), + norm_reward: Dict = dict(use_norm=False, ), + delay_reward_step: int = 1 ) -> gym.Env: r""" Overview: diff --git a/dizoo/multiagent_mujoco/config/ant_mappo_config.py b/dizoo/multiagent_mujoco/config/ant_mappo_config.py index d11c31be8d..f221fa7c0f 100644 --- a/dizoo/multiagent_mujoco/config/ant_mappo_config.py +++ b/dizoo/multiagent_mujoco/config/ant_mappo_config.py @@ -75,6 +75,7 @@ ) create_config = EasyDict(create_config) + if __name__ == '__main__': from ding.entry import serial_pipeline_onpolicy serial_pipeline_onpolicy((main_config, create_config), seed=0, max_env_step=int(1e7)) diff --git a/dizoo/multiagent_mujoco/config/ant_masac_config.py b/dizoo/multiagent_mujoco/config/ant_masac_config.py index 9316b095c0..1f04efe8b7 100644 --- a/dizoo/multiagent_mujoco/config/ant_masac_config.py +++ b/dizoo/multiagent_mujoco/config/ant_masac_config.py @@ -34,7 +34,9 @@ target_theta=0.005, discount_factor=0.99, ), - collect=dict(n_sample=400, ), + collect=dict( + n_sample=400, + ), eval=dict(evaluator=dict(eval_freq=500, )), other=dict(replay_buffer=dict(replay_buffer_size=100000, ), ), ), diff --git a/dizoo/petting_zoo/config/ptz_simple_spread_madqn_config.py b/dizoo/petting_zoo/config/ptz_simple_spread_madqn_config.py index 8ddb636abf..b7db69abbe 100644 --- a/dizoo/petting_zoo/config/ptz_simple_spread_madqn_config.py +++ b/dizoo/petting_zoo/config/ptz_simple_spread_madqn_config.py @@ -41,7 +41,9 @@ discount_factor=0.95, ), collect=dict( - collector=dict(get_train_sample=True, ), + collector=dict( + get_train_sample=True, + ), n_episode=32, unroll_len=10, env_num=collector_env_num, @@ -58,7 +60,9 @@ end=0.05, decay=10000, ), - replay_buffer=dict(replay_buffer_size=15000, ), + replay_buffer=dict( + replay_buffer_size=15000, + ), ), ), ) diff --git a/dizoo/rocket/entry/rocket_hover_ppo_main.py b/dizoo/rocket/entry/rocket_hover_ppo_main.py index 13f5714483..2539ff12d3 100644 --- a/dizoo/rocket/entry/rocket_hover_ppo_main.py +++ b/dizoo/rocket/entry/rocket_hover_ppo_main.py @@ -30,10 +30,12 @@ def main(): tb_logger = SummaryWriter(os.path.join('./{}/log/'.format(cfg.exp_name), 'seed' + str(seed_i))) with task.start(async_mode=False, ctx=OnlineRLContext()): collector_env = BaseEnvManagerV2( - env_fn=[lambda: RocketEnv(cfg.env) for _ in range(cfg.env.collector_env_num)], cfg=cfg.env.manager + env_fn=[lambda: RocketEnv(cfg.env) for _ in range(cfg.env.collector_env_num)], + cfg=cfg.env.manager ) evaluator_env = BaseEnvManagerV2( - env_fn=[lambda: RocketEnv(cfg.env) for _ in range(cfg.env.evaluator_env_num)], cfg=cfg.env.manager + env_fn=[lambda: RocketEnv(cfg.env) for _ in range(cfg.env.evaluator_env_num)], + cfg=cfg.env.manager ) # evaluator_env.enable_save_replay() diff --git a/dizoo/rocket/entry/rocket_landing_ppo_main.py b/dizoo/rocket/entry/rocket_landing_ppo_main.py index bf8ebb5162..cc83242ce5 100644 --- a/dizoo/rocket/entry/rocket_landing_ppo_main.py +++ b/dizoo/rocket/entry/rocket_landing_ppo_main.py @@ -27,13 +27,15 @@ def main(): cfg = compile_config(main_config, create_cfg=create_config, auto=True) num_seed = 4 for seed_i in range(num_seed): - tb_logger = SummaryWriter(os.path.join('./{}/log/'.format(cfg.exp_name), 'seed' + str(seed_i))) + tb_logger = SummaryWriter(os.path.join('./{}/log/'.format(cfg.exp_name), 'seed'+str(seed_i))) with task.start(async_mode=False, ctx=OnlineRLContext()): collector_env = BaseEnvManagerV2( - env_fn=[lambda: RocketEnv(cfg.env) for _ in range(cfg.env.collector_env_num)], cfg=cfg.env.manager + env_fn=[lambda: RocketEnv(cfg.env) for _ in range(cfg.env.collector_env_num)], + cfg=cfg.env.manager ) evaluator_env = BaseEnvManagerV2( - env_fn=[lambda: RocketEnv(cfg.env) for _ in range(cfg.env.evaluator_env_num)], cfg=cfg.env.manager + env_fn=[lambda: RocketEnv(cfg.env) for _ in range(cfg.env.evaluator_env_num)], + cfg=cfg.env.manager ) # evaluator_env.enable_save_replay() diff --git a/dizoo/rocket/envs/test_rocket_env.py b/dizoo/rocket/envs/test_rocket_env.py index a8bf030fe7..e19d2879c1 100644 --- a/dizoo/rocket/envs/test_rocket_env.py +++ b/dizoo/rocket/envs/test_rocket_env.py @@ -12,7 +12,7 @@ def test_hover(self): env.seed(314, dynamic_seed=False) assert env._seed == 314 obs = env.reset() - assert obs.shape == (8, ) + assert obs.shape == (8,) for _ in range(5): env.reset() np.random.seed(314) @@ -28,8 +28,8 @@ def test_hover(self): print('timestep', timestep, '\n') assert isinstance(timestep.obs, np.ndarray) assert isinstance(timestep.done, bool) - assert timestep.obs.shape == (8, ) - assert timestep.reward.shape == (1, ) + assert timestep.obs.shape == (8,) + assert timestep.reward.shape == (1,) assert timestep.reward >= env.reward_space.low assert timestep.reward <= env.reward_space.high print(env.observation_space, env.action_space, env.reward_space) diff --git a/dizoo/smac/config/smac_3s5z_madqn_config.py b/dizoo/smac/config/smac_3s5z_madqn_config.py index 5e771baf09..c15dfcd655 100644 --- a/dizoo/smac/config/smac_3s5z_madqn_config.py +++ b/dizoo/smac/config/smac_3s5z_madqn_config.py @@ -18,7 +18,9 @@ stop_value=0.999, n_evaluator_episode=32, special_global_state=True, - manager=dict(shared_memory=False, ), + manager=dict( + shared_memory=False, + ), ), policy=dict( nstep=1, @@ -39,7 +41,9 @@ discount_factor=0.95, ), collect=dict( - collector=dict(get_train_sample=True, ), + collector=dict( + get_train_sample=True, + ), n_episode=32, unroll_len=10, env_num=collector_env_num, @@ -52,7 +56,9 @@ end=0.05, decay=10000, ), - replay_buffer=dict(replay_buffer_size=15000, ), + replay_buffer=dict( + replay_buffer_size=15000, + ), ), ), ) diff --git a/dizoo/smac/config/smac_3s5zvs3s6z_madqn_config.py b/dizoo/smac/config/smac_3s5zvs3s6z_madqn_config.py index 438025241f..23c215b63c 100644 --- a/dizoo/smac/config/smac_3s5zvs3s6z_madqn_config.py +++ b/dizoo/smac/config/smac_3s5zvs3s6z_madqn_config.py @@ -18,7 +18,9 @@ stop_value=0.999, n_evaluator_episode=32, special_global_state=True, - manager=dict(shared_memory=False, ), + manager=dict( + shared_memory=False, + ), ), policy=dict( nstep=3, @@ -39,7 +41,9 @@ discount_factor=0.95, ), collect=dict( - collector=dict(get_train_sample=True, ), + collector=dict( + get_train_sample=True, + ), n_episode=32, unroll_len=10, env_num=collector_env_num, @@ -52,7 +56,9 @@ end=0.05, decay=100000, ), - replay_buffer=dict(replay_buffer_size=30000, ), + replay_buffer=dict( + replay_buffer_size=30000, + ), ), ), ) diff --git a/dizoo/smac/config/smac_5m6m_madqn_config.py b/dizoo/smac/config/smac_5m6m_madqn_config.py index d05bb23dcb..0aa0497712 100644 --- a/dizoo/smac/config/smac_5m6m_madqn_config.py +++ b/dizoo/smac/config/smac_5m6m_madqn_config.py @@ -27,7 +27,7 @@ obs_shape=72, global_obs_shape=152, action_shape=12, - hidden_size_list=[256, 256], + hidden_size_list=[256,256], ), learn=dict( update_per_collect=40, @@ -38,7 +38,9 @@ discount_factor=0.95, ), collect=dict( - collector=dict(get_train_sample=True, ), + collector=dict( + get_train_sample=True, + ), n_episode=32, unroll_len=10, env_num=collector_env_num, @@ -51,7 +53,9 @@ end=0.05, decay=50000, ), - replay_buffer=dict(replay_buffer_size=50000, ), + replay_buffer=dict( + replay_buffer_size=50000, + ), ), ), ) @@ -83,6 +87,7 @@ def train(args): train(args) + def train(args): config = [main_config, create_config] serial_pipeline(config, seed=args.seed, max_env_step=1e7) diff --git a/dizoo/smac/config/smac_8m9m_madqn_config.py b/dizoo/smac/config/smac_8m9m_madqn_config.py index 672330df24..ccf9153a14 100644 --- a/dizoo/smac/config/smac_8m9m_madqn_config.py +++ b/dizoo/smac/config/smac_8m9m_madqn_config.py @@ -27,7 +27,7 @@ obs_shape=108, global_obs_shape=263, action_shape=15, - hidden_size_list=[256, 256], + hidden_size_list=[256,256], ), learn=dict( update_per_collect=40, @@ -38,7 +38,9 @@ discount_factor=0.95, ), collect=dict( - collector=dict(get_train_sample=True, ), + collector=dict( + get_train_sample=True, + ), n_episode=32, unroll_len=20, env_num=collector_env_num, @@ -51,7 +53,9 @@ end=0.05, decay=50000, ), - replay_buffer=dict(replay_buffer_size=20000, ), + replay_buffer=dict( + replay_buffer_size=20000, + ), ), ), ) @@ -83,6 +87,7 @@ def train(args): train(args) + def train(args): config = [main_config, create_config] serial_pipeline(config, seed=args.seed, max_env_step=1e7) diff --git a/dizoo/smac/config/smac_MMM2_madqn_config.py b/dizoo/smac/config/smac_MMM2_madqn_config.py index fe8e96501c..60e3123dc4 100644 --- a/dizoo/smac/config/smac_MMM2_madqn_config.py +++ b/dizoo/smac/config/smac_MMM2_madqn_config.py @@ -18,7 +18,9 @@ stop_value=0.999, n_evaluator_episode=32, special_global_state=True, - manager=dict(shared_memory=False, ), + manager=dict( + shared_memory=False, + ), ), policy=dict( nstep=1, @@ -39,7 +41,9 @@ discount_factor=0.95, ), collect=dict( - collector=dict(get_train_sample=True, ), + collector=dict( + get_train_sample=True, + ), n_episode=32, unroll_len=20, env_num=collector_env_num, @@ -52,7 +56,9 @@ end=0.05, decay=100000, ), - replay_buffer=dict(replay_buffer_size=30000, ), + replay_buffer=dict( + replay_buffer_size=30000, + ), ), ), ) diff --git a/dizoo/smac/config/smac_MMM_madqn_config.py b/dizoo/smac/config/smac_MMM_madqn_config.py index 892f1f5217..1d9a6abeaf 100644 --- a/dizoo/smac/config/smac_MMM_madqn_config.py +++ b/dizoo/smac/config/smac_MMM_madqn_config.py @@ -18,7 +18,9 @@ stop_value=0.999, n_evaluator_episode=32, special_global_state=True, - manager=dict(shared_memory=False, ), + manager=dict( + shared_memory=False, + ), ), policy=dict( nstep=1, @@ -39,7 +41,9 @@ discount_factor=0.95, ), collect=dict( - collector=dict(get_train_sample=True, ), + collector=dict( + get_train_sample=True, + ), n_episode=32, unroll_len=10, env_num=collector_env_num, @@ -52,7 +56,9 @@ end=0.05, decay=10000, ), - replay_buffer=dict(replay_buffer_size=15000, ), + replay_buffer=dict( + replay_buffer_size=15000, + ), ), ), ) diff --git a/dizoo/smac/utils/eval.py b/dizoo/smac/utils/eval.py index 1e112e84a7..6d683a8ace 100644 --- a/dizoo/smac/utils/eval.py +++ b/dizoo/smac/utils/eval.py @@ -10,11 +10,11 @@ def eval( - input_cfg: Union[str, Tuple[dict, dict]], - seed: int = 0, - env_setting: Optional[List[Any]] = None, - model: Optional[torch.nn.Module] = None, - state_dict: Optional[dict] = None, + input_cfg: Union[str, Tuple[dict, dict]], + seed: int = 0, + env_setting: Optional[List[Any]] = None, + model: Optional[torch.nn.Module] = None, + state_dict: Optional[dict] = None, ) -> float: r""" Overview: From 0f5015e3e132ee8dff252a67df4eae4c9f406436 Mon Sep 17 00:00:00 2001 From: zhangpaipai <1124321458@qq.com> Date: Tue, 11 Apr 2023 19:13:01 +0800 Subject: [PATCH 083/244] delete common --- dizoo/common/c51/lunarlander_c51.py | 23 -------------- halfcheetah_ddpg.py | 47 +++++++++++++++++++++++++++++ halfcheetah_sac.py | 0 halfcheetah_td3.py | 0 walker2d_ddpg.py | 47 +++++++++++++++++++++++++++++ walker2d_sac.py | 0 walker2d_td3.py | 0 7 files changed, 94 insertions(+), 23 deletions(-) delete mode 100644 dizoo/common/c51/lunarlander_c51.py create mode 100644 halfcheetah_ddpg.py create mode 100644 halfcheetah_sac.py create mode 100644 halfcheetah_td3.py create mode 100644 walker2d_ddpg.py create mode 100644 walker2d_sac.py create mode 100644 walker2d_td3.py diff --git a/dizoo/common/c51/lunarlander_c51.py b/dizoo/common/c51/lunarlander_c51.py deleted file mode 100644 index 07d1442ff5..0000000000 --- a/dizoo/common/c51/lunarlander_c51.py +++ /dev/null @@ -1,23 +0,0 @@ -from ding.bonus import C51Agent -from huggingface_ding import push_model_to_hub - -# Instantiate the agent -agent = C51Agent("lunarlander_discrete", exp_name="LunarLander-v2-C51") -# Train the agent -return_ = agent.train(step=200000) -# Push model to huggingface hub -""" push_model_to_hub( - agent=agent.best, - env_name="OpenAI/Gym/Box2d", - task_name="LunarLander-v2", - algo_name="DDPG", - wandb_url=return_.wandb_url, - github_repo_url="https://github.com/opendilab/DI-engine", - github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/ddpg.html", - github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/lunarlander.html", - installation_guide="pip3 install DI-engine[common_env]", - usage_file_by_git_clone="./dizoo/common/ddpg/lunarlander_ddpg_deploy.py", - usage_file_by_huggingface_ding="./dizoo/common/ddpg/lunarlander_ddpg_download.py", - train_file="./dizoo/common/ddpg/lunarlander_ddpg.py", - repo_id="OpenDILabCommunity/LunarLander-v2-DDPG" -) """ \ No newline at end of file diff --git a/halfcheetah_ddpg.py b/halfcheetah_ddpg.py new file mode 100644 index 0000000000..573b706df3 --- /dev/null +++ b/halfcheetah_ddpg.py @@ -0,0 +1,47 @@ +from ding.bonus import DDPGAgent +from huggingface_ding import push_model_to_hub + +# Instantiate the agent +agent = DDPGAgent(env="HalfCheetah", exp_name="HalfCheetah-v3-DDPG") +# Train the agent +return_ = agent.train(step=1000000) +# Render the new agent performance +agent.deploy(enable_save_replay=True) +# Push model to huggingface hub +""" push_model_to_hub( + agent=agent.best, + env_name="OpenAI/Gym/MuJoCo", + task_name="Hopper-v3", + algo_name="DDPG", + wandb_url=return_.wandb_url, + github_repo_url="https://github.com/opendilab/DI-engine", + github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/ddpg.html", + github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco.html", + installation_guide=''' +sudo apt update -y \ + && sudo apt install -y \ + build-essential \ + libgl1-mesa-dev \ + libgl1-mesa-glx \ + libglew-dev \ + libosmesa6-dev \ + libglfw3 \ + libglfw3-dev \ + libsdl2-dev \ + libsdl2-image-dev \ + libglm-dev \ + libfreetype6-dev \ + patchelf + +mkdir -p ~/.mujoco +wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz +tar -xf mujoco.tar.gz -C ~/.mujoco +echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin" >> ~/.bashrc +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin +pip3 install DI-engine[common_env] +''', + usage_file_by_git_clone="./dizoo/common/ddpg/hopper_ddpg_deploy.py", + usage_file_by_huggingface_ding="./dizoo/common/ddpg/hopper_ddpg_download.py", + train_file="./dizoo/common/ddpg/hopper_ddpg.py", + repo_id="OpenDILabCommunity/Hopper-v3-DDPG" +) """ diff --git a/halfcheetah_sac.py b/halfcheetah_sac.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/halfcheetah_td3.py b/halfcheetah_td3.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/walker2d_ddpg.py b/walker2d_ddpg.py new file mode 100644 index 0000000000..ec1044df0c --- /dev/null +++ b/walker2d_ddpg.py @@ -0,0 +1,47 @@ +from ding.bonus import DDPGAgent +from huggingface_ding import push_model_to_hub + +# Instantiate the agent +agent = DDPGAgent(env="Walker2d", exp_name="Walker2d-v3-DDPG") +# Train the agent +return_ = agent.train(step=1000000) +# Render the new agent performance +agent.deploy(enable_save_replay=True) +# Push model to huggingface hub +""" push_model_to_hub( + agent=agent.best, + env_name="OpenAI/Gym/MuJoCo", + task_name="Hopper-v3", + algo_name="DDPG", + wandb_url=return_.wandb_url, + github_repo_url="https://github.com/opendilab/DI-engine", + github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/ddpg.html", + github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco.html", + installation_guide=''' +sudo apt update -y \ + && sudo apt install -y \ + build-essential \ + libgl1-mesa-dev \ + libgl1-mesa-glx \ + libglew-dev \ + libosmesa6-dev \ + libglfw3 \ + libglfw3-dev \ + libsdl2-dev \ + libsdl2-image-dev \ + libglm-dev \ + libfreetype6-dev \ + patchelf + +mkdir -p ~/.mujoco +wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz +tar -xf mujoco.tar.gz -C ~/.mujoco +echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin" >> ~/.bashrc +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin +pip3 install DI-engine[common_env] +''', + usage_file_by_git_clone="./dizoo/common/ddpg/hopper_ddpg_deploy.py", + usage_file_by_huggingface_ding="./dizoo/common/ddpg/hopper_ddpg_download.py", + train_file="./dizoo/common/ddpg/hopper_ddpg.py", + repo_id="OpenDILabCommunity/Hopper-v3-DDPG" +) """ \ No newline at end of file diff --git a/walker2d_sac.py b/walker2d_sac.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/walker2d_td3.py b/walker2d_td3.py new file mode 100644 index 0000000000..e69de29bb2 From d5cdb1e080075c00398662cca3ef14f08078294a Mon Sep 17 00:00:00 2001 From: zhangpaipai <1124321458@qq.com> Date: Wed, 12 Apr 2023 14:36:37 +0800 Subject: [PATCH 084/244] polish for zjow to review --- dizoo/common/ddpg/halfcheetah_ddpg.py | 47 --------------------------- dizoo/common/ddpg/walker2d_ddpg.py | 47 --------------------------- dizoo/common/sac/halfcheetah_sac.py | 47 --------------------------- dizoo/common/sac/walker2d_sac.py | 47 --------------------------- dizoo/common/td3/halfcheetah_td3.py | 47 --------------------------- dizoo/common/td3/walker2d_td3.py | 47 --------------------------- halfcheetah_ddpg.py | 47 --------------------------- halfcheetah_sac.py | 0 halfcheetah_td3.py | 0 walker2d_ddpg.py | 47 --------------------------- walker2d_sac.py | 0 walker2d_td3.py | 0 12 files changed, 376 deletions(-) delete mode 100644 dizoo/common/ddpg/halfcheetah_ddpg.py delete mode 100644 dizoo/common/ddpg/walker2d_ddpg.py delete mode 100644 dizoo/common/sac/halfcheetah_sac.py delete mode 100644 dizoo/common/sac/walker2d_sac.py delete mode 100644 dizoo/common/td3/halfcheetah_td3.py delete mode 100644 dizoo/common/td3/walker2d_td3.py delete mode 100644 halfcheetah_ddpg.py delete mode 100644 halfcheetah_sac.py delete mode 100644 halfcheetah_td3.py delete mode 100644 walker2d_ddpg.py delete mode 100644 walker2d_sac.py delete mode 100644 walker2d_td3.py diff --git a/dizoo/common/ddpg/halfcheetah_ddpg.py b/dizoo/common/ddpg/halfcheetah_ddpg.py deleted file mode 100644 index 573b706df3..0000000000 --- a/dizoo/common/ddpg/halfcheetah_ddpg.py +++ /dev/null @@ -1,47 +0,0 @@ -from ding.bonus import DDPGAgent -from huggingface_ding import push_model_to_hub - -# Instantiate the agent -agent = DDPGAgent(env="HalfCheetah", exp_name="HalfCheetah-v3-DDPG") -# Train the agent -return_ = agent.train(step=1000000) -# Render the new agent performance -agent.deploy(enable_save_replay=True) -# Push model to huggingface hub -""" push_model_to_hub( - agent=agent.best, - env_name="OpenAI/Gym/MuJoCo", - task_name="Hopper-v3", - algo_name="DDPG", - wandb_url=return_.wandb_url, - github_repo_url="https://github.com/opendilab/DI-engine", - github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/ddpg.html", - github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco.html", - installation_guide=''' -sudo apt update -y \ - && sudo apt install -y \ - build-essential \ - libgl1-mesa-dev \ - libgl1-mesa-glx \ - libglew-dev \ - libosmesa6-dev \ - libglfw3 \ - libglfw3-dev \ - libsdl2-dev \ - libsdl2-image-dev \ - libglm-dev \ - libfreetype6-dev \ - patchelf - -mkdir -p ~/.mujoco -wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz -tar -xf mujoco.tar.gz -C ~/.mujoco -echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin" >> ~/.bashrc -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin -pip3 install DI-engine[common_env] -''', - usage_file_by_git_clone="./dizoo/common/ddpg/hopper_ddpg_deploy.py", - usage_file_by_huggingface_ding="./dizoo/common/ddpg/hopper_ddpg_download.py", - train_file="./dizoo/common/ddpg/hopper_ddpg.py", - repo_id="OpenDILabCommunity/Hopper-v3-DDPG" -) """ diff --git a/dizoo/common/ddpg/walker2d_ddpg.py b/dizoo/common/ddpg/walker2d_ddpg.py deleted file mode 100644 index ec1044df0c..0000000000 --- a/dizoo/common/ddpg/walker2d_ddpg.py +++ /dev/null @@ -1,47 +0,0 @@ -from ding.bonus import DDPGAgent -from huggingface_ding import push_model_to_hub - -# Instantiate the agent -agent = DDPGAgent(env="Walker2d", exp_name="Walker2d-v3-DDPG") -# Train the agent -return_ = agent.train(step=1000000) -# Render the new agent performance -agent.deploy(enable_save_replay=True) -# Push model to huggingface hub -""" push_model_to_hub( - agent=agent.best, - env_name="OpenAI/Gym/MuJoCo", - task_name="Hopper-v3", - algo_name="DDPG", - wandb_url=return_.wandb_url, - github_repo_url="https://github.com/opendilab/DI-engine", - github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/ddpg.html", - github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco.html", - installation_guide=''' -sudo apt update -y \ - && sudo apt install -y \ - build-essential \ - libgl1-mesa-dev \ - libgl1-mesa-glx \ - libglew-dev \ - libosmesa6-dev \ - libglfw3 \ - libglfw3-dev \ - libsdl2-dev \ - libsdl2-image-dev \ - libglm-dev \ - libfreetype6-dev \ - patchelf - -mkdir -p ~/.mujoco -wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz -tar -xf mujoco.tar.gz -C ~/.mujoco -echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin" >> ~/.bashrc -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin -pip3 install DI-engine[common_env] -''', - usage_file_by_git_clone="./dizoo/common/ddpg/hopper_ddpg_deploy.py", - usage_file_by_huggingface_ding="./dizoo/common/ddpg/hopper_ddpg_download.py", - train_file="./dizoo/common/ddpg/hopper_ddpg.py", - repo_id="OpenDILabCommunity/Hopper-v3-DDPG" -) """ \ No newline at end of file diff --git a/dizoo/common/sac/halfcheetah_sac.py b/dizoo/common/sac/halfcheetah_sac.py deleted file mode 100644 index df8f0c87f4..0000000000 --- a/dizoo/common/sac/halfcheetah_sac.py +++ /dev/null @@ -1,47 +0,0 @@ -from ding.bonus.sac import SACAgent -from huggingface_ding import push_model_to_hub - -# Instantiate the agent -agent = SACAgent(env="HalfCheetah", exp_name="HalfCheetah-v3-SAC") -# Train the agent -return_ = agent.train(step=1000000) -# Render the new agent performance -agent.deploy(enable_save_replay=True) -# Push model to huggingface hub -""" push_model_to_hub( - agent=agent.best, - env_name="OpenAI/Gym/MuJoCo", - task_name="Hopper-v3", - algo_name="SAC", - wandb_url=return_.wandb_url, - github_repo_url="https://github.com/opendilab/DI-engine", - github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/sac.html", - github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco.html", - installation_guide=''' -sudo apt update -y \ - && sudo apt install -y \ - build-essential \ - libgl1-mesa-dev \ - libgl1-mesa-glx \ - libglew-dev \ - libosmesa6-dev \ - libglfw3 \ - libglfw3-dev \ - libsdl2-dev \ - libsdl2-image-dev \ - libglm-dev \ - libfreetype6-dev \ - patchelf - -mkdir -p ~/.mujoco -wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz -tar -xf mujoco.tar.gz -C ~/.mujoco -echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin" >> ~/.bashrc -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin -pip3 install DI-engine[common_env] -''', - usage_file_by_git_clone="./dizoo/common/sac/hopper_sac_deploy.py", - usage_file_by_huggingface_ding="./dizoo/common/sac/hopper_sac_download.py", - train_file="./dizoo/common/sac/hopper_sac.py", - repo_id="OpenDILabCommunity/Hopper-v3-SAC" -) """ \ No newline at end of file diff --git a/dizoo/common/sac/walker2d_sac.py b/dizoo/common/sac/walker2d_sac.py deleted file mode 100644 index 0406394e57..0000000000 --- a/dizoo/common/sac/walker2d_sac.py +++ /dev/null @@ -1,47 +0,0 @@ -from ding.bonus.sac import SACAgent -from huggingface_ding import push_model_to_hub - -# Instantiate the agent -agent = SACAgent(env="Walker2d", exp_name="Walker2d-v3-SAC") -# Train the agent -return_ = agent.train(step=1000000) -# Render the new agent performance -agent.deploy(enable_save_replay=True) -# Push model to huggingface hub -""" push_model_to_hub( - agent=agent.best, - env_name="OpenAI/Gym/MuJoCo", - task_name="Hopper-v3", - algo_name="SAC", - wandb_url=return_.wandb_url, - github_repo_url="https://github.com/opendilab/DI-engine", - github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/sac.html", - github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco.html", - installation_guide=''' -sudo apt update -y \ - && sudo apt install -y \ - build-essential \ - libgl1-mesa-dev \ - libgl1-mesa-glx \ - libglew-dev \ - libosmesa6-dev \ - libglfw3 \ - libglfw3-dev \ - libsdl2-dev \ - libsdl2-image-dev \ - libglm-dev \ - libfreetype6-dev \ - patchelf - -mkdir -p ~/.mujoco -wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz -tar -xf mujoco.tar.gz -C ~/.mujoco -echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin" >> ~/.bashrc -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin -pip3 install DI-engine[common_env] -''', - usage_file_by_git_clone="./dizoo/common/sac/hopper_sac_deploy.py", - usage_file_by_huggingface_ding="./dizoo/common/sac/hopper_sac_download.py", - train_file="./dizoo/common/sac/hopper_sac.py", - repo_id="OpenDILabCommunity/Hopper-v3-SAC" -) """ \ No newline at end of file diff --git a/dizoo/common/td3/halfcheetah_td3.py b/dizoo/common/td3/halfcheetah_td3.py deleted file mode 100644 index f521320fb6..0000000000 --- a/dizoo/common/td3/halfcheetah_td3.py +++ /dev/null @@ -1,47 +0,0 @@ -from ding.bonus import TD3Agent -from huggingface_ding import push_model_to_hub - -# Instantiate the agent -agent = TD3Agent(env="HalfCheetah", exp_name="HalfCheetah-v3-TD3") -# Train the agent -return_ = agent.train(step=1000000) -# Render the new agent performance -agent.deploy(enable_save_replay=True) -# Push model to huggingface hub -""" push_model_to_hub( - agent=agent.best, - env_name="OpenAI/Gym/MuJoCo", - task_name="Hopper-v3", - algo_name="TD3", - wandb_url=return_.wandb_url, - github_repo_url="https://github.com/opendilab/DI-engine", - github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/td3.html", - github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco.html", - installation_guide=''' -sudo apt update -y \ - && sudo apt install -y \ - build-essential \ - libgl1-mesa-dev \ - libgl1-mesa-glx \ - libglew-dev \ - libosmesa6-dev \ - libglfw3 \ - libglfw3-dev \ - libsdl2-dev \ - libsdl2-image-dev \ - libglm-dev \ - libfreetype6-dev \ - patchelf - -mkdir -p ~/.mujoco -wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz -tar -xf mujoco.tar.gz -C ~/.mujoco -echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin" >> ~/.bashrc -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin -pip3 install DI-engine[common_env] -''', - usage_file_by_git_clone="./dizoo/common/td3/hopper_td3_deploy.py", - usage_file_by_huggingface_ding="./dizoo/common/td3/hopper_td3_download.py", - train_file="./dizoo/common/td3/hopper_td3.py", - repo_id="OpenDILabCommunity/Hopper-v3-TD3" -) """ \ No newline at end of file diff --git a/dizoo/common/td3/walker2d_td3.py b/dizoo/common/td3/walker2d_td3.py deleted file mode 100644 index be17109d23..0000000000 --- a/dizoo/common/td3/walker2d_td3.py +++ /dev/null @@ -1,47 +0,0 @@ -from ding.bonus import TD3Agent -from huggingface_ding import push_model_to_hub - -# Instantiate the agent -agent = TD3Agent(env="Walker2d", exp_name="Walker2d-v3-TD3") -# Train the agent -return_ = agent.train(step=1000000) -# Render the new agent performance -agent.deploy(enable_save_replay=True) -# Push model to huggingface hub -""" push_model_to_hub( - agent=agent.best, - env_name="OpenAI/Gym/MuJoCo", - task_name="Hopper-v3", - algo_name="TD3", - wandb_url=return_.wandb_url, - github_repo_url="https://github.com/opendilab/DI-engine", - github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/td3.html", - github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco.html", - installation_guide=''' -sudo apt update -y \ - && sudo apt install -y \ - build-essential \ - libgl1-mesa-dev \ - libgl1-mesa-glx \ - libglew-dev \ - libosmesa6-dev \ - libglfw3 \ - libglfw3-dev \ - libsdl2-dev \ - libsdl2-image-dev \ - libglm-dev \ - libfreetype6-dev \ - patchelf - -mkdir -p ~/.mujoco -wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz -tar -xf mujoco.tar.gz -C ~/.mujoco -echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin" >> ~/.bashrc -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin -pip3 install DI-engine[common_env] -''', - usage_file_by_git_clone="./dizoo/common/td3/hopper_td3_deploy.py", - usage_file_by_huggingface_ding="./dizoo/common/td3/hopper_td3_download.py", - train_file="./dizoo/common/td3/hopper_td3.py", - repo_id="OpenDILabCommunity/Hopper-v3-TD3" -) """ \ No newline at end of file diff --git a/halfcheetah_ddpg.py b/halfcheetah_ddpg.py deleted file mode 100644 index 573b706df3..0000000000 --- a/halfcheetah_ddpg.py +++ /dev/null @@ -1,47 +0,0 @@ -from ding.bonus import DDPGAgent -from huggingface_ding import push_model_to_hub - -# Instantiate the agent -agent = DDPGAgent(env="HalfCheetah", exp_name="HalfCheetah-v3-DDPG") -# Train the agent -return_ = agent.train(step=1000000) -# Render the new agent performance -agent.deploy(enable_save_replay=True) -# Push model to huggingface hub -""" push_model_to_hub( - agent=agent.best, - env_name="OpenAI/Gym/MuJoCo", - task_name="Hopper-v3", - algo_name="DDPG", - wandb_url=return_.wandb_url, - github_repo_url="https://github.com/opendilab/DI-engine", - github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/ddpg.html", - github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco.html", - installation_guide=''' -sudo apt update -y \ - && sudo apt install -y \ - build-essential \ - libgl1-mesa-dev \ - libgl1-mesa-glx \ - libglew-dev \ - libosmesa6-dev \ - libglfw3 \ - libglfw3-dev \ - libsdl2-dev \ - libsdl2-image-dev \ - libglm-dev \ - libfreetype6-dev \ - patchelf - -mkdir -p ~/.mujoco -wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz -tar -xf mujoco.tar.gz -C ~/.mujoco -echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin" >> ~/.bashrc -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin -pip3 install DI-engine[common_env] -''', - usage_file_by_git_clone="./dizoo/common/ddpg/hopper_ddpg_deploy.py", - usage_file_by_huggingface_ding="./dizoo/common/ddpg/hopper_ddpg_download.py", - train_file="./dizoo/common/ddpg/hopper_ddpg.py", - repo_id="OpenDILabCommunity/Hopper-v3-DDPG" -) """ diff --git a/halfcheetah_sac.py b/halfcheetah_sac.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/halfcheetah_td3.py b/halfcheetah_td3.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/walker2d_ddpg.py b/walker2d_ddpg.py deleted file mode 100644 index ec1044df0c..0000000000 --- a/walker2d_ddpg.py +++ /dev/null @@ -1,47 +0,0 @@ -from ding.bonus import DDPGAgent -from huggingface_ding import push_model_to_hub - -# Instantiate the agent -agent = DDPGAgent(env="Walker2d", exp_name="Walker2d-v3-DDPG") -# Train the agent -return_ = agent.train(step=1000000) -# Render the new agent performance -agent.deploy(enable_save_replay=True) -# Push model to huggingface hub -""" push_model_to_hub( - agent=agent.best, - env_name="OpenAI/Gym/MuJoCo", - task_name="Hopper-v3", - algo_name="DDPG", - wandb_url=return_.wandb_url, - github_repo_url="https://github.com/opendilab/DI-engine", - github_doc_model_url="https://di-engine-docs.readthedocs.io/en/latest/12_policies/ddpg.html", - github_doc_env_url="https://di-engine-docs.readthedocs.io/en/latest/13_envs/mujoco.html", - installation_guide=''' -sudo apt update -y \ - && sudo apt install -y \ - build-essential \ - libgl1-mesa-dev \ - libgl1-mesa-glx \ - libglew-dev \ - libosmesa6-dev \ - libglfw3 \ - libglfw3-dev \ - libsdl2-dev \ - libsdl2-image-dev \ - libglm-dev \ - libfreetype6-dev \ - patchelf - -mkdir -p ~/.mujoco -wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz -tar -xf mujoco.tar.gz -C ~/.mujoco -echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin" >> ~/.bashrc -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.mujoco/mjpro210/bin:~/.mujoco/mujoco210/bin -pip3 install DI-engine[common_env] -''', - usage_file_by_git_clone="./dizoo/common/ddpg/hopper_ddpg_deploy.py", - usage_file_by_huggingface_ding="./dizoo/common/ddpg/hopper_ddpg_download.py", - train_file="./dizoo/common/ddpg/hopper_ddpg.py", - repo_id="OpenDILabCommunity/Hopper-v3-DDPG" -) """ \ No newline at end of file diff --git a/walker2d_sac.py b/walker2d_sac.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/walker2d_td3.py b/walker2d_td3.py deleted file mode 100644 index e69de29bb2..0000000000 From d69b165cd5ff97fb3a1a9793f7de81828592449b Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 13 Apr 2023 03:06:52 +0000 Subject: [PATCH 085/244] polish code --- ding/bonus/a2c.py | 2 +- ding/bonus/ddpg.py | 2 +- ding/bonus/dqn.py | 2 +- ding/bonus/ppof.py | 2 +- ding/bonus/sac.py | 2 +- ding/bonus/td3.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ding/bonus/a2c.py b/ding/bonus/a2c.py index 4c0a21133f..f458d3f44e 100644 --- a/ding/bonus/a2c.py +++ b/ding/bonus/a2c.py @@ -121,7 +121,7 @@ def train( ) ) task.use(termination_checker(max_env_step=step)) - task.use(final_ctx_saver(name=self.cfg["exp_name"])) + task.use(final_ctx_saver(name=self.exp_name)) task.run() return TrainingReturn(wandb_url=task.ctx.wandb_url) diff --git a/ding/bonus/ddpg.py b/ding/bonus/ddpg.py index 344a31079e..ca2557cc03 100644 --- a/ding/bonus/ddpg.py +++ b/ding/bonus/ddpg.py @@ -130,7 +130,7 @@ def train( ) ) task.use(termination_checker(max_env_step=step)) - task.use(final_ctx_saver(name=self.cfg["exp_name"])) + task.use(final_ctx_saver(name=self.exp_name)) task.run() return TrainingReturn(wandb_url=task.ctx.wandb_url) diff --git a/ding/bonus/dqn.py b/ding/bonus/dqn.py index a93b2461b0..7466b72064 100644 --- a/ding/bonus/dqn.py +++ b/ding/bonus/dqn.py @@ -131,7 +131,7 @@ def train( ) ) task.use(termination_checker(max_env_step=step)) - task.use(final_ctx_saver(name=self.cfg["exp_name"])) + task.use(final_ctx_saver(name=self.exp_name)) task.run() return TrainingReturn(wandb_url=task.ctx.wandb_url) diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index f98b6df54f..d74cabf552 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -221,7 +221,7 @@ def collect_data( task.use(offline_data_saver(save_data_path, data_type='hdf5')) task.run(max_step=1) logging.info( - 'PPOF collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`' + f'PPOF collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`' ) def batch_evaluate( diff --git a/ding/bonus/sac.py b/ding/bonus/sac.py index d09d27ba57..d30cf81a0a 100644 --- a/ding/bonus/sac.py +++ b/ding/bonus/sac.py @@ -131,7 +131,7 @@ def train( ) ) task.use(termination_checker(max_env_step=step)) - task.use(final_ctx_saver(name=self.cfg["exp_name"])) + task.use(final_ctx_saver(name=self.exp_name)) task.run() return TrainingReturn(wandb_url=task.ctx.wandb_url) diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 7491c54a56..7bef5a2c8b 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -130,7 +130,7 @@ def train( ) ) task.use(termination_checker(max_env_step=step)) - task.use(final_ctx_saver(name=self.cfg["exp_name"])) + task.use(final_ctx_saver(name=self.exp_name)) task.run() return TrainingReturn(wandb_url=task.ctx.wandb_url) From b95f340cb89e7c9ccf546b8635d0ee501de74221 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 13 Apr 2023 04:34:53 +0000 Subject: [PATCH 086/244] polish code --- ding/bonus/a2c.py | 30 +++++------------------ ding/bonus/common.py | 21 ++++++++++++++++ ding/bonus/ddpg.py | 30 +++++------------------ ding/bonus/dqn.py | 30 +++++------------------ ding/bonus/ppof.py | 28 ++++----------------- ding/bonus/sac.py | 28 ++++----------------- ding/bonus/td3.py | 30 +++++------------------ ding/framework/middleware/ckpt_handler.py | 5 +++- 8 files changed, 59 insertions(+), 143 deletions(-) create mode 100644 ding/bonus/common.py diff --git a/ding/bonus/a2c.py b/ding/bonus/a2c.py index f458d3f44e..d55d173142 100644 --- a/ding/bonus/a2c.py +++ b/ding/bonus/a2c.py @@ -18,26 +18,7 @@ from ding.model import VAC from ding.model import model_wrap from ding.bonus.config import get_instance_config, get_instance_env - - -@dataclass -class TrainingReturn: - ''' - Attributions - wandb_url: The weight & biases (wandb) project url of the trainning experiment. - ''' - wandb_url: str - - -@dataclass -class EvalReturn: - ''' - Attributions - eval_value: The mean of evaluation return. - eval_value_std: The standard deviation of evaluation return. - ''' - eval_value: np.float32 - eval_value_std: np.float32 +from ding.bonus.common import TrainingReturn, EvalReturn class A2CAgent: @@ -87,7 +68,7 @@ def __init__( self.policy = A2CPolicy(self.cfg.policy, model=model) if policy_state_dict is not None: self.policy.learn_mode.load_state_dict(policy_state_dict) - self.model_save_dir = os.path.join(self.exp_name, "model") + self.checkpoint_save_dir = os.path.join(self.exp_name, "ckpt") def train( self, @@ -111,7 +92,7 @@ def train( task.use(StepCollector(self.cfg, self.policy.collect_mode, collector_env)) task.use(gae_estimator(self.cfg, self.policy.collect_mode)) task.use(trainer(self.cfg, self.policy.learn_mode)) - task.use(CkptSaver(policy=self.policy, save_dir=self.model_save_dir, train_freq=n_iter_save_ckpt)) + task.use(CkptSaver(policy=self.policy, save_dir=self.checkpoint_save_dir, train_freq=n_iter_save_ckpt)) task.use( wandb_online_logger( metric_list=self.policy.monitor_vars(), @@ -238,8 +219,9 @@ def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: @property def best(self): - best_model_file_path = os.path.join(self.model_save_dir, "ckpt", "eval.pth.tar") + best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar") + # Load best model if it exists if os.path.exists(best_model_file_path): policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) self.policy.learn_mode.load_state_dict(policy_state_dict) - return self \ No newline at end of file + return self diff --git a/ding/bonus/common.py b/ding/bonus/common.py new file mode 100644 index 0000000000..f3ba3a7ffb --- /dev/null +++ b/ding/bonus/common.py @@ -0,0 +1,21 @@ +from dataclasses import dataclass + + +@dataclass +class TrainingReturn: + ''' + Attributions + wandb_url: The weight & biases (wandb) project url of the trainning experiment. + ''' + wandb_url: str + + +@dataclass +class EvalReturn: + ''' + Attributions + eval_value: The mean of evaluation return. + eval_value_std: The standard deviation of evaluation return. + ''' + eval_value: np.float32 + eval_value_std: np.float32 diff --git a/ding/bonus/ddpg.py b/ding/bonus/ddpg.py index ca2557cc03..7a3db2a232 100644 --- a/ding/bonus/ddpg.py +++ b/ding/bonus/ddpg.py @@ -18,26 +18,7 @@ from ding.model import QAC from ding.data import DequeBuffer from ding.bonus.config import get_instance_config, get_instance_env - - -@dataclass -class TrainingReturn: - ''' - Attributions - wandb_url: The weight & biases (wandb) project url of the trainning experiment. - ''' - wandb_url: str - - -@dataclass -class EvalReturn: - ''' - Attributions - eval_value: The mean of evaluation return. - eval_value_std: The standard deviation of evaluation return. - ''' - eval_value: np.float32 - eval_value_std: np.float32 +from ding.bonus.common import TrainingReturn, EvalReturn class DDPGAgent: @@ -89,7 +70,7 @@ def __init__( self.policy = DDPGPolicy(self.cfg.policy, model=model) if policy_state_dict is not None: self.policy.learn_mode.load_state_dict(policy_state_dict) - self.model_save_dir = os.path.join(self.exp_name, "model") + self.checkpoint_save_dir = os.path.join(self.exp_name, "ckpt") def train( self, @@ -120,7 +101,7 @@ def train( ) task.use(data_pusher(self.cfg, self.buffer_)) task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_)) - task.use(CkptSaver(policy=self.policy, save_dir=self.model_save_dir, train_freq=n_iter_save_ckpt)) + task.use(CkptSaver(policy=self.policy, save_dir=self.checkpoint_save_dir, train_freq=n_iter_save_ckpt)) task.use( wandb_online_logger( metric_list=self.policy.monitor_vars(), @@ -245,8 +226,9 @@ def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: @property def best(self): - best_model_file_path = os.path.join(self.model_save_dir, "ckpt", "eval.pth.tar") + best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar") + # Load best model if it exists if os.path.exists(best_model_file_path): policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) self.policy.learn_mode.load_state_dict(policy_state_dict) - return self \ No newline at end of file + return self diff --git a/ding/bonus/dqn.py b/ding/bonus/dqn.py index 7466b72064..ff991dacc3 100644 --- a/ding/bonus/dqn.py +++ b/ding/bonus/dqn.py @@ -18,26 +18,7 @@ from ding.model import model_wrap from ding.data import DequeBuffer from ding.bonus.config import get_instance_config, get_instance_env - - -@dataclass -class TrainingReturn: - ''' - Attributions - wandb_url: The weight & biases (wandb) project url of the trainning experiment. - ''' - wandb_url: str - - -@dataclass -class EvalReturn: - ''' - Attributions - eval_value: The mean of evaluation return. - eval_value_std: The standard deviation of evaluation return. - ''' - eval_value: np.float32 - eval_value_std: np.float32 +from ding.bonus.common import TrainingReturn, EvalReturn class DQNAgent: @@ -88,7 +69,7 @@ def __init__( self.policy = DQNPolicy(self.cfg.policy, model=model) if policy_state_dict is not None: self.policy.learn_mode.load_state_dict(policy_state_dict) - self.model_save_dir = os.path.join(self.exp_name, "model") + self.checkpoint_save_dir = os.path.join(self.exp_name, "ckpt") def train( self, @@ -121,7 +102,7 @@ def train( task.use(nstep_reward_enhancer(self.cfg)) task.use(data_pusher(self.cfg, self.buffer_)) task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_)) - task.use(CkptSaver(policy=self.policy, save_dir=self.model_save_dir, train_freq=n_iter_save_ckpt)) + task.use(CkptSaver(policy=self.policy, save_dir=self.checkpoint_save_dir, train_freq=n_iter_save_ckpt)) task.use( wandb_online_logger( metric_list=self.policy.monitor_vars(), @@ -248,8 +229,9 @@ def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: @property def best(self): - best_model_file_path = os.path.join(self.model_save_dir, "ckpt", "eval.pth.tar") + best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar") + # Load best model if it exists if os.path.exists(best_model_file_path): policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) self.policy.learn_mode.load_state_dict(policy_state_dict) - return self \ No newline at end of file + return self diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index d74cabf552..cb66750d4c 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -17,26 +17,7 @@ from ding.config import save_config_py from .model import PPOFModel from .config import get_instance_config, get_instance_env, get_hybrid_shape - - -@dataclass -class TrainingReturn: - ''' - Attributions - wandb_url: The weight & biases (wandb) project url of the trainning experiment. - ''' - wandb_url: str - - -@dataclass -class EvalReturn: - ''' - Attributions - eval_value: The mean of evaluation return. - eval_value_std: The standard deviation of evaluation return. - ''' - eval_value: np.float32 - eval_value_std: np.float32 +from ding.bonus.common import TrainingReturn, EvalReturn class PPOF: @@ -125,7 +106,7 @@ def __init__( self.policy = PPOFPolicy(self.cfg, model=model) if policy_state_dict is not None: self.policy.load_state_dict(policy_state_dict) - self.model_save_dir = os.path.join(self.exp_name, "model") + self.checkpoint_save_dir = os.path.join(self.exp_name, "ckpt") def train( self, @@ -154,7 +135,7 @@ def train( task.use(PPOFStepCollector(self.seed, self.policy, collector_env, self.cfg.n_sample)) task.use(ppof_adv_estimator(self.policy)) task.use(multistep_trainer(self.policy, log_freq=n_iter_log_show)) - task.use(CkptSaver(self.policy, save_dir=self.model_save_dir, train_freq=n_iter_save_ckpt)) + task.use(CkptSaver(self.policy, save_dir=self.checkpoint_save_dir, train_freq=n_iter_save_ckpt)) task.use( wandb_online_logger( metric_list=self.policy.monitor_vars(), @@ -268,7 +249,8 @@ def _setup_env_manager( @property def best(self): - best_model_file_path = os.path.join(self.model_save_dir, "ckpt", "eval.pth.tar") + best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar") + # Load best model if it exists if os.path.exists(best_model_file_path): policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) self.policy.learn_mode.load_state_dict(policy_state_dict) diff --git a/ding/bonus/sac.py b/ding/bonus/sac.py index d30cf81a0a..4b47e7c5a7 100644 --- a/ding/bonus/sac.py +++ b/ding/bonus/sac.py @@ -19,26 +19,7 @@ from ding.model import model_wrap from ding.data import DequeBuffer from ding.bonus.config import get_instance_config, get_instance_env - - -@dataclass -class TrainingReturn: - ''' - Attributions - wandb_url: The weight & biases (wandb) project url of the trainning experiment. - ''' - wandb_url: str - - -@dataclass -class EvalReturn: - ''' - Attributions - eval_value: The mean of evaluation return. - eval_value_std: The standard deviation of evaluation return. - ''' - eval_value: np.float32 - eval_value_std: np.float32 +from ding.bonus.common import TrainingReturn, EvalReturn class SACAgent: @@ -90,7 +71,7 @@ def __init__( self.policy = SACPolicy(self.cfg.policy, model=model) if policy_state_dict is not None: self.policy.learn_mode.load_state_dict(policy_state_dict) - self.model_save_dir = os.path.join(self.exp_name, "model") + self.checkpoint_save_dir = os.path.join(self.exp_name, "ckpt") def train( self, @@ -121,7 +102,7 @@ def train( ) task.use(data_pusher(self.cfg, self.buffer_)) task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_)) - task.use(CkptSaver(policy=self.policy, save_dir=self.model_save_dir, train_freq=n_iter_save_ckpt)) + task.use(CkptSaver(policy=self.policy, save_dir=self.checkpoint_save_dir, train_freq=n_iter_save_ckpt)) task.use( wandb_online_logger( metric_list=self.policy.monitor_vars(), @@ -247,7 +228,8 @@ def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: @property def best(self): - best_model_file_path = os.path.join(self.model_save_dir, "ckpt", "eval.pth.tar") + best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar") + # Load best model if it exists if os.path.exists(best_model_file_path): policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) self.policy.learn_mode.load_state_dict(policy_state_dict) diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 7bef5a2c8b..cc9e2bfc1b 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -18,26 +18,7 @@ from ding.model import QAC from ding.data import DequeBuffer from ding.bonus.config import get_instance_config, get_instance_env - - -@dataclass -class TrainingReturn: - ''' - Attributions - wandb_url: The weight & biases (wandb) project url of the trainning experiment. - ''' - wandb_url: str - - -@dataclass -class EvalReturn: - ''' - Attributions - eval_value: The mean of evaluation return. - eval_value_std: The standard deviation of evaluation return. - ''' - eval_value: np.float32 - eval_value_std: np.float32 +from ding.bonus.common import TrainingReturn, EvalReturn class TD3Agent: @@ -89,7 +70,7 @@ def __init__( self.policy = TD3Policy(self.cfg.policy, model=model) if policy_state_dict is not None: self.policy.learn_mode.load_state_dict(policy_state_dict) - self.model_save_dir = os.path.join(self.exp_name, "model") + self.model_save_dir = os.path.join(self.exp_name, "ckpt") def train( self, @@ -120,7 +101,7 @@ def train( ) task.use(data_pusher(self.cfg, self.buffer_)) task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_)) - task.use(CkptSaver(policy=self.policy, save_dir=self.model_save_dir, train_freq=n_iter_save_ckpt)) + task.use(CkptSaver(policy=self.policy, save_dir=self.checkpoint_save_dir, train_freq=n_iter_save_ckpt)) task.use( wandb_online_logger( metric_list=self.policy.monitor_vars(), @@ -245,8 +226,9 @@ def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: @property def best(self): - best_model_file_path = os.path.join(self.model_save_dir, "ckpt", "eval.pth.tar") + best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar") + # Load best model if it exists if os.path.exists(best_model_file_path): policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) self.policy.learn_mode.load_state_dict(policy_state_dict) - return self \ No newline at end of file + return self diff --git a/ding/framework/middleware/ckpt_handler.py b/ding/framework/middleware/ckpt_handler.py index 65a14c88f9..ca75f16618 100644 --- a/ding/framework/middleware/ckpt_handler.py +++ b/ding/framework/middleware/ckpt_handler.py @@ -34,7 +34,10 @@ def __init__(self, policy: Policy, save_dir: str, train_freq: Optional[int] = No """ self.policy = policy self.train_freq = train_freq - self.prefix = '{}/ckpt'.format(save_dir) + if str(os.path.basename(os.path.normpath(save_dir))) != "ckpt": + self.prefix = '{}/ckpt'.format(os.path.normpath(save_dir)) + else: + self.prefix = '{}/'.format(os.path.normpath(save_dir)) if not os.path.exists(self.prefix): os.makedirs(self.prefix) self.last_save_iter = 0 From b6be6777667e3fe0adac021cd9d582317461c65f Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 13 Apr 2023 11:42:27 +0000 Subject: [PATCH 087/244] fix bug --- ding/bonus/a2c.py | 1 - ding/bonus/common.py | 1 + ding/bonus/ddpg.py | 1 - ding/bonus/dqn.py | 1 - ding/bonus/ppof.py | 1 - ding/bonus/sac.py | 1 - ding/bonus/td3.py | 1 - 7 files changed, 1 insertion(+), 6 deletions(-) diff --git a/ding/bonus/a2c.py b/ding/bonus/a2c.py index d55d173142..bc0c8cba43 100644 --- a/ding/bonus/a2c.py +++ b/ding/bonus/a2c.py @@ -6,7 +6,6 @@ import gym import torch import treetensor.torch as ttorch -import numpy as np from ding.framework import task, OnlineRLContext from ding.framework.middleware import CkptSaver, trainer, \ wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, \ diff --git a/ding/bonus/common.py b/ding/bonus/common.py index f3ba3a7ffb..1d4ddfc711 100644 --- a/ding/bonus/common.py +++ b/ding/bonus/common.py @@ -1,4 +1,5 @@ from dataclasses import dataclass +import numpy as np @dataclass diff --git a/ding/bonus/ddpg.py b/ding/bonus/ddpg.py index 7a3db2a232..a74e70eaa6 100644 --- a/ding/bonus/ddpg.py +++ b/ding/bonus/ddpg.py @@ -6,7 +6,6 @@ import gym import torch import treetensor.torch as ttorch -import numpy as np from ding.framework import task, OnlineRLContext from ding.framework.middleware import CkptSaver, multistep_trainer, \ wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \ diff --git a/ding/bonus/dqn.py b/ding/bonus/dqn.py index ff991dacc3..b48ca770f7 100644 --- a/ding/bonus/dqn.py +++ b/ding/bonus/dqn.py @@ -5,7 +5,6 @@ import os import torch import treetensor.torch as ttorch -import numpy as np from ding.framework import task, OnlineRLContext from ding.framework.middleware import CkptSaver, multistep_trainer, \ wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \ diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index cb66750d4c..1c079323f5 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -7,7 +7,6 @@ import gym import gymnasium import torch -import numpy as np from ding.framework import task, OnlineRLContext from ding.framework.middleware import interaction_evaluator_ttorch, PPOFStepCollector, multistep_trainer, CkptSaver, \ wandb_online_logger, offline_data_saver, termination_checker, ppof_adv_estimator diff --git a/ding/bonus/sac.py b/ding/bonus/sac.py index 4b47e7c5a7..1491e094c7 100644 --- a/ding/bonus/sac.py +++ b/ding/bonus/sac.py @@ -6,7 +6,6 @@ import gym import torch import treetensor.torch as ttorch -import numpy as np from ding.framework import task, OnlineRLContext from ding.framework.middleware import CkptSaver, multistep_trainer, \ wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \ diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index cc9e2bfc1b..d121d138e6 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -6,7 +6,6 @@ import gym import torch import treetensor.torch as ttorch -import numpy as np from ding.framework import task, OnlineRLContext from ding.framework.middleware import CkptSaver, multistep_trainer, \ wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \ From 516780b4197e6cb4322e1652e5b829e751df6b5c Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 13 Apr 2023 16:31:50 +0000 Subject: [PATCH 088/244] fix bug --- ding/bonus/td3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index d121d138e6..167af523b6 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -69,7 +69,7 @@ def __init__( self.policy = TD3Policy(self.cfg.policy, model=model) if policy_state_dict is not None: self.policy.learn_mode.load_state_dict(policy_state_dict) - self.model_save_dir = os.path.join(self.exp_name, "ckpt") + self.checkpoint_save_dir = os.path.join(self.exp_name, "ckpt") def train( self, From 93008aad6db550c8b84141553da600ca64571931 Mon Sep 17 00:00:00 2001 From: zhangpaipai <1124321458@qq.com> Date: Fri, 14 Apr 2023 17:40:15 +0800 Subject: [PATCH 089/244] polish c51 --- ding/bonus/c51.py | 33 +++++++-------------------------- 1 file changed, 7 insertions(+), 26 deletions(-) diff --git a/ding/bonus/c51.py b/ding/bonus/c51.py index b513b11c17..b91b735d97 100644 --- a/ding/bonus/c51.py +++ b/ding/bonus/c51.py @@ -5,7 +5,6 @@ import os import torch import treetensor.torch as ttorch -import numpy as np from ding.framework import task, OnlineRLContext from ding.framework.middleware import CkptSaver, multistep_trainer, \ wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \ @@ -13,31 +12,12 @@ from ding.envs import BaseEnv, BaseEnvManagerV2, SubprocessEnvManagerV2 from ding.policy import C51Policy from ding.utils import set_pkg_seed -from ding.config import Config, save_config_py, compile_config +from ding.config import save_config_py, compile_config from ding.model import C51DQN from ding.model import model_wrap from ding.data import DequeBuffer from ding.bonus.config import get_instance_config, get_instance_env - - -@dataclass -class TrainingReturn: - ''' - Attributions - wandb_url: The weight & biases (wandb) project url of the trainning experiment. - ''' - wandb_url: str - - -@dataclass -class EvalReturn: - ''' - Attributions - eval_value: The mean of evaluation return. - eval_value_std: The standard deviation of evaluation return. - ''' - eval_value: np.float32 - eval_value_std: np.float32 +from ding.bonus.common import TrainingReturn, EvalReturn class C51Agent: @@ -88,7 +68,7 @@ def __init__( self.policy = C51Policy(self.cfg.policy, model=model) if policy_state_dict is not None: self.policy.learn_mode.load_state_dict(policy_state_dict) - self.model_save_dir=os.path.join(self.cfg["exp_name"], "model") + self.checkpoint_save_dir = os.path.join(self.exp_name, "ckpt") def train( self, @@ -122,7 +102,7 @@ def train( task.use( CkptSaver( policy=self.policy, - save_dir=self.model_save_dir, + save_dir=self.checkpoint_save_dir, train_freq=n_iter_save_ckpt ) ) @@ -135,7 +115,7 @@ def train( ) ) task.use(termination_checker(max_env_step=step)) - task.use(final_ctx_saver(name=self.cfg["exp_name"])) + task.use(final_ctx_saver(name=self.exp_name)) task.run() return TrainingReturn(wandb_url=task.ctx.wandb_url) @@ -251,7 +231,8 @@ def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: @property def best(self): - best_model_file_path=os.path.join(self.model_save_dir, "eval.pth.tar") + best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar") + # Load best model if it exists if os.path.exists(best_model_file_path): policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) self.policy.learn_mode.load_state_dict(policy_state_dict) From c7f5ad6d4018e98c4fdf2a9b2a8b80b43122bc5d Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 27 Apr 2023 17:38:05 +0000 Subject: [PATCH 090/244] add pg agent --- ding/bonus/__init__.py | 1 + ding/bonus/config.py | 46 +++- ding/bonus/pg.py | 223 ++++++++++++++++++ .../middleware/functional/__init__.py | 2 +- .../functional/advantage_estimator.py | 13 + .../framework/middleware/functional/logger.py | 58 +++-- .../middleware/functional/trainer.py | 11 +- ding/policy/pg.py | 24 +- ding/utils/default_helper.py | 2 +- 9 files changed, 348 insertions(+), 32 deletions(-) create mode 100644 ding/bonus/pg.py diff --git a/ding/bonus/__init__.py b/ding/bonus/__init__.py index 43e1dd8790..d343e14ac7 100644 --- a/ding/bonus/__init__.py +++ b/ding/bonus/__init__.py @@ -4,3 +4,4 @@ from .ddpg import DDPGAgent from .dqn import DQNAgent from .sac import SACAgent +from .pg import PGAgent diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 415db966df..2dfa64ebe6 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -4,7 +4,8 @@ from ding.envs import BaseEnv, DingEnvWrapper from ding.envs.env_wrappers import MaxAndSkipWrapper, WarpFrameWrapper, ScaledFloatFrameWrapper, FrameStackWrapper, \ EvalEpisodeReturnEnv, TransposeWrapper, TimeLimitWrapper, FlatObsWrapper, GymToGymnasiumWrapper -from ding.policy import PPOFPolicy, A2CPolicy, TD3Policy, DDPGPolicy, SACPolicy, DQNPolicy, IMPALAPolicy +from ding.policy import PPOFPolicy, A2CPolicy, TD3Policy, DDPGPolicy, SACPolicy, DQNPolicy, IMPALAPolicy, \ + PGPolicy def get_instance_config(env: str, algorithm: str) -> EasyDict: @@ -12,6 +13,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: cfg = PPOFPolicy.default_config() if env == 'lunarlander_discrete': cfg.n_sample = 400 + cfg.value_norm = 'popart' elif env == 'lunarlander_continuous': cfg.action_space = 'continuous' cfg.n_sample = 400 @@ -161,6 +163,48 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ) else: raise KeyError("not supported env type: {}".format(env)) + elif algorithm == 'PG': + cfg = EasyDict({"policy": PGPolicy.default_config()}) + if env == 'lunarlander_discrete': + cfg.update( + dict( + exp_name='LunarLander-v2-PG', + env=dict( + collector_env_num=8, + evaluator_env_num=8, + env_id='LunarLander-v2', + n_evaluator_episode=8, + stop_value=240, + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=8, + action_shape=4, + ), + learn=dict( + batch_size=320, + learning_rate=3e-4, + entropy_weight=0.001, + grad_norm=0.5, + ), + collect=dict( + n_episode=8, + discount_factor=0.99, + ), + eval=dict(evaluator=dict(eval_freq=1000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) + else: + raise KeyError("not supported env type: {}".format(env)) elif algorithm == 'TD3': cfg = EasyDict({"policy": TD3Policy.default_config()}) if env == 'hopper': diff --git a/ding/bonus/pg.py b/ding/bonus/pg.py new file mode 100644 index 0000000000..749581eb62 --- /dev/null +++ b/ding/bonus/pg.py @@ -0,0 +1,223 @@ +from typing import Optional, Union +from ditk import logging +from easydict import EasyDict +import os +import torch +import treetensor.torch as ttorch +from ding.framework import task, OnlineRLContext +from ding.framework.middleware import CkptSaver, trainer, \ + wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, \ + pg_estimator, final_ctx_saver, EpisodeCollector +from ding.envs import BaseEnv, BaseEnvManagerV2, SubprocessEnvManagerV2 +from ding.policy import PGPolicy +from ding.utils import set_pkg_seed +from ding.config import Config, save_config_py, compile_config +from ding.model import PG +from ding.model import model_wrap +from ding.bonus.config import get_instance_config, get_instance_env +from ding.bonus.common import TrainingReturn, EvalReturn + + +class PGAgent: + supported_env_list = [ + 'lunarlander_discrete', + ] + algorithm = 'PG' + + def __init__( + self, + env: Union[str, BaseEnv], + seed: int = 0, + exp_name: str = None, + model: Optional[torch.nn.Module] = None, + cfg: Optional[Union[EasyDict, dict, str]] = None, + policy_state_dict: str = None, + ) -> None: + if isinstance(env, str): + assert env in PGAgent.supported_env_list, "Please use supported envs: {}".format(PGAgent.supported_env_list) + self.env = get_instance_env(env) + if cfg is None: + # 'It should be default env tuned config' + cfg = get_instance_config(env, algorithm=PGAgent.algorithm) + else: + assert isinstance(cfg, EasyDict), "Please use EasyDict as config data type." + + if exp_name is not None: + cfg.exp_name = exp_name + self.cfg = compile_config(cfg, policy=PGPolicy) + self.exp_name = self.cfg.exp_name + + elif isinstance(env, BaseEnv): + self.cfg = compile_config(cfg, policy=PGPolicy) + raise NotImplementedError + else: + raise TypeError("not support env type: {}, only strings and instances of `BaseEnv` now".format(type(env))) + logging.getLogger().setLevel(logging.INFO) + self.seed = seed + set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda) + if not os.path.exists(self.exp_name): + os.makedirs(self.exp_name) + save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py')) + if model is None: + model = PG(**self.cfg.policy.model) + self.policy = PGPolicy(self.cfg.policy, model=model) + if policy_state_dict is not None: + self.policy.learn_mode.load_state_dict(policy_state_dict) + self.checkpoint_save_dir = os.path.join(self.exp_name, "ckpt") + + def train( + self, + step: int = int(1e7), + collector_env_num: int = 4, + evaluator_env_num: int = 4, + n_iter_log_show: int = 500, + n_iter_save_ckpt: int = 1000, + context: Optional[str] = None, + debug: bool = False + ) -> TrainingReturn: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + logging.debug(self.policy._model) + # define env and policy + collector_env = self._setup_env_manager(collector_env_num, context, debug) + evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug) + + with task.start(ctx=OnlineRLContext()): + task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) + task.use(EpisodeCollector(self.cfg, self.policy.collect_mode, collector_env)) + # task.use(gae_estimator(self.cfg, self.policy.collect_mode)) + task.use(pg_estimator(self.policy.collect_mode)) + task.use(trainer(self.cfg, self.policy.learn_mode)) + task.use(CkptSaver(policy=self.policy, save_dir=self.checkpoint_save_dir, train_freq=n_iter_save_ckpt)) + task.use( + wandb_online_logger( + metric_list=self.policy.monitor_vars(), + model=self.policy._model, + anonymous=True, + project_name=self.exp_name + ) + ) + task.use(termination_checker(max_env_step=step)) + task.use(final_ctx_saver(name=self.exp_name)) + task.run() + + return TrainingReturn(wandb_url=task.ctx.wandb_url) + + def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> None: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + # define env and policy + env = self.env.clone() + env.seed(self.seed, dynamic_seed=False) + + if enable_save_replay and replay_save_path: + env.enable_save_replay(replay_path=replay_save_path) + elif enable_save_replay: + env.enable_save_replay(replay_path=os.path.join(self.exp_name, 'videos')) + else: + logging.warning('No video would be generated during the deploy.') + + def single_env_forward_wrapper(forward_fn, cuda=True): + + forward_fn = model_wrap(forward_fn, wrapper_name='argmax_sample').forward + + def _forward(obs): + # unsqueeze means add batch dim, i.e. (O, ) -> (1, O) + obs = ttorch.as_tensor(obs).unsqueeze(0) + if cuda and torch.cuda.is_available(): + obs = obs.cuda() + action = forward_fn(obs, mode='compute_actor')["action"] + # squeeze means delete batch dim, i.e. (1, A) -> (A, ) + action = action.squeeze(0).detach().cpu().numpy() + return action + + return _forward + + forward_fn = single_env_forward_wrapper(self.policy._model, self.cfg.policy.cuda) + + # main loop + return_ = 0. + step = 0 + obs = env.reset() + while True: + action = forward_fn(obs) + obs, rew, done, info = env.step(action) + return_ += rew + step += 1 + if done: + break + logging.info(f'PG deploy is finished, final episode return with {step} steps is: {return_}') + + def collect_data( + self, + env_num: int = 8, + save_data_path: Optional[str] = None, + n_sample: Optional[int] = None, + n_episode: Optional[int] = None, + context: Optional[str] = None, + debug: bool = False + ) -> None: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + if n_episode is not None: + raise NotImplementedError + # define env and policy + env = self._setup_env_manager(env_num, context, debug) + + if save_data_path is None: + save_data_path = os.path.join(self.exp_name, 'demo_data') + + # main execution task + with task.start(ctx=OnlineRLContext()): + task.use( + StepCollector( + self.cfg, self.policy.collect_mode, env, random_collect_size=self.cfg.policy.random_collect_size + ) + ) + task.use(offline_data_saver(save_data_path, data_type='hdf5')) + task.run(max_step=1) + logging.info( + f'PG collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`' + ) + + def batch_evaluate( + self, + env_num: int = 4, + n_evaluator_episode: int = 4, + context: Optional[str] = None, + debug: bool = False + ) -> EvalReturn: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + # define env and policy + env = self._setup_env_manager(env_num, context, debug) + + evaluate_cfg = self.cfg + evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode + + # main execution task + with task.start(ctx=OnlineRLContext()): + task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, env)) + task.run(max_step=1) + + return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) + + def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: bool = False) -> BaseEnvManagerV2: + if debug: + env_cls = BaseEnvManagerV2 + manager_cfg = env_cls.default_config() + else: + env_cls = SubprocessEnvManagerV2 + manager_cfg = env_cls.default_config() + if context is not None: + manager_cfg.context = context + return env_cls([self.env.clone for _ in range(env_num)], manager_cfg) + + @property + def best(self): + best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar") + # Load best model if it exists + if os.path.exists(best_model_file_path): + policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) + self.policy.learn_mode.load_state_dict(policy_state_dict) + return self diff --git a/ding/framework/middleware/functional/__init__.py b/ding/framework/middleware/functional/__init__.py index bf5e965cae..c87832a856 100644 --- a/ding/framework/middleware/functional/__init__.py +++ b/ding/framework/middleware/functional/__init__.py @@ -9,7 +9,7 @@ # algorithm from .explorer import eps_greedy_handler, eps_greedy_masker -from .advantage_estimator import gae_estimator, ppof_adv_estimator +from .advantage_estimator import gae_estimator, ppof_adv_estimator, pg_estimator from .enhancer import reward_estimator, her_data_enhancer, nstep_reward_enhancer from .timer import epoch_timer diff --git a/ding/framework/middleware/functional/advantage_estimator.py b/ding/framework/middleware/functional/advantage_estimator.py index 53eb1d015e..bb6347bc39 100644 --- a/ding/framework/middleware/functional/advantage_estimator.py +++ b/ding/framework/middleware/functional/advantage_estimator.py @@ -79,3 +79,16 @@ def _estimator(ctx: "OnlineRLContext"): ctx.train_data = data return _estimator + + +def pg_estimator(policy: Policy) -> Callable: + + def _estimator(ctx: "OnlineRLContext"): + train_data = [] + for episode in ctx.episodes: + data = ttorch_collate(episode) + data = policy.get_train_sample(data) + train_data.append(data) + ctx.train_data = ttorch.cat(train_data, dim=0) + + return _estimator diff --git a/ding/framework/middleware/functional/logger.py b/ding/framework/middleware/functional/logger.py index c770d74c6a..e9258df928 100644 --- a/ding/framework/middleware/functional/logger.py +++ b/ding/framework/middleware/functional/logger.py @@ -326,10 +326,10 @@ def wandb_offline_logger( # Initialize wandb with default settings # Settings can be covered by calling wandb.init() at the top of the script if anonymous: - wandb.init(anonymous="must") + wandb.init(project=project_name, reinit=True, anonymous="must") else: - wandb.init() - if cfg == 'default': + wandb.init(project=project_name, reinit=True) + if cfg is None: cfg = EasyDict( dict( gradient_logger=False, @@ -339,9 +339,16 @@ def wandb_offline_logger( return_logger=False, ) ) + else: + if not isinstance(cfg, EasyDict): + cfg = EasyDict(cfg) + assert set(cfg.keys() + ) == set(["gradient_logger", "plot_logger", "video_logger", "action_logger", "return_logger"]) + assert all(value in [True, False] for value in cfg.values()) + # The visualizer is called to save the replay of the simulation # which will be uploaded to wandb later - if env is not None: + if env is not None and cfg.video_logger is True and record_path is not None: env.enable_save_replay(replay_path=record_path) if cfg.gradient_logger: wandb.watch(model) @@ -350,6 +357,8 @@ def wandb_offline_logger( "If you want to use wandb to visualize the gradient, please set gradient_logger = True in the config." ) + first_plot = True + def _vis_dataset(datasetpath: str): assert os.path.splitext(datasetpath)[-1] in ['.pkl', '.h5', '.hdf5'] if os.path.splitext(datasetpath)[-1] == '.pkl': @@ -399,23 +408,33 @@ def _vis_dataset(datasetpath: str): _vis_dataset(dataset_path) def _plot(ctx: "OnlineRLContext"): + nonlocal first_plot + if first_plot: + first_plot = False + ctx.wandb_url = wandb.run.get_project_url() + info_for_logging = {} - if not cfg.plot_logger: + if cfg.plot_logger: + for metric in metric_list: + if isinstance(ctx.train_output, Dict) and metric in ctx.train_output: + if isinstance(ctx.train_output[metric], torch.Tensor): + info_for_logging.update({metric: ctx.train_output[metric].cpu().detach().numpy()}) + else: + info_for_logging.update({metric: ctx.train_output[metric]}) + elif isinstance(ctx.train_output, List) and len(ctx.train_output) > 0 and metric in ctx.train_output[0]: + metric_value_list = [] + for item in ctx.train_output: + if isinstance(item[metric], torch.Tensor): + metric_value_list.append(item[metric].cpu().detach().numpy()) + else: + metric_value_list.append(item[metric]) + metric_value = np.mean(metric_value_list) + info_for_logging.update({metric: metric_value}) + else: one_time_warning( "If you want to use wandb to visualize the result, please set plot_logger = True in the config." ) - return - for metric in metric_list: - if metric in ctx.train_output[0]: - metric_value_list = [] - for item in ctx.train_output: - if isinstance(item[metric], torch.Tensor): - metric_value_list.append(item[metric].cpu().detach().numpy()) - else: - metric_value_list.append(item[metric]) - metric_value = np.mean(metric_value_list) - info_for_logging.update({metric: metric_value}) if ctx.eval_value != -np.inf: info_for_logging.update( @@ -441,9 +460,8 @@ def _plot(ctx: "OnlineRLContext"): video_path = os.path.join(record_path, file_list[-2]) info_for_logging.update({"video": wandb.Video(video_path, format="mp4")}) - action_path = os.path.join(record_path, (str(ctx.env_step) + "_action.gif")) - return_path = os.path.join(record_path, (str(ctx.env_step) + "_return.gif")) if cfg.action_logger: + action_path = os.path.join(record_path, (str(ctx.env_step) + "_action.gif")) if all(['logit' in v for v in eval_output]) or hasattr(eval_output, "logit"): if isinstance(eval_output, tnp.ndarray): action_prob = softmax(eval_output.logit) @@ -472,6 +490,7 @@ def _plot(ctx: "OnlineRLContext"): info_for_logging.update({"actions_of_trajectory_{}".format(i): fig}) if cfg.return_logger: + return_path = os.path.join(record_path, (str(ctx.env_step) + "_return.gif")) fig, ax = plt.subplots() ax = plt.gca() ax.set_ylim([0, 1]) @@ -482,7 +501,8 @@ def _plot(ctx: "OnlineRLContext"): ani.save(return_path, writer='pillow') info_for_logging.update({"return distribution": wandb.Video(return_path, format="gif")}) - wandb.log(data=info_for_logging, step=ctx.env_step) + if bool(info_for_logging): + wandb.log(data=info_for_logging, step=ctx.env_step) plt.clf() return _plot diff --git a/ding/framework/middleware/functional/trainer.py b/ding/framework/middleware/functional/trainer.py index b71a3bd089..57443c76f8 100644 --- a/ding/framework/middleware/functional/trainer.py +++ b/ding/framework/middleware/functional/trainer.py @@ -29,19 +29,20 @@ def _train(ctx: Union["OnlineRLContext", "OfflineRLContext"]): if ctx.train_data is None: return - data = ctx.train_data train_output = policy.forward(ctx.train_data) if ctx.train_iter % cfg.policy.learn.learner.hook.log_show_after_iter == 0: + if isinstance(train_output, list): + train_output_loss = np.mean([item['total_loss'] for item in train_output]) + else: + train_output_loss = train_output['total_loss'] if isinstance(ctx, OnlineRLContext): logging.info( 'Training: Train Iter({})\tEnv Step({})\tLoss({:.3f})'.format( - ctx.train_iter, ctx.env_step, train_output['total_loss'] + ctx.train_iter, ctx.env_step, train_output_loss ) ) elif isinstance(ctx, OfflineRLContext): - logging.info( - 'Training: Train Iter({})\tLoss({:.3f})'.format(ctx.train_iter, train_output['total_loss']) - ) + logging.info('Training: Train Iter({})\tLoss({:.3f})'.format(ctx.train_iter, train_output_loss)) else: raise TypeError("not supported ctx type: {}".format(type(ctx))) ctx.train_iter += 1 diff --git a/ding/policy/pg.py b/ding/policy/pg.py index 1e9a500ce3..93a7fd3292 100644 --- a/ding/policy/pg.py +++ b/ding/policy/pg.py @@ -1,6 +1,7 @@ from typing import List, Dict, Any, Tuple, Union from collections import namedtuple import torch +import treetensor as ttorch from ding.rl_utils import get_gae_with_default_last_value, get_train_sample from ding.torch_utils import Adam, to_device @@ -169,16 +170,26 @@ def _get_train_sample(self, data: list) -> Union[None, List[Any]]: Returns: - samples (:obj:`dict`): The training samples generated """ - assert data[-1]['done'] is True, "PG needs a complete epsiode" + assert data[-1]['done'] == True, "PG needs a complete epsiode" if self._cfg.learn.ignore_done: raise NotImplementedError R = 0. - for i in reversed(range(len(data))): - R = self._gamma * R + data[i]['reward'] - data[i]['return'] = R - return get_train_sample(data, self._unroll_len) + if isinstance(data, list): + for i in reversed(range(len(data))): + R = self._gamma * R + data[i]['reward'] + data[i]['return'] = R + return get_train_sample(data, self._unroll_len) + elif isinstance(data, ttorch.Tensor): + data_size = data['done'].shape[0] + data['return'] = ttorch.Tensor([0.0 for i in range(data_size)]) + for i in reversed(range(data_size)): + R = self._gamma * R + data['reward'][i] + data['return'][i] = R + return get_train_sample(data, self._unroll_len) + else: + raise ValueError def _init_eval(self) -> None: pass @@ -207,3 +218,6 @@ def _forward_eval(self, data: dict) -> dict: def _monitor_vars_learn(self) -> List[str]: return super()._monitor_vars_learn() + ['policy_loss', 'entropy_loss', 'return_abs_max', 'grad_norm'] + + def monitor_vars(self) -> List[str]: + return self._monitor_vars_learn() diff --git a/ding/utils/default_helper.py b/ding/utils/default_helper.py index 6b53d568f1..cce57aea11 100644 --- a/ding/utils/default_helper.py +++ b/ding/utils/default_helper.py @@ -470,7 +470,7 @@ def split_data_generator(data: dict, split_size: int, shuffle: bool = True) -> d # assert len(set(length)) == 1, "data values must have the same length: {}".format(length) # if continuous action, data['logit'] is list of length 2 length = length[0] - assert split_size >= 1 and split_size <= length, f'{split_size}_{length}' + assert split_size >= 1 if shuffle: indices = np.random.permutation(length) else: From 1941951f70fda71868df9602ac05fd313e78a4e3 Mon Sep 17 00:00:00 2001 From: zhangpaipai <1124321458@qq.com> Date: Fri, 28 Apr 2023 13:12:50 +0800 Subject: [PATCH 091/244] add pendulum config --- ding/bonus/config.py | 346 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 346 insertions(+) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index e63e7b11d6..60ceb8c1d4 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -360,6 +360,121 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), ) ) + elif env == 'bipedalwalker': + cfg.update( + dict( + exp_name='Bipedalwalker-v3-TD3', + seed=0, + env=dict( + env_id='BipedalWalker-v3', + collector_env_num=1, + evaluator_env_num=5, + # (bool) Scale output action into legal range. + act_scale=True, + n_evaluator_episode=5, + stop_value=300, + rew_clip=True, + # The path to save the game replay + replay_path=None, + ), + policy=dict( + cuda=True, + priority=False, + model=dict( + obs_shape=24, + action_shape=4, + twin_critic=True, + actor_head_hidden_size=400, + critic_head_hidden_size=400, + action_space='regression', + ), + learn=dict( + update_per_collect=4, + discount_factor=0.99, + batch_size=128, + learning_rate_actor=0.001, + learning_rate_critic=0.001, + target_theta=0.005, + ignore_done=False, + actor_update_freq=2, + noise=True, + noise_sigma=0.2, + noise_range=dict( + min=-0.5, + max=0.5, + ), + ), + collect=dict( + n_sample=256, + noise_sigma=0.1, + collector=dict(collect_print_freq=1000, ), + ), + eval=dict(evaluator=dict(eval_freq=100, ), ), + other=dict(replay_buffer=dict(replay_buffer_size=50000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) + elif env == 'pendulum': + cfg.update( + dict( + exp_name='Pendulum-v1-TD3', + seed=0, + env=dict( + collector_env_num=8, + evaluator_env_num=5, + # (bool) Scale output action into legal range. + act_scale=True, + n_evaluator_episode=5, + stop_value=-250, + ), + policy=dict( + cuda=False, + priority=False, + random_collect_size=800, + model=dict( + obs_shape=3, + action_shape=1, + twin_critic=True, + action_space='regression', + ), + learn=dict( + update_per_collect=2, + batch_size=128, + learning_rate_actor=0.001, + learning_rate_critic=0.001, + ignore_done=True, + actor_update_freq=2, + noise=True, + noise_sigma=0.1, + noise_range=dict( + min=-0.5, + max=0.5, + ), + ), + collect=dict( + n_sample=48, + noise_sigma=0.1, + collector=dict(collect_print_freq=1000, ), + ), + eval=dict(evaluator=dict(eval_freq=100, ), ), + other=dict(replay_buffer=dict(replay_buffer_size=20000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) else: raise KeyError("not supported env type: {}".format(env)) elif algorithm == 'DDPG': @@ -572,6 +687,60 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), ) ) + elif env == 'bipedalwalker': + pass + elif env == 'pendulum': + cfg.update( + dict( + exp_name='Pendulum-v1-DDPG', + seed=0, + env=dict( + collector_env_num=8, + evaluator_env_num=5, + # (bool) Scale output action into legal range. + act_scale=True, + n_evaluator_episode=5, + stop_value=-250, + ), + policy=dict( + cuda=False, + priority=False, + random_collect_size=800, + model=dict( + obs_shape=3, + action_shape=1, + twin_critic=False, + action_space='regression', + ), + learn=dict( + update_per_collect=2, + batch_size=128, + learning_rate_actor=0.001, + learning_rate_critic=0.001, + ignore_done=True, + actor_update_freq=1, + noise=False, + ), + collect=dict( + n_sample=48, + noise_sigma=0.1, + collector=dict(collect_print_freq=1000, ), + ), + eval=dict(evaluator=dict(eval_freq=100, )), + other=dict(replay_buffer=dict( + replay_buffer_size=20000, + max_use=16, + ), ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) else: raise KeyError("not supported env type: {}".format(env)) elif algorithm == 'SAC': @@ -885,6 +1054,169 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), ) ) + elif env == 'Pong': + cfg.update( + dict( + exp_name='Pong-v4-C51', + seed=0, + env=dict( + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=20, + env_id='Pong-v4', + #'ALE/Pong-v5' is available. But special setting is needed after gym make. + frame_stack=4, + ), + policy=dict( + cuda=True, + priority=False, + model=dict( + obs_shape=[4, 84, 84], + action_shape=6, + encoder_hidden_size_list=[128, 128, 512], + v_min=-10, + v_max=10, + n_atom=51, + ), + nstep=3, + discount_factor=0.99, + learn=dict( + update_per_collect=10, + batch_size=32, + learning_rate=0.0001, + target_update_freq=500, + ), + collect=dict(n_sample=100, ), + eval=dict(evaluator=dict(eval_freq=4000, )), + other=dict( + eps=dict( + type='exp', + start=1., + end=0.05, + decay=250000, + ), + replay_buffer=dict(replay_buffer_size=100000, ), + ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) + elif env == 'SpaceInvaders': + cfg.update( + dict( + exp_name='SpaceInvaders-v4-C51', + seed=0, + env=dict( + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=10000000000, + env_id='SpaceInvaders-v4', + #'ALE/SpaceInvaders-v5' is available. But special setting is needed after gym make. + frame_stack=4, + manager=dict(shared_memory=False, ) + ), + policy=dict( + cuda=True, + priority=False, + model=dict( + obs_shape=[4, 84, 84], + action_shape=6, + encoder_hidden_size_list=[128, 128, 512], + v_min=-10, + v_max=10, + n_atom=51, + ), + nstep=3, + discount_factor=0.99, + learn=dict( + update_per_collect=10, + batch_size=32, + learning_rate=0.0001, + target_update_freq=500, + ), + collect=dict(n_sample=100, ), + eval=dict(evaluator=dict(eval_freq=4000, )), + other=dict( + eps=dict( + type='exp', + start=1., + end=0.05, + decay=1000000, + ), + replay_buffer=dict(replay_buffer_size=400000, ), + ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) + elif env == 'Qbert': + cfg.update( + dict( + exp_name='Qbert-v4-C51', + seed=0, + env=dict( + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=30000, + env_id='Qbert-v4', + #'ALE/Qbert-v5' is available. But special setting is needed after gym make. + frame_stack=4 + ), + policy=dict( + cuda=True, + priority=True, + model=dict( + obs_shape=[4, 84, 84], + action_shape=6, + encoder_hidden_size_list=[128, 128, 512], + v_min=-10, + v_max=10, + n_atom=51, + ), + nstep=3, + discount_factor=0.99, + learn=dict( + update_per_collect=10, + batch_size=32, + learning_rate=0.0001, + target_update_freq=500, + ), + collect=dict(n_sample=100, ), + eval=dict(evaluator=dict(eval_freq=4000, )), + other=dict( + eps=dict( + type='exp', + start=1., + end=0.05, + decay=1000000, + ), + replay_buffer=dict(replay_buffer_size=400000, ), + ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) else: raise KeyError("not supported env type: {}".format(env)) else: @@ -900,6 +1232,8 @@ def get_instance_env(env: str) -> BaseEnv: return DingEnvWrapper(gym.make('LunarLander-v2', continuous=True)) elif env == 'bipedalwalker': return DingEnvWrapper(gym.make('BipedalWalker-v3'), cfg={'act_scale': True}) + elif env == 'pendulum': + return DingEnvWrapper(gym.make('Pendulum-v1')) elif env == 'acrobot': return DingEnvWrapper(gym.make('Acrobot-v1')) elif env == 'rocket_landing': @@ -989,6 +1323,18 @@ def get_instance_env(env: str) -> BaseEnv: 'env_wrapper': 'atari_default', }) return DingEnvWrapper(gym.make("SpaceInvaders-v4"), cfg=cfg) + elif env == "Pong": + cfg = EasyDict({ + 'env_id': "Pong-v4", + 'env_wrapper': 'atari_default', + }) + return DingEnvWrapper(gym.make("Pong-v4"), cfg=cfg) + elif env == "Qbert": + cfg = EasyDict({ + 'env_id': "Qbert-v4", + 'env_wrapper': 'atari_default', + }) + return DingEnvWrapper(gym.make("Qbert-v4"), cfg=cfg) elif env in ['atari_qbert', 'atari_kangaroo', 'atari_bowling', 'atari_breakout', 'atari_spaceinvader', 'atari_gopher']: from dizoo.atari.envs.atari_env import AtariEnv From af7272aa441b11186c5b0de5494360d2e32f6611 Mon Sep 17 00:00:00 2001 From: zhangpaipai <1124321458@qq.com> Date: Fri, 28 Apr 2023 13:51:54 +0800 Subject: [PATCH 092/244] add c51_atari td3_pendulum,bipedalwalker ddpg_pendulum --- ding/bonus/c51.py | 3 +++ ding/bonus/config.py | 2 +- ding/bonus/ddpg.py | 2 ++ ding/bonus/td3.py | 2 ++ 4 files changed, 8 insertions(+), 1 deletion(-) diff --git a/ding/bonus/c51.py b/ding/bonus/c51.py index b91b735d97..aa7219be05 100644 --- a/ding/bonus/c51.py +++ b/ding/bonus/c51.py @@ -23,6 +23,9 @@ class C51Agent: supported_env_list = [ 'lunarlander_discrete', + 'Pong', + 'SpaceInvaders', + 'Qbert', ] algorithm = 'C51' diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 60ceb8c1d4..398eccc64f 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -1233,7 +1233,7 @@ def get_instance_env(env: str) -> BaseEnv: elif env == 'bipedalwalker': return DingEnvWrapper(gym.make('BipedalWalker-v3'), cfg={'act_scale': True}) elif env == 'pendulum': - return DingEnvWrapper(gym.make('Pendulum-v1')) + return DingEnvWrapper(gym.make('Pendulum-v1'), cfg={'act_scale': True}) elif env == 'acrobot': return DingEnvWrapper(gym.make('Acrobot-v1')) elif env == 'rocket_landing': diff --git a/ding/bonus/ddpg.py b/ding/bonus/ddpg.py index 2166a787a7..f6bb95c175 100644 --- a/ding/bonus/ddpg.py +++ b/ding/bonus/ddpg.py @@ -26,6 +26,8 @@ class DDPGAgent: 'HalfCheetah', 'Walker2d', 'lunarlander_continuous', + 'bipedalwalker', + 'pendulum', ] algorithm = 'DDPG' diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index f265d51a33..ef1babe3f8 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -26,6 +26,8 @@ class TD3Agent: 'HalfCheetah', 'Walker2d', 'lunarlander_continuous', + 'bipedalwalker', + 'pendulum', ] algorithm = 'TD3' From eafeada62c4adbac3cc4218a6dd63e4bb5dd6432 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 28 Apr 2023 05:54:44 +0000 Subject: [PATCH 093/244] polish code --- ding/bonus/config.py | 107 ++++++++++++++++++++++++++++++++++++++++++- ding/bonus/pg.py | 15 ++++-- ding/bonus/sac.py | 1 + 3 files changed, 117 insertions(+), 6 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 2dfa64ebe6..fb2a0e3ebf 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -485,8 +485,109 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), ) ) - - pass + elif env == 'bipedalwalker': + cfg.update( + dict( + exp_name='BipedalWalker-v3-SAC', + seed=0, + env=dict( + env_id='BipedalWalker-v3', + collector_env_num=8, + evaluator_env_num=8, + # (bool) Scale output action into legal range. + act_scale=True, + n_evaluator_episode=8, + stop_value=300, + rew_clip=True, + ), + policy=dict( + cuda=False, + priority=False, + random_collect_size=1000, + model=dict( + obs_shape=24, + action_shape=4, + twin_critic=True, + action_space='reparameterization', + actor_head_hidden_size=128, + critic_head_hidden_size=128, + ), + learn=dict( + update_per_collect=1, + batch_size=128, + learning_rate_q=0.001, + learning_rate_policy=0.001, + learning_rate_alpha=0.0003, + ignore_done=True, + target_theta=0.005, + discount_factor=0.99, + auto_alpha=True, + value_network=False, + ), + collect=dict( + n_sample=128, + unroll_len=1, + ), + other=dict(replay_buffer=dict(replay_buffer_size=100000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) + elif env == 'pendulum': + cfg.update( + dict( + exp_name='Pendulum-v1-SAC', + seed=0, + env=dict( + collector_env_num=10, + evaluator_env_num=5, + # (bool) Scale output action into legal range. + act_scale=True, + n_evaluator_episode=5, + stop_value=-250, + ), + policy=dict( + cuda=False, + priority=False, + random_collect_size=1000, + model=dict( + obs_shape=3, + action_shape=1, + twin_critic=True, + action_space='reparameterization', + actor_head_hidden_size=128, + critic_head_hidden_size=128, + ), + learn=dict( + update_per_collect=1, + batch_size=128, + learning_rate_q=0.001, + learning_rate_policy=0.001, + learning_rate_alpha=0.0003, + ignore_done=True, + target_theta=0.005, + discount_factor=0.99, + auto_alpha=True, + ), + collect=dict(n_sample=10, ), + eval=dict(evaluator=dict(eval_freq=100, )), + other=dict(replay_buffer=dict(replay_buffer_size=100000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) else: raise KeyError("not supported env type: {}".format(env)) elif algorithm == 'DQN': @@ -560,6 +661,8 @@ def get_instance_env(env: str) -> BaseEnv: return DingEnvWrapper(gym.make('LunarLander-v2', continuous=True)) elif env == 'bipedalwalker': return DingEnvWrapper(gym.make('BipedalWalker-v3'), cfg={'act_scale': True}) + elif env == 'pendulum': + return DingEnvWrapper(gym.make('Pendulum-v1')) elif env == 'acrobot': return DingEnvWrapper(gym.make('Acrobot-v1')) elif env == 'rocket_landing': diff --git a/ding/bonus/pg.py b/ding/bonus/pg.py index 749581eb62..84ec13ee0c 100644 --- a/ding/bonus/pg.py +++ b/ding/bonus/pg.py @@ -119,16 +119,23 @@ def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, def single_env_forward_wrapper(forward_fn, cuda=True): - forward_fn = model_wrap(forward_fn, wrapper_name='argmax_sample').forward - def _forward(obs): # unsqueeze means add batch dim, i.e. (O, ) -> (1, O) obs = ttorch.as_tensor(obs).unsqueeze(0) if cuda and torch.cuda.is_available(): obs = obs.cuda() - action = forward_fn(obs, mode='compute_actor')["action"] + output = forward_fn(obs) + if self.policy._cfg.deterministic_eval: + if self.policy._cfg.action_space == 'discrete': + output['action'] = output['logit'].argmax(dim=-1) + elif self.policy._cfg.action_space == 'continuous': + output['action'] = output['logit']['mu'] + else: + raise KeyError("invalid action_space: {}".format(self.policy._cfg.action_space)) + else: + output['action'] = output['dist'].sample() # squeeze means delete batch dim, i.e. (1, A) -> (A, ) - action = action.squeeze(0).detach().cpu().numpy() + action = output['action'].squeeze(0).detach().cpu().numpy() return action return _forward diff --git a/ding/bonus/sac.py b/ding/bonus/sac.py index 1491e094c7..2e636a19f7 100644 --- a/ding/bonus/sac.py +++ b/ding/bonus/sac.py @@ -25,6 +25,7 @@ class SACAgent: supported_env_list = [ 'hopper', 'lunarlander_continuous', + 'bipedalwalker', ] algorithm = 'SAC' From 68a738e87139195734ce8f0dec15945738b469b0 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 28 Apr 2023 09:45:34 +0000 Subject: [PATCH 094/244] polish code --- ding/bonus/sac.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ding/bonus/sac.py b/ding/bonus/sac.py index 2e636a19f7..440e528b85 100644 --- a/ding/bonus/sac.py +++ b/ding/bonus/sac.py @@ -26,6 +26,7 @@ class SACAgent: 'hopper', 'lunarlander_continuous', 'bipedalwalker', + 'pendulum', ] algorithm = 'SAC' From fdc6408055a41f646862f14d6edb5d0eaf9cc2fd Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 28 Apr 2023 12:46:15 +0000 Subject: [PATCH 095/244] polish code --- ding/bonus/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index fb2a0e3ebf..6b4b35de45 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -546,10 +546,10 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: seed=0, env=dict( collector_env_num=10, - evaluator_env_num=5, + evaluator_env_num=8, # (bool) Scale output action into legal range. act_scale=True, - n_evaluator_episode=5, + n_evaluator_episode=8, stop_value=-250, ), policy=dict( From 5cf69d643714cf48e8040751d8cc95806fdf3892 Mon Sep 17 00:00:00 2001 From: zhangpaipai <1124321458@qq.com> Date: Fri, 5 May 2023 18:11:21 +0800 Subject: [PATCH 096/244] add bipedalwalker_ddpg_config --- ding/bonus/config.py | 58 +++++++++++++++- .../config/bipedalwalker_ddpg_config.py | 67 +++++++++++++++++++ 2 files changed, 124 insertions(+), 1 deletion(-) create mode 100644 dizoo/box2d/bipedalwalker/config/bipedalwalker_ddpg_config.py diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 4cc91c6e3c..08467039f1 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -732,7 +732,63 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ) ) elif env == 'bipedalwalker': - pass + cfg.update( + dict( + exp_name='Bipedalwalker-v3-DDPG', + seed=0, + env=dict( + env_id='BipedalWalker-v3', + collector_env_num=8, + evaluator_env_num=5, + # (bool) Scale output action into legal range. + act_scale=True, + n_evaluator_episode=5, + stop_value=300, + rew_clip=True, + # The path to save the game replay + replay_path=None, + ), + policy=dict( + cuda=False, + priority=False, + random_collect_size=1200, + model=dict( + obs_shape=24, + action_shape=4, + twin_critic=False, + actor_head_hidden_size=256, + critic_head_hidden_size=256, + action_space='regression', + ), + learn=dict( + update_per_collect=1, + batch_size=128, + learning_rate_actor=0.001, + learning_rate_critic=0.001, + ignore_done=True, + actor_update_freq=1, + noise=False, + ), + collect=dict( + n_sample=16, + noise_sigma=0.1, + collector=dict(collect_print_freq=1000, ), + ), + eval=dict(evaluator=dict(eval_freq=100, )), + other=dict(replay_buffer=dict( + replay_buffer_size=20000, + max_use=16, + ), ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) elif env == 'pendulum': cfg.update( dict( diff --git a/dizoo/box2d/bipedalwalker/config/bipedalwalker_ddpg_config.py b/dizoo/box2d/bipedalwalker/config/bipedalwalker_ddpg_config.py new file mode 100644 index 0000000000..1125faae35 --- /dev/null +++ b/dizoo/box2d/bipedalwalker/config/bipedalwalker_ddpg_config.py @@ -0,0 +1,67 @@ +from easydict import EasyDict + +bipedalwalker_ddpg_config = dict( + exp_name='bipedalwalker_ddpg_seed0', + env=dict( + env_id='BipedalWalker-v3', + collector_env_num=8, + evaluator_env_num=5, + # (bool) Scale output action into legal range. + act_scale=True, + n_evaluator_episode=5, + stop_value=300, + rew_clip=True, + # The path to save the game replay + replay_path=None, + ), + policy=dict( + cuda=False, + priority=False, + random_collect_size=1200, + model=dict( + obs_shape=24, + action_shape=4, + twin_critic=False, + actor_head_hidden_size=256, + critic_head_hidden_size=256, + action_space='regression', + ), + learn=dict( + update_per_collect=1, + batch_size=128, + learning_rate_actor=0.001, + learning_rate_critic=0.001, + ignore_done=True, + actor_update_freq=1, + noise=False, + ), + collect=dict( + n_sample=16, + noise_sigma=0.1, + collector=dict(collect_print_freq=1000, ), + ), + eval=dict(evaluator=dict(eval_freq=100, )), + other=dict(replay_buffer=dict( + replay_buffer_size=20000, + max_use=16, + ), ), + ), +) +bipedalwalker_ddpg_config = EasyDict(bipedalwalker_ddpg_config) +main_config = bipedalwalker_ddpg_config + +bipedalwalker_ddpg_create_config = dict( + env=dict( + type='bipedalwalker', + import_names=['dizoo.box2d.bipedalwalker.envs.bipedalwalker_env'], + ), + env_manager=dict(type='base'), + policy=dict(type='ddpg'), +) +bipedalwalker_ddpg_create_config = EasyDict(bipedalwalker_ddpg_create_config) +create_config = bipedalwalker_ddpg_create_config + +if __name__ == "__main__": + # or you can enter `ding -m serial -c bipedalwalker_ddpg_config.py -s 0` + from ding.entry import serial_pipeline + serial_pipeline([main_config, create_config], seed=0) \ No newline at end of file From ca63569bc2e6032a1a874c922fefe015768faacb Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 9 May 2023 07:20:20 +0000 Subject: [PATCH 097/244] change config --- ding/bonus/c51.py | 17 +++-------------- ding/bonus/config.py | 31 +++++++++++++------------------ ding/policy/c51.py | 4 ++-- 3 files changed, 18 insertions(+), 34 deletions(-) diff --git a/ding/bonus/c51.py b/ding/bonus/c51.py index aa7219be05..564c5b19e4 100644 --- a/ding/bonus/c51.py +++ b/ding/bonus/c51.py @@ -92,23 +92,11 @@ def train( with task.start(ctx=OnlineRLContext()): task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) task.use(eps_greedy_handler(self.cfg)) - task.use( - StepCollector( - self.cfg, - self.policy.collect_mode, - collector_env - ) - ) + task.use(StepCollector(self.cfg, self.policy.collect_mode, collector_env)) task.use(nstep_reward_enhancer(self.cfg)) task.use(data_pusher(self.cfg, self.buffer_)) task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_)) - task.use( - CkptSaver( - policy=self.policy, - save_dir=self.checkpoint_save_dir, - train_freq=n_iter_save_ckpt - ) - ) + task.use(CkptSaver(policy=self.policy, save_dir=self.checkpoint_save_dir, train_freq=n_iter_save_ckpt)) task.use( wandb_online_logger( metric_list=self.policy.monitor_vars(), @@ -140,6 +128,7 @@ def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, def single_env_forward_wrapper(forward_fn, cuda=True): forward_fn = model_wrap(forward_fn, wrapper_name='argmax_sample').forward + def _forward(obs): # unsqueeze means add batch dim, i.e. (O, ) -> (1, O) obs = ttorch.as_tensor(obs).unsqueeze(0) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 08467039f1..a905d05d10 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -1046,17 +1046,15 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: env=dict( env_id='BipedalWalker-v3', collector_env_num=8, - evaluator_env_num=8, + evaluator_env_num=5, # (bool) Scale output action into legal range. act_scale=True, - n_evaluator_episode=8, - stop_value=300, + n_evaluator_episode=5, rew_clip=True, ), policy=dict( - cuda=False, - priority=False, - random_collect_size=1000, + cuda=True, + random_collect_size=10000, model=dict( obs_shape=24, action_shape=4, @@ -1066,22 +1064,18 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: critic_head_hidden_size=128, ), learn=dict( - update_per_collect=1, - batch_size=128, - learning_rate_q=0.001, - learning_rate_policy=0.001, + update_per_collect=64, + batch_size=256, + learning_rate_q=0.0003, + learning_rate_policy=0.0003, learning_rate_alpha=0.0003, - ignore_done=True, target_theta=0.005, discount_factor=0.99, auto_alpha=True, - value_network=False, + learner=dict(hook=dict(log_show_after_iter=1000, )) ), - collect=dict( - n_sample=128, - unroll_len=1, - ), - other=dict(replay_buffer=dict(replay_buffer_size=100000, ), ), + collect=dict(n_sample=64, ), + other=dict(replay_buffer=dict(replay_buffer_size=300000, ), ), ), wandb_logger=dict( gradient_logger=True, @@ -1243,7 +1237,8 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: start=0.95, end=0.1, decay=50000, - ), replay_buffer=dict(replay_buffer_size=100000, ) + ), + replay_buffer=dict(replay_buffer_size=100000, ) ), ), wandb_logger=dict( diff --git a/ding/policy/c51.py b/ding/policy/c51.py index e5cd42bf52..dbb106bf42 100644 --- a/ding/policy/c51.py +++ b/ding/policy/c51.py @@ -199,7 +199,7 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: # Only discrete action satisfying len(data['action'])==1 can return this and draw histogram on tensorboard. # '[histogram]action_distribution': data['action'], } - + def _monitor_vars_learn(self) -> List[str]: return ['cur_lr', 'total_loss', 'q_value', 'target_q_value'] @@ -266,6 +266,6 @@ def _get_train_sample(self, data: list) -> Union[None, List[Any]]: """ data = get_nstep_return_data(data, self._nstep, gamma=self._gamma) return get_train_sample(data, self._unroll_len) - + def monitor_vars(self) -> List[str]: return ['cur_lr', 'total_loss', 'q_value', 'target_q_value'] From 2e8978b88dc6ca0963b1ead7b843f1492210dca0 Mon Sep 17 00:00:00 2001 From: zhangpaipai <1124321458@qq.com> Date: Thu, 11 May 2023 21:08:36 +0800 Subject: [PATCH 098/244] change bipedalwalker config and noframeskip --- ding/bonus/config.py | 74 +++++++++++++++++++------------------------- 1 file changed, 31 insertions(+), 43 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index a905d05d10..0d99c9f581 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -411,35 +411,31 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: seed=0, env=dict( env_id='BipedalWalker-v3', - collector_env_num=1, + collector_env_num=8, evaluator_env_num=5, # (bool) Scale output action into legal range. act_scale=True, n_evaluator_episode=5, - stop_value=300, rew_clip=True, - # The path to save the game replay - replay_path=None, ), policy=dict( cuda=True, - priority=False, + random_collect_size=10000, model=dict( obs_shape=24, action_shape=4, twin_critic=True, + action_space='regression', actor_head_hidden_size=400, critic_head_hidden_size=400, - action_space='regression', ), learn=dict( - update_per_collect=4, - discount_factor=0.99, - batch_size=128, - learning_rate_actor=0.001, - learning_rate_critic=0.001, + update_per_collect=64, + batch_size=256, + learning_rate_actor=0.0003, + learning_rate_critic=0.0003, target_theta=0.005, - ignore_done=False, + discount_factor=0.99, actor_update_freq=2, noise=True, noise_sigma=0.2, @@ -447,14 +443,14 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: min=-0.5, max=0.5, ), + learner=dict( + hook=dict(log_show_after_iter=1000, ) + ) ), collect=dict( - n_sample=256, - noise_sigma=0.1, - collector=dict(collect_print_freq=1000, ), + n_sample=64, ), - eval=dict(evaluator=dict(eval_freq=100, ), ), - other=dict(replay_buffer=dict(replay_buffer_size=50000, ), ), + other=dict(replay_buffer=dict(replay_buffer_size=300000, ), ), ), wandb_logger=dict( gradient_logger=True, @@ -743,42 +739,34 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: # (bool) Scale output action into legal range. act_scale=True, n_evaluator_episode=5, - stop_value=300, rew_clip=True, - # The path to save the game replay - replay_path=None, ), policy=dict( - cuda=False, - priority=False, - random_collect_size=1200, + cuda=True, + random_collect_size=10000, model=dict( obs_shape=24, action_shape=4, twin_critic=False, - actor_head_hidden_size=256, - critic_head_hidden_size=256, action_space='regression', + actor_head_hidden_size=400, + critic_head_hidden_size=400, ), learn=dict( - update_per_collect=1, - batch_size=128, - learning_rate_actor=0.001, - learning_rate_critic=0.001, - ignore_done=True, - actor_update_freq=1, - noise=False, + update_per_collect=64, + batch_size=256, + learning_rate_actor=0.0003, + learning_rate_critic=0.0003, + target_theta=0.005, + discount_factor=0.99, + learner=dict( + hook=dict(log_show_after_iter=1000, ) + ) ), collect=dict( - n_sample=16, - noise_sigma=0.1, - collector=dict(collect_print_freq=1000, ), + n_sample=64, ), - eval=dict(evaluator=dict(eval_freq=100, )), - other=dict(replay_buffer=dict( - replay_buffer_size=20000, - max_use=16, - ), ), + other=dict(replay_buffer=dict(replay_buffer_size=300000, ), ), ), wandb_logger=dict( gradient_logger=True, @@ -1518,19 +1506,19 @@ def get_instance_env(env: str) -> BaseEnv: 'env_id': "SpaceInvaders-v4", 'env_wrapper': 'atari_default', }) - return DingEnvWrapper(gym.make("SpaceInvaders-v4"), cfg=cfg) + return DingEnvWrapper(gym.make("SpaceInvadersNoFrameskip-v4"), cfg=cfg) elif env == "Pong": cfg = EasyDict({ 'env_id': "Pong-v4", 'env_wrapper': 'atari_default', }) - return DingEnvWrapper(gym.make("Pong-v4"), cfg=cfg) + return DingEnvWrapper(gym.make("PongNoFrameskip-v4"), cfg=cfg) elif env == "Qbert": cfg = EasyDict({ 'env_id': "Qbert-v4", 'env_wrapper': 'atari_default', }) - return DingEnvWrapper(gym.make("Qbert-v4"), cfg=cfg) + return DingEnvWrapper(gym.make("QbertNoFrameskip-v4"), cfg=cfg) elif env in ['atari_qbert', 'atari_kangaroo', 'atari_bowling', 'atari_breakout', 'atari_spaceinvader', 'atari_gopher']: from dizoo.atari.envs.atari_env import AtariEnv From aa3367d4d026f38b5ced86a30eb8b0fa11878e89 Mon Sep 17 00:00:00 2001 From: zhangpaipai <1124321458@qq.com> Date: Mon, 15 May 2023 12:51:11 +0800 Subject: [PATCH 099/244] polish c51-atari name --- ding/bonus/c51.py | 6 +++--- ding/bonus/config.py | 30 +++++++++++++++--------------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/ding/bonus/c51.py b/ding/bonus/c51.py index 564c5b19e4..79c1ed6779 100644 --- a/ding/bonus/c51.py +++ b/ding/bonus/c51.py @@ -23,9 +23,9 @@ class C51Agent: supported_env_list = [ 'lunarlander_discrete', - 'Pong', - 'SpaceInvaders', - 'Qbert', + 'PongNoFrameskip', + 'SpaceInvadersNoFrameskip', + 'QbertNoFrameskip', ] algorithm = 'C51' diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 0d99c9f581..a2be64ce06 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -1238,17 +1238,17 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), ) ) - elif env == 'Pong': + elif env == 'PongNoFrameskip': cfg.update( dict( - exp_name='Pong-v4-C51', + exp_name='PongNoFrameskip-v4-C51', seed=0, env=dict( collector_env_num=8, evaluator_env_num=8, n_evaluator_episode=8, stop_value=20, - env_id='Pong-v4', + env_id='PongNoFrameskip-v4', #'ALE/Pong-v5' is available. But special setting is needed after gym make. frame_stack=4, ), @@ -1292,17 +1292,17 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), ) ) - elif env == 'SpaceInvaders': + elif env == 'SpaceInvadersNoFrameskip': cfg.update( dict( - exp_name='SpaceInvaders-v4-C51', + exp_name='SpaceInvadersNoFrameskip-v4-C51', seed=0, env=dict( collector_env_num=8, evaluator_env_num=8, n_evaluator_episode=8, stop_value=10000000000, - env_id='SpaceInvaders-v4', + env_id='SpaceInvadersNoFrameskip-v4', #'ALE/SpaceInvaders-v5' is available. But special setting is needed after gym make. frame_stack=4, manager=dict(shared_memory=False, ) @@ -1347,17 +1347,17 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), ) ) - elif env == 'Qbert': + elif env == 'QbertNoFrameskip': cfg.update( dict( - exp_name='Qbert-v4-C51', + exp_name='QbertNoFrameskip-v4-C51', seed=0, env=dict( collector_env_num=8, evaluator_env_num=8, n_evaluator_episode=8, stop_value=30000, - env_id='Qbert-v4', + env_id='QbertNoFrameskip-v4', #'ALE/Qbert-v5' is available. But special setting is needed after gym make. frame_stack=4 ), @@ -1501,21 +1501,21 @@ def get_instance_env(env: str) -> BaseEnv: env_wrapper='mujoco_default', ) return DingEnvWrapper(gym.make('Walker2d-v3'), cfg=cfg) - elif env == "SpaceInvaders": + elif env == "SpaceInvadersNoFrameskip": cfg = EasyDict({ - 'env_id': "SpaceInvaders-v4", + 'env_id': "SpaceInvadersNoFrameskip-v4", 'env_wrapper': 'atari_default', }) return DingEnvWrapper(gym.make("SpaceInvadersNoFrameskip-v4"), cfg=cfg) - elif env == "Pong": + elif env == "PongNoFrameskip": cfg = EasyDict({ - 'env_id': "Pong-v4", + 'env_id': "PongNoFrameskip-v4", 'env_wrapper': 'atari_default', }) return DingEnvWrapper(gym.make("PongNoFrameskip-v4"), cfg=cfg) - elif env == "Qbert": + elif env == "QbertNoFrameskip": cfg = EasyDict({ - 'env_id': "Qbert-v4", + 'env_id': "QbertNoFrameskip-v4", 'env_wrapper': 'atari_default', }) return DingEnvWrapper(gym.make("QbertNoFrameskip-v4"), cfg=cfg) From 4b7aa509f4dbf0054b1e47500a763218178067f6 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Mon, 15 May 2023 01:59:57 -0700 Subject: [PATCH 100/244] add pong spaceinvaders and qbert for dqn --- ding/bonus/config.py | 151 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index a2be64ce06..b0e784d7fe 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -1181,6 +1181,157 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), ) ) + elif env == 'Pong': + cfg.update( + dict( + exp_name='Pong-v4-DQN', + seed=0, + env=dict( + env_id='Pong-v4', + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=20, + fram_stack=4, + ), + policy=dict( + cuda=True, + priority=False, + discount_factor=0.99, + nstep=3, + learn=dict( + update_per_collect=10, + batch_size=32, + learning_rate=0.0001, + # Frequency of target network update. + target_update_freq=500, + ), + model=dict( + obs_shape=[4, 84, 84], + action_shape=6, + encoder_hidden_size_list=[128, 128, 512], + ), + collect=dict(n_sample=96, ), + other=dict( + eps=dict( + type='exp', + start=1., + end=0.05, + decay=250000, + ), + replay_buffer=dict(replay_buffer_size=100000, ) + ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) + elif env == 'SpaceInvaders': + cfg.update( + dict( + exp_name='SpaceInvaders-v4-DQN', + seed=0, + env=dict( + env_id='SpaceInvadersNoFrameskip-v4', + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + fram_stack=4, + stop_value=2000, + ), + policy=dict( + cuda=True, + priority=False, + discount_factor=0.99, + nstep=3, + learn=dict( + update_per_collect=10, + batch_size=32, + learning_rate=0.0001, + # Frequency of target network update. + target_update_freq=500, + hook=dict(save_ckpt_after_iter=1000000, ) + ), + model=dict( + obs_shape=[4, 84, 84], + action_shape=6, + encoder_hidden_size_list=[128, 128, 512], + ), + collect=dict(n_sample=100, ), + other=dict( + eps=dict( + type='exp', + start=1., + end=0.05, + decay=1000000, + ), + replay_buffer=dict(replay_buffer_size=400000, ) + ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) + elif env == 'Qbert': + cfg.update( + dict( + exp_name='Qbert-v4-DQN', + seed=0, + env=dict( + env_id='Qbert-v4', + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + fram_stack=4, + stop_value=30000, + ), + policy=dict( + cuda=True, + priority=False, + discount_factor=0.99, + nstep=3, + learn=dict( + update_per_collect=10, + batch_size=32, + learning_rate=0.0001, + # Frequency of target network update. + target_update_freq=500, + ), + model=dict( + obs_shape=[4, 84, 84], + action_shape=6, + encoder_hidden_size_list=[128, 128, 512], + ), + collect=dict(n_sample=100, ), + other=dict( + eps=dict( + type='exp', + start=1., + end=0.05, + decay=1000000, + ), + replay_buffer=dict(replay_buffer_size=400000, ) + ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) else: raise KeyError("not supported env type: {}".format(env)) elif algorithm == 'C51': From efc807ee6d453769c5606fcf9e929923faab6e25 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 16 May 2023 05:42:07 +0000 Subject: [PATCH 101/244] polish code --- ding/bonus/config.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index b0e784d7fe..1073b4a68b 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -443,13 +443,9 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: min=-0.5, max=0.5, ), - learner=dict( - hook=dict(log_show_after_iter=1000, ) - ) - ), - collect=dict( - n_sample=64, + learner=dict(hook=dict(log_show_after_iter=1000, )) ), + collect=dict(n_sample=64, ), other=dict(replay_buffer=dict(replay_buffer_size=300000, ), ), ), wandb_logger=dict( @@ -759,13 +755,9 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: learning_rate_critic=0.0003, target_theta=0.005, discount_factor=0.99, - learner=dict( - hook=dict(log_show_after_iter=1000, ) - ) - ), - collect=dict( - n_sample=64, + learner=dict(hook=dict(log_show_after_iter=1000, )) ), + collect=dict(n_sample=64, ), other=dict(replay_buffer=dict(replay_buffer_size=300000, ), ), ), wandb_logger=dict( From eed925f2321894377afb37017ebb488cdf420093 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 16 May 2023 08:03:06 +0000 Subject: [PATCH 102/244] polish code; add env mode --- ding/bonus/a2c.py | 21 ++++++++++++++------- ding/bonus/c51.py | 20 ++++++++++++++------ ding/bonus/ddpg.py | 21 ++++++++++++++------- ding/bonus/dqn.py | 20 ++++++++++++++------ ding/bonus/pg.py | 20 ++++++++++++++------ ding/bonus/sac.py | 21 ++++++++++++++------- ding/bonus/td3.py | 21 ++++++++++++++------- 7 files changed, 98 insertions(+), 46 deletions(-) diff --git a/ding/bonus/a2c.py b/ding/bonus/a2c.py index bc0c8cba43..4fff18e44f 100644 --- a/ding/bonus/a2c.py +++ b/ding/bonus/a2c.py @@ -3,7 +3,7 @@ from ditk import logging from easydict import EasyDict import os -import gym +from functools import partial import torch import treetensor.torch as ttorch from ding.framework import task, OnlineRLContext @@ -83,8 +83,8 @@ def train( logging.getLogger().setLevel(logging.DEBUG) logging.debug(self.policy._model) # define env and policy - collector_env = self._setup_env_manager(collector_env_num, context, debug) - evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug) + collector_env = self._setup_env_manager(collector_env_num, context, debug, 'collector') + evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug, 'evaluator') with task.start(ctx=OnlineRLContext()): task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) @@ -165,7 +165,7 @@ def collect_data( if n_episode is not None: raise NotImplementedError # define env and policy - env = self._setup_env_manager(env_num, context, debug) + env = self._setup_env_manager(env_num, context, debug, 'collector') if save_data_path is None: save_data_path = os.path.join(self.exp_name, 'demo_data') @@ -193,7 +193,7 @@ def batch_evaluate( if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy - env = self._setup_env_manager(env_num, context, debug) + env = self._setup_env_manager(env_num, context, debug, 'evaluator') evaluate_cfg = self.cfg evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode @@ -205,7 +205,14 @@ def batch_evaluate( return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) - def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: bool = False) -> BaseEnvManagerV2: + def _setup_env_manager( + self, + env_num: int, + context: Optional[str] = None, + debug: bool = False, + caller: str = 'collector' + ) -> BaseEnvManagerV2: + assert caller in ['evaluator', 'collector'] if debug: env_cls = BaseEnvManagerV2 manager_cfg = env_cls.default_config() @@ -214,7 +221,7 @@ def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: manager_cfg = env_cls.default_config() if context is not None: manager_cfg.context = context - return env_cls([self.env.clone for _ in range(env_num)], manager_cfg) + return env_cls([partial(self.env.clone, caller) for _ in range(env_num)], manager_cfg) @property def best(self): diff --git a/ding/bonus/c51.py b/ding/bonus/c51.py index 79c1ed6779..527c728fc8 100644 --- a/ding/bonus/c51.py +++ b/ding/bonus/c51.py @@ -3,6 +3,7 @@ from ditk import logging from easydict import EasyDict import os +from functools import partial import torch import treetensor.torch as ttorch from ding.framework import task, OnlineRLContext @@ -86,8 +87,8 @@ def train( logging.getLogger().setLevel(logging.DEBUG) logging.debug(self.policy._model) # define env and policy - collector_env = self._setup_env_manager(collector_env_num, context, debug) - evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug) + collector_env = self._setup_env_manager(collector_env_num, context, debug, 'collector') + evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug, 'evaluator') with task.start(ctx=OnlineRLContext()): task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) @@ -170,7 +171,7 @@ def collect_data( if n_episode is not None: raise NotImplementedError # define env and policy - env = self._setup_env_manager(env_num, context, debug) + env = self._setup_env_manager(env_num, context, debug, 'collector') if save_data_path is None: save_data_path = os.path.join(self.exp_name, 'demo_data') @@ -198,7 +199,7 @@ def batch_evaluate( if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy - env = self._setup_env_manager(env_num, context, debug) + env = self._setup_env_manager(env_num, context, debug, 'evaluator') evaluate_cfg = self.cfg evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode @@ -210,7 +211,14 @@ def batch_evaluate( return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) - def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: bool = False) -> BaseEnvManagerV2: + def _setup_env_manager( + self, + env_num: int, + context: Optional[str] = None, + debug: bool = False, + caller: str = 'collector' + ) -> BaseEnvManagerV2: + assert caller in ['evaluator', 'collector'] if debug: env_cls = BaseEnvManagerV2 manager_cfg = env_cls.default_config() @@ -219,7 +227,7 @@ def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: manager_cfg = env_cls.default_config() if context is not None: manager_cfg.context = context - return env_cls([self.env.clone for _ in range(env_num)], manager_cfg) + return env_cls([partial(self.env.clone, caller) for _ in range(env_num)], manager_cfg) @property def best(self): diff --git a/ding/bonus/ddpg.py b/ding/bonus/ddpg.py index f6bb95c175..03bdd7213c 100644 --- a/ding/bonus/ddpg.py +++ b/ding/bonus/ddpg.py @@ -3,7 +3,7 @@ from ditk import logging from easydict import EasyDict import os -import gym +from functools import partial import torch import treetensor.torch as ttorch from ding.framework import task, OnlineRLContext @@ -89,8 +89,8 @@ def train( logging.getLogger().setLevel(logging.DEBUG) logging.debug(self.policy._model) # define env and policy - collector_env = self._setup_env_manager(collector_env_num, context, debug) - evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug) + collector_env = self._setup_env_manager(collector_env_num, context, debug, 'collector') + evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug, 'evaluator') with task.start(ctx=OnlineRLContext()): task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) @@ -176,7 +176,7 @@ def collect_data( if n_episode is not None: raise NotImplementedError # define env and policy - env = self._setup_env_manager(env_num, context, debug) + env = self._setup_env_manager(env_num, context, debug, 'collector') if save_data_path is None: save_data_path = os.path.join(self.exp_name, 'demo_data') @@ -204,7 +204,7 @@ def batch_evaluate( if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy - env = self._setup_env_manager(env_num, context, debug) + env = self._setup_env_manager(env_num, context, debug, 'evaluator') evaluate_cfg = self.cfg evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode @@ -216,7 +216,14 @@ def batch_evaluate( return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) - def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: bool = False) -> BaseEnvManagerV2: + def _setup_env_manager( + self, + env_num: int, + context: Optional[str] = None, + debug: bool = False, + caller: str = 'collector' + ) -> BaseEnvManagerV2: + assert caller in ['evaluator', 'collector'] if debug: env_cls = BaseEnvManagerV2 manager_cfg = env_cls.default_config() @@ -225,7 +232,7 @@ def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: manager_cfg = env_cls.default_config() if context is not None: manager_cfg.context = context - return env_cls([self.env.clone for _ in range(env_num)], manager_cfg) + return env_cls([partial(self.env.clone, caller) for _ in range(env_num)], manager_cfg) @property def best(self): diff --git a/ding/bonus/dqn.py b/ding/bonus/dqn.py index b48ca770f7..d03c976525 100644 --- a/ding/bonus/dqn.py +++ b/ding/bonus/dqn.py @@ -3,6 +3,7 @@ from ditk import logging from easydict import EasyDict import os +from functools import partial import torch import treetensor.torch as ttorch from ding.framework import task, OnlineRLContext @@ -83,8 +84,8 @@ def train( logging.getLogger().setLevel(logging.DEBUG) logging.debug(self.policy._model) # define env and policy - collector_env = self._setup_env_manager(collector_env_num, context, debug) - evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug) + collector_env = self._setup_env_manager(collector_env_num, context, debug, 'collector') + evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug, 'evaluator') with task.start(ctx=OnlineRLContext()): task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) @@ -175,7 +176,7 @@ def collect_data( if n_episode is not None: raise NotImplementedError # define env and policy - env = self._setup_env_manager(env_num, context, debug) + env = self._setup_env_manager(env_num, context, debug, 'collector') if save_data_path is None: save_data_path = os.path.join(self.exp_name, 'demo_data') @@ -203,7 +204,7 @@ def batch_evaluate( if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy - env = self._setup_env_manager(env_num, context, debug) + env = self._setup_env_manager(env_num, context, debug, 'evaluator') evaluate_cfg = self.cfg evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode @@ -215,7 +216,14 @@ def batch_evaluate( return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) - def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: bool = False) -> BaseEnvManagerV2: + def _setup_env_manager( + self, + env_num: int, + context: Optional[str] = None, + debug: bool = False, + caller: str = 'collector' + ) -> BaseEnvManagerV2: + assert caller in ['evaluator', 'collector'] if debug: env_cls = BaseEnvManagerV2 manager_cfg = env_cls.default_config() @@ -224,7 +232,7 @@ def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: manager_cfg = env_cls.default_config() if context is not None: manager_cfg.context = context - return env_cls([self.env.clone for _ in range(env_num)], manager_cfg) + return env_cls([partial(self.env.clone, caller) for _ in range(env_num)], manager_cfg) @property def best(self): diff --git a/ding/bonus/pg.py b/ding/bonus/pg.py index 84ec13ee0c..4044880151 100644 --- a/ding/bonus/pg.py +++ b/ding/bonus/pg.py @@ -2,6 +2,7 @@ from ditk import logging from easydict import EasyDict import os +from functools import partial import torch import treetensor.torch as ttorch from ding.framework import task, OnlineRLContext @@ -79,8 +80,8 @@ def train( logging.getLogger().setLevel(logging.DEBUG) logging.debug(self.policy._model) # define env and policy - collector_env = self._setup_env_manager(collector_env_num, context, debug) - evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug) + collector_env = self._setup_env_manager(collector_env_num, context, debug, 'collector') + evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug, 'evaluator') with task.start(ctx=OnlineRLContext()): task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) @@ -169,7 +170,7 @@ def collect_data( if n_episode is not None: raise NotImplementedError # define env and policy - env = self._setup_env_manager(env_num, context, debug) + env = self._setup_env_manager(env_num, context, debug, 'collector') if save_data_path is None: save_data_path = os.path.join(self.exp_name, 'demo_data') @@ -197,7 +198,7 @@ def batch_evaluate( if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy - env = self._setup_env_manager(env_num, context, debug) + env = self._setup_env_manager(env_num, context, debug, 'evaluator') evaluate_cfg = self.cfg evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode @@ -209,7 +210,14 @@ def batch_evaluate( return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) - def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: bool = False) -> BaseEnvManagerV2: + def _setup_env_manager( + self, + env_num: int, + context: Optional[str] = None, + debug: bool = False, + caller: str = 'collector' + ) -> BaseEnvManagerV2: + assert caller in ['evaluator', 'collector'] if debug: env_cls = BaseEnvManagerV2 manager_cfg = env_cls.default_config() @@ -218,7 +226,7 @@ def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: manager_cfg = env_cls.default_config() if context is not None: manager_cfg.context = context - return env_cls([self.env.clone for _ in range(env_num)], manager_cfg) + return env_cls([partial(self.env.clone, caller) for _ in range(env_num)], manager_cfg) @property def best(self): diff --git a/ding/bonus/sac.py b/ding/bonus/sac.py index bab515552d..d4367d212f 100644 --- a/ding/bonus/sac.py +++ b/ding/bonus/sac.py @@ -3,7 +3,7 @@ from ditk import logging from easydict import EasyDict import os -import gym +from functools import partial import torch import treetensor.torch as ttorch from ding.framework import task, OnlineRLContext @@ -90,8 +90,8 @@ def train( logging.getLogger().setLevel(logging.DEBUG) logging.debug(self.policy._model) # define env and policy - collector_env = self._setup_env_manager(collector_env_num, context, debug) - evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug) + collector_env = self._setup_env_manager(collector_env_num, context, debug, 'collector') + evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug, 'evaluator') with task.start(ctx=OnlineRLContext()): task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) @@ -178,7 +178,7 @@ def collect_data( if n_episode is not None: raise NotImplementedError # define env and policy - env = self._setup_env_manager(env_num, context, debug) + env = self._setup_env_manager(env_num, context, debug, 'collector') if save_data_path is None: save_data_path = os.path.join(self.exp_name, 'demo_data') @@ -206,7 +206,7 @@ def batch_evaluate( if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy - env = self._setup_env_manager(env_num, context, debug) + env = self._setup_env_manager(env_num, context, debug, 'evaluator') evaluate_cfg = self.cfg evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode @@ -218,7 +218,14 @@ def batch_evaluate( return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) - def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: bool = False) -> BaseEnvManagerV2: + def _setup_env_manager( + self, + env_num: int, + context: Optional[str] = None, + debug: bool = False, + caller: str = 'collector' + ) -> BaseEnvManagerV2: + assert caller in ['evaluator', 'collector'] if debug: env_cls = BaseEnvManagerV2 manager_cfg = env_cls.default_config() @@ -227,7 +234,7 @@ def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: manager_cfg = env_cls.default_config() if context is not None: manager_cfg.context = context - return env_cls([self.env.clone for _ in range(env_num)], manager_cfg) + return env_cls([partial(self.env.clone, caller) for _ in range(env_num)], manager_cfg) @property def best(self): diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index ef1babe3f8..70deb52c74 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -3,7 +3,7 @@ from ditk import logging from easydict import EasyDict import os -import gym +from functools import partial import torch import treetensor.torch as ttorch from ding.framework import task, OnlineRLContext @@ -89,8 +89,8 @@ def train( logging.getLogger().setLevel(logging.DEBUG) logging.debug(self.policy._model) # define env and policy - collector_env = self._setup_env_manager(collector_env_num, context, debug) - evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug) + collector_env = self._setup_env_manager(collector_env_num, context, debug, 'collector') + evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug, 'evaluator') with task.start(ctx=OnlineRLContext()): task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) @@ -176,7 +176,7 @@ def collect_data( if n_episode is not None: raise NotImplementedError # define env and policy - env = self._setup_env_manager(env_num, context, debug) + env = self._setup_env_manager(env_num, context, debug, 'collector') if save_data_path is None: save_data_path = os.path.join(self.exp_name, 'demo_data') @@ -204,7 +204,7 @@ def batch_evaluate( if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy - env = self._setup_env_manager(env_num, context, debug) + env = self._setup_env_manager(env_num, context, debug, 'evaluator') evaluate_cfg = self.cfg evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode @@ -216,7 +216,14 @@ def batch_evaluate( return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) - def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: bool = False) -> BaseEnvManagerV2: + def _setup_env_manager( + self, + env_num: int, + context: Optional[str] = None, + debug: bool = False, + caller: str = 'collector' + ) -> BaseEnvManagerV2: + assert caller in ['evaluator', 'collector'] if debug: env_cls = BaseEnvManagerV2 manager_cfg = env_cls.default_config() @@ -225,7 +232,7 @@ def _setup_env_manager(self, env_num: int, context: Optional[str] = None, debug: manager_cfg = env_cls.default_config() if context is not None: manager_cfg.context = context - return env_cls([self.env.clone for _ in range(env_num)], manager_cfg) + return env_cls([partial(self.env.clone, caller) for _ in range(env_num)], manager_cfg) @property def best(self): From f37f65bdf27bbb2128d8f67cbadddb79b2eec6c3 Mon Sep 17 00:00:00 2001 From: zhangpaipai <1124321458@qq.com> Date: Tue, 16 May 2023 16:40:31 +0800 Subject: [PATCH 103/244] add rew_clip in ding_env_wrapper --- ding/bonus/config.py | 2 +- ding/envs/env/ding_env_wrapper.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 1073b4a68b..018eb1fb68 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -1558,7 +1558,7 @@ def get_instance_env(env: str) -> BaseEnv: elif env == 'lunarlander_continuous': return DingEnvWrapper(gym.make('LunarLander-v2', continuous=True)) elif env == 'bipedalwalker': - return DingEnvWrapper(gym.make('BipedalWalker-v3'), cfg={'act_scale': True}) + return DingEnvWrapper(gym.make('BipedalWalker-v3'), cfg={'act_scale': True, 'rew_clip': True}) elif env == 'pendulum': return DingEnvWrapper(gym.make('Pendulum-v1'), cfg={'act_scale': True}) elif env == 'acrobot': diff --git a/ding/envs/env/ding_env_wrapper.py b/ding/envs/env/ding_env_wrapper.py index fb806cbbbc..5914b18d98 100644 --- a/ding/envs/env/ding_env_wrapper.py +++ b/ding/envs/env/ding_env_wrapper.py @@ -33,6 +33,8 @@ def __init__(self, env: gym.Env = None, cfg: dict = None, seed_api: bool = True, self._cfg = EasyDict(self._cfg) if 'act_scale' not in self._cfg: self._cfg.act_scale = False + if 'rew_clip' not in self._cfg: + self._cfg.rew_clip = False if 'env_wrapper' not in self._cfg: self._cfg.env_wrapper = 'default' if 'env_id' not in self._cfg: @@ -124,6 +126,9 @@ def step(self, action: Union[np.int64, np.ndarray]) -> BaseEnvTimestep: if self._cfg.act_scale: action = affine_transform(action, min_val=self._env.action_space.low, max_val=self._env.action_space.high) obs, rew, done, info = self._env.step(action) + if self._cfg.rew_clip: + rew = max(-10, rew) + rew = np.float32(rew) if self.observation_space.dtype == np.float32: obs = to_ndarray(obs, dtype=np.float32) else: From 59cc61b7bdcb0cb122f7aac889d3daee70d729d6 Mon Sep 17 00:00:00 2001 From: zhangpaipai <1124321458@qq.com> Date: Fri, 19 May 2023 20:51:00 +0800 Subject: [PATCH 104/244] polish dqn atari --- ding/bonus/config.py | 16 ++++++++-------- ding/bonus/dqn.py | 3 +++ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 018eb1fb68..f54f14c649 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -1173,13 +1173,13 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), ) ) - elif env == 'Pong': + elif env == 'PongNoFrameskip': cfg.update( dict( - exp_name='Pong-v4-DQN', + exp_name='PongNoFrameskip-v4-DQN', seed=0, env=dict( - env_id='Pong-v4', + env_id='PongNoFrameskip-v4', collector_env_num=8, evaluator_env_num=8, n_evaluator_episode=8, @@ -1223,10 +1223,10 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), ) ) - elif env == 'SpaceInvaders': + elif env == 'SpaceInvadersNoFrameskip': cfg.update( dict( - exp_name='SpaceInvaders-v4-DQN', + exp_name='SpaceInvadersNoFrameskip-v4-DQN', seed=0, env=dict( env_id='SpaceInvadersNoFrameskip-v4', @@ -1274,13 +1274,13 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), ) ) - elif env == 'Qbert': + elif env == 'QbertNoFrameskip': cfg.update( dict( - exp_name='Qbert-v4-DQN', + exp_name='QbertNoFrameskip-v4-DQN', seed=0, env=dict( - env_id='Qbert-v4', + env_id='QbertNoFrameskip-v4', collector_env_num=8, evaluator_env_num=8, n_evaluator_episode=8, diff --git a/ding/bonus/dqn.py b/ding/bonus/dqn.py index d03c976525..64606d5fec 100644 --- a/ding/bonus/dqn.py +++ b/ding/bonus/dqn.py @@ -24,6 +24,9 @@ class DQNAgent: supported_env_list = [ 'lunarlander_discrete', + 'PongNoFrameskip', + 'SpaceInvadersNoFrameskip', + 'QbertNoFrameskip', ] algorithm = 'DQN' From b1aab8d4ccd35f29ee990fd9d5eae565fd66af49 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 23 May 2023 08:34:46 +0000 Subject: [PATCH 105/244] add a2c continuous action space --- ding/policy/a2c.py | 29 +++++++-- ding/rl_utils/__init__.py | 2 +- ding/rl_utils/a2c.py | 34 +++++++++- .../config/bipedalwalker_a2c_config.py | 65 +++++++++++++++++++ 4 files changed, 123 insertions(+), 7 deletions(-) create mode 100644 dizoo/box2d/bipedalwalker/config/bipedalwalker_a2c_config.py diff --git a/ding/policy/a2c.py b/ding/policy/a2c.py index f80917e3dc..da738d9784 100644 --- a/ding/policy/a2c.py +++ b/ding/policy/a2c.py @@ -2,7 +2,8 @@ from collections import namedtuple import torch -from ding.rl_utils import a2c_data, a2c_error, get_gae_with_default_last_value, get_train_sample +from ding.rl_utils import a2c_data, a2c_error, get_gae_with_default_last_value, get_train_sample, \ + a2c_error_continuous from ding.torch_utils import Adam, to_device from ding.model import model_wrap from ding.utils import POLICY_REGISTRY, split_data_generator @@ -27,6 +28,8 @@ class A2CPolicy(Policy): priority=False, # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True. priority_IS_weight=False, + # (str) Which kind of action space used in PPOPolicy, ['discrete', 'continuous'] + action_space='discrete', learn=dict( # (int) for a2c, update_per_collect must be 1. @@ -74,6 +77,7 @@ def _init_learn(self) -> None: Learn mode init method. Called by ``self.__init__``. Init the optimizer, algorithm config, main and target models. """ + assert self._cfg.action_space in ["continuous", "discrete"] # Optimizer self._optimizer = Adam( self._model.parameters(), @@ -120,7 +124,11 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: error_data = a2c_data(output['logit'], batch['action'], output['value'], adv, return_, batch['weight']) # Calculate A2C loss - a2c_loss = a2c_error(error_data) + if self._action_space == 'continuous': + a2c_loss = a2c_error_continuous(error_data) + elif self._action_space == 'discrete': + a2c_loss = a2c_error(error_data) + wv, we = self._value_weight, self._entropy_weight total_loss = a2c_loss.policy_loss + wv * a2c_loss.value_loss - we * a2c_loss.entropy_loss @@ -168,8 +176,14 @@ def _init_collect(self) -> None: Init traj and unroll length, collect model. """ + assert self._cfg.action_space in ["continuous", "discrete"] self._unroll_len = self._cfg.collect.unroll_len - self._collect_model = model_wrap(self._model, wrapper_name='multinomial_sample') + + self._action_space = self._cfg.action_space + if self._action_space == 'continuous': + self._collect_model = model_wrap(self._model, wrapper_name='reparam_sample') + elif self._action_space == 'discrete': + self._collect_model = model_wrap(self._model, wrapper_name='multinomial_sample') self._collect_model.reset() # Algorithm self._gamma = self._cfg.collect.discount_factor @@ -245,7 +259,12 @@ def _init_eval(self) -> None: Evaluate mode init method. Called by ``self.__init__``. Init eval model with argmax strategy. """ - self._eval_model = model_wrap(self._model, wrapper_name='argmax_sample') + assert self._cfg.action_space in ["continuous", "discrete"] + self._action_space = self._cfg.action_space + if self._action_space == 'continuous': + self._eval_model = model_wrap(self._model, wrapper_name='deterministic_sample') + elif self._action_space == 'discrete': + self._eval_model = model_wrap(self._model, wrapper_name='argmax_sample') self._eval_model.reset() def _forward_eval(self, data: dict) -> dict: @@ -276,4 +295,4 @@ def _monitor_vars_learn(self) -> List[str]: return super()._monitor_vars_learn() + ['policy_loss', 'value_loss', 'entropy_loss', 'adv_abs_max', 'grad_norm'] def monitor_vars(self) -> List[str]: - return self._monitor_vars_learn() \ No newline at end of file + return self._monitor_vars_learn() diff --git a/ding/rl_utils/__init__.py b/ding/rl_utils/__init__.py index 080b37ead5..381ee601f7 100644 --- a/ding/rl_utils/__init__.py +++ b/ding/rl_utils/__init__.py @@ -3,7 +3,7 @@ ppo_error, ppo_error_continuous, ppo_policy_error_continuous from .ppg import ppg_data, ppg_joint_loss, ppg_joint_error from .gae import gae_data, gae -from .a2c import a2c_data, a2c_error +from .a2c import a2c_data, a2c_error, a2c_error_continuous from .coma import coma_data, coma_error from .td import q_nstep_td_data, q_nstep_td_error, q_1step_td_data, \ q_1step_td_error, m_q_1step_td_data, m_q_1step_td_error, td_lambda_data, td_lambda_error,\ diff --git a/ding/rl_utils/a2c.py b/ding/rl_utils/a2c.py index 0ed1deb29d..6d1fcb5448 100644 --- a/ding/rl_utils/a2c.py +++ b/ding/rl_utils/a2c.py @@ -1,6 +1,7 @@ from collections import namedtuple import torch import torch.nn.functional as F +from torch.distributions import Independent, Normal a2c_data = namedtuple('a2c_data', ['logit', 'action', 'value', 'adv', 'return_', 'weight']) a2c_loss = namedtuple('a2c_loss', ['policy_loss', 'value_loss', 'entropy_loss']) @@ -9,7 +10,7 @@ def a2c_error(data: namedtuple) -> namedtuple: """ Overview: - Implementation of A2C(Advantage Actor-Critic) (arXiv:1602.01783) + Implementation of A2C(Advantage Actor-Critic) (arXiv:1602.01783) for discrete action space Arguments: - data (:obj:`namedtuple`): a2c input data with fieids shown in ``a2c_data`` Returns: @@ -34,3 +35,34 @@ def a2c_error(data: namedtuple) -> namedtuple: policy_loss = -(logp * adv * weight).mean() value_loss = (F.mse_loss(return_, value, reduction='none') * weight).mean() return a2c_loss(policy_loss, value_loss, entropy_loss) + + +def a2c_error_continuous(data: namedtuple) -> namedtuple: + """ + Overview: + Implementation of A2C(Advantage Actor-Critic) (arXiv:1602.01783) for continuous action space + Arguments: + - data (:obj:`namedtuple`): a2c input data with fieids shown in ``a2c_data`` + Returns: + - a2c_loss (:obj:`namedtuple`): the a2c loss item, all of them are the differentiable 0-dim tensor + Shapes: + - logit (:obj:`torch.FloatTensor`): :math:`(B, N)`, where B is batch size and N is action dim + - action (:obj:`torch.LongTensor`): :math:`(B, )` + - value (:obj:`torch.FloatTensor`): :math:`(B, )` + - adv (:obj:`torch.FloatTensor`): :math:`(B, )` + - return (:obj:`torch.FloatTensor`): :math:`(B, )` + - weight (:obj:`torch.FloatTensor` or :obj:`None`): :math:`(B, )` + - policy_loss (:obj:`torch.FloatTensor`): :math:`()`, 0-dim tensor + - value_loss (:obj:`torch.FloatTensor`): :math:`()` + - entropy_loss (:obj:`torch.FloatTensor`): :math:`()` + """ + logit, action, value, adv, return_, weight = data + if weight is None: + weight = torch.ones_like(value) + + dist = Independent(Normal(logit['mu'], logit['sigma']), 1) + logp = dist.log_prob(action) + entropy_loss = (dist.entropy() * weight).mean() + policy_loss = -(logp * adv * weight).mean() + value_loss = (F.mse_loss(return_, value, reduction='none') * weight).mean() + return a2c_loss(policy_loss, value_loss, entropy_loss) diff --git a/dizoo/box2d/bipedalwalker/config/bipedalwalker_a2c_config.py b/dizoo/box2d/bipedalwalker/config/bipedalwalker_a2c_config.py new file mode 100644 index 0000000000..5dcc75055c --- /dev/null +++ b/dizoo/box2d/bipedalwalker/config/bipedalwalker_a2c_config.py @@ -0,0 +1,65 @@ +from easydict import EasyDict + +bipedalwalker_a2c_config = dict( + exp_name='bipedalwalker_a2c_seed0', + env=dict( + env_id='BipedalWalker-v3', + collector_env_num=8, + evaluator_env_num=8, + # (bool) Scale output action into legal range. + act_scale=True, + n_evaluator_episode=8, + stop_value=300, + rew_clip=True, + # The path to save the game replay + # replay_path='./bipedalwalker_a2c_seed0/video', + ), + policy=dict( + cuda=True, + # (int) the trajectory length to calculate v-trace target + unroll_len=32, + # load_path="./bipedalwalker_a2c_seed0/ckpt/ckpt_best.pth.tar", + action_space='continuous', + model=dict( + action_space='continuous', + obs_shape=24, + action_shape=4, + ), + learn=dict( + # (int) the number of data for a train iteration + batch_size=64, + learning_rate=0.0003, + # (float) loss weight of the value network, the weight of policy network is set to 1 + value_weight=0.5, + # (float) loss weight of the entropy regularization, the weight of policy network is set to 1 + entropy_weight=0.01, + # (float) discount factor for future reward, defaults int [0, 1] + discount_factor=0.99, + ), + collect=dict( + # (int) collect n_sample data, train model n_iteration times + n_sample=64, + collector=dict(collect_print_freq=1000, ), + ), + eval=dict(evaluator=dict(eval_freq=100, )), + other=dict(replay_buffer=dict(replay_buffer_size=10000, ), ), + ), +) +bipedalwalker_a2c_config = EasyDict(bipedalwalker_a2c_config) +main_config = bipedalwalker_a2c_config +bipedalwalker_a2c_create_config = dict( + env=dict( + type='bipedalwalker', + import_names=['dizoo.box2d.bipedalwalker.envs.bipedalwalker_env'], + ), + env_manager=dict(type='subprocess'), + policy=dict(type='a2c'), + replay_buffer=dict(type='naive'), +) +bipedalwalker_a2c_create_config = EasyDict(bipedalwalker_a2c_create_config) +create_config = bipedalwalker_a2c_create_config + +if __name__ == "__main__": + # or you can enter `ding -m serial_onpolicy -c bipedalwalker_a2c_config.py -s 0` + from ding.entry import serial_pipeline_onpolicy + serial_pipeline_onpolicy([main_config, create_config], seed=0) From 0584404a346a2a95f7138fcbb82cfb0358acbf9d Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 23 May 2023 08:38:45 +0000 Subject: [PATCH 106/244] add a2c continuous action space --- ding/rl_utils/tests/test_a2c.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/ding/rl_utils/tests/test_a2c.py b/ding/rl_utils/tests/test_a2c.py index e2c635a7f2..2248671127 100644 --- a/ding/rl_utils/tests/test_a2c.py +++ b/ding/rl_utils/tests/test_a2c.py @@ -2,7 +2,7 @@ from itertools import product import numpy as np import torch -from ding.rl_utils import a2c_data, a2c_error +from ding.rl_utils import a2c_data, a2c_error, a2c_error_continuous random_weight = torch.rand(4) + 1 weight_args = [None, random_weight] @@ -26,3 +26,26 @@ def test_a2c(weight): total_loss.backward() assert isinstance(logit.grad, torch.Tensor) assert isinstance(value.grad, torch.Tensor) + + +@pytest.mark.unittest +@pytest.mark.parametrize('weight, ', weight_args) +def test_a2c_continuous(weight): + B, N = 4, 32 + logit = { + "mu": torch.randn(B, N).requires_grad_(True), + "sigma": torch.randn(B, N).requires_grad_(True), + } + action = torch.randint(0, N, size=(B, )) + value = torch.randn(B).requires_grad_(True) + adv = torch.rand(B) + return_ = torch.randn(B) * 2 + data = a2c_data(logit, action, value, adv, return_, weight) + loss = a2c_error_continuous(data) + assert all([l.shape == tuple() for l in loss]) + assert logit.grad is None + assert value.grad is None + total_loss = sum(loss) + total_loss.backward() + assert isinstance(logit.grad, torch.Tensor) + assert isinstance(value.grad, torch.Tensor) From f651f680cfcdbfbaea4e9db6ca27bc0e9d76398e Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 23 May 2023 09:08:41 +0000 Subject: [PATCH 107/244] add a2c continuous for mujoco --- dizoo/mujoco/config/halfcheetah_a2c_config.py | 64 +++++++++++++++++++ dizoo/mujoco/config/hopper_a2c_config.py | 64 +++++++++++++++++++ dizoo/mujoco/config/walker2d_a2c_config.py | 64 +++++++++++++++++++ 3 files changed, 192 insertions(+) create mode 100644 dizoo/mujoco/config/halfcheetah_a2c_config.py create mode 100644 dizoo/mujoco/config/hopper_a2c_config.py create mode 100644 dizoo/mujoco/config/walker2d_a2c_config.py diff --git a/dizoo/mujoco/config/halfcheetah_a2c_config.py b/dizoo/mujoco/config/halfcheetah_a2c_config.py new file mode 100644 index 0000000000..a53fdbb8d3 --- /dev/null +++ b/dizoo/mujoco/config/halfcheetah_a2c_config.py @@ -0,0 +1,64 @@ +from easydict import EasyDict + +halfcheetah_a2c_config = dict( + exp_name='halfcheetah_a2c_seed0', + env=dict( + env_id='HalfCheetah-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=12000, + ), + policy=dict( + cuda=True, + random_collect_size=10000, + model=dict( + action_space='continuous', + obs_shape=17, + action_shape=6, + ), + learn=dict( + # (int) the number of data for a train iteration + batch_size=256, + learning_rate=0.0003, + # (float) loss weight of the value network, the weight of policy network is set to 1 + value_weight=0.5, + # (float) loss weight of the entropy regularization, the weight of policy network is set to 1 + entropy_weight=0.01, + # (float) discount factor for future reward, defaults int [0, 1] + discount_factor=0.99, + ), + collect=dict( + n_sample=4096, + unroll_len=1, + ), + command=dict(), + eval=dict(), + other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), + ), +) + +halfcheetah_a2c_config = EasyDict(halfcheetah_a2c_config) +main_config = halfcheetah_a2c_config + +halfcheetah_a2c_create_config = dict( + env=dict( + type='mujoco', + import_names=['dizoo.mujoco.envs.mujoco_env'], + ), + env_manager=dict(type='subprocess'), + policy=dict( + type='a2c', + import_names=['ding.policy.a2c'], + ), + replay_buffer=dict(type='naive', ), +) +halfcheetah_a2c_create_config = EasyDict(halfcheetah_a2c_create_config) +create_config = halfcheetah_a2c_create_config + +if __name__ == "__main__": + # or you can enter `ding -m serial -c halfcheetah_a2c_config.py -s 0` + from ding.entry import serial_pipeline_onpolicy + serial_pipeline_onpolicy((main_config, create_config), seed=0) diff --git a/dizoo/mujoco/config/hopper_a2c_config.py b/dizoo/mujoco/config/hopper_a2c_config.py new file mode 100644 index 0000000000..354dceddbf --- /dev/null +++ b/dizoo/mujoco/config/hopper_a2c_config.py @@ -0,0 +1,64 @@ +from easydict import EasyDict + +hopper_a2c_config = dict( + exp_name='hopper_a2c_seed0', + env=dict( + env_id='Hopper-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=6000, + ), + policy=dict( + cuda=True, + random_collect_size=10000, + model=dict( + obs_shape=11, + action_shape=3, + action_space='continuous', + ), + learn=dict( + # (int) the number of data for a train iteration + batch_size=256, + learning_rate=0.0003, + # (float) loss weight of the value network, the weight of policy network is set to 1 + value_weight=0.5, + # (float) loss weight of the entropy regularization, the weight of policy network is set to 1 + entropy_weight=0.01, + # (float) discount factor for future reward, defaults int [0, 1] + discount_factor=0.99, + ), + collect=dict( + n_sample=4096, + unroll_len=1, + ), + command=dict(), + eval=dict(), + other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), + ), +) + +hopper_a2c_config = EasyDict(hopper_a2c_config) +main_config = hopper_a2c_config + +hopper_a2c_create_config = dict( + env=dict( + type='mujoco', + import_names=['dizoo.mujoco.envs.mujoco_env'], + ), + env_manager=dict(type='subprocess'), + policy=dict( + type='a2c', + import_names=['ding.policy.a2c'], + ), + replay_buffer=dict(type='naive', ), +) +hopper_a2c_create_config = EasyDict(hopper_a2c_create_config) +create_config = hopper_a2c_create_config + +if __name__ == "__main__": + # or you can enter `ding -m serial -c hopper_a2c_config.py -s 0` + from ding.entry import serial_pipeline_onpolicy + serial_pipeline_onpolicy([main_config, create_config], seed=0) diff --git a/dizoo/mujoco/config/walker2d_a2c_config.py b/dizoo/mujoco/config/walker2d_a2c_config.py new file mode 100644 index 0000000000..320f9851df --- /dev/null +++ b/dizoo/mujoco/config/walker2d_a2c_config.py @@ -0,0 +1,64 @@ +from easydict import EasyDict + +walker2d_a2c_config = dict( + exp_name='walker2d_a2c_seed0', + env=dict( + env_id='Walker2d-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=6000, + ), + policy=dict( + cuda=True, + random_collect_size=10000, + model=dict( + action_space='continuous', + obs_shape=17, + action_shape=6, + ), + learn=dict( + # (int) the number of data for a train iteration + batch_size=256, + learning_rate=0.0003, + # (float) loss weight of the value network, the weight of policy network is set to 1 + value_weight=0.5, + # (float) loss weight of the entropy regularization, the weight of policy network is set to 1 + entropy_weight=0.01, + # (float) discount factor for future reward, defaults int [0, 1] + discount_factor=0.99, + ), + collect=dict( + n_sample=4096, + unroll_len=1, + ), + command=dict(), + eval=dict(), + other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), + ), +) + +walker2d_a2c_config = EasyDict(walker2d_a2c_config) +main_config = walker2d_a2c_config + +walker2d_a2c_create_config = dict( + env=dict( + type='mujoco', + import_names=['dizoo.mujoco.envs.mujoco_env'], + ), + env_manager=dict(type='subprocess'), + policy=dict( + type='a2c', + import_names=['ding.policy.a2c'], + ), + replay_buffer=dict(type='naive', ), +) +walker2d_a2c_create_config = EasyDict(walker2d_a2c_create_config) +create_config = walker2d_a2c_create_config + +if __name__ == "__main__": + # or you can enter `ding -m serial -c walker2d_a2c_config.py -s 0` + from ding.entry import serial_pipeline_onpolicy + serial_pipeline_onpolicy([main_config, create_config], seed=0) From 92bfff3b779ff0e40f8e210c5617b3644ca74847 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 23 May 2023 09:14:11 +0000 Subject: [PATCH 108/244] add a2c continuous for mujoco --- dizoo/mujoco/config/halfcheetah_a2c_config.py | 2 +- dizoo/mujoco/config/hopper_a2c_config.py | 2 +- dizoo/mujoco/config/walker2d_a2c_config.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dizoo/mujoco/config/halfcheetah_a2c_config.py b/dizoo/mujoco/config/halfcheetah_a2c_config.py index a53fdbb8d3..d8dc67e65f 100644 --- a/dizoo/mujoco/config/halfcheetah_a2c_config.py +++ b/dizoo/mujoco/config/halfcheetah_a2c_config.py @@ -13,7 +13,7 @@ ), policy=dict( cuda=True, - random_collect_size=10000, + action_space='continuous', model=dict( action_space='continuous', obs_shape=17, diff --git a/dizoo/mujoco/config/hopper_a2c_config.py b/dizoo/mujoco/config/hopper_a2c_config.py index 354dceddbf..b9c554a0aa 100644 --- a/dizoo/mujoco/config/hopper_a2c_config.py +++ b/dizoo/mujoco/config/hopper_a2c_config.py @@ -13,7 +13,7 @@ ), policy=dict( cuda=True, - random_collect_size=10000, + action_space='continuous', model=dict( obs_shape=11, action_shape=3, diff --git a/dizoo/mujoco/config/walker2d_a2c_config.py b/dizoo/mujoco/config/walker2d_a2c_config.py index 320f9851df..68c47adf95 100644 --- a/dizoo/mujoco/config/walker2d_a2c_config.py +++ b/dizoo/mujoco/config/walker2d_a2c_config.py @@ -13,7 +13,7 @@ ), policy=dict( cuda=True, - random_collect_size=10000, + action_space='continuous', model=dict( action_space='continuous', obs_shape=17, From a72de147db7326194be691a5650b4dc8e42346bf Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 23 May 2023 09:26:32 +0000 Subject: [PATCH 109/244] add a2c continuous for mujoco --- dizoo/mujoco/config/halfcheetah_a2c_config.py | 1 - dizoo/mujoco/config/hopper_a2c_config.py | 1 - dizoo/mujoco/config/walker2d_a2c_config.py | 1 - 3 files changed, 3 deletions(-) diff --git a/dizoo/mujoco/config/halfcheetah_a2c_config.py b/dizoo/mujoco/config/halfcheetah_a2c_config.py index d8dc67e65f..81fdae9243 100644 --- a/dizoo/mujoco/config/halfcheetah_a2c_config.py +++ b/dizoo/mujoco/config/halfcheetah_a2c_config.py @@ -36,7 +36,6 @@ ), command=dict(), eval=dict(), - other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), ), ) diff --git a/dizoo/mujoco/config/hopper_a2c_config.py b/dizoo/mujoco/config/hopper_a2c_config.py index b9c554a0aa..fe2498d54d 100644 --- a/dizoo/mujoco/config/hopper_a2c_config.py +++ b/dizoo/mujoco/config/hopper_a2c_config.py @@ -36,7 +36,6 @@ ), command=dict(), eval=dict(), - other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), ), ) diff --git a/dizoo/mujoco/config/walker2d_a2c_config.py b/dizoo/mujoco/config/walker2d_a2c_config.py index 68c47adf95..402c670f3e 100644 --- a/dizoo/mujoco/config/walker2d_a2c_config.py +++ b/dizoo/mujoco/config/walker2d_a2c_config.py @@ -36,7 +36,6 @@ ), command=dict(), eval=dict(), - other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), ), ) From 4e59519d436927e2e48ab63bb83efc9650045ed8 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 24 May 2023 06:05:10 +0000 Subject: [PATCH 110/244] add a2c mujoco config; add ppo atari config --- ding/bonus/a2c.py | 3 + ding/bonus/config.py | 136 +++++++++++++++++++++++++++++++++++++++++++ ding/bonus/ppof.py | 3 + 3 files changed, 142 insertions(+) diff --git a/ding/bonus/a2c.py b/ding/bonus/a2c.py index 4fff18e44f..79a944ae17 100644 --- a/ding/bonus/a2c.py +++ b/ding/bonus/a2c.py @@ -23,6 +23,9 @@ class A2CAgent: supported_env_list = [ 'lunarlander_discrete', + 'hopper', + 'HalfCheetah', + 'Walker2d', ] algorithm = 'A2C' diff --git a/ding/bonus/config.py b/ding/bonus/config.py index f54f14c649..77be97a046 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -94,6 +94,16 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: critic_head_hidden_size=128, critic_head_layer_num=2, ) + elif env in ['PongNoFrameskip', 'SpaceInvadersNoFrameskip', 'QbertNoFrameskip']: + cfg.n_sample = 3200 + cfg.batch_size = 320 + cfg.epoch_per_collect = 10 + cfg.learning_rate = 3e-4 + cfg.model = dict( + encoder_hidden_size_list=[64, 64, 128], + actor_head_hidden_size=128, + critic_head_hidden_size=128, + ) elif env == 'minigrid_fourroom': cfg.n_sample = 3200 cfg.batch_size = 320 @@ -161,6 +171,132 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), ) ) + if env == 'hopper': + cfg.update( + dict( + exp_name='Hopper-v3-A2C', + env=dict( + env_id='Hopper-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=12000, + ), + policy=dict( + cuda=True, + action_space='continuous', + model=dict( + obs_shape=11, + action_shape=3, + action_space='continuous', + ), + learn=dict( + batch_size=256, + learning_rate=0.0003, + value_weight=0.5, + entropy_weight=0.01, + discount_factor=0.99, + ), + collect=dict( + n_sample=4096, + unroll_len=1, + ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) + if env == 'HalfCheetah': + cfg.update( + dict( + exp_name='HalfCheetah-v3-A2C', + env=dict( + env_id='HalfCheetah-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=12000, + ), + policy=dict( + cuda=True, + action_space='continuous', + model=dict( + action_space='continuous', + obs_shape=17, + action_shape=6, + ), + learn=dict( + batch_size=256, + learning_rate=0.0003, + value_weight=0.5, + entropy_weight=0.01, + discount_factor=0.99, + ), + collect=dict( + n_sample=4096, + unroll_len=1, + ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) + if env == 'Walker2d': + cfg.update( + dict( + exp_name='Walker2d-v3-A2C', + env=dict( + env_id='Walker2d-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=12000, + ), + policy=dict( + cuda=True, + action_space='continuous', + model=dict( + action_space='continuous', + obs_shape=17, + action_shape=6, + ), + learn=dict( + batch_size=256, + learning_rate=0.0003, + value_weight=0.5, + entropy_weight=0.01, + discount_factor=0.99, + ), + collect=dict( + n_sample=4096, + unroll_len=1, + ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) else: raise KeyError("not supported env type: {}".format(env)) elif algorithm == 'PG': diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index 34c85487b5..f46f9136da 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -42,6 +42,9 @@ class PPOF: 'atari_qbert', 'atari_kangaroo', 'atari_bowling', + 'PongNoFrameskip', + 'SpaceInvadersNoFrameskip', + 'QbertNoFrameskip', # mujoco 'hopper', ] From f104d81509b3d50faa85730eb0c70aef3096da60 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 24 May 2023 10:02:20 +0000 Subject: [PATCH 111/244] add a2c mujoco config; add ppo atari config --- ding/bonus/config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 77be97a046..b48f90373d 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -171,7 +171,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), ) ) - if env == 'hopper': + elif env == 'hopper': cfg.update( dict( exp_name='Hopper-v3-A2C', @@ -213,7 +213,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), ) ) - if env == 'HalfCheetah': + elif env == 'HalfCheetah': cfg.update( dict( exp_name='HalfCheetah-v3-A2C', @@ -255,7 +255,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), ) ) - if env == 'Walker2d': + elif env == 'Walker2d': cfg.update( dict( exp_name='Walker2d-v3-A2C', From 308e25a606fb13ba0250ee9feb0ee4e81316411c Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 24 May 2023 10:17:41 +0000 Subject: [PATCH 112/244] fix a2c deploy bug --- ding/bonus/a2c.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ding/bonus/a2c.py b/ding/bonus/a2c.py index 79a944ae17..7c527099bf 100644 --- a/ding/bonus/a2c.py +++ b/ding/bonus/a2c.py @@ -125,7 +125,12 @@ def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, def single_env_forward_wrapper(forward_fn, cuda=True): - forward_fn = model_wrap(forward_fn, wrapper_name='argmax_sample').forward + if self.cfg.policy.action_space == 'continuous': + forward_fn = model_wrap(forward_fn, wrapper_name='deterministic_sample').forward + elif self.cfg.policy.action_space == 'discrete': + forward_fn = model_wrap(forward_fn, wrapper_name='argmax_sample').forward + else: + raise NotImplementedError def _forward(obs): # unsqueeze means add batch dim, i.e. (O, ) -> (1, O) From 522b0ffabbd156d5a4a20abdb3923ea5c8672300 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 25 May 2023 09:28:54 +0000 Subject: [PATCH 113/244] Add bipedalwalker a2c --- ding/bonus/config.py | 43 +++++++++++++++++++ .../config/bipedalwalker_a2c_config.py | 13 +++--- 2 files changed, 49 insertions(+), 7 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index b48f90373d..4c08fda29c 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -171,6 +171,49 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), ) ) + elif env == 'bipedalwalker': + cfg.update( + dict( + exp_name='Bipedalwalker-v3-A2C', + seed=0, + env=dict( + env_id='BipedalWalker-v3', + collector_env_num=8, + evaluator_env_num=8, + act_scale=True, + n_evaluator_episode=8, + rew_clip=True, + ), + policy=dict( + cuda=True, + action_space='continuous', + model=dict( + action_space='continuous', + obs_shape=24, + action_shape=4, + ), + learn=dict( + batch_size=128, + learning_rate=0.0003, + value_weight=0.5, + entropy_weight=0.001, + discount_factor=0.99, + adv_norm=True, + ), + collect=dict( + n_sample=128, + discount_factor=0.99, + ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) elif env == 'hopper': cfg.update( dict( diff --git a/dizoo/box2d/bipedalwalker/config/bipedalwalker_a2c_config.py b/dizoo/box2d/bipedalwalker/config/bipedalwalker_a2c_config.py index 5dcc75055c..c82542597f 100644 --- a/dizoo/box2d/bipedalwalker/config/bipedalwalker_a2c_config.py +++ b/dizoo/box2d/bipedalwalker/config/bipedalwalker_a2c_config.py @@ -16,8 +16,6 @@ ), policy=dict( cuda=True, - # (int) the trajectory length to calculate v-trace target - unroll_len=32, # load_path="./bipedalwalker_a2c_seed0/ckpt/ckpt_best.pth.tar", action_space='continuous', model=dict( @@ -27,22 +25,23 @@ ), learn=dict( # (int) the number of data for a train iteration - batch_size=64, + batch_size=256, learning_rate=0.0003, # (float) loss weight of the value network, the weight of policy network is set to 1 value_weight=0.5, # (float) loss weight of the entropy regularization, the weight of policy network is set to 1 - entropy_weight=0.01, + entropy_weight=0.001, # (float) discount factor for future reward, defaults int [0, 1] discount_factor=0.99, + adv_norm=True, ), collect=dict( # (int) collect n_sample data, train model n_iteration times - n_sample=64, - collector=dict(collect_print_freq=1000, ), + n_sample=512, + discount_factor=0.99, + collector=dict(collect_print_freq=100, ), ), eval=dict(evaluator=dict(eval_freq=100, )), - other=dict(replay_buffer=dict(replay_buffer_size=10000, ), ), ), ) bipedalwalker_a2c_config = EasyDict(bipedalwalker_a2c_config) From 1e87b1d13f61c0cb9dcb61b41e449fe902fd3389 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 25 May 2023 09:42:04 +0000 Subject: [PATCH 114/244] polish code --- ding/bonus/config.py | 27 ++++++++++--------- dizoo/mujoco/config/halfcheetah_a2c_config.py | 14 +++++----- dizoo/mujoco/config/hopper_a2c_config.py | 11 ++++---- dizoo/mujoco/config/walker2d_a2c_config.py | 11 ++++---- 4 files changed, 34 insertions(+), 29 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 4c08fda29c..324146a033 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -239,12 +239,12 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: batch_size=256, learning_rate=0.0003, value_weight=0.5, - entropy_weight=0.01, - discount_factor=0.99, + entropy_weight=0.001, + discount_factor=0.999, + adv_norm=True, ), collect=dict( - n_sample=4096, - unroll_len=1, + n_sample=256, ), ), wandb_logger=dict( @@ -278,15 +278,16 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: action_shape=6, ), learn=dict( - batch_size=256, + batch_size=1000, learning_rate=0.0003, value_weight=0.5, - entropy_weight=0.01, - discount_factor=0.99, + entropy_weight=0.001, + discount_factor=0.999, + ignore_done=True, + adv_norm=True, ), collect=dict( - n_sample=4096, - unroll_len=1, + n_sample=1000, ), ), wandb_logger=dict( @@ -323,12 +324,12 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: batch_size=256, learning_rate=0.0003, value_weight=0.5, - entropy_weight=0.01, - discount_factor=0.99, + entropy_weight=0.001, + discount_factor=0.999, + adv_norm=True, ), collect=dict( - n_sample=4096, - unroll_len=1, + n_sample=256, ), ), wandb_logger=dict( diff --git a/dizoo/mujoco/config/halfcheetah_a2c_config.py b/dizoo/mujoco/config/halfcheetah_a2c_config.py index 81fdae9243..125acfe78c 100644 --- a/dizoo/mujoco/config/halfcheetah_a2c_config.py +++ b/dizoo/mujoco/config/halfcheetah_a2c_config.py @@ -21,21 +21,23 @@ ), learn=dict( # (int) the number of data for a train iteration - batch_size=256, + batch_size=1000, learning_rate=0.0003, # (float) loss weight of the value network, the weight of policy network is set to 1 value_weight=0.5, # (float) loss weight of the entropy regularization, the weight of policy network is set to 1 - entropy_weight=0.01, + entropy_weight=0.001, # (float) discount factor for future reward, defaults int [0, 1] - discount_factor=0.99, + discount_factor=0.999, + ignore_done=True, + adv_norm=True, ), collect=dict( - n_sample=4096, - unroll_len=1, + n_sample=1000, + collector=dict(collect_print_freq=100, ), ), command=dict(), - eval=dict(), + eval=dict(evaluator=dict(eval_freq=100, )), ), ) diff --git a/dizoo/mujoco/config/hopper_a2c_config.py b/dizoo/mujoco/config/hopper_a2c_config.py index fe2498d54d..c8a890f938 100644 --- a/dizoo/mujoco/config/hopper_a2c_config.py +++ b/dizoo/mujoco/config/hopper_a2c_config.py @@ -26,16 +26,17 @@ # (float) loss weight of the value network, the weight of policy network is set to 1 value_weight=0.5, # (float) loss weight of the entropy regularization, the weight of policy network is set to 1 - entropy_weight=0.01, + entropy_weight=0.001, # (float) discount factor for future reward, defaults int [0, 1] - discount_factor=0.99, + discount_factor=0.999, + adv_norm=True, ), collect=dict( - n_sample=4096, - unroll_len=1, + n_sample=256, + collector=dict(collect_print_freq=100, ), ), command=dict(), - eval=dict(), + eval=dict(evaluator=dict(eval_freq=100, )), ), ) diff --git a/dizoo/mujoco/config/walker2d_a2c_config.py b/dizoo/mujoco/config/walker2d_a2c_config.py index 402c670f3e..baf942dc96 100644 --- a/dizoo/mujoco/config/walker2d_a2c_config.py +++ b/dizoo/mujoco/config/walker2d_a2c_config.py @@ -26,16 +26,17 @@ # (float) loss weight of the value network, the weight of policy network is set to 1 value_weight=0.5, # (float) loss weight of the entropy regularization, the weight of policy network is set to 1 - entropy_weight=0.01, + entropy_weight=0.001, # (float) discount factor for future reward, defaults int [0, 1] - discount_factor=0.99, + discount_factor=0.999, + adv_norm=True, ), collect=dict( - n_sample=4096, - unroll_len=1, + n_sample=256, + collector=dict(collect_print_freq=100, ), ), command=dict(), - eval=dict(), + eval=dict(evaluator=dict(eval_freq=100, )), ), ) From 06f4046f7f3b35ebdf0fdbc1d6b099133c5b0ce9 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 25 May 2023 11:16:17 +0000 Subject: [PATCH 115/244] polish code --- ding/bonus/a2c.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ding/bonus/a2c.py b/ding/bonus/a2c.py index 7c527099bf..b0166798db 100644 --- a/ding/bonus/a2c.py +++ b/ding/bonus/a2c.py @@ -23,6 +23,7 @@ class A2CAgent: supported_env_list = [ 'lunarlander_discrete', + 'bipedalwalker', 'hopper', 'HalfCheetah', 'Walker2d', From 7fc70324e0f0be8893777e4cb50df94c0628b9c7 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 25 May 2023 11:36:48 +0000 Subject: [PATCH 116/244] polish code --- ding/bonus/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 324146a033..9da16f9906 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -194,7 +194,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), learn=dict( batch_size=128, - learning_rate=0.0003, + learning_rate=0.005, value_weight=0.5, entropy_weight=0.001, discount_factor=0.99, From bb74395fe15537444d690bbe52cedb6f902be7c9 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 25 May 2023 12:04:23 +0000 Subject: [PATCH 117/244] polish code --- ding/bonus/config.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 9da16f9906..ce6a69ec1c 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -193,15 +193,14 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: action_shape=4, ), learn=dict( - batch_size=128, - learning_rate=0.005, - value_weight=0.5, - entropy_weight=0.001, - discount_factor=0.99, + batch_size=64, + learning_rate=0.01, + value_weight=2, + entropy_weight=0.01, adv_norm=True, ), collect=dict( - n_sample=128, + n_sample=64, discount_factor=0.99, ), ), @@ -240,11 +239,11 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: learning_rate=0.0003, value_weight=0.5, entropy_weight=0.001, - discount_factor=0.999, adv_norm=True, ), collect=dict( n_sample=256, + discount_factor=0.999, ), ), wandb_logger=dict( @@ -282,12 +281,12 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: learning_rate=0.0003, value_weight=0.5, entropy_weight=0.001, - discount_factor=0.999, ignore_done=True, adv_norm=True, ), collect=dict( n_sample=1000, + discount_factor=0.999, ), ), wandb_logger=dict( @@ -325,11 +324,11 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: learning_rate=0.0003, value_weight=0.5, entropy_weight=0.001, - discount_factor=0.999, adv_norm=True, ), collect=dict( n_sample=256, + discount_factor=0.999, ), ), wandb_logger=dict( From 5ea92332868a400511b1ac10fa3f0f6fa8cfe8bb Mon Sep 17 00:00:00 2001 From: zjowowen Date: Mon, 29 May 2023 08:59:04 +0000 Subject: [PATCH 118/244] polish code --- dizoo/mujoco/config/halfcheetah_a2c_config.py | 6 +++--- dizoo/mujoco/config/hopper_a2c_config.py | 2 +- dizoo/mujoco/config/walker2d_a2c_config.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dizoo/mujoco/config/halfcheetah_a2c_config.py b/dizoo/mujoco/config/halfcheetah_a2c_config.py index 125acfe78c..017431633d 100644 --- a/dizoo/mujoco/config/halfcheetah_a2c_config.py +++ b/dizoo/mujoco/config/halfcheetah_a2c_config.py @@ -21,19 +21,19 @@ ), learn=dict( # (int) the number of data for a train iteration - batch_size=1000, + batch_size=256, learning_rate=0.0003, # (float) loss weight of the value network, the weight of policy network is set to 1 value_weight=0.5, # (float) loss weight of the entropy regularization, the weight of policy network is set to 1 entropy_weight=0.001, # (float) discount factor for future reward, defaults int [0, 1] - discount_factor=0.999, + discount_factor=0.99, ignore_done=True, adv_norm=True, ), collect=dict( - n_sample=1000, + n_sample=256, collector=dict(collect_print_freq=100, ), ), command=dict(), diff --git a/dizoo/mujoco/config/hopper_a2c_config.py b/dizoo/mujoco/config/hopper_a2c_config.py index c8a890f938..cd0a12a881 100644 --- a/dizoo/mujoco/config/hopper_a2c_config.py +++ b/dizoo/mujoco/config/hopper_a2c_config.py @@ -28,7 +28,7 @@ # (float) loss weight of the entropy regularization, the weight of policy network is set to 1 entropy_weight=0.001, # (float) discount factor for future reward, defaults int [0, 1] - discount_factor=0.999, + discount_factor=0.99, adv_norm=True, ), collect=dict( diff --git a/dizoo/mujoco/config/walker2d_a2c_config.py b/dizoo/mujoco/config/walker2d_a2c_config.py index baf942dc96..0d822b8bbe 100644 --- a/dizoo/mujoco/config/walker2d_a2c_config.py +++ b/dizoo/mujoco/config/walker2d_a2c_config.py @@ -28,7 +28,7 @@ # (float) loss weight of the entropy regularization, the weight of policy network is set to 1 entropy_weight=0.001, # (float) discount factor for future reward, defaults int [0, 1] - discount_factor=0.999, + discount_factor=0.99, adv_norm=True, ), collect=dict( From 59b7080c15eca6b0e64157479fefb0873d11ef07 Mon Sep 17 00:00:00 2001 From: zhangpaipai <1124321458@qq.com> Date: Tue, 30 May 2023 11:33:35 +0800 Subject: [PATCH 119/244] add pendulum a2c+pg --- ding/bonus/a2c.py | 1 + ding/bonus/config.py | 2 + .../pendulum/config/pendulum_a2c_config.py | 51 +++++++++++++++++++ .../pendulum/config/pendulum_pg_config.py | 50 ++++++++++++++++++ 4 files changed, 104 insertions(+) create mode 100644 dizoo/classic_control/pendulum/config/pendulum_a2c_config.py create mode 100644 dizoo/classic_control/pendulum/config/pendulum_pg_config.py diff --git a/ding/bonus/a2c.py b/ding/bonus/a2c.py index b0166798db..368ec3e845 100644 --- a/ding/bonus/a2c.py +++ b/ding/bonus/a2c.py @@ -24,6 +24,7 @@ class A2CAgent: supported_env_list = [ 'lunarlander_discrete', 'bipedalwalker', + 'pendulum', 'hopper', 'HalfCheetah', 'Walker2d', diff --git a/ding/bonus/config.py b/ding/bonus/config.py index ce6a69ec1c..97fd693970 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -213,6 +213,8 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), ) ) + elif env == 'pendulum': + pass elif env == 'hopper': cfg.update( dict( diff --git a/dizoo/classic_control/pendulum/config/pendulum_a2c_config.py b/dizoo/classic_control/pendulum/config/pendulum_a2c_config.py new file mode 100644 index 0000000000..d66e8b1a17 --- /dev/null +++ b/dizoo/classic_control/pendulum/config/pendulum_a2c_config.py @@ -0,0 +1,51 @@ +from easydict import EasyDict + +pendulum_a2c_config = dict( + exp_name='pendulum_a2c_seed0', + env=dict( + collector_env_num=10, + evaluator_env_num=5, + act_scale=True, + n_evaluator_episode=5, + stop_value=-200, + ), + policy=dict( + cuda=False, + action_space='continuous', + model=dict( + action_space='continuous', + obs_shape=3, + action_shape=1, + ), + learn=dict( + epoch_per_collect=10, + batch_size=32, + learning_rate=3e-5, + value_weight=0.5, + entropy_weight=0.0, + ), + collect=dict( + n_sample=200, + unroll_len=1, + discount_factor=0.99, + ), + eval=dict(evaluator=dict(eval_freq=200, )) + ), +) +pendulum_a2c_config = EasyDict(pendulum_a2c_config) +main_config = pendulum_a2c_config +pendulum_a2c_create_config = dict( + env=dict( + type='pendulum', + import_names=['dizoo.classic_control.pendulum.envs.pendulum_env'], + ), + env_manager=dict(type='base'), + policy=dict(type='a2c'), +) +pendulum_a2c_create_config = EasyDict(pendulum_a2c_create_config) +create_config = pendulum_a2c_create_config + +if __name__ == "__main__": + # or you can enter `ding -m serial_onpolicy -c pendulum_a2c_config.py -s 0` + from ding.entry import serial_pipeline_onpolicy + serial_pipeline_onpolicy([main_config, create_config], seed=0) diff --git a/dizoo/classic_control/pendulum/config/pendulum_pg_config.py b/dizoo/classic_control/pendulum/config/pendulum_pg_config.py new file mode 100644 index 0000000000..546e694106 --- /dev/null +++ b/dizoo/classic_control/pendulum/config/pendulum_pg_config.py @@ -0,0 +1,50 @@ +from easydict import EasyDict + +pendulum_pg_config = dict( + exp_name='pendulum_pg_seed0', + env=dict( + collector_env_num=10, + evaluator_env_num=5, + act_scale=True, + n_evaluator_episode=5, + stop_value=-200, + ), + policy=dict( + cuda=False, + action_space='continuous', + model=dict( + action_space='continuous', + obs_shape=3, + action_shape=1, + ), + learn=dict( + batch_size=64, + learning_rate=0.001, + entropy_weight=0.001, + ), + collect=dict( + n_episode=40, + unroll_len=1, + discount_factor=0.99, + ), + eval=dict(evaluator=dict(eval_freq=200, )) + ), +) +pendulum_pg_config = EasyDict(pendulum_pg_config) +main_config = pendulum_pg_config +pendulum_pg_create_config = dict( + env=dict( + type='pendulum', + import_names=['dizoo.classic_control.pendulum.envs.pendulum_env'], + ), + env_manager=dict(type='base'), + policy=dict(type='pg'), + collector=dict(type='episode'), +) +pendulum_pg_create_config = EasyDict(pendulum_pg_create_config) +create_config = pendulum_pg_create_config + +if __name__ == "__main__": + # or you can enter `ding -m serial_onpolicy -c pendulum_pg_config.py -s 0` + from ding.entry import serial_pipeline_onpolicy + serial_pipeline_onpolicy([main_config, create_config], seed=0) From e6e100b93dcf4d54a7374f0c1a4118ab909bd23c Mon Sep 17 00:00:00 2001 From: zhangpaipai <1124321458@qq.com> Date: Tue, 30 May 2023 16:10:43 +0800 Subject: [PATCH 120/244] add pg bipedalwalker+mujoco --- .../config/bipedalwalker_pg_config.py | 52 +++++++++++++++++++ dizoo/mujoco/config/halfcheetah_pg_config.py | 52 +++++++++++++++++++ dizoo/mujoco/config/hopper_pg_config.py | 52 +++++++++++++++++++ dizoo/mujoco/config/walker2d_pg_config.py | 52 +++++++++++++++++++ 4 files changed, 208 insertions(+) create mode 100644 dizoo/box2d/bipedalwalker/config/bipedalwalker_pg_config.py create mode 100644 dizoo/mujoco/config/halfcheetah_pg_config.py create mode 100644 dizoo/mujoco/config/hopper_pg_config.py create mode 100644 dizoo/mujoco/config/walker2d_pg_config.py diff --git a/dizoo/box2d/bipedalwalker/config/bipedalwalker_pg_config.py b/dizoo/box2d/bipedalwalker/config/bipedalwalker_pg_config.py new file mode 100644 index 0000000000..96aa08aee8 --- /dev/null +++ b/dizoo/box2d/bipedalwalker/config/bipedalwalker_pg_config.py @@ -0,0 +1,52 @@ +from easydict import EasyDict + +bipedalwalker_pg_config = dict( + exp_name='bipedalwalker_pg_seed0', + env=dict( + env_id='BipedalWalker-v3', + collector_env_num=8, + evaluator_env_num=8, + act_scale=True, + n_evaluator_episode=8, + stop_value=300, + rew_clip=True, + ), + policy=dict( + cuda=True, + action_space='continuous', + model=dict( + action_space='continuous', + obs_shape=24, + action_shape=4, + ), + learn=dict( + batch_size=64, + learning_rate=0.001, + entropy_weight=0.001, + ), + collect=dict( + n_episode=8, + unroll_len=1, + discount_factor=0.99, + ), + eval=dict(evaluator=dict(eval_freq=200, )) + ), +) +bipedalwalker_pg_config = EasyDict(bipedalwalker_pg_config) +main_config = bipedalwalker_pg_config +bipedalwalker_pg_create_config = dict( + env=dict( + type='bipedalwalker', + import_names=['dizoo.box2d.bipedalwalker.envs.bipedalwalker_env'], + ), + env_manager=dict(type='subprocess'), + policy=dict(type='pg'), + collector=dict(type='episode'), +) +bipedalwalker_pg_create_config = EasyDict(bipedalwalker_pg_create_config) +create_config = bipedalwalker_pg_create_config + +if __name__ == "__main__": + # or you can enter `ding -m serial_onpolicy -c bipedalwalker_pg_config.py -s 0` + from ding.entry import serial_pipeline_onpolicy + serial_pipeline_onpolicy([main_config, create_config], seed=0) diff --git a/dizoo/mujoco/config/halfcheetah_pg_config.py b/dizoo/mujoco/config/halfcheetah_pg_config.py new file mode 100644 index 0000000000..acc8be1c7e --- /dev/null +++ b/dizoo/mujoco/config/halfcheetah_pg_config.py @@ -0,0 +1,52 @@ +from easydict import EasyDict + +halfcheetah_pg_config = dict( + exp_name='halfcheetah_pg_seed0', + env=dict( + env_id='HalfCheetah-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=6000, + ), + policy=dict( + cuda=True, + action_space='continuous', + model=dict( + action_space='continuous', + obs_shape=17, + action_shape=6, + ), + learn=dict( + batch_size=64, + learning_rate=0.001, + entropy_weight=0.001, + ), + collect=dict( + n_episode=8, + unroll_len=1, + discount_factor=0.99, + ), + eval=dict(evaluator=dict(eval_freq=200, )) + ), +) +halfcheetah_pg_config = EasyDict(halfcheetah_pg_config) +main_config = halfcheetah_pg_config +halfcheetah_pg_create_config = dict( + env=dict( + type='mujoco', + import_names=['dizoo.mujoco.envs.mujoco_env'], + ), + env_manager=dict(type='subprocess'), + policy=dict(type='pg'), + collector=dict(type='episode'), +) +halfcheetah_pg_create_config = EasyDict(halfcheetah_pg_create_config) +create_config = halfcheetah_pg_create_config + +if __name__ == "__main__": + # or you can enter `ding -m serial_onpolicy -c halfcheetah_pg_config.py -s 0` + from ding.entry import serial_pipeline_onpolicy + serial_pipeline_onpolicy([main_config, create_config], seed=0) diff --git a/dizoo/mujoco/config/hopper_pg_config.py b/dizoo/mujoco/config/hopper_pg_config.py new file mode 100644 index 0000000000..f1cb72e5f9 --- /dev/null +++ b/dizoo/mujoco/config/hopper_pg_config.py @@ -0,0 +1,52 @@ +from easydict import EasyDict + +hopper_pg_config = dict( + exp_name='hopper_pg_seed0', + env=dict( + env_id='Hopper-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=6000, + ), + policy=dict( + cuda=True, + action_space='continuous', + model=dict( + action_space='continuous', + obs_shape=11, + action_shape=3, + ), + learn=dict( + batch_size=64, + learning_rate=0.001, + entropy_weight=0.001, + ), + collect=dict( + n_episode=8, + unroll_len=1, + discount_factor=0.99, + ), + eval=dict(evaluator=dict(eval_freq=200, )) + ), +) +hopper_pg_config = EasyDict(hopper_pg_config) +main_config = hopper_pg_config +hopper_pg_create_config = dict( + env=dict( + type='mujoco', + import_names=['dizoo.mujoco.envs.mujoco_env'], + ), + env_manager=dict(type='subprocess'), + policy=dict(type='pg'), + collector=dict(type='episode'), +) +hopper_pg_create_config = EasyDict(hopper_pg_create_config) +create_config = hopper_pg_create_config + +if __name__ == "__main__": + # or you can enter `ding -m serial_onpolicy -c hopper_pg_config.py -s 0` + from ding.entry import serial_pipeline_onpolicy + serial_pipeline_onpolicy([main_config, create_config], seed=0) diff --git a/dizoo/mujoco/config/walker2d_pg_config.py b/dizoo/mujoco/config/walker2d_pg_config.py new file mode 100644 index 0000000000..ab031f0635 --- /dev/null +++ b/dizoo/mujoco/config/walker2d_pg_config.py @@ -0,0 +1,52 @@ +from easydict import EasyDict + +walker2d_pg_config = dict( + exp_name='walker2d_pg_seed0', + env=dict( + env_id='Walker2d-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=6000, + ), + policy=dict( + cuda=True, + action_space='continuous', + model=dict( + action_space='continuous', + obs_shape=17, + action_shape=6, + ), + learn=dict( + batch_size=64, + learning_rate=0.001, + entropy_weight=0.001, + ), + collect=dict( + n_episode=8, + unroll_len=1, + discount_factor=0.99, + ), + eval=dict(evaluator=dict(eval_freq=200, )) + ), +) +walker2d_pg_config = EasyDict(walker2d_pg_config) +main_config = walker2d_pg_config +walker2d_pg_create_config = dict( + env=dict( + type='mujoco', + import_names=['dizoo.mujoco.envs.mujoco_env'], + ), + env_manager=dict(type='subprocess'), + policy=dict(type='pg'), + collector=dict(type='episode'), +) +walker2d_pg_create_config = EasyDict(walker2d_pg_create_config) +create_config = walker2d_pg_create_config + +if __name__ == "__main__": + # or you can enter `ding -m serial_onpolicy -c walker2d_pg_config.py -s 0` + from ding.entry import serial_pipeline_onpolicy + serial_pipeline_onpolicy([main_config, create_config], seed=0) From f69c4483d6783938cfea229f8be982be717ec8fe Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 30 May 2023 10:42:41 +0000 Subject: [PATCH 121/244] polish code for wandb sweep --- ding/bonus/a2c.py | 34 +++++++++++-------- ding/bonus/config.py | 7 ++-- .../framework/middleware/functional/logger.py | 17 +++++++--- 3 files changed, 35 insertions(+), 23 deletions(-) diff --git a/ding/bonus/a2c.py b/ding/bonus/a2c.py index 368ec3e845..6e53833f26 100644 --- a/ding/bonus/a2c.py +++ b/ding/bonus/a2c.py @@ -75,14 +75,16 @@ def __init__( self.checkpoint_save_dir = os.path.join(self.exp_name, "ckpt") def train( - self, - step: int = int(1e7), - collector_env_num: int = 4, - evaluator_env_num: int = 4, - n_iter_log_show: int = 500, - n_iter_save_ckpt: int = 1000, - context: Optional[str] = None, - debug: bool = False + self, + step: int = int(1e7), + collector_env_num: int = 4, + evaluator_env_num: int = 4, + n_iter_log_show: int = 500, + n_iter_save_ckpt: int = 1000, + context: Optional[str] = None, + debug: bool = False, + wandb: bool = True, + wandb_sweep: bool = False, ) -> TrainingReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) @@ -97,14 +99,16 @@ def train( task.use(gae_estimator(self.cfg, self.policy.collect_mode)) task.use(trainer(self.cfg, self.policy.learn_mode)) task.use(CkptSaver(policy=self.policy, save_dir=self.checkpoint_save_dir, train_freq=n_iter_save_ckpt)) - task.use( - wandb_online_logger( - metric_list=self.policy.monitor_vars(), - model=self.policy._model, - anonymous=True, - project_name=self.exp_name + if wandb: + task.use( + wandb_online_logger( + metric_list=self.policy.monitor_vars(), + model=self.policy._model, + anonymous=True, + project_name=self.exp_name, + wandb_sweep=wandb_sweep, + ) ) - ) task.use(termination_checker(max_env_step=step)) task.use(final_ctx_saver(name=self.exp_name)) task.run() diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 97fd693970..18e8183c93 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -194,9 +194,10 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), learn=dict( batch_size=64, - learning_rate=0.01, - value_weight=2, - entropy_weight=0.01, + learning_rate=0.0003, + value_weight=0.5, + entropy_weight=0.001, + discount_factor=0.99, adv_norm=True, ), collect=dict( diff --git a/ding/framework/middleware/functional/logger.py b/ding/framework/middleware/functional/logger.py index 54584a41c3..6dcd45d605 100644 --- a/ding/framework/middleware/functional/logger.py +++ b/ding/framework/middleware/functional/logger.py @@ -3,8 +3,6 @@ from easydict import EasyDict from matplotlib import pyplot as plt from matplotlib import animation -from matplotlib import ticker as mtick -from torch.nn import functional as F from sklearn.manifold import TSNE import numpy as np import torch @@ -128,6 +126,7 @@ def wandb_online_logger( model: Optional[torch.nn.Module] = None, anonymous: bool = False, project_name: str = 'default-project', + wandb_sweep: bool = False, ) -> Callable: ''' Overview: @@ -154,10 +153,17 @@ def wandb_online_logger( metric_list = ["q_value", "target q_value", "loss", "lr", "entropy", "target_q_value", "td_error"] # Initialize wandb with default settings # Settings can be covered by calling wandb.init() at the top of the script - if anonymous: - wandb.init(project=project_name, reinit=True, anonymous="must") + if not wandb_sweep: + if anonymous: + wandb.init(project=project_name, reinit=True, anonymous="must") + else: + wandb.init(project=project_name, reinit=True) else: - wandb.init(project=project_name, reinit=True) + if anonymous: + wandb.init(project=project_name, anonymous="must") + else: + wandb.init(project=project_name) + plt.switch_backend('agg') if cfg is None: cfg = EasyDict( dict( @@ -284,6 +290,7 @@ def _plot(ctx: "OnlineRLContext"): if bool(info_for_logging): wandb.log(data=info_for_logging, step=ctx.env_step) + # wandb.log(data=info_for_logging) plt.clf() return _plot From 98877dec02f5e24a5cb4fa41a7ef49faf3ebc744 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 30 May 2023 10:45:19 +0000 Subject: [PATCH 122/244] polish code for wandb sweep --- ding/bonus/a2c.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/ding/bonus/a2c.py b/ding/bonus/a2c.py index 6e53833f26..c0add29d79 100644 --- a/ding/bonus/a2c.py +++ b/ding/bonus/a2c.py @@ -83,7 +83,6 @@ def train( n_iter_save_ckpt: int = 1000, context: Optional[str] = None, debug: bool = False, - wandb: bool = True, wandb_sweep: bool = False, ) -> TrainingReturn: if debug: @@ -99,16 +98,15 @@ def train( task.use(gae_estimator(self.cfg, self.policy.collect_mode)) task.use(trainer(self.cfg, self.policy.learn_mode)) task.use(CkptSaver(policy=self.policy, save_dir=self.checkpoint_save_dir, train_freq=n_iter_save_ckpt)) - if wandb: - task.use( - wandb_online_logger( - metric_list=self.policy.monitor_vars(), - model=self.policy._model, - anonymous=True, - project_name=self.exp_name, - wandb_sweep=wandb_sweep, - ) + task.use( + wandb_online_logger( + metric_list=self.policy.monitor_vars(), + model=self.policy._model, + anonymous=True, + project_name=self.exp_name, + wandb_sweep=wandb_sweep, ) + ) task.use(termination_checker(max_env_step=step)) task.use(final_ctx_saver(name=self.exp_name)) task.run() From dbec6a7bdd4d41f9ef8e8b34536caca4b91a5c16 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 30 May 2023 11:13:50 +0000 Subject: [PATCH 123/244] polish code for wandb sweep --- ding/bonus/c51.py | 6 ++++-- ding/bonus/ddpg.py | 6 ++++-- ding/bonus/dqn.py | 6 ++++-- ding/bonus/pg.py | 6 ++++-- ding/bonus/ppof.py | 6 ++++-- ding/bonus/sac.py | 6 ++++-- ding/bonus/td3.py | 6 ++++-- 7 files changed, 28 insertions(+), 14 deletions(-) diff --git a/ding/bonus/c51.py b/ding/bonus/c51.py index 527c728fc8..a299e2d478 100644 --- a/ding/bonus/c51.py +++ b/ding/bonus/c51.py @@ -81,7 +81,8 @@ def train( evaluator_env_num: int = 4, n_iter_save_ckpt: int = 1000, context: Optional[str] = None, - debug: bool = False + debug: bool = False, + wandb_sweep: bool = False, ) -> TrainingReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) @@ -103,7 +104,8 @@ def train( metric_list=self.policy.monitor_vars(), model=self.policy._model, anonymous=True, - project_name=self.exp_name + project_name=self.exp_name, + wandb_sweep=wandb_sweep, ) ) task.use(termination_checker(max_env_step=step)) diff --git a/ding/bonus/ddpg.py b/ding/bonus/ddpg.py index 03bdd7213c..bc3e5f515f 100644 --- a/ding/bonus/ddpg.py +++ b/ding/bonus/ddpg.py @@ -83,7 +83,8 @@ def train( n_iter_log_show: int = 500, n_iter_save_ckpt: int = 1000, context: Optional[str] = None, - debug: bool = False + debug: bool = False, + wandb_sweep: bool = False, ) -> TrainingReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) @@ -110,7 +111,8 @@ def train( metric_list=self.policy.monitor_vars(), model=self.policy._model, anonymous=True, - project_name=self.exp_name + project_name=self.exp_name, + wandb_sweep=wandb_sweep, ) ) task.use(termination_checker(max_env_step=step)) diff --git a/ding/bonus/dqn.py b/ding/bonus/dqn.py index 64606d5fec..47ccf74e16 100644 --- a/ding/bonus/dqn.py +++ b/ding/bonus/dqn.py @@ -81,7 +81,8 @@ def train( evaluator_env_num: int = 4, n_iter_save_ckpt: int = 1000, context: Optional[str] = None, - debug: bool = False + debug: bool = False, + wandb_sweep: bool = False, ) -> TrainingReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) @@ -111,7 +112,8 @@ def train( metric_list=self.policy.monitor_vars(), model=self.policy._model, anonymous=True, - project_name=self.exp_name + project_name=self.exp_name, + wandb_sweep=wandb_sweep, ) ) task.use(termination_checker(max_env_step=step)) diff --git a/ding/bonus/pg.py b/ding/bonus/pg.py index 4044880151..3c2e3a2b46 100644 --- a/ding/bonus/pg.py +++ b/ding/bonus/pg.py @@ -74,7 +74,8 @@ def train( n_iter_log_show: int = 500, n_iter_save_ckpt: int = 1000, context: Optional[str] = None, - debug: bool = False + debug: bool = False, + wandb_sweep: bool = False, ) -> TrainingReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) @@ -95,7 +96,8 @@ def train( metric_list=self.policy.monitor_vars(), model=self.policy._model, anonymous=True, - project_name=self.exp_name + project_name=self.exp_name, + wandb_sweep=wandb_sweep, ) ) task.use(termination_checker(max_env_step=step)) diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index f46f9136da..c324d318b8 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -119,7 +119,8 @@ def train( n_iter_save_ckpt: int = 1000, context: Optional[str] = None, reward_model: Optional[str] = None, - debug: bool = False + debug: bool = False, + wandb_sweep: bool = False, ) -> TrainingReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) @@ -143,7 +144,8 @@ def train( metric_list=self.policy.monitor_vars(), model=self.policy._model, anonymous=True, - project_name=self.exp_name + project_name=self.exp_name, + wandb_sweep=wandb_sweep, ) ) task.use(termination_checker(max_env_step=step)) diff --git a/ding/bonus/sac.py b/ding/bonus/sac.py index d4367d212f..4fb7dc3778 100644 --- a/ding/bonus/sac.py +++ b/ding/bonus/sac.py @@ -84,7 +84,8 @@ def train( n_iter_log_show: int = 500, n_iter_save_ckpt: int = 1000, context: Optional[str] = None, - debug: bool = False + debug: bool = False, + wandb_sweep: bool = False, ) -> TrainingReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) @@ -111,7 +112,8 @@ def train( metric_list=self.policy.monitor_vars(), model=self.policy._model, anonymous=True, - project_name=self.exp_name + project_name=self.exp_name, + wandb_sweep=wandb_sweep, ) ) task.use(termination_checker(max_env_step=step)) diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 70deb52c74..0cb501c68e 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -83,7 +83,8 @@ def train( n_iter_log_show: int = 500, n_iter_save_ckpt: int = 1000, context: Optional[str] = None, - debug: bool = False + debug: bool = False, + wandb_sweep: bool = False, ) -> TrainingReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) @@ -110,7 +111,8 @@ def train( metric_list=self.policy.monitor_vars(), model=self.policy._model, anonymous=True, - project_name=self.exp_name + project_name=self.exp_name, + wandb_sweep=wandb_sweep, ) ) task.use(termination_checker(max_env_step=step)) From d2d7e8e41af2c08850f0f885c0e2600c996c45d2 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 30 May 2023 12:32:45 +0000 Subject: [PATCH 124/244] polish code for a2c mujoco --- ding/bonus/config.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 18e8183c93..59d590a993 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -238,15 +238,15 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: action_space='continuous', ), learn=dict( - batch_size=256, + batch_size=64, learning_rate=0.0003, value_weight=0.5, - entropy_weight=0.001, + entropy_weight=0.01, adv_norm=True, ), collect=dict( - n_sample=256, - discount_factor=0.999, + n_sample=64, + discount_factor=0.99, ), ), wandb_logger=dict( @@ -280,16 +280,16 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: action_shape=6, ), learn=dict( - batch_size=1000, + batch_size=128, learning_rate=0.0003, value_weight=0.5, - entropy_weight=0.001, + entropy_weight=0.01, ignore_done=True, adv_norm=True, ), collect=dict( - n_sample=1000, - discount_factor=0.999, + n_sample=128, + discount_factor=0.99, ), ), wandb_logger=dict( @@ -323,15 +323,15 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: action_shape=6, ), learn=dict( - batch_size=256, + batch_size=64, learning_rate=0.0003, value_weight=0.5, - entropy_weight=0.001, + entropy_weight=0.01, adv_norm=True, ), collect=dict( - n_sample=256, - discount_factor=0.999, + n_sample=64, + discount_factor=0.99, ), ), wandb_logger=dict( From 168fd41e0fac1525966d163fe71f0a0835040699 Mon Sep 17 00:00:00 2001 From: zhangpaipai <1124321458@qq.com> Date: Wed, 31 May 2023 16:35:42 +0800 Subject: [PATCH 125/244] add pg pendulum new pipeline --- ding/bonus/config.py | 41 +++++++++++++++++++++++++++++++++++++++++ ding/bonus/pg.py | 5 +++++ 2 files changed, 46 insertions(+) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 59d590a993..e35a5dff76 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -385,6 +385,47 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), ) ) + elif env == 'pendulum': + cfg.update( + dict( + exp_name='Pendulum-v1-PG', + seed=0, + env=dict( + collector_env_num=8, + evaluator_env_num=8, + act_scale=True, + n_evaluator_episode=8, + stop_value=-200, + ), + policy=dict( + cuda=False, + action_space='continuous', + model=dict( + action_space='continuous', + obs_shape=3, + action_shape=1, + ), + learn=dict( + batch_size=64, + learning_rate=0.001, + entropy_weight=0.001, + ), + collect=dict( + n_episode=40, + unroll_len=1, + discount_factor=0.99, + ), + eval=dict(evaluator=dict(eval_freq=100, )) + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) else: raise KeyError("not supported env type: {}".format(env)) elif algorithm == 'TD3': diff --git a/ding/bonus/pg.py b/ding/bonus/pg.py index 3c2e3a2b46..b1a78279b7 100644 --- a/ding/bonus/pg.py +++ b/ding/bonus/pg.py @@ -22,6 +22,11 @@ class PGAgent: supported_env_list = [ 'lunarlander_discrete', + 'bipedalwalker', + 'pendulum', + 'hopper', + 'HalfCheetah', + 'Walker2d', ] algorithm = 'PG' From de2d180522eb2d2c6be9ede218fe38403fdc5c7b Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 31 May 2023 15:14:27 +0000 Subject: [PATCH 126/244] fix scalar action bug in random collect --- ding/bonus/dqn.py | 17 ++++++++--------- .../middleware/functional/collector.py | 3 +++ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/ding/bonus/dqn.py b/ding/bonus/dqn.py index 47ccf74e16..bf1a509b14 100644 --- a/ding/bonus/dqn.py +++ b/ding/bonus/dqn.py @@ -94,15 +94,14 @@ def train( with task.start(ctx=OnlineRLContext()): task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) task.use(eps_greedy_handler(self.cfg)) - # task.use( - # StepCollector( - # self.cfg, - # self.policy.collect_mode, - # collector_env, - # random_collect_size=self.cfg.policy.random_collect_size - # ) - # ) - task.use(StepCollector(self.cfg, self.policy.collect_mode, collector_env)) + task.use( + StepCollector( + self.cfg, + self.policy.collect_mode, + collector_env, + random_collect_size=self.cfg.policy.random_collect_size + ) + ) task.use(nstep_reward_enhancer(self.cfg)) task.use(data_pusher(self.cfg, self.buffer_)) task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_)) diff --git a/ding/framework/middleware/functional/collector.py b/ding/framework/middleware/functional/collector.py index 62d183e6d8..39b722188a 100644 --- a/ding/framework/middleware/functional/collector.py +++ b/ding/framework/middleware/functional/collector.py @@ -77,6 +77,9 @@ def _inference(ctx: "OnlineRLContext"): obs = {i: obs[i] for i in range(get_shape0(obs))} # TBD inference_output = policy.forward(obs, **ctx.collect_kwargs) + for key, value in inference_output.items(): + if value['action'].dim()==0: + inference_output[key]['action']=value['action'].unsqueeze(0) ctx.action = [to_ndarray(v['action']) for v in inference_output.values()] # TBD ctx.inference_output = inference_output From 1a2d4dd490651012ea5e0dfe34574a8272275307 Mon Sep 17 00:00:00 2001 From: zhangpaipai <1124321458@qq.com> Date: Thu, 1 Jun 2023 12:41:20 +0800 Subject: [PATCH 127/244] polish pg algorithm --- ding/policy/pg.py | 6 ++++-- dizoo/classic_control/pendulum/config/pendulum_pg_config.py | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/ding/policy/pg.py b/ding/policy/pg.py index 93a7fd3292..8006b4128f 100644 --- a/ding/policy/pg.py +++ b/ding/policy/pg.py @@ -90,13 +90,15 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: self._model.train() return_infos = [] - for batch in split_data_generator(data, self._cfg.learn.batch_size, shuffle=True): + dummy_batch = len(data['obs']) + for batch in split_data_generator(data, dummy_batch, shuffle=True): # forward output = self._learn_model.forward(batch['obs']) return_ = batch['return'] dist = output['dist'] - # calculate PG loss + if len(batch['action'].shape) == 1: + batch['action'] = batch['action'].unsqueeze(-1) log_prob = dist.log_prob(batch['action']) policy_loss = -(log_prob * return_).mean() entropy_loss = -self._cfg.learn.entropy_weight * dist.entropy().mean() diff --git a/dizoo/classic_control/pendulum/config/pendulum_pg_config.py b/dizoo/classic_control/pendulum/config/pendulum_pg_config.py index 546e694106..0dfb8e90ad 100644 --- a/dizoo/classic_control/pendulum/config/pendulum_pg_config.py +++ b/dizoo/classic_control/pendulum/config/pendulum_pg_config.py @@ -3,7 +3,7 @@ pendulum_pg_config = dict( exp_name='pendulum_pg_seed0', env=dict( - collector_env_num=10, + collector_env_num=8, evaluator_env_num=5, act_scale=True, n_evaluator_episode=5, @@ -18,7 +18,7 @@ action_shape=1, ), learn=dict( - batch_size=64, + batch_size=200, learning_rate=0.001, entropy_weight=0.001, ), From 1221565cf3d3e2a2ecc17387e65a6503fd520fda Mon Sep 17 00:00:00 2001 From: zhangpaipai <1124321458@qq.com> Date: Thu, 1 Jun 2023 15:05:38 +0800 Subject: [PATCH 128/244] add bonus pg config --- ding/bonus/config.py | 184 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 178 insertions(+), 6 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index e35a5dff76..60f0640003 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -347,7 +347,136 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: raise KeyError("not supported env type: {}".format(env)) elif algorithm == 'PG': cfg = EasyDict({"policy": PGPolicy.default_config()}) - if env == 'lunarlander_discrete': + if env == 'hopper': + cfg.update( + dict( + exp_name='Hopper-v3-PG', + seed=0, + env=dict( + env_id='Hopper-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=6000, + ), + policy=dict( + cuda=True, + action_space='continuous', + model=dict( + action_space='continuous', + obs_shape=11, + action_shape=3, + ), + learn=dict( + batch_size=64, + learning_rate=0.001, + entropy_weight=0.001, + ), + collect=dict( + n_episode=8, + unroll_len=1, + discount_factor=0.99, + ), + eval=dict(evaluator=dict(eval_freq=200, )) + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) + elif env == 'HalfCheetah': + cfg.update( + dict( + exp_name='HalfCheetah-v3-PG', + seed=0, + env=dict( + env_id='HalfCheetah-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=6000, + ), + policy=dict( + cuda=True, + action_space='continuous', + model=dict( + action_space='continuous', + obs_shape=17, + action_shape=6, + ), + learn=dict( + batch_size=64, + learning_rate=0.001, + entropy_weight=0.001, + ), + collect=dict( + n_episode=8, + unroll_len=1, + discount_factor=0.99, + ), + eval=dict(evaluator=dict(eval_freq=200, )) + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) + elif env == 'Walker2d': + cfg.update( + dict( + exp_name='Walker2d-v3-PG', + seed=0, + env=dict( + env_id='Walker2d-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=6000, + ), + policy=dict( + cuda=True, + action_space='continuous', + model=dict( + action_space='continuous', + obs_shape=17, + action_shape=6, + ), + learn=dict( + batch_size=64, + learning_rate=0.001, + entropy_weight=0.001, + ), + collect=dict( + n_episode=8, + unroll_len=1, + discount_factor=0.99, + ), + eval=dict(evaluator=dict(eval_freq=200, )) + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) + elif env == 'lunarlander_discrete': cfg.update( dict( exp_name='LunarLander-v2-PG', @@ -385,16 +514,59 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), ) ) - elif env == 'pendulum': + elif env == 'bipedalwalker': cfg.update( dict( - exp_name='Pendulum-v1-PG', + exp_name='Bipedalwalker-v3-PG', seed=0, env=dict( + env_id='BipedalWalker-v3', collector_env_num=8, evaluator_env_num=8, act_scale=True, n_evaluator_episode=8, + stop_value=300, + rew_clip=True, + ), + policy=dict( + cuda=True, + action_space='continuous', + model=dict( + action_space='continuous', + obs_shape=24, + action_shape=4, + ), + learn=dict( + batch_size=64, + learning_rate=0.001, + entropy_weight=0.001, + ), + collect=dict( + n_episode=8, + unroll_len=1, + discount_factor=0.99, + ), + eval=dict(evaluator=dict(eval_freq=200, )) + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) + elif env == 'pendulum': + cfg.update( + dict( + exp_name='Pendulum-v1-PG', + seed=0, + env=dict( + collector_env_num=8, + evaluator_env_num=5, + act_scale=True, + n_evaluator_episode=5, stop_value=-200, ), policy=dict( @@ -406,16 +578,16 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: action_shape=1, ), learn=dict( - batch_size=64, + batch_size=200, learning_rate=0.001, entropy_weight=0.001, ), collect=dict( - n_episode=40, + n_episode=20, unroll_len=1, discount_factor=0.99, ), - eval=dict(evaluator=dict(eval_freq=100, )) + eval=dict(evaluator=dict(eval_freq=200, )) ), wandb_logger=dict( gradient_logger=True, From 1018ddae85917ab8a3d46b97fdd1fe3cec99dcb4 Mon Sep 17 00:00:00 2001 From: zhangpaipai <1124321458@qq.com> Date: Thu, 1 Jun 2023 15:56:01 +0800 Subject: [PATCH 129/244] polish pg config --- ding/policy/pg.py | 3 +-- dizoo/classic_control/pendulum/config/pendulum_pg_config.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/ding/policy/pg.py b/ding/policy/pg.py index 8006b4128f..d3c678235c 100644 --- a/ding/policy/pg.py +++ b/ding/policy/pg.py @@ -90,8 +90,7 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: self._model.train() return_infos = [] - dummy_batch = len(data['obs']) - for batch in split_data_generator(data, dummy_batch, shuffle=True): + for batch in split_data_generator(data, self._cfg.learn.batch_size, shuffle=True): # forward output = self._learn_model.forward(batch['obs']) return_ = batch['return'] diff --git a/dizoo/classic_control/pendulum/config/pendulum_pg_config.py b/dizoo/classic_control/pendulum/config/pendulum_pg_config.py index 0dfb8e90ad..d448dee002 100644 --- a/dizoo/classic_control/pendulum/config/pendulum_pg_config.py +++ b/dizoo/classic_control/pendulum/config/pendulum_pg_config.py @@ -18,12 +18,12 @@ action_shape=1, ), learn=dict( - batch_size=200, + batch_size=400 learning_rate=0.001, entropy_weight=0.001, ), collect=dict( - n_episode=40, + n_episode=2, unroll_len=1, discount_factor=0.99, ), From f197844dc7cb8dfceb9733f39c5e032e28cc15a5 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 1 Jun 2023 10:01:35 +0000 Subject: [PATCH 130/244] polish config --- ding/bonus/config.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index e35a5dff76..426aa15cff 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -195,8 +195,8 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: learn=dict( batch_size=64, learning_rate=0.0003, - value_weight=0.5, - entropy_weight=0.001, + value_weight=0.7, + entropy_weight=0.0005, discount_factor=0.99, adv_norm=True, ), @@ -280,15 +280,16 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: action_shape=6, ), learn=dict( - batch_size=128, + batch_size=256, learning_rate=0.0003, value_weight=0.5, entropy_weight=0.01, + grad_norm=0.5, ignore_done=True, adv_norm=True, ), collect=dict( - n_sample=128, + n_sample=256, discount_factor=0.99, ), ), From 0bc6923c8c433dfede30bc1651fc2b6287f9e39e Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 1 Jun 2023 10:07:03 +0000 Subject: [PATCH 131/244] polish code --- ding/bonus/c51.py | 16 ++++++++-------- ding/bonus/ddpg.py | 18 +++++++++--------- ding/bonus/dqn.py | 16 ++++++++-------- ding/bonus/pg.py | 18 +++++++++--------- ding/bonus/ppof.py | 20 ++++++++++---------- ding/bonus/sac.py | 18 +++++++++--------- ding/bonus/td3.py | 18 +++++++++--------- 7 files changed, 62 insertions(+), 62 deletions(-) diff --git a/ding/bonus/c51.py b/ding/bonus/c51.py index a299e2d478..f9d9518bce 100644 --- a/ding/bonus/c51.py +++ b/ding/bonus/c51.py @@ -75,14 +75,14 @@ def __init__( self.checkpoint_save_dir = os.path.join(self.exp_name, "ckpt") def train( - self, - step: int = int(1e7), - collector_env_num: int = 4, - evaluator_env_num: int = 4, - n_iter_save_ckpt: int = 1000, - context: Optional[str] = None, - debug: bool = False, - wandb_sweep: bool = False, + self, + step: int = int(1e7), + collector_env_num: int = 4, + evaluator_env_num: int = 4, + n_iter_save_ckpt: int = 1000, + context: Optional[str] = None, + debug: bool = False, + wandb_sweep: bool = False, ) -> TrainingReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) diff --git a/ding/bonus/ddpg.py b/ding/bonus/ddpg.py index bc3e5f515f..6e7ec97eb1 100644 --- a/ding/bonus/ddpg.py +++ b/ding/bonus/ddpg.py @@ -76,15 +76,15 @@ def __init__( self.checkpoint_save_dir = os.path.join(self.exp_name, "ckpt") def train( - self, - step: int = int(1e7), - collector_env_num: int = 4, - evaluator_env_num: int = 4, - n_iter_log_show: int = 500, - n_iter_save_ckpt: int = 1000, - context: Optional[str] = None, - debug: bool = False, - wandb_sweep: bool = False, + self, + step: int = int(1e7), + collector_env_num: int = 4, + evaluator_env_num: int = 4, + n_iter_log_show: int = 500, + n_iter_save_ckpt: int = 1000, + context: Optional[str] = None, + debug: bool = False, + wandb_sweep: bool = False, ) -> TrainingReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) diff --git a/ding/bonus/dqn.py b/ding/bonus/dqn.py index bf1a509b14..46326be9c9 100644 --- a/ding/bonus/dqn.py +++ b/ding/bonus/dqn.py @@ -75,14 +75,14 @@ def __init__( self.checkpoint_save_dir = os.path.join(self.exp_name, "ckpt") def train( - self, - step: int = int(1e7), - collector_env_num: int = 4, - evaluator_env_num: int = 4, - n_iter_save_ckpt: int = 1000, - context: Optional[str] = None, - debug: bool = False, - wandb_sweep: bool = False, + self, + step: int = int(1e7), + collector_env_num: int = 4, + evaluator_env_num: int = 4, + n_iter_save_ckpt: int = 1000, + context: Optional[str] = None, + debug: bool = False, + wandb_sweep: bool = False, ) -> TrainingReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) diff --git a/ding/bonus/pg.py b/ding/bonus/pg.py index b1a78279b7..d14c60e949 100644 --- a/ding/bonus/pg.py +++ b/ding/bonus/pg.py @@ -72,15 +72,15 @@ def __init__( self.checkpoint_save_dir = os.path.join(self.exp_name, "ckpt") def train( - self, - step: int = int(1e7), - collector_env_num: int = 4, - evaluator_env_num: int = 4, - n_iter_log_show: int = 500, - n_iter_save_ckpt: int = 1000, - context: Optional[str] = None, - debug: bool = False, - wandb_sweep: bool = False, + self, + step: int = int(1e7), + collector_env_num: int = 4, + evaluator_env_num: int = 4, + n_iter_log_show: int = 500, + n_iter_save_ckpt: int = 1000, + context: Optional[str] = None, + debug: bool = False, + wandb_sweep: bool = False, ) -> TrainingReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index c324d318b8..5cc3be8379 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -111,16 +111,16 @@ def __init__( self.checkpoint_save_dir = os.path.join(self.exp_name, "ckpt") def train( - self, - step: int = int(1e7), - collector_env_num: int = 4, - evaluator_env_num: int = 4, - n_iter_log_show: int = 500, - n_iter_save_ckpt: int = 1000, - context: Optional[str] = None, - reward_model: Optional[str] = None, - debug: bool = False, - wandb_sweep: bool = False, + self, + step: int = int(1e7), + collector_env_num: int = 4, + evaluator_env_num: int = 4, + n_iter_log_show: int = 500, + n_iter_save_ckpt: int = 1000, + context: Optional[str] = None, + reward_model: Optional[str] = None, + debug: bool = False, + wandb_sweep: bool = False, ) -> TrainingReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) diff --git a/ding/bonus/sac.py b/ding/bonus/sac.py index 4fb7dc3778..0cef4d6cef 100644 --- a/ding/bonus/sac.py +++ b/ding/bonus/sac.py @@ -77,15 +77,15 @@ def __init__( self.checkpoint_save_dir = os.path.join(self.exp_name, "ckpt") def train( - self, - step: int = int(1e7), - collector_env_num: int = 4, - evaluator_env_num: int = 4, - n_iter_log_show: int = 500, - n_iter_save_ckpt: int = 1000, - context: Optional[str] = None, - debug: bool = False, - wandb_sweep: bool = False, + self, + step: int = int(1e7), + collector_env_num: int = 4, + evaluator_env_num: int = 4, + n_iter_log_show: int = 500, + n_iter_save_ckpt: int = 1000, + context: Optional[str] = None, + debug: bool = False, + wandb_sweep: bool = False, ) -> TrainingReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 0cb501c68e..763dd2d722 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -76,15 +76,15 @@ def __init__( self.checkpoint_save_dir = os.path.join(self.exp_name, "ckpt") def train( - self, - step: int = int(1e7), - collector_env_num: int = 4, - evaluator_env_num: int = 4, - n_iter_log_show: int = 500, - n_iter_save_ckpt: int = 1000, - context: Optional[str] = None, - debug: bool = False, - wandb_sweep: bool = False, + self, + step: int = int(1e7), + collector_env_num: int = 4, + evaluator_env_num: int = 4, + n_iter_log_show: int = 500, + n_iter_save_ckpt: int = 1000, + context: Optional[str] = None, + debug: bool = False, + wandb_sweep: bool = False, ) -> TrainingReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) From 5473706b19981746e4a5d8bf2a0ffd36ca13d2ff Mon Sep 17 00:00:00 2001 From: zhangpaipai <1124321458@qq.com> Date: Thu, 1 Jun 2023 19:25:31 +0800 Subject: [PATCH 132/244] change pendulum pg config --- ding/bonus/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 48ab910a3b..905eae2a9e 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -579,12 +579,12 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: action_shape=1, ), learn=dict( - batch_size=200, + batch_size=800, learning_rate=0.001, entropy_weight=0.001, ), collect=dict( - n_episode=20, + n_episode=4, unroll_len=1, discount_factor=0.99, ), From db8176b35ab743ed0f060ab2c4708f066f2249a6 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 1 Jun 2023 13:17:16 +0000 Subject: [PATCH 133/244] fix continuous action dim=1 bug --- .../functional/advantage_estimator.py | 21 +++++++++++++++---- .../middleware/functional/collector.py | 6 +++--- .../tests/test_advantage_estimator.py | 4 ++-- ding/policy/common_utils.py | 5 ++++- ding/policy/pg.py | 2 -- ding/policy/policy_factory.py | 2 ++ ding/utils/data/collate_fn.py | 7 ++++--- 7 files changed, 32 insertions(+), 15 deletions(-) diff --git a/ding/framework/middleware/functional/advantage_estimator.py b/ding/framework/middleware/functional/advantage_estimator.py index bb6347bc39..21ef898f33 100644 --- a/ding/framework/middleware/functional/advantage_estimator.py +++ b/ding/framework/middleware/functional/advantage_estimator.py @@ -41,8 +41,15 @@ def _gae(ctx: "OnlineRLContext"): - train_data (:obj:`List[treetensor.torch.Tensor]`): The processed data if `buffer_` is None. """ cuda = cfg.policy.cuda and torch.cuda.is_available() - data = ctx.trajectories # List - data = ttorch_collate(data) # ttorch.Tensor + + # action shape (B,) for discete action, (B, D,) for continuous action + # reward shape (B,) done shape (B,) value shape (B,) + data = ttorch_collate(ctx.trajectories, cat_1dim=True) + if data['action'].dtype in [torch.float16,torch.float32,torch.double] \ + and data['action'].dim() == 1 : + # action shape + data['action'] = data['action'].unsqueeze(-1) + with torch.no_grad(): if cuda: data = data.cuda() @@ -72,7 +79,10 @@ def _gae(ctx: "OnlineRLContext"): def ppof_adv_estimator(policy: Policy) -> Callable: def _estimator(ctx: "OnlineRLContext"): - data = ttorch_collate(ctx.trajectories) + data = ttorch_collate(ctx.trajectories, cat_1dim=True) + if data['action'].dtype in [torch.float16,torch.float32,torch.double] \ + and data['action'].dim() == 1 : + data['action'] = data['action'].unsqueeze(-1) traj_flag = data.done.clone() traj_flag[ctx.trajectory_end_idx] = True data.traj_flag = traj_flag @@ -86,7 +96,10 @@ def pg_estimator(policy: Policy) -> Callable: def _estimator(ctx: "OnlineRLContext"): train_data = [] for episode in ctx.episodes: - data = ttorch_collate(episode) + data = ttorch_collate(episode, cat_1dim=True) + if data['action'].dtype in [torch.float16,torch.float32,torch.double] \ + and data['action'].dim() == 1 : + data['action'] = data['action'].unsqueeze(-1) data = policy.get_train_sample(data) train_data.append(data) ctx.train_data = ttorch.cat(train_data, dim=0) diff --git a/ding/framework/middleware/functional/collector.py b/ding/framework/middleware/functional/collector.py index 39b722188a..f530bfabd4 100644 --- a/ding/framework/middleware/functional/collector.py +++ b/ding/framework/middleware/functional/collector.py @@ -77,9 +77,9 @@ def _inference(ctx: "OnlineRLContext"): obs = {i: obs[i] for i in range(get_shape0(obs))} # TBD inference_output = policy.forward(obs, **ctx.collect_kwargs) - for key, value in inference_output.items(): - if value['action'].dim()==0: - inference_output[key]['action']=value['action'].unsqueeze(0) + # for key, value in inference_output.items(): + # if value['action'].dim() == 0: + # inference_output[key]['action'] = value['action'].unsqueeze(0) ctx.action = [to_ndarray(v['action']) for v in inference_output.values()] # TBD ctx.inference_output = inference_output diff --git a/ding/framework/middleware/tests/test_advantage_estimator.py b/ding/framework/middleware/tests/test_advantage_estimator.py index 66ad45e77d..3c4cc5d849 100644 --- a/ding/framework/middleware/tests/test_advantage_estimator.py +++ b/ding/framework/middleware/tests/test_advantage_estimator.py @@ -53,7 +53,7 @@ def call_gae_estimator(batch_size: int = 32, trajectory_end_idx_size: int = 5, b } ) for _ in range(batch_size) ] - ctx.trajectories_copy = ttorch_collate(copy.deepcopy(ctx.trajectories)) + ctx.trajectories_copy = ttorch_collate(copy.deepcopy(ctx.trajectories), cat_1dim=True) traj_flag = ctx.trajectories_copy.done.clone() traj_flag[ctx.trajectory_end_idx] = True ctx.trajectories_copy.traj_flag = traj_flag @@ -67,7 +67,7 @@ def call_gae_estimator(batch_size: int = 32, trajectory_end_idx_size: int = 5, b d.logit = d.logit[0] d.next_obs = d.next_obs[0] d.obs = d.obs[0] - ctx.train_data = ttorch_collate(train_data) + ctx.train_data = ttorch_collate(train_data, cat_1dim=True) assert ctx.trajectories is None assert torch.equal(ctx.trajectories_copy.action, ctx.train_data.action) diff --git a/ding/policy/common_utils.py b/ding/policy/common_utils.py index c25b2f9bf3..714583e63f 100644 --- a/ding/policy/common_utils.py +++ b/ding/policy/common_utils.py @@ -13,7 +13,10 @@ def default_preprocess_learn( ignore_done: bool = False, ) -> dict: # data preprocess - data = default_collate(data) + if data[0]['action'].dtype in [torch.int8, torch.int16, torch.int32, torch.int64]: + data = default_collate(data, cat_1dim=True) # for discrete action + else: + data = default_collate(data, cat_1dim=False) # for continuous action if ignore_done: data['done'] = torch.zeros_like(data['done']).float() else: diff --git a/ding/policy/pg.py b/ding/policy/pg.py index d3c678235c..754a44b994 100644 --- a/ding/policy/pg.py +++ b/ding/policy/pg.py @@ -96,8 +96,6 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: return_ = batch['return'] dist = output['dist'] # calculate PG loss - if len(batch['action'].shape) == 1: - batch['action'] = batch['action'].unsqueeze(-1) log_prob = dist.log_prob(batch['action']) policy_loss = -(log_prob * return_).mean() entropy_loss = -self._cfg.learn.entropy_weight * dist.entropy().mean() diff --git a/ding/policy/policy_factory.py b/ding/policy/policy_factory.py index 3ccdccbb9b..410b24ef1d 100644 --- a/ding/policy/policy_factory.py +++ b/ding/policy/policy_factory.py @@ -38,6 +38,8 @@ def forward(data: Dict[int, Any], *args, **kwargs) -> Dict[int, Any]: action = torch.as_tensor(action_space.sample()) if isinstance(action_space, gym.spaces.MultiDiscrete): action = [torch.LongTensor([v]) for v in action] + elif isinstance(action_space, gym.spaces.Discrete): + action = action.unsqueeze(0) actions[env_id] = {'action': action} elif 'global_state' in data[env_id].keys(): # for smac diff --git a/ding/utils/data/collate_fn.py b/ding/utils/data/collate_fn.py index 94cfa105ee..aa416f0df5 100644 --- a/ding/utils/data/collate_fn.py +++ b/ding/utils/data/collate_fn.py @@ -17,7 +17,7 @@ ) -def ttorch_collate(x, json=False): +def ttorch_collate(x, json: bool = False, cat_1dim: bool = True): def inplace_fn(t): for k in t.keys(): @@ -28,7 +28,8 @@ def inplace_fn(t): inplace_fn(t[k]) x = ttorch.stack(x) - inplace_fn(x) + if cat_1dim: + inplace_fn(x) if json: x = x.json() return x @@ -86,7 +87,7 @@ def default_collate(batch: Sequence, else: return torch.stack(batch, 0, out=out) elif isinstance(elem, ttorch.Tensor): - return ttorch_collate(batch, json=True) + return ttorch_collate(batch, json=True, cat_1dim=cat_1dim) elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \ and elem_type.__name__ != 'string_': if elem_type.__name__ == 'ndarray': From d16fa8608cd8d317f4d90fe5b5a06bf8be16c119 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Mon, 5 Jun 2023 02:25:16 +0000 Subject: [PATCH 134/244] Add ppof lr scheduler --- ding/policy/ppof.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/ding/policy/ppof.py b/ding/policy/ppof.py index aaad6ce241..049f768431 100644 --- a/ding/policy/ppof.py +++ b/ding/policy/ppof.py @@ -27,6 +27,8 @@ class PPOFPolicy: epoch_per_collect=10, batch_size=64, learning_rate=3e-4, + # learningrate scheduler + lr_scheduler=None, # (10000, 0.1) weight_decay=0, value_weight=0.5, entropy_weight=0.01, @@ -81,6 +83,13 @@ def __init__(self, cfg: "EasyDict", model: torch.nn.Module, enable_mode: List[st lr=self._cfg.learning_rate, weight_decay=self._cfg.weight_decay, ) + # define linear lr scheduler + if self._cfg.lr_scheduler is not None: + epoch_num, min_lr_lambda = self._cfg.lr_scheduler + + self._lr_scheduler = torch.optim.lr_scheduler.LambdaLR( + self._optimizer, lr_lambda=lambda epoch: max(1.0 - epoch*(1.0-min_lr_lambda)/epoch_num , min_lr_lambda) + ) if self._cfg.value_norm: self._running_mean_std = RunningMeanStd(epsilon=1e-4, device=self._device) @@ -281,6 +290,10 @@ def forward(self, data: ttorch.Tensor) -> Dict[str, Any]: } ) return_infos.append(return_info) + + if self._cfg.lr_scheduler is not None: + self._lr_scheduler.step() + return return_infos def state_dict(self) -> Dict[str, Any]: From eb86c63118356f4fabcd2d8b9d542759c87e42f7 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 6 Jun 2023 05:15:10 +0000 Subject: [PATCH 135/244] polish config --- ding/bonus/config.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 905eae2a9e..7d06b5852a 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -99,6 +99,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: cfg.batch_size = 320 cfg.epoch_per_collect = 10 cfg.learning_rate = 3e-4 + cfg.lr_scheduler = (2000, 0.1) cfg.model = dict( encoder_hidden_size_list=[64, 64, 128], actor_head_hidden_size=128, @@ -151,13 +152,13 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: action_shape=4, ), learn=dict( - batch_size=160, + batch_size=64, learning_rate=3e-4, entropy_weight=0.001, adv_norm=True, ), collect=dict( - n_sample=320, + n_sample=64, discount_factor=0.99, gae_lambda=0.95, ), @@ -238,14 +239,14 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: action_space='continuous', ), learn=dict( - batch_size=64, + batch_size=128, learning_rate=0.0003, value_weight=0.5, - entropy_weight=0.01, + entropy_weight=0.02, adv_norm=True, ), collect=dict( - n_sample=64, + n_sample=128, discount_factor=0.99, ), ), @@ -324,14 +325,14 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: action_shape=6, ), learn=dict( - batch_size=64, + batch_size=32, learning_rate=0.0003, value_weight=0.5, - entropy_weight=0.01, + entropy_weight=0.005, adv_norm=True, ), collect=dict( - n_sample=64, + n_sample=32, discount_factor=0.99, ), ), From eab79124344de7fede2c69192965cd6147c7ba47 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 6 Jun 2023 12:44:33 +0000 Subject: [PATCH 136/244] fix random collect bug for dqn --- ding/bonus/dqn.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ding/bonus/dqn.py b/ding/bonus/dqn.py index 46326be9c9..a4b6222df6 100644 --- a/ding/bonus/dqn.py +++ b/ding/bonus/dqn.py @@ -99,7 +99,8 @@ def train( self.cfg, self.policy.collect_mode, collector_env, - random_collect_size=self.cfg.policy.random_collect_size + random_collect_size=self.cfg.policy.random_collect_size \ + if hasattr(self.cfg.policy, 'random_collect_size') else 0, ) ) task.use(nstep_reward_enhancer(self.cfg)) From 98a901741fa9c415f64dccff4bdef74a67685627 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 7 Jun 2023 07:06:34 +0000 Subject: [PATCH 137/244] polish ppo qbert spaceinvader config --- ding/bonus/config.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 7d06b5852a..611f7af6f7 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -94,17 +94,40 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: critic_head_hidden_size=128, critic_head_layer_num=2, ) - elif env in ['PongNoFrameskip', 'SpaceInvadersNoFrameskip', 'QbertNoFrameskip']: + elif env == 'PongNoFrameskip': cfg.n_sample = 3200 cfg.batch_size = 320 cfg.epoch_per_collect = 10 cfg.learning_rate = 3e-4 + cfg.model = dict( + encoder_hidden_size_list=[64, 64, 128], + actor_head_hidden_size=128, + critic_head_hidden_size=128, + ) + elif env == 'SpaceInvadersNoFrameskip': + cfg.n_sample = 320 + cfg.batch_size = 320 + cfg.epoch_per_collect = 1 + cfg.learning_rate = 1e-3 + cfg.entropy_weight = 0.01 cfg.lr_scheduler = (2000, 0.1) cfg.model = dict( encoder_hidden_size_list=[64, 64, 128], actor_head_hidden_size=128, critic_head_hidden_size=128, ) + elif env == 'QbertNoFrameskip': + cfg.n_sample = 3200 + cfg.batch_size = 320 + cfg.epoch_per_collect = 10 + cfg.learning_rate = 5e-4 + cfg.lr_scheduler = (1000, 0.1) + cfg.deterministic_eval = True + cfg.model = dict( + encoder_hidden_size_list=[64, 64, 128], + actor_head_hidden_size=128, + critic_head_hidden_size=128, + ) elif env == 'minigrid_fourroom': cfg.n_sample = 3200 cfg.batch_size = 320 From b52d8f1333450eea9d9cbd8ae5695f6c8d569ec6 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 9 Jun 2023 06:36:16 +0000 Subject: [PATCH 138/244] remove mujoco wrapper --- ding/envs/env/default_wrapper.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ding/envs/env/default_wrapper.py b/ding/envs/env/default_wrapper.py index 1c9b2c7ed9..a7ce75f763 100644 --- a/ding/envs/env/default_wrapper.py +++ b/ding/envs/env/default_wrapper.py @@ -9,7 +9,6 @@ def get_default_wrappers(env_wrapper_name: str, env_id: Optional[str] = None, ca assert caller == 'collector' or 'evaluator' if env_wrapper_name == 'mujoco_default': return [ - EasyDict(type='delay_reward', kwargs=dict(delay_reward_step=3)), copy.deepcopy(eval_episode_return_wrapper), ] elif env_wrapper_name == 'atari_default': From 8b15b5299d882acb39bbea60664d8679a64f65ed Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 9 Jun 2023 06:37:12 +0000 Subject: [PATCH 139/244] polish a2c mujoco config; add ppo offpolicy agent pipeline --- ding/bonus/__init__.py | 1 + ding/bonus/config.py | 51 +++++++- ding/bonus/ppo_offpolicy.py | 239 ++++++++++++++++++++++++++++++++++++ ding/policy/ppo.py | 5 + 4 files changed, 295 insertions(+), 1 deletion(-) create mode 100644 ding/bonus/ppo_offpolicy.py diff --git a/ding/bonus/__init__.py b/ding/bonus/__init__.py index 482b8b49d6..3589f6aaf6 100644 --- a/ding/bonus/__init__.py +++ b/ding/bonus/__init__.py @@ -1,5 +1,6 @@ from .a2c import A2CAgent from .ppof import PPOF +from .ppo_offpolicy import PPOOffPolicyAgent from .c51 import C51Agent from .td3 import TD3Agent from .ddpg import DDPGAgent diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 611f7af6f7..607a7ed3aa 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -5,7 +5,7 @@ from ding.envs.env_wrappers import MaxAndSkipWrapper, WarpFrameWrapper, ScaledFloatFrameWrapper, FrameStackWrapper, \ EvalEpisodeReturnEnv, TransposeWrapper, TimeLimitWrapper, FlatObsWrapper, GymToGymnasiumWrapper from ding.policy import PPOFPolicy, A2CPolicy, TD3Policy, DDPGPolicy, SACPolicy, DQNPolicy, IMPALAPolicy, \ - PGPolicy, C51Policy + PGPolicy, C51Policy, PPOOffPolicy def get_instance_config(env: str, algorithm: str) -> EasyDict: @@ -155,6 +155,52 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: cfg.learning_rate = 3e-4 else: raise KeyError("not supported env type: {}".format(env)) + elif algorithm == 'PPOOffPolicy': + cfg = EasyDict({"policy": PPOOffPolicy.default_config()}) + if env == 'lunarlander_discrete': + cfg.update( + dict( + exp_name='LunarLander-v2-PPOOffPolicy', + env=dict( + collector_env_num=8, + evaluator_env_num=8, + env_id='LunarLander-v2', + n_evaluator_episode=8, + stop_value=240, + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=8, + action_shape=4, + ), + learn=dict( + update_per_collect=4, + batch_size=64, + learning_rate=0.001, + value_weight=0.5, + entropy_weight=0.01, + clip_ratio=0.2, + nstep=1, + nstep_return=False, + adv_norm=True, + ), + collect=dict( + n_sample=128, + unroll_len=1, + discount_factor=0.99, + gae_lambda=0.95, + ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) elif algorithm == 'A2C': cfg = EasyDict({"policy": A2CPolicy.default_config()}) if env == 'lunarlander_discrete': @@ -271,6 +317,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: collect=dict( n_sample=128, discount_factor=0.99, + gae_lambda=0.95, ), ), wandb_logger=dict( @@ -315,6 +362,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: collect=dict( n_sample=256, discount_factor=0.99, + gae_lambda=0.95, ), ), wandb_logger=dict( @@ -357,6 +405,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: collect=dict( n_sample=32, discount_factor=0.99, + gae_lambda=0.95, ), ), wandb_logger=dict( diff --git a/ding/bonus/ppo_offpolicy.py b/ding/bonus/ppo_offpolicy.py new file mode 100644 index 0000000000..5fd01b246e --- /dev/null +++ b/ding/bonus/ppo_offpolicy.py @@ -0,0 +1,239 @@ +from dataclasses import dataclass +from typing import Optional, Union +from ditk import logging +from easydict import EasyDict +import os +from functools import partial +import torch +import treetensor.torch as ttorch +from ding.framework import task, OnlineRLContext +from ding.framework.middleware import CkptSaver, final_ctx_saver, OffPolicyLearner, StepCollector, \ + wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, gae_estimator +from ding.envs import BaseEnv, BaseEnvManagerV2, SubprocessEnvManagerV2 +from ding.policy import PPOOffPolicy +from ding.utils import set_pkg_seed +from ding.config import Config, save_config_py, compile_config +from ding.model import VAC +from ding.data import DequeBuffer +from ding.bonus.config import get_instance_config, get_instance_env +from ding.bonus.common import TrainingReturn, EvalReturn + + +class PPOOffPolicyAgent: + supported_env_list = [ + 'lunarlander_discrete', + ] + algorithm = 'PPOOffPolicy' + + def __init__( + self, + env: Union[str, BaseEnv], + seed: int = 0, + exp_name: str = None, + model: Optional[torch.nn.Module] = None, + cfg: Optional[Union[EasyDict, dict, str]] = None, + policy_state_dict: str = None, + ) -> None: + if isinstance(env, str): + assert env in PPOOffPolicyAgent.supported_env_list, "Please use supported envs: {}".format( + PPOOffPolicyAgent.supported_env_list + ) + self.env = get_instance_env(env) + if cfg is None: + # 'It should be default env tuned config' + cfg = get_instance_config(env, algorithm=PPOOffPolicyAgent.algorithm) + else: + assert isinstance(cfg, EasyDict), "Please use EasyDict as config data type." + + if exp_name is not None: + cfg.exp_name = exp_name + self.cfg = compile_config(cfg, policy=PPOOffPolicy) + self.exp_name = self.cfg.exp_name + + elif isinstance(env, BaseEnv): + self.cfg = compile_config(cfg, policy=PPOOffPolicy) + raise NotImplementedError + else: + raise TypeError("not support env type: {}, only strings and instances of `BaseEnv` now".format(type(env))) + logging.getLogger().setLevel(logging.INFO) + self.seed = seed + set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda) + if not os.path.exists(self.exp_name): + os.makedirs(self.exp_name) + save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py')) + if model is None: + model = VAC(**self.cfg.policy.model) + self.buffer_ = DequeBuffer(size=self.cfg.policy.other.replay_buffer.replay_buffer_size) + self.policy = PPOOffPolicy(self.cfg.policy, model=model) + if policy_state_dict is not None: + self.policy.learn_mode.load_state_dict(policy_state_dict) + self.checkpoint_save_dir = os.path.join(self.exp_name, "ckpt") + + def train( + self, + step: int = int(1e7), + collector_env_num: int = 4, + evaluator_env_num: int = 4, + n_iter_log_show: int = 500, + n_iter_save_ckpt: int = 1000, + context: Optional[str] = None, + debug: bool = False, + wandb_sweep: bool = False, + ) -> TrainingReturn: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + logging.debug(self.policy._model) + # define env and policy + collector_env = self._setup_env_manager(collector_env_num, context, debug, 'collector') + evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug, 'evaluator') + + with task.start(ctx=OnlineRLContext()): + task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) + task.use( + StepCollector( + self.cfg, + self.policy.collect_mode, + collector_env, + ) + ) + task.use(gae_estimator(self.cfg, self.policy.collect_mode, self.buffer_)) + task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_)) + task.use(CkptSaver(policy=self.policy, save_dir=self.checkpoint_save_dir, train_freq=n_iter_save_ckpt)) + task.use( + wandb_online_logger( + metric_list=self.policy.monitor_vars(), + model=self.policy._model, + anonymous=True, + project_name=self.exp_name, + wandb_sweep=wandb_sweep, + ) + ) + task.use(termination_checker(max_env_step=step)) + task.use(final_ctx_saver(name=self.exp_name)) + task.run() + + return TrainingReturn(wandb_url=task.ctx.wandb_url) + + def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> None: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + # define env and policy + env = self.env.clone() + env.seed(self.seed, dynamic_seed=False) + + if enable_save_replay and replay_save_path: + env.enable_save_replay(replay_path=replay_save_path) + elif enable_save_replay: + env.enable_save_replay(replay_path=os.path.join(self.exp_name, 'videos')) + else: + logging.warning('No video would be generated during the deploy.') + + def single_env_forward_wrapper(forward_fn, cuda=True): + + def _forward(obs): + # unsqueeze means add batch dim, i.e. (O, ) -> (1, O) + obs = ttorch.as_tensor(obs).unsqueeze(0) + if cuda and torch.cuda.is_available(): + obs = obs.cuda() + action = forward_fn(obs, mode='compute_actor')["action"] + # squeeze means delete batch dim, i.e. (1, A) -> (A, ) + action = action.squeeze(0).detach().cpu().numpy() + return action + + return _forward + + forward_fn = single_env_forward_wrapper(self.policy._model, self.cfg.policy.cuda) + + # main loop + return_ = 0. + step = 0 + obs = env.reset() + while True: + action = forward_fn(obs) + obs, rew, done, info = env.step(action) + return_ += rew + step += 1 + if done: + break + logging.info(f'PPOOffPolicy deploy is finished, final episode return with {step} steps is: {return_}') + + def collect_data( + self, + env_num: int = 8, + save_data_path: Optional[str] = None, + n_sample: Optional[int] = None, + n_episode: Optional[int] = None, + context: Optional[str] = None, + debug: bool = False + ) -> None: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + if n_episode is not None: + raise NotImplementedError + # define env and policy + env = self._setup_env_manager(env_num, context, debug, 'collector') + + if save_data_path is None: + save_data_path = os.path.join(self.exp_name, 'demo_data') + + # main execution task + with task.start(ctx=OnlineRLContext()): + task.use( + StepCollector( + self.cfg, self.policy.collect_mode, env, random_collect_size=self.cfg.policy.random_collect_size + ) + ) + task.use(offline_data_saver(save_data_path, data_type='hdf5')) + task.run(max_step=1) + logging.info( + f'PPOOffPolicy collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`' + ) + + def batch_evaluate( + self, + env_num: int = 4, + n_evaluator_episode: int = 4, + context: Optional[str] = None, + debug: bool = False + ) -> EvalReturn: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + # define env and policy + env = self._setup_env_manager(env_num, context, debug, 'evaluator') + + evaluate_cfg = self.cfg + evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode + + # main execution task + with task.start(ctx=OnlineRLContext()): + task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, env)) + task.run(max_step=1) + + return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) + + def _setup_env_manager( + self, + env_num: int, + context: Optional[str] = None, + debug: bool = False, + caller: str = 'collector' + ) -> BaseEnvManagerV2: + assert caller in ['evaluator', 'collector'] + if debug: + env_cls = BaseEnvManagerV2 + manager_cfg = env_cls.default_config() + else: + env_cls = SubprocessEnvManagerV2 + manager_cfg = env_cls.default_config() + if context is not None: + manager_cfg.context = context + return env_cls([partial(self.env.clone, caller) for _ in range(env_num)], manager_cfg) + + @property + def best(self): + best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar") + # Load best model if it exists + if os.path.exists(best_model_file_path): + policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) + self.policy.learn_mode.load_state_dict(policy_state_dict) + return self diff --git a/ding/policy/ppo.py b/ding/policy/ppo.py index ff7ee51bc2..cbd267ca27 100644 --- a/ding/policy/ppo.py +++ b/ding/policy/ppo.py @@ -457,6 +457,9 @@ def _monitor_vars_learn(self) -> List[str]: if self._action_space == 'continuous': variables += ['mu_mean', 'sigma_mean', 'sigma_grad', 'act'] return variables + + def monitor_vars(self) -> List[str]: + return self._monitor_vars_learn() @POLICY_REGISTRY.register('ppo_pg') @@ -970,6 +973,8 @@ def _monitor_vars_learn(self) -> List[str]: 'policy_loss', 'value_loss', 'entropy_loss', 'adv_abs_max', 'approx_kl', 'clipfrac' ] + def monitor_vars(self) -> List[str]: + return self._monitor_vars_learn() @POLICY_REGISTRY.register('ppo_stdim') class PPOSTDIMPolicy(PPOPolicy): From dc61317c839db7b162aba65195ae45d6bbcebeb6 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 9 Jun 2023 07:19:45 +0000 Subject: [PATCH 140/244] Add wandb monitor evaluate return std --- ding/framework/middleware/functional/logger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ding/framework/middleware/functional/logger.py b/ding/framework/middleware/functional/logger.py index 892a1afeb0..815cc21a8a 100644 --- a/ding/framework/middleware/functional/logger.py +++ b/ding/framework/middleware/functional/logger.py @@ -226,6 +226,7 @@ def _plot(ctx: "OnlineRLContext"): info_for_logging.update( { "episode return mean": ctx.eval_value, + "episode return std": ctx.eval_value_std, "train iter": ctx.train_iter, "env step": ctx.env_step } @@ -289,7 +290,6 @@ def _plot(ctx: "OnlineRLContext"): if bool(info_for_logging): wandb.log(data=info_for_logging, step=ctx.env_step) - # wandb.log(data=info_for_logging) plt.clf() return _plot From ea5f1e7d4ce370a6788d4d366c9193483c267257 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 9 Jun 2023 08:09:36 +0000 Subject: [PATCH 141/244] polish deploy method --- ding/bonus/a2c.py | 4 +++- ding/bonus/c51.py | 4 +++- ding/bonus/ddpg.py | 4 +++- ding/bonus/dqn.py | 4 +++- ding/bonus/pg.py | 4 +++- ding/bonus/ppo_offpolicy.py | 4 +++- ding/bonus/ppof.py | 4 +++- ding/bonus/sac.py | 4 +++- ding/bonus/td3.py | 4 +++- 9 files changed, 27 insertions(+), 9 deletions(-) diff --git a/ding/bonus/a2c.py b/ding/bonus/a2c.py index c0add29d79..0145850e5b 100644 --- a/ding/bonus/a2c.py +++ b/ding/bonus/a2c.py @@ -113,7 +113,7 @@ def train( return TrainingReturn(wandb_url=task.ctx.wandb_url) - def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> None: + def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> float: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -163,6 +163,8 @@ def _forward(obs): break logging.info(f'A2C deploy is finished, final episode return with {step} steps is: {return_}') + return return_ + def collect_data( self, env_num: int = 8, diff --git a/ding/bonus/c51.py b/ding/bonus/c51.py index f9d9518bce..822b633e83 100644 --- a/ding/bonus/c51.py +++ b/ding/bonus/c51.py @@ -114,7 +114,7 @@ def train( return TrainingReturn(wandb_url=task.ctx.wandb_url) - def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> None: + def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> float: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -159,6 +159,8 @@ def _forward(obs): break logging.info(f'C51 deploy is finished, final episode return with {step} steps is: {return_}') + return return_ + def collect_data( self, env_num: int = 8, diff --git a/ding/bonus/ddpg.py b/ding/bonus/ddpg.py index 6e7ec97eb1..12873f35b5 100644 --- a/ding/bonus/ddpg.py +++ b/ding/bonus/ddpg.py @@ -121,7 +121,7 @@ def train( return TrainingReturn(wandb_url=task.ctx.wandb_url) - def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> None: + def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> float: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -164,6 +164,8 @@ def _forward(obs): break logging.info(f'DDPG deploy is finished, final episode return with {step} steps is: {return_}') + return return_ + def collect_data( self, env_num: int = 8, diff --git a/ding/bonus/dqn.py b/ding/bonus/dqn.py index a4b6222df6..fafc6a98ea 100644 --- a/ding/bonus/dqn.py +++ b/ding/bonus/dqn.py @@ -122,7 +122,7 @@ def train( return TrainingReturn(wandb_url=task.ctx.wandb_url) - def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> None: + def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> float: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -167,6 +167,8 @@ def _forward(obs): break logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}') + return return_ + def collect_data( self, env_num: int = 8, diff --git a/ding/bonus/pg.py b/ding/bonus/pg.py index d14c60e949..0ba0dfdb65 100644 --- a/ding/bonus/pg.py +++ b/ding/bonus/pg.py @@ -111,7 +111,7 @@ def train( return TrainingReturn(wandb_url=task.ctx.wandb_url) - def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> None: + def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> float: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -163,6 +163,8 @@ def _forward(obs): break logging.info(f'PG deploy is finished, final episode return with {step} steps is: {return_}') + return return_ + def collect_data( self, env_num: int = 8, diff --git a/ding/bonus/ppo_offpolicy.py b/ding/bonus/ppo_offpolicy.py index 5fd01b246e..badaefece2 100644 --- a/ding/bonus/ppo_offpolicy.py +++ b/ding/bonus/ppo_offpolicy.py @@ -114,7 +114,7 @@ def train( return TrainingReturn(wandb_url=task.ctx.wandb_url) - def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> None: + def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> float: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -157,6 +157,8 @@ def _forward(obs): break logging.info(f'PPOOffPolicy deploy is finished, final episode return with {step} steps is: {return_}') + return return_ + def collect_data( self, env_num: int = 8, diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index 5cc3be8379..04b6876395 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -153,7 +153,7 @@ def train( return TrainingReturn(wandb_url=task.ctx.wandb_url) - def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> None: + def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> float: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -182,6 +182,8 @@ def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, break logging.info(f'PPOF deploy is finished, final episode return with {step} steps is: {return_}') + return return_ + def collect_data( self, env_num: int = 8, diff --git a/ding/bonus/sac.py b/ding/bonus/sac.py index 0cef4d6cef..23acd9ea9e 100644 --- a/ding/bonus/sac.py +++ b/ding/bonus/sac.py @@ -122,7 +122,7 @@ def train( return TrainingReturn(wandb_url=task.ctx.wandb_url) - def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> None: + def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> float: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -166,6 +166,8 @@ def _forward(obs): break logging.info(f'SAC deploy is finished, final episode return with {step} steps is: {return_}') + return return_ + def collect_data( self, env_num: int = 8, diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 763dd2d722..3882603ca3 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -121,7 +121,7 @@ def train( return TrainingReturn(wandb_url=task.ctx.wandb_url) - def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> None: + def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> float: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy @@ -164,6 +164,8 @@ def _forward(obs): break logging.info(f'TD3 deploy is finished, final episode return with {step} steps is: {return_}') + return return_ + def collect_data( self, env_num: int = 8, From 35a21b47bbd55dc72faf30643ef44155d06a33ec Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 9 Jun 2023 08:12:47 +0000 Subject: [PATCH 142/244] format code --- ding/bonus/ppo_offpolicy.py | 12 +++++------- ding/policy/ppo.py | 3 ++- ding/policy/ppof.py | 7 ++++--- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/ding/bonus/ppo_offpolicy.py b/ding/bonus/ppo_offpolicy.py index badaefece2..75413aebb5 100644 --- a/ding/bonus/ppo_offpolicy.py +++ b/ding/bonus/ppo_offpolicy.py @@ -89,13 +89,11 @@ def train( with task.start(ctx=OnlineRLContext()): task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) - task.use( - StepCollector( - self.cfg, - self.policy.collect_mode, - collector_env, - ) - ) + task.use(StepCollector( + self.cfg, + self.policy.collect_mode, + collector_env, + )) task.use(gae_estimator(self.cfg, self.policy.collect_mode, self.buffer_)) task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_)) task.use(CkptSaver(policy=self.policy, save_dir=self.checkpoint_save_dir, train_freq=n_iter_save_ckpt)) diff --git a/ding/policy/ppo.py b/ding/policy/ppo.py index cbd267ca27..619fdd437c 100644 --- a/ding/policy/ppo.py +++ b/ding/policy/ppo.py @@ -457,7 +457,7 @@ def _monitor_vars_learn(self) -> List[str]: if self._action_space == 'continuous': variables += ['mu_mean', 'sigma_mean', 'sigma_grad', 'act'] return variables - + def monitor_vars(self) -> List[str]: return self._monitor_vars_learn() @@ -976,6 +976,7 @@ def _monitor_vars_learn(self) -> List[str]: def monitor_vars(self) -> List[str]: return self._monitor_vars_learn() + @POLICY_REGISTRY.register('ppo_stdim') class PPOSTDIMPolicy(PPOPolicy): """ diff --git a/ding/policy/ppof.py b/ding/policy/ppof.py index 049f768431..72f6002a2c 100644 --- a/ding/policy/ppof.py +++ b/ding/policy/ppof.py @@ -88,7 +88,8 @@ def __init__(self, cfg: "EasyDict", model: torch.nn.Module, enable_mode: List[st epoch_num, min_lr_lambda = self._cfg.lr_scheduler self._lr_scheduler = torch.optim.lr_scheduler.LambdaLR( - self._optimizer, lr_lambda=lambda epoch: max(1.0 - epoch*(1.0-min_lr_lambda)/epoch_num , min_lr_lambda) + self._optimizer, + lr_lambda=lambda epoch: max(1.0 - epoch * (1.0 - min_lr_lambda) / epoch_num, min_lr_lambda) ) if self._cfg.value_norm: @@ -290,10 +291,10 @@ def forward(self, data: ttorch.Tensor) -> Dict[str, Any]: } ) return_infos.append(return_info) - + if self._cfg.lr_scheduler is not None: self._lr_scheduler.step() - + return return_infos def state_dict(self) -> Dict[str, Any]: From f95e8eb3e4b1ce4d56f3a6ffe5c9d38a1e493f49 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 13 Jun 2023 03:33:14 +0000 Subject: [PATCH 143/244] polish code --- ding/bonus/config.py | 1 - ding/framework/middleware/functional/logger.py | 15 ++++++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 607a7ed3aa..375bc7fad7 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -122,7 +122,6 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: cfg.epoch_per_collect = 10 cfg.learning_rate = 5e-4 cfg.lr_scheduler = (1000, 0.1) - cfg.deterministic_eval = True cfg.model = dict( encoder_hidden_size_list=[64, 64, 128], actor_head_hidden_size=128, diff --git a/ding/framework/middleware/functional/logger.py b/ding/framework/middleware/functional/logger.py index 815cc21a8a..638a47228d 100644 --- a/ding/framework/middleware/functional/logger.py +++ b/ding/framework/middleware/functional/logger.py @@ -304,6 +304,7 @@ def wandb_offline_logger( model: Optional[torch.nn.Module] = None, anonymous: bool = False, project_name: str = 'default-project', + wandb_sweep: bool = False, ) -> Callable: ''' Overview: @@ -331,10 +332,17 @@ def wandb_offline_logger( metric_list = ["q_value", "target q_value", "loss", "lr", "entropy", "target_q_value", "td_error"] # Initialize wandb with default settings # Settings can be covered by calling wandb.init() at the top of the script - if anonymous: - wandb.init(project=project_name, reinit=True, anonymous="must") + if not wandb_sweep: + if anonymous: + wandb.init(project=project_name, reinit=True, anonymous="must") + else: + wandb.init(project=project_name, reinit=True) else: - wandb.init(project=project_name, reinit=True) + if anonymous: + wandb.init(project=project_name, anonymous="must") + else: + wandb.init(project=project_name) + plt.switch_backend('agg') if cfg is None: cfg = EasyDict( dict( @@ -458,6 +466,7 @@ def _plot(ctx: "OnlineRLContext"): info_for_logging.update( { "episode return mean": ctx.eval_value, + "episode return std": ctx.eval_value_std, "train iter": ctx.train_iter, "env step": ctx.env_step } From 603fa5ec30897449546103439ee9546a6ded0334 Mon Sep 17 00:00:00 2001 From: zhangpaipai <1124321458@qq.com> Date: Tue, 13 Jun 2023 11:42:17 +0800 Subject: [PATCH 144/244] polish pg pendulum+hopper config --- ding/bonus/config.py | 26 ++++++++++++------------- dizoo/mujoco/config/hopper_pg_config.py | 8 ++++---- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 607a7ed3aa..2c2b60b265 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -445,15 +445,15 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), learn=dict( batch_size=64, - learning_rate=0.001, - entropy_weight=0.001, + learning_rate=0.005, + entropy_weight=0.01, ), collect=dict( - n_episode=8, + n_episode=34, unroll_len=1, discount_factor=0.99, ), - eval=dict(evaluator=dict(eval_freq=200, )) + eval=dict(evaluator=dict(eval_freq=1, )) ), wandb_logger=dict( gradient_logger=True, @@ -492,11 +492,11 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: entropy_weight=0.001, ), collect=dict( - n_episode=8, + n_episode=20, unroll_len=1, discount_factor=0.99, ), - eval=dict(evaluator=dict(eval_freq=200, )) + eval=dict(evaluator=dict(eval_freq=1, )) ), wandb_logger=dict( gradient_logger=True, @@ -535,11 +535,11 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: entropy_weight=0.001, ), collect=dict( - n_episode=8, + n_episode=20, unroll_len=1, discount_factor=0.99, ), - eval=dict(evaluator=dict(eval_freq=200, )) + eval=dict(evaluator=dict(eval_freq=1, )) ), wandb_logger=dict( gradient_logger=True, @@ -616,11 +616,11 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: entropy_weight=0.001, ), collect=dict( - n_episode=8, + n_episode=20, unroll_len=1, discount_factor=0.99, ), - eval=dict(evaluator=dict(eval_freq=200, )) + eval=dict(evaluator=dict(eval_freq=1, )) ), wandb_logger=dict( gradient_logger=True, @@ -652,16 +652,16 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: action_shape=1, ), learn=dict( - batch_size=800, + batch_size=4000, learning_rate=0.001, entropy_weight=0.001, ), collect=dict( - n_episode=4, + n_episode=20, unroll_len=1, discount_factor=0.99, ), - eval=dict(evaluator=dict(eval_freq=200, )) + eval=dict(evaluator=dict(eval_freq=1, )) ), wandb_logger=dict( gradient_logger=True, diff --git a/dizoo/mujoco/config/hopper_pg_config.py b/dizoo/mujoco/config/hopper_pg_config.py index f1cb72e5f9..18427131aa 100644 --- a/dizoo/mujoco/config/hopper_pg_config.py +++ b/dizoo/mujoco/config/hopper_pg_config.py @@ -21,15 +21,15 @@ ), learn=dict( batch_size=64, - learning_rate=0.001, - entropy_weight=0.001, + learning_rate=0.005, + entropy_weight=0.01, ), collect=dict( - n_episode=8, + n_episode=34, unroll_len=1, discount_factor=0.99, ), - eval=dict(evaluator=dict(eval_freq=200, )) + eval=dict(evaluator=dict(eval_freq=1, )) ), ) hopper_pg_config = EasyDict(hopper_pg_config) From ddd6550594a86b0da38fe294e5913937351da4a0 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 13 Jun 2023 07:08:18 +0000 Subject: [PATCH 145/244] fix data shape bug --- ding/bonus/pg.py | 1 - .../functional/advantage_estimator.py | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/ding/bonus/pg.py b/ding/bonus/pg.py index 0ba0dfdb65..977a147c06 100644 --- a/ding/bonus/pg.py +++ b/ding/bonus/pg.py @@ -92,7 +92,6 @@ def train( with task.start(ctx=OnlineRLContext()): task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) task.use(EpisodeCollector(self.cfg, self.policy.collect_mode, collector_env)) - # task.use(gae_estimator(self.cfg, self.policy.collect_mode)) task.use(pg_estimator(self.policy.collect_mode)) task.use(trainer(self.cfg, self.policy.learn_mode)) task.use(CkptSaver(policy=self.policy, save_dir=self.checkpoint_save_dir, train_freq=n_iter_save_ckpt)) diff --git a/ding/framework/middleware/functional/advantage_estimator.py b/ding/framework/middleware/functional/advantage_estimator.py index 21ef898f33..f0766350e7 100644 --- a/ding/framework/middleware/functional/advantage_estimator.py +++ b/ding/framework/middleware/functional/advantage_estimator.py @@ -28,6 +28,12 @@ def gae_estimator(cfg: EasyDict, policy: Policy, buffer_: Optional[Buffer] = Non """ model = policy.get_attribute('model') + obs_shape = cfg['policy']['model']['obs_shape'] + obs_shape = torch.Size(torch.tensor(obs_shape)) if isinstance(obs_shape, list) \ + else torch.Size(torch.tensor(obs_shape).unsqueeze(0)) + action_shape = cfg['policy']['model']['action_shape'] + action_shape = torch.Size(torch.tensor(action_shape)) if isinstance(action_shape, list) \ + else torch.Size(torch.tensor(action_shape).unsqueeze(0)) def _gae(ctx: "OnlineRLContext"): """ @@ -69,6 +75,17 @@ def _gae(ctx: "OnlineRLContext"): else: data = data.cpu() data = ttorch.split(data, 1) + if data[0]['obs'].shape == obs_shape: + pass + elif data[0]['obs'].shape[0] == 1 and data[0]['obs'].shape[1:] == obs_shape: + for d in data: + d['obs'] = d['obs'].squeeze(0) + d['next_obs'] = d['next_obs'].squeeze(0) + if hasattr(data[0], 'logit'): + for d in data: + d['logit'] = d['logit'].squeeze(0) + else: + raise RuntimeError("The shape of obs is {}, which is not same as config.".format(data[0]['obs'].shape)) for d in data: buffer_.push(d) ctx.trajectories = None From a7c3cf49849e87199fb9dec1677aed3a264ab776 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 13 Jun 2023 09:02:15 +0000 Subject: [PATCH 146/244] fix ppo offpolicy deploy bug --- ding/bonus/ppo_offpolicy.py | 3 +++ ding/framework/middleware/functional/advantage_estimator.py | 1 + 2 files changed, 4 insertions(+) diff --git a/ding/bonus/ppo_offpolicy.py b/ding/bonus/ppo_offpolicy.py index 75413aebb5..8f3f36f1a7 100644 --- a/ding/bonus/ppo_offpolicy.py +++ b/ding/bonus/ppo_offpolicy.py @@ -14,6 +14,7 @@ from ding.utils import set_pkg_seed from ding.config import Config, save_config_py, compile_config from ding.model import VAC +from ding.model import model_wrap from ding.data import DequeBuffer from ding.bonus.config import get_instance_config, get_instance_env from ding.bonus.common import TrainingReturn, EvalReturn @@ -128,6 +129,8 @@ def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, def single_env_forward_wrapper(forward_fn, cuda=True): + forward_fn = model_wrap(forward_fn, wrapper_name='argmax_sample').forward + def _forward(obs): # unsqueeze means add batch dim, i.e. (O, ) -> (1, O) obs = ttorch.as_tensor(obs).unsqueeze(0) diff --git a/ding/framework/middleware/functional/advantage_estimator.py b/ding/framework/middleware/functional/advantage_estimator.py index f0766350e7..69e6125947 100644 --- a/ding/framework/middleware/functional/advantage_estimator.py +++ b/ding/framework/middleware/functional/advantage_estimator.py @@ -75,6 +75,7 @@ def _gae(ctx: "OnlineRLContext"): else: data = data.cpu() data = ttorch.split(data, 1) + # To ensure the shape of obs is same as config if data[0]['obs'].shape == obs_shape: pass elif data[0]['obs'].shape[0] == 1 and data[0]['obs'].shape[1:] == obs_shape: From ea979e84c5063c6c6fb385a71137a0e54f2136e4 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 13 Jun 2023 16:21:59 +0000 Subject: [PATCH 147/244] fix mujoco reward action env clip bug --- ding/bonus/config.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index c10d853111..686382b31a 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -293,7 +293,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: env_id='Hopper-v3', norm_obs=dict(use_norm=False, ), norm_reward=dict(use_norm=False, ), - collector_env_num=8, + collector_env_num=1, evaluator_env_num=8, n_evaluator_episode=8, stop_value=12000, @@ -336,9 +336,10 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: env_id='HalfCheetah-v3', norm_obs=dict(use_norm=False, ), norm_reward=dict(use_norm=False, ), - collector_env_num=8, + collector_env_num=1, evaluator_env_num=8, n_evaluator_episode=8, + act_scale=True, stop_value=12000, ), policy=dict( @@ -381,7 +382,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: env_id='Walker2d-v3', norm_obs=dict(use_norm=False, ), norm_reward=dict(use_norm=False, ), - collector_env_num=8, + collector_env_num=1, evaluator_env_num=8, n_evaluator_episode=8, stop_value=12000, @@ -2104,12 +2105,16 @@ def get_instance_env(env: str) -> BaseEnv: cfg = EasyDict( env_id='HalfCheetah-v3', env_wrapper='mujoco_default', + act_scale=True, + rew_clip=True, ) return DingEnvWrapper(gym.make('HalfCheetah-v3'), cfg=cfg) elif env == 'Walker2d': cfg = EasyDict( env_id='Walker2d-v3', env_wrapper='mujoco_default', + act_scale=True, + rew_clip=True, ) return DingEnvWrapper(gym.make('Walker2d-v3'), cfg=cfg) elif env == "SpaceInvadersNoFrameskip": From ff7f6393a1169916e24dcce244e127e9035475a9 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 13 Jun 2023 16:27:13 +0000 Subject: [PATCH 148/244] fix mujoco reward action env clip bug --- ding/bonus/config.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 686382b31a..9fbbae8910 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -2099,6 +2099,8 @@ def get_instance_env(env: str) -> BaseEnv: cfg = EasyDict( env_id='Hopper-v3', env_wrapper='mujoco_default', + act_scale=True, + rew_clip=True, ) return DingEnvWrapper(gym.make('Hopper-v3'), cfg=cfg) elif env == 'HalfCheetah': From ed5b1a393a530119b23f78ec09ae9ecb9106b00d Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 14 Jun 2023 05:53:05 +0000 Subject: [PATCH 149/244] fix deploy env mode bug --- ding/bonus/a2c.py | 2 +- ding/bonus/c51.py | 2 +- ding/bonus/ddpg.py | 2 +- ding/bonus/dqn.py | 2 +- ding/bonus/pg.py | 2 +- ding/bonus/ppo_offpolicy.py | 2 +- ding/bonus/ppof.py | 3 ++- ding/bonus/sac.py | 2 +- ding/bonus/td3.py | 2 +- 9 files changed, 10 insertions(+), 9 deletions(-) diff --git a/ding/bonus/a2c.py b/ding/bonus/a2c.py index 0145850e5b..08eb976d72 100644 --- a/ding/bonus/a2c.py +++ b/ding/bonus/a2c.py @@ -117,7 +117,7 @@ def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy - env = self.env.clone() + env = self.env.clone(caller='evaluator') env.seed(self.seed, dynamic_seed=False) if enable_save_replay and replay_save_path: diff --git a/ding/bonus/c51.py b/ding/bonus/c51.py index 822b633e83..326dd92fd6 100644 --- a/ding/bonus/c51.py +++ b/ding/bonus/c51.py @@ -118,7 +118,7 @@ def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy - env = self.env.clone() + env = self.env.clone(caller='evaluator') env.seed(self.seed, dynamic_seed=False) if enable_save_replay and replay_save_path: diff --git a/ding/bonus/ddpg.py b/ding/bonus/ddpg.py index 12873f35b5..d781e10a61 100644 --- a/ding/bonus/ddpg.py +++ b/ding/bonus/ddpg.py @@ -125,7 +125,7 @@ def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy - env = self.env.clone() + env = self.env.clone(caller='evaluator') env.seed(self.seed, dynamic_seed=False) if enable_save_replay and replay_save_path: diff --git a/ding/bonus/dqn.py b/ding/bonus/dqn.py index fafc6a98ea..745ce43727 100644 --- a/ding/bonus/dqn.py +++ b/ding/bonus/dqn.py @@ -126,7 +126,7 @@ def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy - env = self.env.clone() + env = self.env.clone(caller='evaluator') env.seed(self.seed, dynamic_seed=False) if enable_save_replay and replay_save_path: diff --git a/ding/bonus/pg.py b/ding/bonus/pg.py index 977a147c06..83bd7874d2 100644 --- a/ding/bonus/pg.py +++ b/ding/bonus/pg.py @@ -114,7 +114,7 @@ def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy - env = self.env.clone() + env = self.env.clone(caller='evaluator') env.seed(self.seed, dynamic_seed=False) if enable_save_replay and replay_save_path: diff --git a/ding/bonus/ppo_offpolicy.py b/ding/bonus/ppo_offpolicy.py index 8f3f36f1a7..c123638aff 100644 --- a/ding/bonus/ppo_offpolicy.py +++ b/ding/bonus/ppo_offpolicy.py @@ -117,7 +117,7 @@ def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy - env = self.env.clone() + env = self.env.clone(caller='evaluator') env.seed(self.seed, dynamic_seed=False) if enable_save_replay and replay_save_path: diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index 04b6876395..62a2ab90f1 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -3,6 +3,7 @@ from ditk import logging from easydict import EasyDict from functools import partial +import random import os import gym import gymnasium @@ -157,7 +158,7 @@ def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy - env = self.env.clone() + env = self.env.clone(caller='evaluator') env.seed(self.seed, dynamic_seed=False) if enable_save_replay and replay_save_path: diff --git a/ding/bonus/sac.py b/ding/bonus/sac.py index 23acd9ea9e..ec1dda0243 100644 --- a/ding/bonus/sac.py +++ b/ding/bonus/sac.py @@ -126,7 +126,7 @@ def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy - env = self.env.clone() + env = self.env.clone(caller='evaluator') env.seed(self.seed, dynamic_seed=False) if enable_save_replay and replay_save_path: diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 3882603ca3..36db518e01 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -125,7 +125,7 @@ def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy - env = self.env.clone() + env = self.env.clone(caller='evaluator') env.seed(self.seed, dynamic_seed=False) if enable_save_replay and replay_save_path: From 05f8c479a4cb0f8cee140459ed1addbfc702ed29 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 14 Jun 2023 08:00:57 +0000 Subject: [PATCH 150/244] fix env reset bug for deployment and evaluation --- ding/bonus/a2c.py | 9 +++++++++ ding/bonus/c51.py | 9 +++++++++ ding/bonus/ddpg.py | 9 +++++++++ ding/bonus/dqn.py | 9 +++++++++ ding/bonus/pg.py | 9 +++++++++ ding/bonus/ppo_offpolicy.py | 9 +++++++++ ding/bonus/ppof.py | 9 +++++++++ ding/bonus/sac.py | 9 +++++++++ ding/bonus/td3.py | 9 +++++++++ 9 files changed, 81 insertions(+) diff --git a/ding/bonus/a2c.py b/ding/bonus/a2c.py index 08eb976d72..627597780e 100644 --- a/ding/bonus/a2c.py +++ b/ding/bonus/a2c.py @@ -150,6 +150,10 @@ def _forward(obs): forward_fn = single_env_forward_wrapper(self.policy._model, self.cfg.policy.cuda) + # reset first to make sure the env is in the initial state + # env will be reset again in the main loop + env.reset() + # main loop return_ = 0. step = 0 @@ -209,6 +213,11 @@ def batch_evaluate( # define env and policy env = self._setup_env_manager(env_num, context, debug, 'evaluator') + # reset first to make sure the env is in the initial state + # env will be reset again in the main loop + env.launch() + env.reset() + evaluate_cfg = self.cfg evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode diff --git a/ding/bonus/c51.py b/ding/bonus/c51.py index 326dd92fd6..1470062307 100644 --- a/ding/bonus/c51.py +++ b/ding/bonus/c51.py @@ -146,6 +146,10 @@ def _forward(obs): forward_fn = single_env_forward_wrapper(self.policy._model, self.cfg.policy.cuda) + # reset first to make sure the env is in the initial state + # env will be reset again in the main loop + env.reset() + # main loop return_ = 0. step = 0 @@ -205,6 +209,11 @@ def batch_evaluate( # define env and policy env = self._setup_env_manager(env_num, context, debug, 'evaluator') + # reset first to make sure the env is in the initial state + # env will be reset again in the main loop + env.launch() + env.reset() + evaluate_cfg = self.cfg evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode diff --git a/ding/bonus/ddpg.py b/ding/bonus/ddpg.py index d781e10a61..8515b9754c 100644 --- a/ding/bonus/ddpg.py +++ b/ding/bonus/ddpg.py @@ -151,6 +151,10 @@ def _forward(obs): forward_fn = single_env_forward_wrapper(self.policy._model, self.cfg.policy.cuda) + # reset first to make sure the env is in the initial state + # env will be reset again in the main loop + env.reset() + # main loop return_ = 0. step = 0 @@ -210,6 +214,11 @@ def batch_evaluate( # define env and policy env = self._setup_env_manager(env_num, context, debug, 'evaluator') + # reset first to make sure the env is in the initial state + # env will be reset again in the main loop + env.launch() + env.reset() + evaluate_cfg = self.cfg evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode diff --git a/ding/bonus/dqn.py b/ding/bonus/dqn.py index 745ce43727..8bc5973548 100644 --- a/ding/bonus/dqn.py +++ b/ding/bonus/dqn.py @@ -154,6 +154,10 @@ def _forward(obs): forward_fn = single_env_forward_wrapper(self.policy._model, self.cfg.policy.cuda) + # reset first to make sure the env is in the initial state + # env will be reset again in the main loop + env.reset() + # main loop return_ = 0. step = 0 @@ -213,6 +217,11 @@ def batch_evaluate( # define env and policy env = self._setup_env_manager(env_num, context, debug, 'evaluator') + # reset first to make sure the env is in the initial state + # env will be reset again in the main loop + env.launch() + env.reset() + evaluate_cfg = self.cfg evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode diff --git a/ding/bonus/pg.py b/ding/bonus/pg.py index 83bd7874d2..46354837a6 100644 --- a/ding/bonus/pg.py +++ b/ding/bonus/pg.py @@ -149,6 +149,10 @@ def _forward(obs): forward_fn = single_env_forward_wrapper(self.policy._model, self.cfg.policy.cuda) + # reset first to make sure the env is in the initial state + # env will be reset again in the main loop + env.reset() + # main loop return_ = 0. step = 0 @@ -208,6 +212,11 @@ def batch_evaluate( # define env and policy env = self._setup_env_manager(env_num, context, debug, 'evaluator') + # reset first to make sure the env is in the initial state + # env will be reset again in the main loop + env.launch() + env.reset() + evaluate_cfg = self.cfg evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode diff --git a/ding/bonus/ppo_offpolicy.py b/ding/bonus/ppo_offpolicy.py index c123638aff..b7730ea288 100644 --- a/ding/bonus/ppo_offpolicy.py +++ b/ding/bonus/ppo_offpolicy.py @@ -145,6 +145,10 @@ def _forward(obs): forward_fn = single_env_forward_wrapper(self.policy._model, self.cfg.policy.cuda) + # reset first to make sure the env is in the initial state + # env will be reset again in the main loop + env.reset() + # main loop return_ = 0. step = 0 @@ -204,6 +208,11 @@ def batch_evaluate( # define env and policy env = self._setup_env_manager(env_num, context, debug, 'evaluator') + # reset first to make sure the env is in the initial state + # env will be reset again in the main loop + env.launch() + env.reset() + evaluate_cfg = self.cfg evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index 62a2ab90f1..c0d158dabe 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -170,6 +170,10 @@ def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, forward_fn = single_env_forward_wrapper_ttorch(self.policy.eval, self.cfg.cuda) + # reset first to make sure the env is in the initial state + # env will be reset again in the main loop + env.reset() + # main loop return_ = 0. step = 0 @@ -224,6 +228,11 @@ def batch_evaluate( # define env and policy env = self._setup_env_manager(env_num, context, debug, 'evaluator') + # reset first to make sure the env is in the initial state + # env will be reset again in the main loop + env.launch() + env.reset() + # main execution task with task.start(ctx=OnlineRLContext()): task.use(interaction_evaluator_ttorch( diff --git a/ding/bonus/sac.py b/ding/bonus/sac.py index ec1dda0243..b430674f65 100644 --- a/ding/bonus/sac.py +++ b/ding/bonus/sac.py @@ -153,6 +153,10 @@ def _forward(obs): forward_fn = single_env_forward_wrapper(self.policy._model, self.cfg.policy.cuda) + # reset first to make sure the env is in the initial state + # env will be reset again in the main loop + env.reset() + # main loop return_ = 0. step = 0 @@ -212,6 +216,11 @@ def batch_evaluate( # define env and policy env = self._setup_env_manager(env_num, context, debug, 'evaluator') + # reset first to make sure the env is in the initial state + # env will be reset again in the main loop + env.launch() + env.reset() + evaluate_cfg = self.cfg evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 36db518e01..88cbbaf644 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -151,6 +151,10 @@ def _forward(obs): forward_fn = single_env_forward_wrapper(self.policy._model, self.cfg.policy.cuda) + # reset first to make sure the env is in the initial state + # env will be reset again in the main loop + env.reset() + # main loop return_ = 0. step = 0 @@ -210,6 +214,11 @@ def batch_evaluate( # define env and policy env = self._setup_env_manager(env_num, context, debug, 'evaluator') + # reset first to make sure the env is in the initial state + # env will be reset again in the main loop + env.launch() + env.reset() + evaluate_cfg = self.cfg evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode From df200333949de4987905ca49b68d3f9548658a4d Mon Sep 17 00:00:00 2001 From: zjowowen Date: Sun, 25 Jun 2023 08:49:45 +0000 Subject: [PATCH 151/244] Add ppo offpolicy atari config --- ding/bonus/config.py | 148 ++++++++++++++++++++++++++++++++++++ ding/bonus/ppo_offpolicy.py | 3 + 2 files changed, 151 insertions(+) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 9fbbae8910..7a0986b55c 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -200,6 +200,154 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: ), ) ) + elif env == 'PongNoFrameskip': + cfg.update( + dict( + exp_name='PongNoFrameskip-v4-PPOOffPolicy', + env=dict( + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=20, + env_id='PongNoFrameskip-v4', + frame_stack=4, + ), + policy=dict( + cuda=True, + recompute_adv=True, + action_space='discrete', + model=dict( + obs_shape=[4, 84, 84], + action_shape=6, + action_space='discrete', + encoder_hidden_size_list=[64, 64, 128], + actor_head_hidden_size=128, + critic_head_hidden_size=128, + ), + learn=dict( + update_per_collect=10, + batch_size=320, + learning_rate=3e-4, + value_weight=0.5, + entropy_weight=0.001, + clip_ratio=0.2, + adv_norm=True, + # value_norm=True, + ignore_done=False, + grad_clip_type='clip_norm', + grad_clip_value=0.5, + ), + collect=dict( + n_sample=3200, + unroll_len=1, + discount_factor=0.99, + gae_lambda=0.95, + ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) + elif env == 'SpaceInvadersNoFrameskip': + cfg.update( + dict( + exp_name='SpaceInvadersNoFrameskip-v4-PPOOffPolicy', + env=dict( + collector_env_num=16, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=10000000000, + env_id='SpaceInvadersNoFrameskip-v4', + frame_stack=4, + manager=dict(shared_memory=False, ) + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=[4, 84, 84], + action_shape=6, + encoder_hidden_size_list=[32, 64, 64, 128], + actor_head_hidden_size=128, + critic_head_hidden_size=128, + critic_head_layer_num=2, + ), + learn=dict( + update_per_collect=24, + batch_size=128, + learning_rate=0.0001, + value_weight=1.0, + entropy_weight=0.03, + clip_ratio=0.1, + adv_norm=False, + ), + collect=dict( + n_sample=1024, + unroll_len=1, + discount_factor=0.99, + gae_lambda=0.95, + ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) + elif env == 'QbertNoFrameskip': + cfg.update( + dict( + exp_name='QbertNoFrameskip-v4-PPOOffPolicy', + env=dict( + collector_env_num=16, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=10000000000, + env_id='QbertNoFrameskip-v4', + frame_stack=4 + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=[4, 84, 84], + action_shape=6, + encoder_hidden_size_list=[32, 64, 64, 128], + actor_head_hidden_size=128, + critic_head_hidden_size=128, + critic_head_layer_num=2, + ), + learn=dict( + update_per_collect=24, + batch_size=128, + learning_rate=0.0001, + value_weight=1.0, + entropy_weight=0.03, + clip_ratio=0.1, + adv_norm=False, + ), + collect=dict( + n_sample=1024, + unroll_len=1, + discount_factor=0.99, + gae_lambda=0.95, + ), + ), + wandb_logger=dict( + gradient_logger=True, + video_logger=True, + plot_logger=True, + action_logger=True, + return_logger=False + ), + ) + ) elif algorithm == 'A2C': cfg = EasyDict({"policy": A2CPolicy.default_config()}) if env == 'lunarlander_discrete': diff --git a/ding/bonus/ppo_offpolicy.py b/ding/bonus/ppo_offpolicy.py index b7730ea288..524f66d254 100644 --- a/ding/bonus/ppo_offpolicy.py +++ b/ding/bonus/ppo_offpolicy.py @@ -23,6 +23,9 @@ class PPOOffPolicyAgent: supported_env_list = [ 'lunarlander_discrete', + 'PongNoFrameskip', + 'SpaceInvadersNoFrameskip', + 'QbertNoFrameskip', ] algorithm = 'PPOOffPolicy' From 5ecc9dc79140806b10d276621f765aa258b3de5b Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 29 Jun 2023 05:30:48 +0000 Subject: [PATCH 152/244] polish config --- ding/bonus/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 7a0986b55c..95440ecc90 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -324,11 +324,11 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: critic_head_layer_num=2, ), learn=dict( - update_per_collect=24, + update_per_collect=18, batch_size=128, learning_rate=0.0001, value_weight=1.0, - entropy_weight=0.03, + entropy_weight=0.005, clip_ratio=0.1, adv_norm=False, ), From c621c35c6695e269fe540b2fc75e47b94ff1a526 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Mon, 10 Jul 2023 12:30:39 +0000 Subject: [PATCH 153/244] polish config code --- ding/bonus/__init__.py | 108 + ding/bonus/a2c.py | 94 +- ding/bonus/c51.py | 102 +- ding/bonus/cfg/A2C/__init__.py | 27 + ding/bonus/cfg/A2C/gym_bipedalwalker_v3.py | 49 + ding/bonus/cfg/A2C/gym_halfcheetah_v3.py | 53 + ding/bonus/cfg/A2C/gym_hopper_v3.py | 50 + ding/bonus/cfg/A2C/gym_lunarlander_v2.py | 39 + ding/bonus/cfg/A2C/gym_pendulum_v1.py | 1 + ding/bonus/cfg/A2C/gym_walker2d_v3.py | 50 + ding/bonus/cfg/C51/__init__.py | 23 + ding/bonus/cfg/C51/gym_lunarlander_v2.py | 52 + ding/bonus/cfg/C51/gym_pongnoframeskip_v4.py | 55 + ding/bonus/cfg/C51/gym_qbertnoframeskip_v4.py | 55 + .../C51/gym_spaceInvadersnoframeskip_v4.py | 55 + ding/bonus/cfg/DDPG/__init__.py | 29 + ding/bonus/cfg/DDPG/gym_bipedalwalker_v3.py | 52 + ding/bonus/cfg/DDPG/gym_halfcheetah_v3.py | 59 + ding/bonus/cfg/DDPG/gym_hopper_v3.py | 59 + .../cfg/DDPG/gym_lunarlandercontinuous_v2.py | 60 + ding/bonus/cfg/DDPG/gym_pendulum_v1.py | 53 + ding/bonus/cfg/DDPG/gym_walker2d_v3.py | 59 + ding/bonus/cfg/DQN/__init__.py | 23 + ding/bonus/cfg/DQN/gym_lunarlander_v2.py | 53 + ding/bonus/cfg/DQN/gym_pongnoframeskip_v4.py | 51 + ding/bonus/cfg/DQN/gym_qbertnoframeskip_v4.py | 51 + .../DQN/gym_spaceInvadersnoframeskip_v4.py | 52 + ding/bonus/cfg/PG/__init__ .py | 27 + ding/bonus/cfg/PG/gym_bipedalwalker_v3.py | 49 + ding/bonus/cfg/PG/gym_halfcheetah_v3.py | 50 + ding/bonus/cfg/PG/gym_hopper_v3.py | 50 + ding/bonus/cfg/PG/gym_lunarlander_v2.py | 39 + ding/bonus/cfg/PG/gym_pendulum_v1.py | 42 + ding/bonus/cfg/PG/gym_walker2d_v3.py | 50 + ding/bonus/cfg/PPOF/__init__.py | 17 + ding/bonus/cfg/PPOF/gym_lunarlander_v2.py | 13 + .../cfg/PPOF/gym_lunarlandercontinuous_v2.py | 14 + ding/bonus/cfg/PPOOffPolicy/__init__.py | 23 + .../cfg/PPOOffPolicy/gym_lunarlander_v2.py | 44 + .../PPOOffPolicy/gym_pongnoframeskip_v4.py | 54 + .../PPOOffPolicy/gym_qbertnoframeskip_v4.py | 48 + .../gym_spaceInvadersnoframeskip_v4.py | 49 + ding/bonus/cfg/SAC/__init__.py | 29 + ding/bonus/cfg/SAC/gym_bipedalwalker_v3.py | 53 + ding/bonus/cfg/SAC/gym_halfcheetah_v3.py | 62 + ding/bonus/cfg/SAC/gym_hopper_v3.py | 47 + .../cfg/SAC/gym_lunarlandercontinuous_v2.py | 44 + ding/bonus/cfg/SAC/gym_pendulum_v1.py | 49 + ding/bonus/cfg/SAC/gym_walker2d_v3.py | 62 + ding/bonus/cfg/TD3/__init__.py | 29 + ding/bonus/cfg/TD3/gym_bipedalwalker_v3.py | 58 + ding/bonus/cfg/TD3/gym_halfcheetah_v3.py | 64 + ding/bonus/cfg/TD3/gym_hopper_v3.py | 41 + .../cfg/TD3/gym_lunarlandercontinuous_v2.py | 50 + ding/bonus/cfg/TD3/gym_pendulum_v1.py | 54 + ding/bonus/cfg/TD3/gym_walker2d_v3.py | 64 + ding/bonus/cfg/__init__.py | 9 + ding/bonus/config.py | 2014 +---------------- ding/bonus/ddpg.py | 106 +- ding/bonus/dqn.py | 102 +- ding/bonus/pg.py | 107 +- ding/bonus/ppo_offpolicy.py | 103 +- ding/bonus/ppof.py | 4 +- ding/bonus/sac.py | 107 +- ding/bonus/td3.py | 107 +- ding/envs/__init__.py | 2 + ding/envs/ding_env_manager.py | 23 + ding/envs/gym_env.py | 6 + 68 files changed, 2920 insertions(+), 2459 deletions(-) create mode 100644 ding/bonus/cfg/A2C/__init__.py create mode 100644 ding/bonus/cfg/A2C/gym_bipedalwalker_v3.py create mode 100644 ding/bonus/cfg/A2C/gym_halfcheetah_v3.py create mode 100644 ding/bonus/cfg/A2C/gym_hopper_v3.py create mode 100644 ding/bonus/cfg/A2C/gym_lunarlander_v2.py create mode 100644 ding/bonus/cfg/A2C/gym_pendulum_v1.py create mode 100644 ding/bonus/cfg/A2C/gym_walker2d_v3.py create mode 100644 ding/bonus/cfg/C51/__init__.py create mode 100644 ding/bonus/cfg/C51/gym_lunarlander_v2.py create mode 100644 ding/bonus/cfg/C51/gym_pongnoframeskip_v4.py create mode 100644 ding/bonus/cfg/C51/gym_qbertnoframeskip_v4.py create mode 100644 ding/bonus/cfg/C51/gym_spaceInvadersnoframeskip_v4.py create mode 100644 ding/bonus/cfg/DDPG/__init__.py create mode 100644 ding/bonus/cfg/DDPG/gym_bipedalwalker_v3.py create mode 100644 ding/bonus/cfg/DDPG/gym_halfcheetah_v3.py create mode 100644 ding/bonus/cfg/DDPG/gym_hopper_v3.py create mode 100644 ding/bonus/cfg/DDPG/gym_lunarlandercontinuous_v2.py create mode 100644 ding/bonus/cfg/DDPG/gym_pendulum_v1.py create mode 100644 ding/bonus/cfg/DDPG/gym_walker2d_v3.py create mode 100644 ding/bonus/cfg/DQN/__init__.py create mode 100644 ding/bonus/cfg/DQN/gym_lunarlander_v2.py create mode 100644 ding/bonus/cfg/DQN/gym_pongnoframeskip_v4.py create mode 100644 ding/bonus/cfg/DQN/gym_qbertnoframeskip_v4.py create mode 100644 ding/bonus/cfg/DQN/gym_spaceInvadersnoframeskip_v4.py create mode 100644 ding/bonus/cfg/PG/__init__ .py create mode 100644 ding/bonus/cfg/PG/gym_bipedalwalker_v3.py create mode 100644 ding/bonus/cfg/PG/gym_halfcheetah_v3.py create mode 100644 ding/bonus/cfg/PG/gym_hopper_v3.py create mode 100644 ding/bonus/cfg/PG/gym_lunarlander_v2.py create mode 100644 ding/bonus/cfg/PG/gym_pendulum_v1.py create mode 100644 ding/bonus/cfg/PG/gym_walker2d_v3.py create mode 100644 ding/bonus/cfg/PPOF/__init__.py create mode 100644 ding/bonus/cfg/PPOF/gym_lunarlander_v2.py create mode 100644 ding/bonus/cfg/PPOF/gym_lunarlandercontinuous_v2.py create mode 100644 ding/bonus/cfg/PPOOffPolicy/__init__.py create mode 100644 ding/bonus/cfg/PPOOffPolicy/gym_lunarlander_v2.py create mode 100644 ding/bonus/cfg/PPOOffPolicy/gym_pongnoframeskip_v4.py create mode 100644 ding/bonus/cfg/PPOOffPolicy/gym_qbertnoframeskip_v4.py create mode 100644 ding/bonus/cfg/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py create mode 100644 ding/bonus/cfg/SAC/__init__.py create mode 100644 ding/bonus/cfg/SAC/gym_bipedalwalker_v3.py create mode 100644 ding/bonus/cfg/SAC/gym_halfcheetah_v3.py create mode 100644 ding/bonus/cfg/SAC/gym_hopper_v3.py create mode 100644 ding/bonus/cfg/SAC/gym_lunarlandercontinuous_v2.py create mode 100644 ding/bonus/cfg/SAC/gym_pendulum_v1.py create mode 100644 ding/bonus/cfg/SAC/gym_walker2d_v3.py create mode 100644 ding/bonus/cfg/TD3/__init__.py create mode 100644 ding/bonus/cfg/TD3/gym_bipedalwalker_v3.py create mode 100644 ding/bonus/cfg/TD3/gym_halfcheetah_v3.py create mode 100644 ding/bonus/cfg/TD3/gym_hopper_v3.py create mode 100644 ding/bonus/cfg/TD3/gym_lunarlandercontinuous_v2.py create mode 100644 ding/bonus/cfg/TD3/gym_pendulum_v1.py create mode 100644 ding/bonus/cfg/TD3/gym_walker2d_v3.py create mode 100644 ding/bonus/cfg/__init__.py create mode 100644 ding/envs/ding_env_manager.py create mode 100644 ding/envs/gym_env.py diff --git a/ding/bonus/__init__.py b/ding/bonus/__init__.py index 3589f6aaf6..a9571dd342 100644 --- a/ding/bonus/__init__.py +++ b/ding/bonus/__init__.py @@ -1,3 +1,4 @@ +from . import cfg from .a2c import A2CAgent from .ppof import PPOF from .ppo_offpolicy import PPOOffPolicyAgent @@ -7,3 +8,110 @@ from .dqn import DQNAgent from .sac import SACAgent from .pg import PGAgent + +supported_algo = dict( + A2C=A2CAgent, + PPOF=PPOF, + PPOOffPolicy=PPOOffPolicyAgent, + C51=C51Agent, + TD3=TD3Agent, + DDPG=DDPGAgent, + DQN=DQNAgent, + SAC=SACAgent, + PG=PGAgent, +) + +supported_algo_list = list(supported_algo.keys()) + + +def env_supported(algo: str = None) -> list: + """ + return list of the envs that supported by di-engine. + """ + + if algo is not None: + if algo.upper() == "A2C": + return list(cfg.A2C.supported_env.keys()) + elif algo.upper() == "C51": + return list(cfg.C51.supported_env.keys()) + elif algo.upper() == "DDPG": + return list(cfg.DDPG.supported_env.keys()) + elif algo.upper() == "DQN": + return list(cfg.DQN.supported_env.keys()) + elif algo.upper() == "PG": + return list(cfg.PG.supported_env.keys()) + elif algo.upper() == "PPOF": + return list(cfg.PPOF.supported_env.keys()) + elif algo.upper() == "PPOOFFPOLICY": + return list(cfg.PPOOffPolicy.supported_env.keys()) + elif algo.upper() == "SAC": + return list(cfg.SAC.supported_env.keys()) + elif algo.upper() == "TD3": + return list(cfg.TD3.supported_env.keys()) + else: + raise ValueError("The algo {} is not supported by di-engine.".format(algo)) + else: + #merge all the supported envs from all the algos + supported_env = [] + supported_env.extend(list(cfg.SAC.supported_env.keys())) + return supported_env + + +supported_env = env_supported() + + +def algo_supported(env_id: str = None) -> list: + """ + return list of the algos that supported by di-engine. + """ + if env_id is not None: + algo = [] + if env_id.upper() in [item.upper() for item in cfg.A2C.supported_env.keys()]: + algo.append("A2C") + if env_id.upper() in [item.upper() for item in cfg.C51.supported_env.keys()]: + algo.append("C51") + if env_id.upper() in [item.upper() for item in cfg.DDPG.supported_env.keys()]: + algo.append("DDPG") + if env_id.upper() in [item.upper() for item in cfg.DQN.supported_env.keys()]: + algo.append("DQN") + if env_id.upper() in [item.upper() for item in cfg.PG.supported_env.keys()]: + algo.append("PG") + if env_id.upper() in [item.upper() for item in cfg.PPOF.supported_env.keys()]: + algo.append("PPOF") + if env_id.upper() in [item.upper() for item in cfg.PPOOffPolicy.supported_env.keys()]: + algo.append("PPOOffPolicy") + if env_id.upper() in [item.upper() for item in cfg.SAC.supported_env.keys()]: + algo.append("SAC") + if env_id.upper() in [item.upper() for item in cfg.TD3.supported_env.keys()]: + algo.append("TD3") + + if len(algo) == 0: + raise ValueError("The env {} is not supported by di-engine.".format(env_id)) + return algo + else: + return supported_algo_list + + +def is_supported(env_id: str = None, algo: str = None) -> bool: + """ + Check if the env-algo pair is supported by di-engine. + """ + if env_id is not None and env_id.upper() in [item.upper() for item in supported_env.keys()]: + if algo is not None and algo.upper() in supported_algo_list: + if env_id.upper() in env_supported(algo): + return True + else: + return False + elif algo is None: + return True + else: + return False + elif env_id is None: + if algo is not None and algo.upper() in supported_algo_list: + return True + elif algo is None: + raise ValueError("Please specify the env or algo.") + else: + return False + else: + return False diff --git a/ding/bonus/a2c.py b/ding/bonus/a2c.py index 627597780e..9b25e42909 100644 --- a/ding/bonus/a2c.py +++ b/ding/bonus/a2c.py @@ -1,66 +1,70 @@ -from dataclasses import dataclass from typing import Optional, Union from ditk import logging from easydict import EasyDict import os -from functools import partial import torch import treetensor.torch as ttorch from ding.framework import task, OnlineRLContext from ding.framework.middleware import CkptSaver, trainer, \ wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, \ gae_estimator, final_ctx_saver -from ding.envs import BaseEnv, BaseEnvManagerV2, SubprocessEnvManagerV2 +from ding.envs import BaseEnv +from ding.envs import setup_ding_env_manager from ding.policy import A2CPolicy from ding.utils import set_pkg_seed -from ding.config import Config, save_config_py, compile_config +from ding.config import save_config_py, compile_config from ding.model import VAC from ding.model import model_wrap -from ding.bonus.config import get_instance_config, get_instance_env from ding.bonus.common import TrainingReturn, EvalReturn +from ding.bonus.cfg.A2C import supported_env_cfg +from ding.bonus.cfg.A2C import supported_env class A2CAgent: - supported_env_list = [ - 'lunarlander_discrete', - 'bipedalwalker', - 'pendulum', - 'hopper', - 'HalfCheetah', - 'Walker2d', - ] - algorithm = 'A2C' + supported_env_list = list(supported_env_cfg.keys()) def __init__( self, - env: Union[str, BaseEnv], + env_id: str = None, + env: BaseEnv = None, seed: int = 0, exp_name: str = None, model: Optional[torch.nn.Module] = None, - cfg: Optional[Union[EasyDict, dict, str]] = None, + cfg: Optional[Union[EasyDict, dict]] = None, policy_state_dict: str = None, ) -> None: - if isinstance(env, str): - assert env in A2CAgent.supported_env_list, "Please use supported envs: {}".format( + assert env_id is not None or cfg is not None, "Please specify env_id or cfg." + + if cfg is not None and not isinstance(cfg, EasyDict): + cfg = EasyDict(cfg) + + if env_id is not None: + assert env_id in A2CAgent.supported_env_list, "Please use supported envs: {}".format( A2CAgent.supported_env_list ) - self.env = get_instance_env(env) if cfg is None: - # 'It should be default env tuned config' - cfg = get_instance_config(env, algorithm=A2CAgent.algorithm) + cfg = supported_env_cfg[env_id] else: - assert isinstance(cfg, EasyDict), "Please use EasyDict as config data type." - - if exp_name is not None: - cfg.exp_name = exp_name - self.cfg = compile_config(cfg, policy=A2CPolicy) - self.exp_name = self.cfg.exp_name - - elif isinstance(env, BaseEnv): - self.cfg = compile_config(cfg, policy=A2CPolicy) - raise NotImplementedError + assert cfg.env.env_id == env_id, "env_id in cfg should be the same as env_id in args." + else: + assert hasattr(cfg.env, "env_id"), "Please specify env_id in cfg." + assert cfg.env.env_id in A2CAgent.supported_env_list, "Please use supported envs: {}".format( + A2CAgent.supported_env_list + ) + default_policy_config = EasyDict({"policy": A2CPolicy.default_config()}) + default_policy_config.update(cfg) + cfg = default_policy_config + + if exp_name is not None: + cfg.exp_name = exp_name + self.cfg = compile_config(cfg, policy=A2CPolicy) + self.exp_name = self.cfg.exp_name + if env is None: + self.env = supported_env[cfg.env.env_id](cfg=cfg.env) else: - raise TypeError("not support env type: {}, only strings and instances of `BaseEnv` now".format(type(env))) + assert isinstance(env, BaseEnv), "Please use BaseEnv as env data type." + self.env = env + logging.getLogger().setLevel(logging.INFO) self.seed = seed set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda) @@ -167,6 +171,8 @@ def _forward(obs): break logging.info(f'A2C deploy is finished, final episode return with {step} steps is: {return_}') + env.close() + return return_ def collect_data( @@ -183,7 +189,8 @@ def collect_data( if n_episode is not None: raise NotImplementedError # define env and policy - env = self._setup_env_manager(env_num, context, debug, 'collector') + env_num = env_num if env_num else self.cfg.env.collector_env_num + env = setup_ding_env_manager(self.env, env_num, context, debug, 'collector') if save_data_path is None: save_data_path = os.path.join(self.exp_name, 'demo_data') @@ -211,7 +218,8 @@ def batch_evaluate( if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy - env = self._setup_env_manager(env_num, context, debug, 'evaluator') + env_num = env_num if env_num else self.cfg.env.evaluator_env_num + env = setup_ding_env_manager(self.env, env_num, context, debug, 'evaluator') # reset first to make sure the env is in the initial state # env will be reset again in the main loop @@ -228,24 +236,6 @@ def batch_evaluate( return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) - def _setup_env_manager( - self, - env_num: int, - context: Optional[str] = None, - debug: bool = False, - caller: str = 'collector' - ) -> BaseEnvManagerV2: - assert caller in ['evaluator', 'collector'] - if debug: - env_cls = BaseEnvManagerV2 - manager_cfg = env_cls.default_config() - else: - env_cls = SubprocessEnvManagerV2 - manager_cfg = env_cls.default_config() - if context is not None: - manager_cfg.context = context - return env_cls([partial(self.env.clone, caller) for _ in range(env_num)], manager_cfg) - @property def best(self): best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar") diff --git a/ding/bonus/c51.py b/ding/bonus/c51.py index 1470062307..820ecf0130 100644 --- a/ding/bonus/c51.py +++ b/ding/bonus/c51.py @@ -1,65 +1,71 @@ -from dataclasses import dataclass from typing import Optional, Union from ditk import logging from easydict import EasyDict import os -from functools import partial import torch import treetensor.torch as ttorch from ding.framework import task, OnlineRLContext -from ding.framework.middleware import CkptSaver, multistep_trainer, \ +from ding.framework.middleware import CkptSaver, \ wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \ OffPolicyLearner, final_ctx_saver, eps_greedy_handler, nstep_reward_enhancer -from ding.envs import BaseEnv, BaseEnvManagerV2, SubprocessEnvManagerV2 +from ding.envs import BaseEnv +from ding.envs import setup_ding_env_manager from ding.policy import C51Policy from ding.utils import set_pkg_seed from ding.config import save_config_py, compile_config from ding.model import C51DQN from ding.model import model_wrap from ding.data import DequeBuffer -from ding.bonus.config import get_instance_config, get_instance_env from ding.bonus.common import TrainingReturn, EvalReturn +from ding.bonus.cfg.C51 import supported_env_cfg +from ding.bonus.cfg.C51 import supported_env class C51Agent: - supported_env_list = [ - 'lunarlander_discrete', - 'PongNoFrameskip', - 'SpaceInvadersNoFrameskip', - 'QbertNoFrameskip', - ] - algorithm = 'C51' + supported_env_list = list(supported_env_cfg.keys()) def __init__( self, - env: Union[str, BaseEnv], + env_id: str = None, + env: BaseEnv = None, seed: int = 0, exp_name: str = None, model: Optional[torch.nn.Module] = None, - cfg: Optional[Union[EasyDict, dict, str]] = None, + cfg: Optional[Union[EasyDict, dict]] = None, policy_state_dict: str = None, ) -> None: - if isinstance(env, str): - assert env in C51Agent.supported_env_list, "Please use supported envs: {}".format( + assert env_id is not None or cfg is not None, "Please specify env_id or cfg." + + if cfg is not None and not isinstance(cfg, EasyDict): + cfg = EasyDict(cfg) + + if env_id is not None: + assert env_id in C51Agent.supported_env_list, "Please use supported envs: {}".format( C51Agent.supported_env_list ) - self.env = get_instance_env(env) if cfg is None: - # 'It should be default env tuned config' - cfg = get_instance_config(env, algorithm=C51Agent.algorithm) + cfg = supported_env_cfg[env_id] else: - assert isinstance(cfg, EasyDict), "Please use EasyDict as config data type." - - if exp_name is not None: - cfg.exp_name = exp_name - self.cfg = compile_config(cfg, policy=C51Policy) - self.exp_name = self.cfg.exp_name - - elif isinstance(env, BaseEnv): - self.cfg = compile_config(cfg, policy=C51Policy) - raise NotImplementedError + assert cfg.env.env_id == env_id, "env_id in cfg should be the same as env_id in args." + else: + assert hasattr(cfg.env, "env_id"), "Please specify env_id in cfg." + assert cfg.env.env_id in C51Agent.supported_env_list, "Please use supported envs: {}".format( + C51Agent.supported_env_list + ) + default_policy_config = EasyDict({"policy": C51Policy.default_config()}) + default_policy_config.update(cfg) + cfg = default_policy_config + + if exp_name is not None: + cfg.exp_name = exp_name + self.cfg = compile_config(cfg, policy=C51Policy) + self.exp_name = self.cfg.exp_name + if env is None: + self.env = supported_env[cfg.env.env_id](cfg=cfg.env) else: - raise TypeError("not support env type: {}, only strings and instances of `BaseEnv` now".format(type(env))) + assert isinstance(env, BaseEnv), "Please use BaseEnv as env data type." + self.env = env + logging.getLogger().setLevel(logging.INFO) self.seed = seed set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda) @@ -77,8 +83,8 @@ def __init__( def train( self, step: int = int(1e7), - collector_env_num: int = 4, - evaluator_env_num: int = 4, + collector_env_num: int = None, + evaluator_env_num: int = None, n_iter_save_ckpt: int = 1000, context: Optional[str] = None, debug: bool = False, @@ -88,8 +94,10 @@ def train( logging.getLogger().setLevel(logging.DEBUG) logging.debug(self.policy._model) # define env and policy - collector_env = self._setup_env_manager(collector_env_num, context, debug, 'collector') - evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug, 'evaluator') + collector_env_num = collector_env_num if collector_env_num else self.cfg.env.collector_env_num + evaluator_env_num = evaluator_env_num if evaluator_env_num else self.cfg.env.evaluator_env_num + collector_env = setup_ding_env_manager(self.env, collector_env_num, context, debug, 'collector') + evaluator_env = setup_ding_env_manager(self.env, evaluator_env_num, context, debug, 'evaluator') with task.start(ctx=OnlineRLContext()): task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) @@ -163,6 +171,8 @@ def _forward(obs): break logging.info(f'C51 deploy is finished, final episode return with {step} steps is: {return_}') + env.close() + return return_ def collect_data( @@ -179,7 +189,8 @@ def collect_data( if n_episode is not None: raise NotImplementedError # define env and policy - env = self._setup_env_manager(env_num, context, debug, 'collector') + env_num = env_num if env_num else self.cfg.env.collector_env_num + env = setup_ding_env_manager(self.env, env_num, context, debug, 'collector') if save_data_path is None: save_data_path = os.path.join(self.exp_name, 'demo_data') @@ -207,7 +218,8 @@ def batch_evaluate( if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy - env = self._setup_env_manager(env_num, context, debug, 'evaluator') + env_num = env_num if env_num else self.cfg.env.evaluator_env_num + env = setup_ding_env_manager(self.env, env_num, context, debug, 'evaluator') # reset first to make sure the env is in the initial state # env will be reset again in the main loop @@ -224,24 +236,6 @@ def batch_evaluate( return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) - def _setup_env_manager( - self, - env_num: int, - context: Optional[str] = None, - debug: bool = False, - caller: str = 'collector' - ) -> BaseEnvManagerV2: - assert caller in ['evaluator', 'collector'] - if debug: - env_cls = BaseEnvManagerV2 - manager_cfg = env_cls.default_config() - else: - env_cls = SubprocessEnvManagerV2 - manager_cfg = env_cls.default_config() - if context is not None: - manager_cfg.context = context - return env_cls([partial(self.env.clone, caller) for _ in range(env_num)], manager_cfg) - @property def best(self): best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar") diff --git a/ding/bonus/cfg/A2C/__init__.py b/ding/bonus/cfg/A2C/__init__.py new file mode 100644 index 0000000000..63c6804ef6 --- /dev/null +++ b/ding/bonus/cfg/A2C/__init__.py @@ -0,0 +1,27 @@ +from easydict import EasyDict +from . import gym_bipedalwalker_v3 +from . import gym_halfcheetah_v3 +from . import gym_hopper_v3 +from . import gym_lunarlander_v2 +from . import gym_pendulum_v1 +from . import gym_walker2d_v3 + +supported_env_cfg = { + gym_bipedalwalker_v3.cfg.env.env_id: gym_bipedalwalker_v3.cfg, + gym_halfcheetah_v3.cfg.env.env_id: gym_halfcheetah_v3.cfg, + gym_hopper_v3.cfg.env.env_id: gym_hopper_v3.cfg, + gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.cfg, + gym_walker2d_v3.cfg.env.env_id: gym_walker2d_v3.cfg, +} + +supported_env_cfg = EasyDict(supported_env_cfg) + +supported_env = { + gym_bipedalwalker_v3.cfg.env.env_id: gym_bipedalwalker_v3.env, + gym_halfcheetah_v3.cfg.env.env_id: gym_halfcheetah_v3.env, + gym_hopper_v3.cfg.env.env_id: gym_hopper_v3.env, + gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.env, + gym_walker2d_v3.cfg.env.env_id: gym_walker2d_v3.env, +} + +supported_env = EasyDict(supported_env) diff --git a/ding/bonus/cfg/A2C/gym_bipedalwalker_v3.py b/ding/bonus/cfg/A2C/gym_bipedalwalker_v3.py new file mode 100644 index 0000000000..3ab43bd168 --- /dev/null +++ b/ding/bonus/cfg/A2C/gym_bipedalwalker_v3.py @@ -0,0 +1,49 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='Bipedalwalker-v3-A2C', + seed=0, + env=dict( + env_id='BipedalWalker-v3', + collector_env_num=8, + evaluator_env_num=8, + act_scale=True, + n_evaluator_episode=8, + rew_clip=True, + ), + policy=dict( + cuda=True, + action_space='continuous', + model=dict( + action_space='continuous', + obs_shape=24, + action_shape=4, + ), + learn=dict( + batch_size=64, + learning_rate=0.0003, + value_weight=0.7, + entropy_weight=0.0005, + discount_factor=0.99, + adv_norm=True, + ), + collect=dict( + n_sample=64, + discount_factor=0.99, + ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial( + ding.envs.gym_env.env, cfg=dict( + act_scale=cfg.env.act_scale, + rew_clip=cfg.env.rew_clip, + ) +) diff --git a/ding/bonus/cfg/A2C/gym_halfcheetah_v3.py b/ding/bonus/cfg/A2C/gym_halfcheetah_v3.py new file mode 100644 index 0000000000..5b4e1e181d --- /dev/null +++ b/ding/bonus/cfg/A2C/gym_halfcheetah_v3.py @@ -0,0 +1,53 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='HalfCheetah-v3-A2C', + env=dict( + env_id='HalfCheetah-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=1, + evaluator_env_num=8, + n_evaluator_episode=8, + act_scale=True, + stop_value=12000, + env_wrapper='mujoco_default', + act_scale=True, + rew_clip=True, + ), + policy=dict( + cuda=True, + action_space='continuous', + model=dict( + action_space='continuous', + obs_shape=17, + action_shape=6, + ), + learn=dict( + batch_size=256, + learning_rate=0.0003, + value_weight=0.5, + entropy_weight=0.01, + grad_norm=0.5, + ignore_done=True, + adv_norm=True, + ), + collect=dict( + n_sample=256, + discount_factor=0.99, + gae_lambda=0.95, + ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial( + ding.envs.gym_env.env, + cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) +) diff --git a/ding/bonus/cfg/A2C/gym_hopper_v3.py b/ding/bonus/cfg/A2C/gym_hopper_v3.py new file mode 100644 index 0000000000..de461a79d7 --- /dev/null +++ b/ding/bonus/cfg/A2C/gym_hopper_v3.py @@ -0,0 +1,50 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='Hopper-v3-A2C', + env=dict( + env_id='Hopper-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=1, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=12000, + env_wrapper='mujoco_default', + act_scale=True, + rew_clip=True, + ), + policy=dict( + cuda=True, + action_space='continuous', + model=dict( + obs_shape=11, + action_shape=3, + action_space='continuous', + ), + learn=dict( + batch_size=128, + learning_rate=0.0003, + value_weight=0.5, + entropy_weight=0.02, + adv_norm=True, + ), + collect=dict( + n_sample=128, + discount_factor=0.99, + gae_lambda=0.95, + ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial( + ding.envs.gym_env.env, + cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) +) diff --git a/ding/bonus/cfg/A2C/gym_lunarlander_v2.py b/ding/bonus/cfg/A2C/gym_lunarlander_v2.py new file mode 100644 index 0000000000..cf2e3b50b0 --- /dev/null +++ b/ding/bonus/cfg/A2C/gym_lunarlander_v2.py @@ -0,0 +1,39 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='LunarLander-v2-A2C', + env=dict( + collector_env_num=8, + evaluator_env_num=8, + env_id='LunarLander-v2', + n_evaluator_episode=8, + stop_value=240, + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=8, + action_shape=4, + ), + learn=dict( + batch_size=64, + learning_rate=3e-4, + entropy_weight=0.001, + adv_norm=True, + ), + collect=dict( + n_sample=64, + discount_factor=0.99, + gae_lambda=0.95, + ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial(ding.envs.gym_env.env, continuous=True) diff --git a/ding/bonus/cfg/A2C/gym_pendulum_v1.py b/ding/bonus/cfg/A2C/gym_pendulum_v1.py new file mode 100644 index 0000000000..464090415c --- /dev/null +++ b/ding/bonus/cfg/A2C/gym_pendulum_v1.py @@ -0,0 +1 @@ +# TODO diff --git a/ding/bonus/cfg/A2C/gym_walker2d_v3.py b/ding/bonus/cfg/A2C/gym_walker2d_v3.py new file mode 100644 index 0000000000..a714e53f26 --- /dev/null +++ b/ding/bonus/cfg/A2C/gym_walker2d_v3.py @@ -0,0 +1,50 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='Walker2d-v3-A2C', + env=dict( + env_id='Walker2d-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=1, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=12000, + env_wrapper='mujoco_default', + act_scale=True, + rew_clip=True, + ), + policy=dict( + cuda=True, + action_space='continuous', + model=dict( + action_space='continuous', + obs_shape=17, + action_shape=6, + ), + learn=dict( + batch_size=32, + learning_rate=0.0003, + value_weight=0.5, + entropy_weight=0.005, + adv_norm=True, + ), + collect=dict( + n_sample=32, + discount_factor=0.99, + gae_lambda=0.95, + ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial( + ding.envs.gym_env.env, + cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) +) diff --git a/ding/bonus/cfg/C51/__init__.py b/ding/bonus/cfg/C51/__init__.py new file mode 100644 index 0000000000..2704b04c53 --- /dev/null +++ b/ding/bonus/cfg/C51/__init__.py @@ -0,0 +1,23 @@ +from easydict import EasyDict +from . import gym_lunarlander_v2 +from . import gym_pongnoframeskip_v4 +from . import gym_qbertnoframeskip_v4 +from . import gym_spaceInvadersnoframeskip_v4 + +supported_env_cfg = { + gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.cfg, + gym_pongnoframeskip_v4.cfg.env.env_id: gym_pongnoframeskip_v4.cfg, + gym_qbertnoframeskip_v4.cfg.env.env_id: gym_qbertnoframeskip_v4.cfg, + gym_spaceInvadersnoframeskip_v4.cfg.env.env_id: gym_spaceInvadersnoframeskip_v4.cfg, +} + +supported_env_cfg = EasyDict(supported_env_cfg) + +supported_env = { + gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.env, + gym_pongnoframeskip_v4.cfg.env.env_id: gym_pongnoframeskip_v4.env, + gym_qbertnoframeskip_v4.cfg.env.env_id: gym_qbertnoframeskip_v4.env, + gym_spaceInvadersnoframeskip_v4.cfg.env.env_id: gym_spaceInvadersnoframeskip_v4.env, +} + +supported_env = EasyDict(supported_env) diff --git a/ding/bonus/cfg/C51/gym_lunarlander_v2.py b/ding/bonus/cfg/C51/gym_lunarlander_v2.py new file mode 100644 index 0000000000..97ea8a7abe --- /dev/null +++ b/ding/bonus/cfg/C51/gym_lunarlander_v2.py @@ -0,0 +1,52 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='lunarlander_c51', + seed=0, + env=dict( + collector_env_num=8, + evaluator_env_num=8, + env_id='LunarLander-v2', + n_evaluator_episode=8, + stop_value=200, + ), + policy=dict( + cuda=False, + model=dict( + obs_shape=8, + action_shape=4, + encoder_hidden_size_list=[512, 64], + v_min=-30, + v_max=30, + n_atom=51, + ), + discount_factor=0.99, + nstep=3, + learn=dict( + update_per_collect=10, + batch_size=64, + learning_rate=0.001, + target_update_freq=100, + ), + collect=dict( + n_sample=64, + unroll_len=1, + ), + other=dict( + eps=dict( + type='exp', + start=0.95, + end=0.1, + decay=50000, + ), replay_buffer=dict(replay_buffer_size=100000, ) + ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +env = ding.envs.gym_env.env diff --git a/ding/bonus/cfg/C51/gym_pongnoframeskip_v4.py b/ding/bonus/cfg/C51/gym_pongnoframeskip_v4.py new file mode 100644 index 0000000000..5cbf8debd3 --- /dev/null +++ b/ding/bonus/cfg/C51/gym_pongnoframeskip_v4.py @@ -0,0 +1,55 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='PongNoFrameskip-v4-C51', + seed=0, + env=dict( + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=20, + env_id='PongNoFrameskip-v4', + frame_stack=4, + env_wrapper='atari_default', + ), + policy=dict( + cuda=True, + priority=False, + model=dict( + obs_shape=[4, 84, 84], + action_shape=6, + encoder_hidden_size_list=[128, 128, 512], + v_min=-10, + v_max=10, + n_atom=51, + ), + nstep=3, + discount_factor=0.99, + learn=dict( + update_per_collect=10, + batch_size=32, + learning_rate=0.0001, + target_update_freq=500, + ), + collect=dict(n_sample=100, ), + eval=dict(evaluator=dict(eval_freq=4000, )), + other=dict( + eps=dict( + type='exp', + start=1., + end=0.05, + decay=250000, + ), + replay_buffer=dict(replay_buffer_size=100000, ), + ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial(ding.envs.gym_env.env, cfg=dict(env_id=cfg.env.env_id, env_wrapper=cfg.env.env_wrapper)) diff --git a/ding/bonus/cfg/C51/gym_qbertnoframeskip_v4.py b/ding/bonus/cfg/C51/gym_qbertnoframeskip_v4.py new file mode 100644 index 0000000000..8442414e1e --- /dev/null +++ b/ding/bonus/cfg/C51/gym_qbertnoframeskip_v4.py @@ -0,0 +1,55 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='QbertNoFrameskip-v4-C51', + seed=0, + env=dict( + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=30000, + env_id='QbertNoFrameskip-v4', + frame_stack=4, + env_wrapper='atari_default', + ), + policy=dict( + cuda=True, + priority=True, + model=dict( + obs_shape=[4, 84, 84], + action_shape=6, + encoder_hidden_size_list=[128, 128, 512], + v_min=-10, + v_max=10, + n_atom=51, + ), + nstep=3, + discount_factor=0.99, + learn=dict( + update_per_collect=10, + batch_size=32, + learning_rate=0.0001, + target_update_freq=500, + ), + collect=dict(n_sample=100, ), + eval=dict(evaluator=dict(eval_freq=4000, )), + other=dict( + eps=dict( + type='exp', + start=1., + end=0.05, + decay=1000000, + ), + replay_buffer=dict(replay_buffer_size=400000, ), + ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial(ding.envs.gym_env.env, cfg=dict(env_id=cfg.env.env_id, env_wrapper=cfg.env.env_wrapper)) diff --git a/ding/bonus/cfg/C51/gym_spaceInvadersnoframeskip_v4.py b/ding/bonus/cfg/C51/gym_spaceInvadersnoframeskip_v4.py new file mode 100644 index 0000000000..42490e4fda --- /dev/null +++ b/ding/bonus/cfg/C51/gym_spaceInvadersnoframeskip_v4.py @@ -0,0 +1,55 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='SpaceInvadersNoFrameskip-v4-C51', + seed=0, + env=dict( + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=10000000000, + env_id='SpaceInvadersNoFrameskip-v4', + frame_stack=4, + env_wrapper='atari_default', + ), + policy=dict( + cuda=True, + priority=False, + model=dict( + obs_shape=[4, 84, 84], + action_shape=6, + encoder_hidden_size_list=[128, 128, 512], + v_min=-10, + v_max=10, + n_atom=51, + ), + nstep=3, + discount_factor=0.99, + learn=dict( + update_per_collect=10, + batch_size=32, + learning_rate=0.0001, + target_update_freq=500, + ), + collect=dict(n_sample=100, ), + eval=dict(evaluator=dict(eval_freq=4000, )), + other=dict( + eps=dict( + type='exp', + start=1., + end=0.05, + decay=1000000, + ), + replay_buffer=dict(replay_buffer_size=400000, ), + ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial(ding.envs.gym_env.env, cfg=dict(env_id=cfg.env.env_id, env_wrapper=cfg.env.env_wrapper)) diff --git a/ding/bonus/cfg/DDPG/__init__.py b/ding/bonus/cfg/DDPG/__init__.py new file mode 100644 index 0000000000..6e01f29d74 --- /dev/null +++ b/ding/bonus/cfg/DDPG/__init__.py @@ -0,0 +1,29 @@ +from easydict import EasyDict +from . import gym_bipedalwalker_v3 +from . import gym_halfcheetah_v3 +from . import gym_hopper_v3 +from . import gym_lunarlandercontinuous_v2 +from . import gym_pendulum_v1 +from . import gym_walker2d_v3 + +supported_env_cfg = { + gym_bipedalwalker_v3.cfg.env.env_id: gym_bipedalwalker_v3.cfg, + gym_halfcheetah_v3.cfg.env.env_id: gym_halfcheetah_v3.cfg, + gym_hopper_v3.cfg.env.env_id: gym_hopper_v3.cfg, + gym_lunarlandercontinuous_v2.cfg.env.env_id: gym_lunarlandercontinuous_v2.cfg, + gym_pendulum_v1.cfg.env.env_id: gym_pendulum_v1.cfg, + gym_walker2d_v3.cfg.env.env_id: gym_walker2d_v3.cfg, +} + +supported_env_cfg = EasyDict(supported_env_cfg) + +supported_env = { + gym_bipedalwalker_v3.cfg.env.env_id: gym_bipedalwalker_v3.env, + gym_halfcheetah_v3.cfg.env.env_id: gym_halfcheetah_v3.env, + gym_hopper_v3.cfg.env.env_id: gym_hopper_v3.env, + gym_lunarlandercontinuous_v2.cfg.env.env_id: gym_lunarlandercontinuous_v2.env, + gym_pendulum_v1.cfg.env.env_id: gym_pendulum_v1.env, + gym_walker2d_v3.cfg.env.env_id: gym_walker2d_v3.env, +} + +supported_env = EasyDict(supported_env) diff --git a/ding/bonus/cfg/DDPG/gym_bipedalwalker_v3.py b/ding/bonus/cfg/DDPG/gym_bipedalwalker_v3.py new file mode 100644 index 0000000000..227c773891 --- /dev/null +++ b/ding/bonus/cfg/DDPG/gym_bipedalwalker_v3.py @@ -0,0 +1,52 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='Bipedalwalker-v3-DDPG', + seed=0, + env=dict( + env_id='BipedalWalker-v3', + collector_env_num=8, + evaluator_env_num=5, + # (bool) Scale output action into legal range. + act_scale=True, + n_evaluator_episode=5, + rew_clip=True, + ), + policy=dict( + cuda=True, + random_collect_size=10000, + model=dict( + obs_shape=24, + action_shape=4, + twin_critic=False, + action_space='regression', + actor_head_hidden_size=400, + critic_head_hidden_size=400, + ), + learn=dict( + update_per_collect=64, + batch_size=256, + learning_rate_actor=0.0003, + learning_rate_critic=0.0003, + target_theta=0.005, + discount_factor=0.99, + learner=dict(hook=dict(log_show_after_iter=1000, )) + ), + collect=dict(n_sample=64, ), + other=dict(replay_buffer=dict(replay_buffer_size=300000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial( + ding.envs.gym_env.env, cfg=dict( + act_scale=cfg.env.act_scale, + rew_clip=cfg.env.rew_clip, + ) +) diff --git a/ding/bonus/cfg/DDPG/gym_halfcheetah_v3.py b/ding/bonus/cfg/DDPG/gym_halfcheetah_v3.py new file mode 100644 index 0000000000..66845e5997 --- /dev/null +++ b/ding/bonus/cfg/DDPG/gym_halfcheetah_v3.py @@ -0,0 +1,59 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='HalfCheetah-v3-DDPG', + seed=0, + env=dict( + env_id='HalfCheetah-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=1, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=11000, + env_wrapper='mujoco_default', + act_scale=True, + rew_clip=True, + ), + policy=dict( + cuda=True, + random_collect_size=25000, + model=dict( + obs_shape=17, + action_shape=6, + twin_critic=False, + actor_head_hidden_size=256, + critic_head_hidden_size=256, + action_space='regression', + ), + learn=dict( + update_per_collect=1, + batch_size=256, + learning_rate_actor=1e-3, + learning_rate_critic=1e-3, + ignore_done=True, + target_theta=0.005, + discount_factor=0.99, + actor_update_freq=1, + noise=False, + ), + collect=dict( + n_sample=1, + unroll_len=1, + noise_sigma=0.1, + ), + other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial( + ding.envs.gym_env.env, + cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) +) diff --git a/ding/bonus/cfg/DDPG/gym_hopper_v3.py b/ding/bonus/cfg/DDPG/gym_hopper_v3.py new file mode 100644 index 0000000000..865571c285 --- /dev/null +++ b/ding/bonus/cfg/DDPG/gym_hopper_v3.py @@ -0,0 +1,59 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='Hopper-v3-DDPG', + seed=0, + env=dict( + env_id='Hopper-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=1, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=6000, + env_wrapper='mujoco_default', + act_scale=True, + rew_clip=True, + ), + policy=dict( + cuda=True, + random_collect_size=25000, + model=dict( + obs_shape=11, + action_shape=3, + twin_critic=False, + actor_head_hidden_size=256, + critic_head_hidden_size=256, + action_space='regression', + ), + learn=dict( + update_per_collect=1, + batch_size=256, + learning_rate_actor=1e-3, + learning_rate_critic=1e-3, + ignore_done=False, + target_theta=0.005, + discount_factor=0.99, + actor_update_freq=1, + noise=False, + ), + collect=dict( + n_sample=1, + unroll_len=1, + noise_sigma=0.1, + ), + other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial( + ding.envs.gym_env.env, + cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) +) diff --git a/ding/bonus/cfg/DDPG/gym_lunarlandercontinuous_v2.py b/ding/bonus/cfg/DDPG/gym_lunarlandercontinuous_v2.py new file mode 100644 index 0000000000..dd902cd191 --- /dev/null +++ b/ding/bonus/cfg/DDPG/gym_lunarlandercontinuous_v2.py @@ -0,0 +1,60 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='LunarLanderContinuous-V2-DDPG', + seed=0, + env=dict( + env_id='LunarLanderContinuous-v2', + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + act_scale=True, + stop_value=240, + ), + policy=dict( + cuda=True, + random_collect_size=0, + model=dict( + obs_shape=8, + action_shape=2, + twin_critic=True, + action_space='regression', + ), + learn=dict( + update_per_collect=2, + batch_size=128, + learning_rate_actor=0.001, + learning_rate_critic=0.001, + ignore_done=False, # TODO(pu) + # (int) When critic network updates once, how many times will actor network update. + # Delayed Policy Updates in original TD3 paper(https://arxiv.org/pdf/1802.09477.pdf). + # Default 1 for DDPG, 2 for TD3. + actor_update_freq=1, + # (bool) Whether to add noise on target network's action. + # Target Policy Smoothing Regularization in original TD3 paper(https://arxiv.org/pdf/1802.09477.pdf). + # Default True for TD3, False for DDPG. + noise=False, + noise_sigma=0.1, + noise_range=dict( + min=-0.5, + max=0.5, + ), + ), + collect=dict( + n_sample=48, + noise_sigma=0.1, + collector=dict(collect_print_freq=1000, ), + ), + eval=dict(evaluator=dict(eval_freq=100, ), ), + other=dict(replay_buffer=dict(replay_buffer_size=20000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial(ding.envs.gym_env.env, continuous=True) diff --git a/ding/bonus/cfg/DDPG/gym_pendulum_v1.py b/ding/bonus/cfg/DDPG/gym_pendulum_v1.py new file mode 100644 index 0000000000..34e76c7750 --- /dev/null +++ b/ding/bonus/cfg/DDPG/gym_pendulum_v1.py @@ -0,0 +1,53 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='Pendulum-v1-DDPG', + seed=0, + env=dict( + collector_env_num=8, + evaluator_env_num=5, + # (bool) Scale output action into legal range. + act_scale=True, + n_evaluator_episode=5, + stop_value=-250, + ), + policy=dict( + cuda=False, + priority=False, + random_collect_size=800, + model=dict( + obs_shape=3, + action_shape=1, + twin_critic=False, + action_space='regression', + ), + learn=dict( + update_per_collect=2, + batch_size=128, + learning_rate_actor=0.001, + learning_rate_critic=0.001, + ignore_done=True, + actor_update_freq=1, + noise=False, + ), + collect=dict( + n_sample=48, + noise_sigma=0.1, + collector=dict(collect_print_freq=1000, ), + ), + eval=dict(evaluator=dict(eval_freq=100, )), + other=dict(replay_buffer=dict( + replay_buffer_size=20000, + max_use=16, + ), ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial(ding.envs.gym_env.env, cfg=dict(act_scale=cfg.env.act_scale)) diff --git a/ding/bonus/cfg/DDPG/gym_walker2d_v3.py b/ding/bonus/cfg/DDPG/gym_walker2d_v3.py new file mode 100644 index 0000000000..1611ca7d62 --- /dev/null +++ b/ding/bonus/cfg/DDPG/gym_walker2d_v3.py @@ -0,0 +1,59 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='Walker2d-v3-DDPG', + seed=0, + env=dict( + env_id='Walker2d-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=1, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=6000, + env_wrapper='mujoco_default', + act_scale=True, + rew_clip=True, + ), + policy=dict( + cuda=True, + random_collect_size=25000, + model=dict( + obs_shape=17, + action_shape=6, + twin_critic=False, + actor_head_hidden_size=256, + critic_head_hidden_size=256, + action_space='regression', + ), + learn=dict( + update_per_collect=1, + batch_size=256, + learning_rate_actor=1e-3, + learning_rate_critic=1e-3, + ignore_done=False, + target_theta=0.005, + discount_factor=0.99, + actor_update_freq=1, + noise=False, + ), + collect=dict( + n_sample=1, + unroll_len=1, + noise_sigma=0.1, + ), + other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial( + ding.envs.gym_env.env, + cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) +) diff --git a/ding/bonus/cfg/DQN/__init__.py b/ding/bonus/cfg/DQN/__init__.py new file mode 100644 index 0000000000..2704b04c53 --- /dev/null +++ b/ding/bonus/cfg/DQN/__init__.py @@ -0,0 +1,23 @@ +from easydict import EasyDict +from . import gym_lunarlander_v2 +from . import gym_pongnoframeskip_v4 +from . import gym_qbertnoframeskip_v4 +from . import gym_spaceInvadersnoframeskip_v4 + +supported_env_cfg = { + gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.cfg, + gym_pongnoframeskip_v4.cfg.env.env_id: gym_pongnoframeskip_v4.cfg, + gym_qbertnoframeskip_v4.cfg.env.env_id: gym_qbertnoframeskip_v4.cfg, + gym_spaceInvadersnoframeskip_v4.cfg.env.env_id: gym_spaceInvadersnoframeskip_v4.cfg, +} + +supported_env_cfg = EasyDict(supported_env_cfg) + +supported_env = { + gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.env, + gym_pongnoframeskip_v4.cfg.env.env_id: gym_pongnoframeskip_v4.env, + gym_qbertnoframeskip_v4.cfg.env.env_id: gym_qbertnoframeskip_v4.env, + gym_spaceInvadersnoframeskip_v4.cfg.env.env_id: gym_spaceInvadersnoframeskip_v4.env, +} + +supported_env = EasyDict(supported_env) diff --git a/ding/bonus/cfg/DQN/gym_lunarlander_v2.py b/ding/bonus/cfg/DQN/gym_lunarlander_v2.py new file mode 100644 index 0000000000..510c7b2e18 --- /dev/null +++ b/ding/bonus/cfg/DQN/gym_lunarlander_v2.py @@ -0,0 +1,53 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='LunarLander-v2-DQN', + seed=0, + env=dict( + env_id='LunarLander-v2', + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=240, + ), + policy=dict( + cuda=True, + random_collect_size=25000, + discount_factor=0.99, + nstep=3, + learn=dict( + update_per_collect=10, + batch_size=64, + learning_rate=0.001, + # Frequency of target network update. + target_update_freq=100, + ), + model=dict( + obs_shape=8, + action_shape=4, + encoder_hidden_size_list=[512, 64], + # Whether to use dueling head. + dueling=True, + ), + collect=dict( + n_sample=64, + unroll_len=1, + ), + other=dict( + eps=dict( + type='exp', + start=0.95, + end=0.1, + decay=50000, + ), replay_buffer=dict(replay_buffer_size=100000, ) + ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +env = ding.envs.gym_env.env diff --git a/ding/bonus/cfg/DQN/gym_pongnoframeskip_v4.py b/ding/bonus/cfg/DQN/gym_pongnoframeskip_v4.py new file mode 100644 index 0000000000..5777aae121 --- /dev/null +++ b/ding/bonus/cfg/DQN/gym_pongnoframeskip_v4.py @@ -0,0 +1,51 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='PongNoFrameskip-v4-DQN', + seed=0, + env=dict( + env_id='PongNoFrameskip-v4', + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=20, + fram_stack=4, + env_wrapper='atari_default', + ), + policy=dict( + cuda=True, + priority=False, + discount_factor=0.99, + nstep=3, + learn=dict( + update_per_collect=10, + batch_size=32, + learning_rate=0.0001, + # Frequency of target network update. + target_update_freq=500, + ), + model=dict( + obs_shape=[4, 84, 84], + action_shape=6, + encoder_hidden_size_list=[128, 128, 512], + ), + collect=dict(n_sample=96, ), + other=dict( + eps=dict( + type='exp', + start=1., + end=0.05, + decay=250000, + ), replay_buffer=dict(replay_buffer_size=100000, ) + ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial(ding.envs.gym_env.env, cfg=dict(env_id=cfg.env.env_id, env_wrapper=cfg.env.env_wrapper)) diff --git a/ding/bonus/cfg/DQN/gym_qbertnoframeskip_v4.py b/ding/bonus/cfg/DQN/gym_qbertnoframeskip_v4.py new file mode 100644 index 0000000000..94c0a48c57 --- /dev/null +++ b/ding/bonus/cfg/DQN/gym_qbertnoframeskip_v4.py @@ -0,0 +1,51 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='QbertNoFrameskip-v4-DQN', + seed=0, + env=dict( + env_id='QbertNoFrameskip-v4', + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + fram_stack=4, + stop_value=30000, + env_wrapper='atari_default', + ), + policy=dict( + cuda=True, + priority=False, + discount_factor=0.99, + nstep=3, + learn=dict( + update_per_collect=10, + batch_size=32, + learning_rate=0.0001, + # Frequency of target network update. + target_update_freq=500, + ), + model=dict( + obs_shape=[4, 84, 84], + action_shape=6, + encoder_hidden_size_list=[128, 128, 512], + ), + collect=dict(n_sample=100, ), + other=dict( + eps=dict( + type='exp', + start=1., + end=0.05, + decay=1000000, + ), replay_buffer=dict(replay_buffer_size=400000, ) + ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial(ding.envs.gym_env.env, cfg=dict(env_id=cfg.env.env_id, env_wrapper=cfg.env.env_wrapper)) diff --git a/ding/bonus/cfg/DQN/gym_spaceInvadersnoframeskip_v4.py b/ding/bonus/cfg/DQN/gym_spaceInvadersnoframeskip_v4.py new file mode 100644 index 0000000000..17a4b5cad5 --- /dev/null +++ b/ding/bonus/cfg/DQN/gym_spaceInvadersnoframeskip_v4.py @@ -0,0 +1,52 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='SpaceInvadersNoFrameskip-v4-DQN', + seed=0, + env=dict( + env_id='SpaceInvadersNoFrameskip-v4', + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + fram_stack=4, + stop_value=2000, + env_wrapper='atari_default', + ), + policy=dict( + cuda=True, + priority=False, + discount_factor=0.99, + nstep=3, + learn=dict( + update_per_collect=10, + batch_size=32, + learning_rate=0.0001, + # Frequency of target network update. + target_update_freq=500, + hook=dict(save_ckpt_after_iter=1000000, ) + ), + model=dict( + obs_shape=[4, 84, 84], + action_shape=6, + encoder_hidden_size_list=[128, 128, 512], + ), + collect=dict(n_sample=100, ), + other=dict( + eps=dict( + type='exp', + start=1., + end=0.05, + decay=1000000, + ), replay_buffer=dict(replay_buffer_size=400000, ) + ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial(ding.envs.gym_env.env, cfg=dict(env_id=cfg.env.env_id, env_wrapper=cfg.env.env_wrapper)) diff --git a/ding/bonus/cfg/PG/__init__ .py b/ding/bonus/cfg/PG/__init__ .py new file mode 100644 index 0000000000..63c6804ef6 --- /dev/null +++ b/ding/bonus/cfg/PG/__init__ .py @@ -0,0 +1,27 @@ +from easydict import EasyDict +from . import gym_bipedalwalker_v3 +from . import gym_halfcheetah_v3 +from . import gym_hopper_v3 +from . import gym_lunarlander_v2 +from . import gym_pendulum_v1 +from . import gym_walker2d_v3 + +supported_env_cfg = { + gym_bipedalwalker_v3.cfg.env.env_id: gym_bipedalwalker_v3.cfg, + gym_halfcheetah_v3.cfg.env.env_id: gym_halfcheetah_v3.cfg, + gym_hopper_v3.cfg.env.env_id: gym_hopper_v3.cfg, + gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.cfg, + gym_walker2d_v3.cfg.env.env_id: gym_walker2d_v3.cfg, +} + +supported_env_cfg = EasyDict(supported_env_cfg) + +supported_env = { + gym_bipedalwalker_v3.cfg.env.env_id: gym_bipedalwalker_v3.env, + gym_halfcheetah_v3.cfg.env.env_id: gym_halfcheetah_v3.env, + gym_hopper_v3.cfg.env.env_id: gym_hopper_v3.env, + gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.env, + gym_walker2d_v3.cfg.env.env_id: gym_walker2d_v3.env, +} + +supported_env = EasyDict(supported_env) diff --git a/ding/bonus/cfg/PG/gym_bipedalwalker_v3.py b/ding/bonus/cfg/PG/gym_bipedalwalker_v3.py new file mode 100644 index 0000000000..e3c194f4b9 --- /dev/null +++ b/ding/bonus/cfg/PG/gym_bipedalwalker_v3.py @@ -0,0 +1,49 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='Bipedalwalker-v3-PG', + seed=0, + env=dict( + env_id='BipedalWalker-v3', + collector_env_num=8, + evaluator_env_num=8, + act_scale=True, + n_evaluator_episode=8, + stop_value=300, + rew_clip=True, + ), + policy=dict( + cuda=True, + action_space='continuous', + model=dict( + action_space='continuous', + obs_shape=24, + action_shape=4, + ), + learn=dict( + batch_size=64, + learning_rate=0.001, + entropy_weight=0.001, + ), + collect=dict( + n_episode=20, + unroll_len=1, + discount_factor=0.99, + ), + eval=dict(evaluator=dict(eval_freq=1, )) + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial( + ding.envs.gym_env.env, cfg=dict( + act_scale=cfg.env.act_scale, + rew_clip=cfg.env.rew_clip, + ) +) diff --git a/ding/bonus/cfg/PG/gym_halfcheetah_v3.py b/ding/bonus/cfg/PG/gym_halfcheetah_v3.py new file mode 100644 index 0000000000..2a869e84a3 --- /dev/null +++ b/ding/bonus/cfg/PG/gym_halfcheetah_v3.py @@ -0,0 +1,50 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='HalfCheetah-v3-PG', + seed=0, + env=dict( + env_id='HalfCheetah-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=6000, + env_wrapper='mujoco_default', + act_scale=True, + rew_clip=True, + ), + policy=dict( + cuda=True, + action_space='continuous', + model=dict( + action_space='continuous', + obs_shape=17, + action_shape=6, + ), + learn=dict( + batch_size=64, + learning_rate=0.001, + entropy_weight=0.001, + ), + collect=dict( + n_episode=20, + unroll_len=1, + discount_factor=0.99, + ), + eval=dict(evaluator=dict(eval_freq=1, )) + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial( + ding.envs.gym_env.env, + cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) +) diff --git a/ding/bonus/cfg/PG/gym_hopper_v3.py b/ding/bonus/cfg/PG/gym_hopper_v3.py new file mode 100644 index 0000000000..2c81292094 --- /dev/null +++ b/ding/bonus/cfg/PG/gym_hopper_v3.py @@ -0,0 +1,50 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='Hopper-v3-PG', + seed=0, + env=dict( + env_id='Hopper-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=6000, + env_wrapper='mujoco_default', + act_scale=True, + rew_clip=True, + ), + policy=dict( + cuda=True, + action_space='continuous', + model=dict( + action_space='continuous', + obs_shape=11, + action_shape=3, + ), + learn=dict( + batch_size=64, + learning_rate=0.005, + entropy_weight=0.01, + ), + collect=dict( + n_episode=34, + unroll_len=1, + discount_factor=0.99, + ), + eval=dict(evaluator=dict(eval_freq=1, )) + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial( + ding.envs.gym_env.env, + cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) +) diff --git a/ding/bonus/cfg/PG/gym_lunarlander_v2.py b/ding/bonus/cfg/PG/gym_lunarlander_v2.py new file mode 100644 index 0000000000..ea1e855430 --- /dev/null +++ b/ding/bonus/cfg/PG/gym_lunarlander_v2.py @@ -0,0 +1,39 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='LunarLander-v2-PG', + env=dict( + collector_env_num=8, + evaluator_env_num=8, + env_id='LunarLander-v2', + n_evaluator_episode=8, + stop_value=240, + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=8, + action_shape=4, + ), + learn=dict( + batch_size=320, + learning_rate=3e-4, + entropy_weight=0.001, + grad_norm=0.5, + ), + collect=dict( + n_episode=8, + discount_factor=0.99, + ), + eval=dict(evaluator=dict(eval_freq=1000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial(ding.envs.gym_env.env, continuous=True) diff --git a/ding/bonus/cfg/PG/gym_pendulum_v1.py b/ding/bonus/cfg/PG/gym_pendulum_v1.py new file mode 100644 index 0000000000..08baf62322 --- /dev/null +++ b/ding/bonus/cfg/PG/gym_pendulum_v1.py @@ -0,0 +1,42 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='Pendulum-v1-PG', + seed=0, + env=dict( + collector_env_num=8, + evaluator_env_num=5, + act_scale=True, + n_evaluator_episode=5, + stop_value=-200, + ), + policy=dict( + cuda=False, + action_space='continuous', + model=dict( + action_space='continuous', + obs_shape=3, + action_shape=1, + ), + learn=dict( + batch_size=4000, + learning_rate=0.001, + entropy_weight=0.001, + ), + collect=dict( + n_episode=20, + unroll_len=1, + discount_factor=0.99, + ), + eval=dict(evaluator=dict(eval_freq=1, )) + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial(ding.envs.gym_env.env, cfg=dict(act_scale=cfg.env.act_scale)) diff --git a/ding/bonus/cfg/PG/gym_walker2d_v3.py b/ding/bonus/cfg/PG/gym_walker2d_v3.py new file mode 100644 index 0000000000..d71b0fdc5f --- /dev/null +++ b/ding/bonus/cfg/PG/gym_walker2d_v3.py @@ -0,0 +1,50 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='Walker2d-v3-PG', + seed=0, + env=dict( + env_id='Walker2d-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=6000, + env_wrapper='mujoco_default', + act_scale=True, + rew_clip=True, + ), + policy=dict( + cuda=True, + action_space='continuous', + model=dict( + action_space='continuous', + obs_shape=17, + action_shape=6, + ), + learn=dict( + batch_size=64, + learning_rate=0.001, + entropy_weight=0.001, + ), + collect=dict( + n_episode=20, + unroll_len=1, + discount_factor=0.99, + ), + eval=dict(evaluator=dict(eval_freq=1, )) + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial( + ding.envs.gym_env.env, + cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) +) diff --git a/ding/bonus/cfg/PPOF/__init__.py b/ding/bonus/cfg/PPOF/__init__.py new file mode 100644 index 0000000000..2fba110833 --- /dev/null +++ b/ding/bonus/cfg/PPOF/__init__.py @@ -0,0 +1,17 @@ +from easydict import EasyDict +from . import gym_lunarlander_v2 +from . import gym_lunarlandercontinuous_v2 + +supported_env_cfg = { + gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.cfg, + gym_lunarlandercontinuous_v2.cfg.env.env_id: gym_lunarlandercontinuous_v2.cfg, +} + +supported_env_cfg = EasyDict(supported_env_cfg) + +supported_env = { + gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.env, + gym_lunarlandercontinuous_v2.cfg.env.env_id: gym_lunarlandercontinuous_v2.env, +} + +supported_env = EasyDict(supported_env) diff --git a/ding/bonus/cfg/PPOF/gym_lunarlander_v2.py b/ding/bonus/cfg/PPOF/gym_lunarlander_v2.py new file mode 100644 index 0000000000..4f21511e03 --- /dev/null +++ b/ding/bonus/cfg/PPOF/gym_lunarlander_v2.py @@ -0,0 +1,13 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='LunarLander-v2-PPO', + n_sample=400, + value_norm='popart', +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial(ding.envs.gym_env.env, continuous=True) diff --git a/ding/bonus/cfg/PPOF/gym_lunarlandercontinuous_v2.py b/ding/bonus/cfg/PPOF/gym_lunarlandercontinuous_v2.py new file mode 100644 index 0000000000..546e37bb25 --- /dev/null +++ b/ding/bonus/cfg/PPOF/gym_lunarlandercontinuous_v2.py @@ -0,0 +1,14 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='LunarLanderContinuous-V2-PPO', + action_space='continuous', + n_sample=400, + act_scale=True, +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial(ding.envs.gym_env.env, cfg=dict(act_scale=cfg.act_scale)) diff --git a/ding/bonus/cfg/PPOOffPolicy/__init__.py b/ding/bonus/cfg/PPOOffPolicy/__init__.py new file mode 100644 index 0000000000..2704b04c53 --- /dev/null +++ b/ding/bonus/cfg/PPOOffPolicy/__init__.py @@ -0,0 +1,23 @@ +from easydict import EasyDict +from . import gym_lunarlander_v2 +from . import gym_pongnoframeskip_v4 +from . import gym_qbertnoframeskip_v4 +from . import gym_spaceInvadersnoframeskip_v4 + +supported_env_cfg = { + gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.cfg, + gym_pongnoframeskip_v4.cfg.env.env_id: gym_pongnoframeskip_v4.cfg, + gym_qbertnoframeskip_v4.cfg.env.env_id: gym_qbertnoframeskip_v4.cfg, + gym_spaceInvadersnoframeskip_v4.cfg.env.env_id: gym_spaceInvadersnoframeskip_v4.cfg, +} + +supported_env_cfg = EasyDict(supported_env_cfg) + +supported_env = { + gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.env, + gym_pongnoframeskip_v4.cfg.env.env_id: gym_pongnoframeskip_v4.env, + gym_qbertnoframeskip_v4.cfg.env.env_id: gym_qbertnoframeskip_v4.env, + gym_spaceInvadersnoframeskip_v4.cfg.env.env_id: gym_spaceInvadersnoframeskip_v4.env, +} + +supported_env = EasyDict(supported_env) diff --git a/ding/bonus/cfg/PPOOffPolicy/gym_lunarlander_v2.py b/ding/bonus/cfg/PPOOffPolicy/gym_lunarlander_v2.py new file mode 100644 index 0000000000..18ba940f3e --- /dev/null +++ b/ding/bonus/cfg/PPOOffPolicy/gym_lunarlander_v2.py @@ -0,0 +1,44 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='LunarLander-v2-PPOOffPolicy', + env=dict( + collector_env_num=8, + evaluator_env_num=8, + env_id='LunarLander-v2', + n_evaluator_episode=8, + stop_value=240, + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=8, + action_shape=4, + ), + learn=dict( + update_per_collect=4, + batch_size=64, + learning_rate=0.001, + value_weight=0.5, + entropy_weight=0.01, + clip_ratio=0.2, + nstep=1, + nstep_return=False, + adv_norm=True, + ), + collect=dict( + n_sample=128, + unroll_len=1, + discount_factor=0.99, + gae_lambda=0.95, + ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +env = ding.envs.gym_env.env diff --git a/ding/bonus/cfg/PPOOffPolicy/gym_pongnoframeskip_v4.py b/ding/bonus/cfg/PPOOffPolicy/gym_pongnoframeskip_v4.py new file mode 100644 index 0000000000..5b2dc8be6a --- /dev/null +++ b/ding/bonus/cfg/PPOOffPolicy/gym_pongnoframeskip_v4.py @@ -0,0 +1,54 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='PongNoFrameskip-v4-PPOOffPolicy', + env=dict( + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=20, + env_id='PongNoFrameskip-v4', + frame_stack=4, + ), + policy=dict( + cuda=True, + recompute_adv=True, + action_space='discrete', + model=dict( + obs_shape=[4, 84, 84], + action_shape=6, + action_space='discrete', + encoder_hidden_size_list=[64, 64, 128], + actor_head_hidden_size=128, + critic_head_hidden_size=128, + ), + learn=dict( + update_per_collect=10, + batch_size=320, + learning_rate=3e-4, + value_weight=0.5, + entropy_weight=0.001, + clip_ratio=0.2, + adv_norm=True, + # value_norm=True, + ignore_done=False, + grad_clip_type='clip_norm', + grad_clip_value=0.5, + ), + collect=dict( + n_sample=3200, + unroll_len=1, + discount_factor=0.99, + gae_lambda=0.95, + ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial(ding.envs.gym_env.env, cfg=dict(env_id=cfg.env.env_id, env_wrapper=cfg.env.env_wrapper)) diff --git a/ding/bonus/cfg/PPOOffPolicy/gym_qbertnoframeskip_v4.py b/ding/bonus/cfg/PPOOffPolicy/gym_qbertnoframeskip_v4.py new file mode 100644 index 0000000000..4e06e99550 --- /dev/null +++ b/ding/bonus/cfg/PPOOffPolicy/gym_qbertnoframeskip_v4.py @@ -0,0 +1,48 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='QbertNoFrameskip-v4-PPOOffPolicy', + env=dict( + collector_env_num=16, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=10000000000, + env_id='QbertNoFrameskip-v4', + frame_stack=4 + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=[4, 84, 84], + action_shape=6, + encoder_hidden_size_list=[32, 64, 64, 128], + actor_head_hidden_size=128, + critic_head_hidden_size=128, + critic_head_layer_num=2, + ), + learn=dict( + update_per_collect=18, + batch_size=128, + learning_rate=0.0001, + value_weight=1.0, + entropy_weight=0.005, + clip_ratio=0.1, + adv_norm=False, + ), + collect=dict( + n_sample=1024, + unroll_len=1, + discount_factor=0.99, + gae_lambda=0.95, + ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial(ding.envs.gym_env.env, cfg=dict(env_id=cfg.env.env_id, env_wrapper=cfg.env.env_wrapper)) diff --git a/ding/bonus/cfg/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py b/ding/bonus/cfg/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py new file mode 100644 index 0000000000..7aa6909b35 --- /dev/null +++ b/ding/bonus/cfg/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py @@ -0,0 +1,49 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='SpaceInvadersNoFrameskip-v4-PPOOffPolicy', + env=dict( + collector_env_num=16, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=10000000000, + env_id='SpaceInvadersNoFrameskip-v4', + frame_stack=4, + manager=dict(shared_memory=False, ) + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=[4, 84, 84], + action_shape=6, + encoder_hidden_size_list=[32, 64, 64, 128], + actor_head_hidden_size=128, + critic_head_hidden_size=128, + critic_head_layer_num=2, + ), + learn=dict( + update_per_collect=24, + batch_size=128, + learning_rate=0.0001, + value_weight=1.0, + entropy_weight=0.03, + clip_ratio=0.1, + adv_norm=False, + ), + collect=dict( + n_sample=1024, + unroll_len=1, + discount_factor=0.99, + gae_lambda=0.95, + ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial(ding.envs.gym_env.env, cfg=dict(env_id=cfg.env.env_id, env_wrapper=cfg.env.env_wrapper)) diff --git a/ding/bonus/cfg/SAC/__init__.py b/ding/bonus/cfg/SAC/__init__.py new file mode 100644 index 0000000000..6e01f29d74 --- /dev/null +++ b/ding/bonus/cfg/SAC/__init__.py @@ -0,0 +1,29 @@ +from easydict import EasyDict +from . import gym_bipedalwalker_v3 +from . import gym_halfcheetah_v3 +from . import gym_hopper_v3 +from . import gym_lunarlandercontinuous_v2 +from . import gym_pendulum_v1 +from . import gym_walker2d_v3 + +supported_env_cfg = { + gym_bipedalwalker_v3.cfg.env.env_id: gym_bipedalwalker_v3.cfg, + gym_halfcheetah_v3.cfg.env.env_id: gym_halfcheetah_v3.cfg, + gym_hopper_v3.cfg.env.env_id: gym_hopper_v3.cfg, + gym_lunarlandercontinuous_v2.cfg.env.env_id: gym_lunarlandercontinuous_v2.cfg, + gym_pendulum_v1.cfg.env.env_id: gym_pendulum_v1.cfg, + gym_walker2d_v3.cfg.env.env_id: gym_walker2d_v3.cfg, +} + +supported_env_cfg = EasyDict(supported_env_cfg) + +supported_env = { + gym_bipedalwalker_v3.cfg.env.env_id: gym_bipedalwalker_v3.env, + gym_halfcheetah_v3.cfg.env.env_id: gym_halfcheetah_v3.env, + gym_hopper_v3.cfg.env.env_id: gym_hopper_v3.env, + gym_lunarlandercontinuous_v2.cfg.env.env_id: gym_lunarlandercontinuous_v2.env, + gym_pendulum_v1.cfg.env.env_id: gym_pendulum_v1.env, + gym_walker2d_v3.cfg.env.env_id: gym_walker2d_v3.env, +} + +supported_env = EasyDict(supported_env) diff --git a/ding/bonus/cfg/SAC/gym_bipedalwalker_v3.py b/ding/bonus/cfg/SAC/gym_bipedalwalker_v3.py new file mode 100644 index 0000000000..89443e451b --- /dev/null +++ b/ding/bonus/cfg/SAC/gym_bipedalwalker_v3.py @@ -0,0 +1,53 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='BipedalWalker-v3-SAC', + seed=0, + env=dict( + env_id='BipedalWalker-v3', + collector_env_num=8, + evaluator_env_num=5, + act_scale=True, + n_evaluator_episode=5, + rew_clip=True, + ), + policy=dict( + cuda=True, + random_collect_size=10000, + model=dict( + obs_shape=24, + action_shape=4, + twin_critic=True, + action_space='reparameterization', + actor_head_hidden_size=128, + critic_head_hidden_size=128, + ), + learn=dict( + update_per_collect=64, + batch_size=256, + learning_rate_q=0.0003, + learning_rate_policy=0.0003, + learning_rate_alpha=0.0003, + target_theta=0.005, + discount_factor=0.99, + auto_alpha=True, + learner=dict(hook=dict(log_show_after_iter=1000, )) + ), + collect=dict(n_sample=64, ), + other=dict(replay_buffer=dict(replay_buffer_size=300000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial( + ding.envs.gym_env.env, cfg=dict( + act_scale=cfg.env.act_scale, + rew_clip=cfg.env.rew_clip, + ) +) diff --git a/ding/bonus/cfg/SAC/gym_halfcheetah_v3.py b/ding/bonus/cfg/SAC/gym_halfcheetah_v3.py new file mode 100644 index 0000000000..9d1b445834 --- /dev/null +++ b/ding/bonus/cfg/SAC/gym_halfcheetah_v3.py @@ -0,0 +1,62 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='HalfCheetah-v3-SAC', + seed=0, + env=dict( + env_id='HalfCheetah-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=1, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=12000, + env_wrapper='mujoco_default', + act_scale=True, + rew_clip=True, + ), + policy=dict( + cuda=False, + random_collect_size=10000, + model=dict( + obs_shape=17, + action_shape=6, + twin_critic=True, + action_space='reparameterization', + actor_head_hidden_size=256, + critic_head_hidden_size=256, + ), + learn=dict( + update_per_collect=1, + batch_size=256, + learning_rate_q=1e-3, + learning_rate_policy=1e-3, + learning_rate_alpha=3e-4, + ignore_done=True, + target_theta=0.005, + discount_factor=0.99, + alpha=0.2, + reparameterization=True, + auto_alpha=False, + ), + collect=dict( + n_sample=1, + unroll_len=1, + ), + command=dict(), + eval=dict(), + other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial( + ding.envs.gym_env.env, + cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) +) diff --git a/ding/bonus/cfg/SAC/gym_hopper_v3.py b/ding/bonus/cfg/SAC/gym_hopper_v3.py new file mode 100644 index 0000000000..db7b90e8ab --- /dev/null +++ b/ding/bonus/cfg/SAC/gym_hopper_v3.py @@ -0,0 +1,47 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='Hopper-v3-SAC', + seed=0, + env=dict( + env_id='Hopper-v3', + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=6000, + env_wrapper='mujoco_default', + act_scale=True, + rew_clip=True, + ), + policy=dict( + cuda=True, + random_collect_size=10000, + model=dict( + obs_shape=11, + action_shape=3, + action_space='reparameterization', + actor_head_hidden_size=256, + critic_head_hidden_size=256, + ), + learn=dict( + update_per_collect=1, + batch_size=256, + learning_rate_q=1e-3, + learning_rate_policy=1e-3, + reparameterization=True, + auto_alpha=False, + ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial( + ding.envs.gym_env.env, + cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) +) diff --git a/ding/bonus/cfg/SAC/gym_lunarlandercontinuous_v2.py b/ding/bonus/cfg/SAC/gym_lunarlandercontinuous_v2.py new file mode 100644 index 0000000000..3aa9f30d79 --- /dev/null +++ b/ding/bonus/cfg/SAC/gym_lunarlandercontinuous_v2.py @@ -0,0 +1,44 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='LunarLander-v2-SAC', + seed=0, + env=dict( + env_id='LunarLanderContinuous-v2', + collector_env_num=4, + evaluator_env_num=8, + act_scale=True, + n_evaluator_episode=8, + stop_value=240, + ), + policy=dict( + cuda=True, + random_collect_size=10000, + model=dict( + obs_shape=8, + action_shape=2, + action_space='reparameterization', + twin_critic=True, + ), + learn=dict( + update_per_collect=256, + batch_size=128, + learning_rate_q=1e-3, + learning_rate_policy=3e-4, + learning_rate_alpha=3e-4, + auto_alpha=True, + ), + collect=dict(n_sample=256, ), + eval=dict(evaluator=dict(eval_freq=1000, ), ), + other=dict(replay_buffer=dict(replay_buffer_size=int(1e5), ), ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial(ding.envs.gym_env.env, continuous=True) diff --git a/ding/bonus/cfg/SAC/gym_pendulum_v1.py b/ding/bonus/cfg/SAC/gym_pendulum_v1.py new file mode 100644 index 0000000000..9c2f713314 --- /dev/null +++ b/ding/bonus/cfg/SAC/gym_pendulum_v1.py @@ -0,0 +1,49 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='Pendulum-v1-SAC', + seed=0, + env=dict( + collector_env_num=10, + evaluator_env_num=8, + act_scale=True, + n_evaluator_episode=8, + stop_value=-250, + ), + policy=dict( + cuda=False, + priority=False, + random_collect_size=1000, + model=dict( + obs_shape=3, + action_shape=1, + twin_critic=True, + action_space='reparameterization', + actor_head_hidden_size=128, + critic_head_hidden_size=128, + ), + learn=dict( + update_per_collect=1, + batch_size=128, + learning_rate_q=0.001, + learning_rate_policy=0.001, + learning_rate_alpha=0.0003, + ignore_done=True, + target_theta=0.005, + discount_factor=0.99, + auto_alpha=True, + ), + collect=dict(n_sample=10, ), + eval=dict(evaluator=dict(eval_freq=100, )), + other=dict(replay_buffer=dict(replay_buffer_size=100000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial(ding.envs.gym_env.env, cfg=dict(act_scale=cfg.env.act_scale)) diff --git a/ding/bonus/cfg/SAC/gym_walker2d_v3.py b/ding/bonus/cfg/SAC/gym_walker2d_v3.py new file mode 100644 index 0000000000..32a2bafac8 --- /dev/null +++ b/ding/bonus/cfg/SAC/gym_walker2d_v3.py @@ -0,0 +1,62 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='Walker2d-v3-SAC', + seed=0, + env=dict( + env_id='Walker2d-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=1, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=6000, + env_wrapper='mujoco_default', + act_scale=True, + rew_clip=True, + ), + policy=dict( + cuda=True, + random_collect_size=10000, + model=dict( + obs_shape=17, + action_shape=6, + twin_critic=True, + action_space='reparameterization', + actor_head_hidden_size=256, + critic_head_hidden_size=256, + ), + learn=dict( + update_per_collect=1, + batch_size=256, + learning_rate_q=1e-3, + learning_rate_policy=1e-3, + learning_rate_alpha=3e-4, + ignore_done=False, + target_theta=0.005, + discount_factor=0.99, + alpha=0.2, + reparameterization=True, + auto_alpha=False, + ), + collect=dict( + n_sample=1, + unroll_len=1, + ), + command=dict(), + eval=dict(), + other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial( + ding.envs.gym_env.env, + cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) +) diff --git a/ding/bonus/cfg/TD3/__init__.py b/ding/bonus/cfg/TD3/__init__.py new file mode 100644 index 0000000000..6e01f29d74 --- /dev/null +++ b/ding/bonus/cfg/TD3/__init__.py @@ -0,0 +1,29 @@ +from easydict import EasyDict +from . import gym_bipedalwalker_v3 +from . import gym_halfcheetah_v3 +from . import gym_hopper_v3 +from . import gym_lunarlandercontinuous_v2 +from . import gym_pendulum_v1 +from . import gym_walker2d_v3 + +supported_env_cfg = { + gym_bipedalwalker_v3.cfg.env.env_id: gym_bipedalwalker_v3.cfg, + gym_halfcheetah_v3.cfg.env.env_id: gym_halfcheetah_v3.cfg, + gym_hopper_v3.cfg.env.env_id: gym_hopper_v3.cfg, + gym_lunarlandercontinuous_v2.cfg.env.env_id: gym_lunarlandercontinuous_v2.cfg, + gym_pendulum_v1.cfg.env.env_id: gym_pendulum_v1.cfg, + gym_walker2d_v3.cfg.env.env_id: gym_walker2d_v3.cfg, +} + +supported_env_cfg = EasyDict(supported_env_cfg) + +supported_env = { + gym_bipedalwalker_v3.cfg.env.env_id: gym_bipedalwalker_v3.env, + gym_halfcheetah_v3.cfg.env.env_id: gym_halfcheetah_v3.env, + gym_hopper_v3.cfg.env.env_id: gym_hopper_v3.env, + gym_lunarlandercontinuous_v2.cfg.env.env_id: gym_lunarlandercontinuous_v2.env, + gym_pendulum_v1.cfg.env.env_id: gym_pendulum_v1.env, + gym_walker2d_v3.cfg.env.env_id: gym_walker2d_v3.env, +} + +supported_env = EasyDict(supported_env) diff --git a/ding/bonus/cfg/TD3/gym_bipedalwalker_v3.py b/ding/bonus/cfg/TD3/gym_bipedalwalker_v3.py new file mode 100644 index 0000000000..7634f43b5e --- /dev/null +++ b/ding/bonus/cfg/TD3/gym_bipedalwalker_v3.py @@ -0,0 +1,58 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='Bipedalwalker-v3-TD3', + seed=0, + env=dict( + env_id='BipedalWalker-v3', + collector_env_num=8, + evaluator_env_num=5, + act_scale=True, + n_evaluator_episode=5, + rew_clip=True, + ), + policy=dict( + cuda=True, + random_collect_size=10000, + model=dict( + obs_shape=24, + action_shape=4, + twin_critic=True, + action_space='regression', + actor_head_hidden_size=400, + critic_head_hidden_size=400, + ), + learn=dict( + update_per_collect=64, + batch_size=256, + learning_rate_actor=0.0003, + learning_rate_critic=0.0003, + target_theta=0.005, + discount_factor=0.99, + actor_update_freq=2, + noise=True, + noise_sigma=0.2, + noise_range=dict( + min=-0.5, + max=0.5, + ), + learner=dict(hook=dict(log_show_after_iter=1000, )) + ), + collect=dict(n_sample=64, ), + other=dict(replay_buffer=dict(replay_buffer_size=300000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial( + ding.envs.gym_env.env, cfg=dict( + act_scale=cfg.env.act_scale, + rew_clip=cfg.env.rew_clip, + ) +) diff --git a/ding/bonus/cfg/TD3/gym_halfcheetah_v3.py b/ding/bonus/cfg/TD3/gym_halfcheetah_v3.py new file mode 100644 index 0000000000..a04c023b8e --- /dev/null +++ b/ding/bonus/cfg/TD3/gym_halfcheetah_v3.py @@ -0,0 +1,64 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='HalfCheetah-v3-TD3', + seed=0, + env=dict( + env_id='HalfCheetah-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=1, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=11000, + env_wrapper='mujoco_default', + act_scale=True, + rew_clip=True, + ), + policy=dict( + cuda=True, + random_collect_size=25000, + model=dict( + obs_shape=17, + action_shape=6, + twin_critic=True, + actor_head_hidden_size=256, + critic_head_hidden_size=256, + action_space='regression', + ), + learn=dict( + update_per_collect=1, + batch_size=256, + learning_rate_actor=1e-3, + learning_rate_critic=1e-3, + ignore_done=True, + target_theta=0.005, + discount_factor=0.99, + actor_update_freq=2, + noise=True, + noise_sigma=0.2, + noise_range=dict( + min=-0.5, + max=0.5, + ), + ), + collect=dict( + n_sample=1, + unroll_len=1, + noise_sigma=0.1, + ), + other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial( + ding.envs.gym_env.env, + cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) +) diff --git a/ding/bonus/cfg/TD3/gym_hopper_v3.py b/ding/bonus/cfg/TD3/gym_hopper_v3.py new file mode 100644 index 0000000000..62791007aa --- /dev/null +++ b/ding/bonus/cfg/TD3/gym_hopper_v3.py @@ -0,0 +1,41 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='Hopper-v3-TD3', + seed=0, + env=dict( + env_id='Hopper-v3', + collector_env_num=8, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=6000, + env_wrapper='mujoco_default', + act_scale=True, + rew_clip=True, + ), + policy=dict( + cuda=True, + random_collect_size=25000, + model=dict( + obs_shape=11, + action_shape=3, + actor_head_hidden_size=256, + critic_head_hidden_size=256, + action_space='regression', + ), + collect=dict(n_sample=1, ), + other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial( + ding.envs.gym_env.env, + cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) +) diff --git a/ding/bonus/cfg/TD3/gym_lunarlandercontinuous_v2.py b/ding/bonus/cfg/TD3/gym_lunarlandercontinuous_v2.py new file mode 100644 index 0000000000..b91315a68a --- /dev/null +++ b/ding/bonus/cfg/TD3/gym_lunarlandercontinuous_v2.py @@ -0,0 +1,50 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='LunarLanderContinuous-V2-TD3', + seed=0, + env=dict( + env_id='LunarLanderContinuous-v2', + collector_env_num=4, + evaluator_env_num=8, + n_evaluator_episode=8, + act_scale=True, + stop_value=240, + ), + policy=dict( + cuda=True, + random_collect_size=10000, + model=dict( + obs_shape=8, + action_shape=2, + action_space='regression', + ), + learn=dict( + update_per_collect=256, + batch_size=256, + learning_rate_actor=3e-4, + learning_rate_critic=1e-3, + noise=True, + noise_sigma=0.1, + noise_range=dict( + min=-0.5, + max=0.5, + ), + ), + collect=dict( + n_sample=256, + noise_sigma=0.1, + ), + eval=dict(evaluator=dict(eval_freq=1000, ), ), + other=dict(replay_buffer=dict(replay_buffer_size=100000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial(ding.envs.gym_env.env, cfg=dict(act_scale=cfg.env.act_scale)) diff --git a/ding/bonus/cfg/TD3/gym_pendulum_v1.py b/ding/bonus/cfg/TD3/gym_pendulum_v1.py new file mode 100644 index 0000000000..64eced070d --- /dev/null +++ b/ding/bonus/cfg/TD3/gym_pendulum_v1.py @@ -0,0 +1,54 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='Pendulum-v1-TD3', + seed=0, + env=dict( + collector_env_num=8, + evaluator_env_num=5, + act_scale=True, + n_evaluator_episode=5, + stop_value=-250, + ), + policy=dict( + cuda=False, + priority=False, + random_collect_size=800, + model=dict( + obs_shape=3, + action_shape=1, + twin_critic=True, + action_space='regression', + ), + learn=dict( + update_per_collect=2, + batch_size=128, + learning_rate_actor=0.001, + learning_rate_critic=0.001, + ignore_done=True, + actor_update_freq=2, + noise=True, + noise_sigma=0.1, + noise_range=dict( + min=-0.5, + max=0.5, + ), + ), + collect=dict( + n_sample=48, + noise_sigma=0.1, + collector=dict(collect_print_freq=1000, ), + ), + eval=dict(evaluator=dict(eval_freq=100, ), ), + other=dict(replay_buffer=dict(replay_buffer_size=20000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial(ding.envs.gym_env.env, cfg=dict(act_scale=cfg.env.act_scale)) diff --git a/ding/bonus/cfg/TD3/gym_walker2d_v3.py b/ding/bonus/cfg/TD3/gym_walker2d_v3.py new file mode 100644 index 0000000000..155d9332d0 --- /dev/null +++ b/ding/bonus/cfg/TD3/gym_walker2d_v3.py @@ -0,0 +1,64 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='Walker2d-v3-TD3', + seed=0, + env=dict( + env_id='Walker2d-v3', + norm_obs=dict(use_norm=False, ), + norm_reward=dict(use_norm=False, ), + collector_env_num=1, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=6000, + env_wrapper='mujoco_default', + act_scale=True, + rew_clip=True, + ), + policy=dict( + cuda=True, + random_collect_size=25000, + model=dict( + obs_shape=17, + action_shape=6, + twin_critic=True, + actor_head_hidden_size=256, + critic_head_hidden_size=256, + action_space='regression', + ), + learn=dict( + update_per_collect=1, + batch_size=256, + learning_rate_actor=1e-3, + learning_rate_critic=1e-3, + ignore_done=False, + target_theta=0.005, + discount_factor=0.99, + actor_update_freq=2, + noise=True, + noise_sigma=0.2, + noise_range=dict( + min=-0.5, + max=0.5, + ), + ), + collect=dict( + n_sample=1, + unroll_len=1, + noise_sigma=0.1, + ), + other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial( + ding.envs.gym_env.env, + cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) +) diff --git a/ding/bonus/cfg/__init__.py b/ding/bonus/cfg/__init__.py new file mode 100644 index 0000000000..71bd7c5a71 --- /dev/null +++ b/ding/bonus/cfg/__init__.py @@ -0,0 +1,9 @@ +from . import A2C +from . import C51 +from . import DDPG +from . import DQN +from . import PG +from . import PPOF +from . import PPOOffPolicy +from . import SAC +from . import TD3 diff --git a/ding/bonus/config.py b/ding/bonus/config.py index 95440ecc90..d571b307e7 100644 --- a/ding/bonus/config.py +++ b/ding/bonus/config.py @@ -4,12 +4,11 @@ from ding.envs import BaseEnv, DingEnvWrapper from ding.envs.env_wrappers import MaxAndSkipWrapper, WarpFrameWrapper, ScaledFloatFrameWrapper, FrameStackWrapper, \ EvalEpisodeReturnEnv, TransposeWrapper, TimeLimitWrapper, FlatObsWrapper, GymToGymnasiumWrapper -from ding.policy import PPOFPolicy, A2CPolicy, TD3Policy, DDPGPolicy, SACPolicy, DQNPolicy, IMPALAPolicy, \ - PGPolicy, C51Policy, PPOOffPolicy +from ding.policy import PPOFPolicy def get_instance_config(env: str, algorithm: str) -> EasyDict: - if algorithm == 'PPO': + if algorithm == 'PPOF': cfg = PPOFPolicy.default_config() if env == 'lunarlander_discrete': cfg.n_sample = 400 @@ -154,2015 +153,6 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict: cfg.learning_rate = 3e-4 else: raise KeyError("not supported env type: {}".format(env)) - elif algorithm == 'PPOOffPolicy': - cfg = EasyDict({"policy": PPOOffPolicy.default_config()}) - if env == 'lunarlander_discrete': - cfg.update( - dict( - exp_name='LunarLander-v2-PPOOffPolicy', - env=dict( - collector_env_num=8, - evaluator_env_num=8, - env_id='LunarLander-v2', - n_evaluator_episode=8, - stop_value=240, - ), - policy=dict( - cuda=True, - model=dict( - obs_shape=8, - action_shape=4, - ), - learn=dict( - update_per_collect=4, - batch_size=64, - learning_rate=0.001, - value_weight=0.5, - entropy_weight=0.01, - clip_ratio=0.2, - nstep=1, - nstep_return=False, - adv_norm=True, - ), - collect=dict( - n_sample=128, - unroll_len=1, - discount_factor=0.99, - gae_lambda=0.95, - ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'PongNoFrameskip': - cfg.update( - dict( - exp_name='PongNoFrameskip-v4-PPOOffPolicy', - env=dict( - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=20, - env_id='PongNoFrameskip-v4', - frame_stack=4, - ), - policy=dict( - cuda=True, - recompute_adv=True, - action_space='discrete', - model=dict( - obs_shape=[4, 84, 84], - action_shape=6, - action_space='discrete', - encoder_hidden_size_list=[64, 64, 128], - actor_head_hidden_size=128, - critic_head_hidden_size=128, - ), - learn=dict( - update_per_collect=10, - batch_size=320, - learning_rate=3e-4, - value_weight=0.5, - entropy_weight=0.001, - clip_ratio=0.2, - adv_norm=True, - # value_norm=True, - ignore_done=False, - grad_clip_type='clip_norm', - grad_clip_value=0.5, - ), - collect=dict( - n_sample=3200, - unroll_len=1, - discount_factor=0.99, - gae_lambda=0.95, - ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'SpaceInvadersNoFrameskip': - cfg.update( - dict( - exp_name='SpaceInvadersNoFrameskip-v4-PPOOffPolicy', - env=dict( - collector_env_num=16, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=10000000000, - env_id='SpaceInvadersNoFrameskip-v4', - frame_stack=4, - manager=dict(shared_memory=False, ) - ), - policy=dict( - cuda=True, - model=dict( - obs_shape=[4, 84, 84], - action_shape=6, - encoder_hidden_size_list=[32, 64, 64, 128], - actor_head_hidden_size=128, - critic_head_hidden_size=128, - critic_head_layer_num=2, - ), - learn=dict( - update_per_collect=24, - batch_size=128, - learning_rate=0.0001, - value_weight=1.0, - entropy_weight=0.03, - clip_ratio=0.1, - adv_norm=False, - ), - collect=dict( - n_sample=1024, - unroll_len=1, - discount_factor=0.99, - gae_lambda=0.95, - ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'QbertNoFrameskip': - cfg.update( - dict( - exp_name='QbertNoFrameskip-v4-PPOOffPolicy', - env=dict( - collector_env_num=16, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=10000000000, - env_id='QbertNoFrameskip-v4', - frame_stack=4 - ), - policy=dict( - cuda=True, - model=dict( - obs_shape=[4, 84, 84], - action_shape=6, - encoder_hidden_size_list=[32, 64, 64, 128], - actor_head_hidden_size=128, - critic_head_hidden_size=128, - critic_head_layer_num=2, - ), - learn=dict( - update_per_collect=18, - batch_size=128, - learning_rate=0.0001, - value_weight=1.0, - entropy_weight=0.005, - clip_ratio=0.1, - adv_norm=False, - ), - collect=dict( - n_sample=1024, - unroll_len=1, - discount_factor=0.99, - gae_lambda=0.95, - ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif algorithm == 'A2C': - cfg = EasyDict({"policy": A2CPolicy.default_config()}) - if env == 'lunarlander_discrete': - cfg.update( - dict( - exp_name='LunarLander-v2-A2C', - env=dict( - collector_env_num=8, - evaluator_env_num=8, - env_id='LunarLander-v2', - n_evaluator_episode=8, - stop_value=240, - ), - policy=dict( - cuda=True, - model=dict( - obs_shape=8, - action_shape=4, - ), - learn=dict( - batch_size=64, - learning_rate=3e-4, - entropy_weight=0.001, - adv_norm=True, - ), - collect=dict( - n_sample=64, - discount_factor=0.99, - gae_lambda=0.95, - ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'bipedalwalker': - cfg.update( - dict( - exp_name='Bipedalwalker-v3-A2C', - seed=0, - env=dict( - env_id='BipedalWalker-v3', - collector_env_num=8, - evaluator_env_num=8, - act_scale=True, - n_evaluator_episode=8, - rew_clip=True, - ), - policy=dict( - cuda=True, - action_space='continuous', - model=dict( - action_space='continuous', - obs_shape=24, - action_shape=4, - ), - learn=dict( - batch_size=64, - learning_rate=0.0003, - value_weight=0.7, - entropy_weight=0.0005, - discount_factor=0.99, - adv_norm=True, - ), - collect=dict( - n_sample=64, - discount_factor=0.99, - ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'pendulum': - pass - elif env == 'hopper': - cfg.update( - dict( - exp_name='Hopper-v3-A2C', - env=dict( - env_id='Hopper-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=1, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=12000, - ), - policy=dict( - cuda=True, - action_space='continuous', - model=dict( - obs_shape=11, - action_shape=3, - action_space='continuous', - ), - learn=dict( - batch_size=128, - learning_rate=0.0003, - value_weight=0.5, - entropy_weight=0.02, - adv_norm=True, - ), - collect=dict( - n_sample=128, - discount_factor=0.99, - gae_lambda=0.95, - ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'HalfCheetah': - cfg.update( - dict( - exp_name='HalfCheetah-v3-A2C', - env=dict( - env_id='HalfCheetah-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=1, - evaluator_env_num=8, - n_evaluator_episode=8, - act_scale=True, - stop_value=12000, - ), - policy=dict( - cuda=True, - action_space='continuous', - model=dict( - action_space='continuous', - obs_shape=17, - action_shape=6, - ), - learn=dict( - batch_size=256, - learning_rate=0.0003, - value_weight=0.5, - entropy_weight=0.01, - grad_norm=0.5, - ignore_done=True, - adv_norm=True, - ), - collect=dict( - n_sample=256, - discount_factor=0.99, - gae_lambda=0.95, - ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'Walker2d': - cfg.update( - dict( - exp_name='Walker2d-v3-A2C', - env=dict( - env_id='Walker2d-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=1, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=12000, - ), - policy=dict( - cuda=True, - action_space='continuous', - model=dict( - action_space='continuous', - obs_shape=17, - action_shape=6, - ), - learn=dict( - batch_size=32, - learning_rate=0.0003, - value_weight=0.5, - entropy_weight=0.005, - adv_norm=True, - ), - collect=dict( - n_sample=32, - discount_factor=0.99, - gae_lambda=0.95, - ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - else: - raise KeyError("not supported env type: {}".format(env)) - elif algorithm == 'PG': - cfg = EasyDict({"policy": PGPolicy.default_config()}) - if env == 'hopper': - cfg.update( - dict( - exp_name='Hopper-v3-PG', - seed=0, - env=dict( - env_id='Hopper-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=6000, - ), - policy=dict( - cuda=True, - action_space='continuous', - model=dict( - action_space='continuous', - obs_shape=11, - action_shape=3, - ), - learn=dict( - batch_size=64, - learning_rate=0.005, - entropy_weight=0.01, - ), - collect=dict( - n_episode=34, - unroll_len=1, - discount_factor=0.99, - ), - eval=dict(evaluator=dict(eval_freq=1, )) - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'HalfCheetah': - cfg.update( - dict( - exp_name='HalfCheetah-v3-PG', - seed=0, - env=dict( - env_id='HalfCheetah-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=6000, - ), - policy=dict( - cuda=True, - action_space='continuous', - model=dict( - action_space='continuous', - obs_shape=17, - action_shape=6, - ), - learn=dict( - batch_size=64, - learning_rate=0.001, - entropy_weight=0.001, - ), - collect=dict( - n_episode=20, - unroll_len=1, - discount_factor=0.99, - ), - eval=dict(evaluator=dict(eval_freq=1, )) - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'Walker2d': - cfg.update( - dict( - exp_name='Walker2d-v3-PG', - seed=0, - env=dict( - env_id='Walker2d-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=6000, - ), - policy=dict( - cuda=True, - action_space='continuous', - model=dict( - action_space='continuous', - obs_shape=17, - action_shape=6, - ), - learn=dict( - batch_size=64, - learning_rate=0.001, - entropy_weight=0.001, - ), - collect=dict( - n_episode=20, - unroll_len=1, - discount_factor=0.99, - ), - eval=dict(evaluator=dict(eval_freq=1, )) - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'lunarlander_discrete': - cfg.update( - dict( - exp_name='LunarLander-v2-PG', - env=dict( - collector_env_num=8, - evaluator_env_num=8, - env_id='LunarLander-v2', - n_evaluator_episode=8, - stop_value=240, - ), - policy=dict( - cuda=True, - model=dict( - obs_shape=8, - action_shape=4, - ), - learn=dict( - batch_size=320, - learning_rate=3e-4, - entropy_weight=0.001, - grad_norm=0.5, - ), - collect=dict( - n_episode=8, - discount_factor=0.99, - ), - eval=dict(evaluator=dict(eval_freq=1000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'bipedalwalker': - cfg.update( - dict( - exp_name='Bipedalwalker-v3-PG', - seed=0, - env=dict( - env_id='BipedalWalker-v3', - collector_env_num=8, - evaluator_env_num=8, - act_scale=True, - n_evaluator_episode=8, - stop_value=300, - rew_clip=True, - ), - policy=dict( - cuda=True, - action_space='continuous', - model=dict( - action_space='continuous', - obs_shape=24, - action_shape=4, - ), - learn=dict( - batch_size=64, - learning_rate=0.001, - entropy_weight=0.001, - ), - collect=dict( - n_episode=20, - unroll_len=1, - discount_factor=0.99, - ), - eval=dict(evaluator=dict(eval_freq=1, )) - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'pendulum': - cfg.update( - dict( - exp_name='Pendulum-v1-PG', - seed=0, - env=dict( - collector_env_num=8, - evaluator_env_num=5, - act_scale=True, - n_evaluator_episode=5, - stop_value=-200, - ), - policy=dict( - cuda=False, - action_space='continuous', - model=dict( - action_space='continuous', - obs_shape=3, - action_shape=1, - ), - learn=dict( - batch_size=4000, - learning_rate=0.001, - entropy_weight=0.001, - ), - collect=dict( - n_episode=20, - unroll_len=1, - discount_factor=0.99, - ), - eval=dict(evaluator=dict(eval_freq=1, )) - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - else: - raise KeyError("not supported env type: {}".format(env)) - elif algorithm == 'TD3': - cfg = EasyDict({"policy": TD3Policy.default_config()}) - if env == 'hopper': - cfg.update( - dict( - exp_name='Hopper-v3-TD3', - seed=0, - env=dict( - env_id='Hopper-v3', - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=6000, - ), - policy=dict( - cuda=True, - random_collect_size=25000, - model=dict( - obs_shape=11, - action_shape=3, - actor_head_hidden_size=256, - critic_head_hidden_size=256, - action_space='regression', - ), - collect=dict(n_sample=1, ), - other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'HalfCheetah': - cfg.update( - dict( - exp_name='HalfCheetah-v3-TD3', - seed=0, - env=dict( - env_id='HalfCheetah-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=1, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=11000, - ), - policy=dict( - cuda=True, - random_collect_size=25000, - model=dict( - obs_shape=17, - action_shape=6, - twin_critic=True, - actor_head_hidden_size=256, - critic_head_hidden_size=256, - action_space='regression', - ), - learn=dict( - update_per_collect=1, - batch_size=256, - learning_rate_actor=1e-3, - learning_rate_critic=1e-3, - ignore_done=True, - target_theta=0.005, - discount_factor=0.99, - actor_update_freq=2, - noise=True, - noise_sigma=0.2, - noise_range=dict( - min=-0.5, - max=0.5, - ), - ), - collect=dict( - n_sample=1, - unroll_len=1, - noise_sigma=0.1, - ), - other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'Walker2d': - cfg.update( - dict( - exp_name='Walker2d-v3-TD3', - seed=0, - env=dict( - env_id='Walker2d-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=1, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=6000, - ), - policy=dict( - cuda=True, - random_collect_size=25000, - model=dict( - obs_shape=17, - action_shape=6, - twin_critic=True, - actor_head_hidden_size=256, - critic_head_hidden_size=256, - action_space='regression', - ), - learn=dict( - update_per_collect=1, - batch_size=256, - learning_rate_actor=1e-3, - learning_rate_critic=1e-3, - ignore_done=False, - target_theta=0.005, - discount_factor=0.99, - actor_update_freq=2, - noise=True, - noise_sigma=0.2, - noise_range=dict( - min=-0.5, - max=0.5, - ), - ), - collect=dict( - n_sample=1, - unroll_len=1, - noise_sigma=0.1, - ), - other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'lunarlander_continuous': - cfg.update( - dict( - exp_name='LunarLanderContinuous-V2-TD3', - seed=0, - env=dict( - env_id='LunarLanderContinuous-v2', - collector_env_num=4, - evaluator_env_num=8, - n_evaluator_episode=8, - act_scale=True, - stop_value=240, - ), - policy=dict( - cuda=True, - random_collect_size=10000, - model=dict( - obs_shape=8, - action_shape=2, - action_space='regression', - ), - learn=dict( - update_per_collect=256, - batch_size=256, - learning_rate_actor=3e-4, - learning_rate_critic=1e-3, - noise=True, - noise_sigma=0.1, - noise_range=dict( - min=-0.5, - max=0.5, - ), - ), - collect=dict( - n_sample=256, - noise_sigma=0.1, - ), - eval=dict(evaluator=dict(eval_freq=1000, ), ), - other=dict(replay_buffer=dict(replay_buffer_size=100000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'bipedalwalker': - cfg.update( - dict( - exp_name='Bipedalwalker-v3-TD3', - seed=0, - env=dict( - env_id='BipedalWalker-v3', - collector_env_num=8, - evaluator_env_num=5, - # (bool) Scale output action into legal range. - act_scale=True, - n_evaluator_episode=5, - rew_clip=True, - ), - policy=dict( - cuda=True, - random_collect_size=10000, - model=dict( - obs_shape=24, - action_shape=4, - twin_critic=True, - action_space='regression', - actor_head_hidden_size=400, - critic_head_hidden_size=400, - ), - learn=dict( - update_per_collect=64, - batch_size=256, - learning_rate_actor=0.0003, - learning_rate_critic=0.0003, - target_theta=0.005, - discount_factor=0.99, - actor_update_freq=2, - noise=True, - noise_sigma=0.2, - noise_range=dict( - min=-0.5, - max=0.5, - ), - learner=dict(hook=dict(log_show_after_iter=1000, )) - ), - collect=dict(n_sample=64, ), - other=dict(replay_buffer=dict(replay_buffer_size=300000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'pendulum': - cfg.update( - dict( - exp_name='Pendulum-v1-TD3', - seed=0, - env=dict( - collector_env_num=8, - evaluator_env_num=5, - # (bool) Scale output action into legal range. - act_scale=True, - n_evaluator_episode=5, - stop_value=-250, - ), - policy=dict( - cuda=False, - priority=False, - random_collect_size=800, - model=dict( - obs_shape=3, - action_shape=1, - twin_critic=True, - action_space='regression', - ), - learn=dict( - update_per_collect=2, - batch_size=128, - learning_rate_actor=0.001, - learning_rate_critic=0.001, - ignore_done=True, - actor_update_freq=2, - noise=True, - noise_sigma=0.1, - noise_range=dict( - min=-0.5, - max=0.5, - ), - ), - collect=dict( - n_sample=48, - noise_sigma=0.1, - collector=dict(collect_print_freq=1000, ), - ), - eval=dict(evaluator=dict(eval_freq=100, ), ), - other=dict(replay_buffer=dict(replay_buffer_size=20000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - else: - raise KeyError("not supported env type: {}".format(env)) - elif algorithm == 'DDPG': - cfg = EasyDict({"policy": DDPGPolicy.default_config()}) - if env == 'hopper': - cfg.update( - dict( - exp_name='Hopper-v3-DDPG', - seed=0, - env=dict( - env_id='Hopper-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=1, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=6000, - ), - policy=dict( - cuda=True, - random_collect_size=25000, - model=dict( - obs_shape=11, - action_shape=3, - twin_critic=False, - actor_head_hidden_size=256, - critic_head_hidden_size=256, - action_space='regression', - ), - learn=dict( - update_per_collect=1, - batch_size=256, - learning_rate_actor=1e-3, - learning_rate_critic=1e-3, - ignore_done=False, - target_theta=0.005, - discount_factor=0.99, - actor_update_freq=1, - noise=False, - ), - collect=dict( - n_sample=1, - unroll_len=1, - noise_sigma=0.1, - ), - other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), - ) - ) - ) - elif env == 'HalfCheetah': - cfg.update( - dict( - exp_name='HalfCheetah-v3-DDPG', - seed=0, - env=dict( - env_id='HalfCheetah-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=1, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=11000, - ), - policy=dict( - cuda=True, - random_collect_size=25000, - model=dict( - obs_shape=17, - action_shape=6, - twin_critic=False, - actor_head_hidden_size=256, - critic_head_hidden_size=256, - action_space='regression', - ), - learn=dict( - update_per_collect=1, - batch_size=256, - learning_rate_actor=1e-3, - learning_rate_critic=1e-3, - ignore_done=True, - target_theta=0.005, - discount_factor=0.99, - actor_update_freq=1, - noise=False, - ), - collect=dict( - n_sample=1, - unroll_len=1, - noise_sigma=0.1, - ), - other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'Walker2d': - cfg.update( - dict( - exp_name='Walker2d-v3-DDPG', - seed=0, - env=dict( - env_id='Walker2d-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=1, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=6000, - ), - policy=dict( - cuda=True, - random_collect_size=25000, - model=dict( - obs_shape=17, - action_shape=6, - twin_critic=False, - actor_head_hidden_size=256, - critic_head_hidden_size=256, - action_space='regression', - ), - learn=dict( - update_per_collect=1, - batch_size=256, - learning_rate_actor=1e-3, - learning_rate_critic=1e-3, - ignore_done=False, - target_theta=0.005, - discount_factor=0.99, - actor_update_freq=1, - noise=False, - ), - collect=dict( - n_sample=1, - unroll_len=1, - noise_sigma=0.1, - ), - other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'lunarlander_continuous': - cfg.update( - dict( - exp_name='LunarLanderContinuous-V2-DDPG', - seed=0, - env=dict( - env_id='LunarLanderContinuous-v2', - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - act_scale=True, - stop_value=240, - ), - policy=dict( - cuda=True, - random_collect_size=0, - model=dict( - obs_shape=8, - action_shape=2, - twin_critic=True, - action_space='regression', - ), - learn=dict( - update_per_collect=2, - batch_size=128, - learning_rate_actor=0.001, - learning_rate_critic=0.001, - ignore_done=False, # TODO(pu) - # (int) When critic network updates once, how many times will actor network update. - # Delayed Policy Updates in original TD3 paper(https://arxiv.org/pdf/1802.09477.pdf). - # Default 1 for DDPG, 2 for TD3. - actor_update_freq=1, - # (bool) Whether to add noise on target network's action. - # Target Policy Smoothing Regularization in original TD3 paper(https://arxiv.org/pdf/1802.09477.pdf). - # Default True for TD3, False for DDPG. - noise=False, - noise_sigma=0.1, - noise_range=dict( - min=-0.5, - max=0.5, - ), - ), - collect=dict( - n_sample=48, - noise_sigma=0.1, - collector=dict(collect_print_freq=1000, ), - ), - eval=dict(evaluator=dict(eval_freq=100, ), ), - other=dict(replay_buffer=dict(replay_buffer_size=20000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'bipedalwalker': - cfg.update( - dict( - exp_name='Bipedalwalker-v3-DDPG', - seed=0, - env=dict( - env_id='BipedalWalker-v3', - collector_env_num=8, - evaluator_env_num=5, - # (bool) Scale output action into legal range. - act_scale=True, - n_evaluator_episode=5, - rew_clip=True, - ), - policy=dict( - cuda=True, - random_collect_size=10000, - model=dict( - obs_shape=24, - action_shape=4, - twin_critic=False, - action_space='regression', - actor_head_hidden_size=400, - critic_head_hidden_size=400, - ), - learn=dict( - update_per_collect=64, - batch_size=256, - learning_rate_actor=0.0003, - learning_rate_critic=0.0003, - target_theta=0.005, - discount_factor=0.99, - learner=dict(hook=dict(log_show_after_iter=1000, )) - ), - collect=dict(n_sample=64, ), - other=dict(replay_buffer=dict(replay_buffer_size=300000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'pendulum': - cfg.update( - dict( - exp_name='Pendulum-v1-DDPG', - seed=0, - env=dict( - collector_env_num=8, - evaluator_env_num=5, - # (bool) Scale output action into legal range. - act_scale=True, - n_evaluator_episode=5, - stop_value=-250, - ), - policy=dict( - cuda=False, - priority=False, - random_collect_size=800, - model=dict( - obs_shape=3, - action_shape=1, - twin_critic=False, - action_space='regression', - ), - learn=dict( - update_per_collect=2, - batch_size=128, - learning_rate_actor=0.001, - learning_rate_critic=0.001, - ignore_done=True, - actor_update_freq=1, - noise=False, - ), - collect=dict( - n_sample=48, - noise_sigma=0.1, - collector=dict(collect_print_freq=1000, ), - ), - eval=dict(evaluator=dict(eval_freq=100, )), - other=dict(replay_buffer=dict( - replay_buffer_size=20000, - max_use=16, - ), ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - else: - raise KeyError("not supported env type: {}".format(env)) - elif algorithm == 'SAC': - cfg = EasyDict({"policy": SACPolicy.default_config()}) - if env == 'hopper': - cfg.update( - dict( - exp_name='Hopper-v3-SAC', - seed=0, - env=dict( - env_id='Hopper-v3', - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=6000, - ), - policy=dict( - cuda=True, - random_collect_size=10000, - model=dict( - obs_shape=11, - action_shape=3, - action_space='reparameterization', - actor_head_hidden_size=256, - critic_head_hidden_size=256, - ), - learn=dict( - update_per_collect=1, - batch_size=256, - learning_rate_q=1e-3, - learning_rate_policy=1e-3, - reparameterization=True, - auto_alpha=False, - ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'HalfCheetah': - cfg.update( - dict( - exp_name='HalfCheetah-v3-SAC', - seed=0, - env=dict( - env_id='HalfCheetah-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=1, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=12000, - ), - policy=dict( - cuda=False, - random_collect_size=10000, - model=dict( - obs_shape=17, - action_shape=6, - twin_critic=True, - action_space='reparameterization', - actor_head_hidden_size=256, - critic_head_hidden_size=256, - ), - learn=dict( - update_per_collect=1, - batch_size=256, - learning_rate_q=1e-3, - learning_rate_policy=1e-3, - learning_rate_alpha=3e-4, - ignore_done=True, - target_theta=0.005, - discount_factor=0.99, - alpha=0.2, - reparameterization=True, - auto_alpha=False, - ), - collect=dict( - n_sample=1, - unroll_len=1, - ), - command=dict(), - eval=dict(), - other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'Walker2d': - cfg.update( - dict( - exp_name='Walker2d-v3-SAC', - seed=0, - env=dict( - env_id='Walker2d-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=1, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=6000, - ), - policy=dict( - cuda=True, - random_collect_size=10000, - model=dict( - obs_shape=17, - action_shape=6, - twin_critic=True, - action_space='reparameterization', - actor_head_hidden_size=256, - critic_head_hidden_size=256, - ), - learn=dict( - update_per_collect=1, - batch_size=256, - learning_rate_q=1e-3, - learning_rate_policy=1e-3, - learning_rate_alpha=3e-4, - ignore_done=False, - target_theta=0.005, - discount_factor=0.99, - alpha=0.2, - reparameterization=True, - auto_alpha=False, - ), - collect=dict( - n_sample=1, - unroll_len=1, - ), - command=dict(), - eval=dict(), - other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'lunarlander_continuous': - cfg.update( - dict( - exp_name='LunarLander-v2-SAC', - seed=0, - env=dict( - env_id='LunarLanderContinuous-v2', - collector_env_num=4, - evaluator_env_num=8, - act_scale=True, - n_evaluator_episode=8, - stop_value=240, - ), - policy=dict( - cuda=True, - random_collect_size=10000, - model=dict( - obs_shape=8, - action_shape=2, - action_space='reparameterization', - twin_critic=True, - ), - learn=dict( - update_per_collect=256, - batch_size=128, - learning_rate_q=1e-3, - learning_rate_policy=3e-4, - learning_rate_alpha=3e-4, - auto_alpha=True, - ), - collect=dict(n_sample=256, ), - eval=dict(evaluator=dict(eval_freq=1000, ), ), - other=dict(replay_buffer=dict(replay_buffer_size=int(1e5), ), ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'bipedalwalker': - cfg.update( - dict( - exp_name='BipedalWalker-v3-SAC', - seed=0, - env=dict( - env_id='BipedalWalker-v3', - collector_env_num=8, - evaluator_env_num=5, - # (bool) Scale output action into legal range. - act_scale=True, - n_evaluator_episode=5, - rew_clip=True, - ), - policy=dict( - cuda=True, - random_collect_size=10000, - model=dict( - obs_shape=24, - action_shape=4, - twin_critic=True, - action_space='reparameterization', - actor_head_hidden_size=128, - critic_head_hidden_size=128, - ), - learn=dict( - update_per_collect=64, - batch_size=256, - learning_rate_q=0.0003, - learning_rate_policy=0.0003, - learning_rate_alpha=0.0003, - target_theta=0.005, - discount_factor=0.99, - auto_alpha=True, - learner=dict(hook=dict(log_show_after_iter=1000, )) - ), - collect=dict(n_sample=64, ), - other=dict(replay_buffer=dict(replay_buffer_size=300000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'pendulum': - cfg.update( - dict( - exp_name='Pendulum-v1-SAC', - seed=0, - env=dict( - collector_env_num=10, - evaluator_env_num=8, - # (bool) Scale output action into legal range. - act_scale=True, - n_evaluator_episode=8, - stop_value=-250, - ), - policy=dict( - cuda=False, - priority=False, - random_collect_size=1000, - model=dict( - obs_shape=3, - action_shape=1, - twin_critic=True, - action_space='reparameterization', - actor_head_hidden_size=128, - critic_head_hidden_size=128, - ), - learn=dict( - update_per_collect=1, - batch_size=128, - learning_rate_q=0.001, - learning_rate_policy=0.001, - learning_rate_alpha=0.0003, - ignore_done=True, - target_theta=0.005, - discount_factor=0.99, - auto_alpha=True, - ), - collect=dict(n_sample=10, ), - eval=dict(evaluator=dict(eval_freq=100, )), - other=dict(replay_buffer=dict(replay_buffer_size=100000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - else: - raise KeyError("not supported env type: {}".format(env)) - elif algorithm == 'DQN': - cfg = EasyDict({"policy": DQNPolicy.default_config()}) - if env == 'lunarlander_discrete': - cfg.update( - dict( - exp_name='LunarLander-v2-DQN', - seed=0, - env=dict( - env_id='LunarLander-v2', - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=240, - ), - policy=dict( - cuda=True, - random_collect_size=25000, - discount_factor=0.99, - nstep=3, - learn=dict( - update_per_collect=10, - batch_size=64, - learning_rate=0.001, - # Frequency of target network update. - target_update_freq=100, - ), - model=dict( - obs_shape=8, - action_shape=4, - encoder_hidden_size_list=[512, 64], - # Whether to use dueling head. - dueling=True, - ), - collect=dict( - n_sample=64, - unroll_len=1, - ), - other=dict( - eps=dict( - type='exp', - start=0.95, - end=0.1, - decay=50000, - ), - replay_buffer=dict(replay_buffer_size=100000, ) - ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'PongNoFrameskip': - cfg.update( - dict( - exp_name='PongNoFrameskip-v4-DQN', - seed=0, - env=dict( - env_id='PongNoFrameskip-v4', - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=20, - fram_stack=4, - ), - policy=dict( - cuda=True, - priority=False, - discount_factor=0.99, - nstep=3, - learn=dict( - update_per_collect=10, - batch_size=32, - learning_rate=0.0001, - # Frequency of target network update. - target_update_freq=500, - ), - model=dict( - obs_shape=[4, 84, 84], - action_shape=6, - encoder_hidden_size_list=[128, 128, 512], - ), - collect=dict(n_sample=96, ), - other=dict( - eps=dict( - type='exp', - start=1., - end=0.05, - decay=250000, - ), - replay_buffer=dict(replay_buffer_size=100000, ) - ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'SpaceInvadersNoFrameskip': - cfg.update( - dict( - exp_name='SpaceInvadersNoFrameskip-v4-DQN', - seed=0, - env=dict( - env_id='SpaceInvadersNoFrameskip-v4', - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - fram_stack=4, - stop_value=2000, - ), - policy=dict( - cuda=True, - priority=False, - discount_factor=0.99, - nstep=3, - learn=dict( - update_per_collect=10, - batch_size=32, - learning_rate=0.0001, - # Frequency of target network update. - target_update_freq=500, - hook=dict(save_ckpt_after_iter=1000000, ) - ), - model=dict( - obs_shape=[4, 84, 84], - action_shape=6, - encoder_hidden_size_list=[128, 128, 512], - ), - collect=dict(n_sample=100, ), - other=dict( - eps=dict( - type='exp', - start=1., - end=0.05, - decay=1000000, - ), - replay_buffer=dict(replay_buffer_size=400000, ) - ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'QbertNoFrameskip': - cfg.update( - dict( - exp_name='QbertNoFrameskip-v4-DQN', - seed=0, - env=dict( - env_id='QbertNoFrameskip-v4', - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - fram_stack=4, - stop_value=30000, - ), - policy=dict( - cuda=True, - priority=False, - discount_factor=0.99, - nstep=3, - learn=dict( - update_per_collect=10, - batch_size=32, - learning_rate=0.0001, - # Frequency of target network update. - target_update_freq=500, - ), - model=dict( - obs_shape=[4, 84, 84], - action_shape=6, - encoder_hidden_size_list=[128, 128, 512], - ), - collect=dict(n_sample=100, ), - other=dict( - eps=dict( - type='exp', - start=1., - end=0.05, - decay=1000000, - ), - replay_buffer=dict(replay_buffer_size=400000, ) - ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - else: - raise KeyError("not supported env type: {}".format(env)) - elif algorithm == 'C51': - cfg = EasyDict({"policy": C51Policy.default_config()}) - if env == 'lunarlander_discrete': - cfg.update( - dict( - exp_name='lunarlander_c51', - seed=0, - env=dict( - collector_env_num=8, - evaluator_env_num=8, - env_id='LunarLander-v2', - n_evaluator_episode=8, - stop_value=200, - ), - policy=dict( - cuda=False, - model=dict( - obs_shape=8, - action_shape=4, - encoder_hidden_size_list=[512, 64], - v_min=-30, - v_max=30, - n_atom=51, - ), - discount_factor=0.99, - nstep=3, - learn=dict( - update_per_collect=10, - batch_size=64, - learning_rate=0.001, - target_update_freq=100, - ), - collect=dict( - n_sample=64, - unroll_len=1, - ), - other=dict( - eps=dict( - type='exp', - start=0.95, - end=0.1, - decay=50000, - ), - replay_buffer=dict(replay_buffer_size=100000, ) - ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'PongNoFrameskip': - cfg.update( - dict( - exp_name='PongNoFrameskip-v4-C51', - seed=0, - env=dict( - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=20, - env_id='PongNoFrameskip-v4', - #'ALE/Pong-v5' is available. But special setting is needed after gym make. - frame_stack=4, - ), - policy=dict( - cuda=True, - priority=False, - model=dict( - obs_shape=[4, 84, 84], - action_shape=6, - encoder_hidden_size_list=[128, 128, 512], - v_min=-10, - v_max=10, - n_atom=51, - ), - nstep=3, - discount_factor=0.99, - learn=dict( - update_per_collect=10, - batch_size=32, - learning_rate=0.0001, - target_update_freq=500, - ), - collect=dict(n_sample=100, ), - eval=dict(evaluator=dict(eval_freq=4000, )), - other=dict( - eps=dict( - type='exp', - start=1., - end=0.05, - decay=250000, - ), - replay_buffer=dict(replay_buffer_size=100000, ), - ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'SpaceInvadersNoFrameskip': - cfg.update( - dict( - exp_name='SpaceInvadersNoFrameskip-v4-C51', - seed=0, - env=dict( - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=10000000000, - env_id='SpaceInvadersNoFrameskip-v4', - #'ALE/SpaceInvaders-v5' is available. But special setting is needed after gym make. - frame_stack=4, - manager=dict(shared_memory=False, ) - ), - policy=dict( - cuda=True, - priority=False, - model=dict( - obs_shape=[4, 84, 84], - action_shape=6, - encoder_hidden_size_list=[128, 128, 512], - v_min=-10, - v_max=10, - n_atom=51, - ), - nstep=3, - discount_factor=0.99, - learn=dict( - update_per_collect=10, - batch_size=32, - learning_rate=0.0001, - target_update_freq=500, - ), - collect=dict(n_sample=100, ), - eval=dict(evaluator=dict(eval_freq=4000, )), - other=dict( - eps=dict( - type='exp', - start=1., - end=0.05, - decay=1000000, - ), - replay_buffer=dict(replay_buffer_size=400000, ), - ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - elif env == 'QbertNoFrameskip': - cfg.update( - dict( - exp_name='QbertNoFrameskip-v4-C51', - seed=0, - env=dict( - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=30000, - env_id='QbertNoFrameskip-v4', - #'ALE/Qbert-v5' is available. But special setting is needed after gym make. - frame_stack=4 - ), - policy=dict( - cuda=True, - priority=True, - model=dict( - obs_shape=[4, 84, 84], - action_shape=6, - encoder_hidden_size_list=[128, 128, 512], - v_min=-10, - v_max=10, - n_atom=51, - ), - nstep=3, - discount_factor=0.99, - learn=dict( - update_per_collect=10, - batch_size=32, - learning_rate=0.0001, - target_update_freq=500, - ), - collect=dict(n_sample=100, ), - eval=dict(evaluator=dict(eval_freq=4000, )), - other=dict( - eps=dict( - type='exp', - start=1., - end=0.05, - decay=1000000, - ), - replay_buffer=dict(replay_buffer_size=400000, ), - ), - ), - wandb_logger=dict( - gradient_logger=True, - video_logger=True, - plot_logger=True, - action_logger=True, - return_logger=False - ), - ) - ) - else: - raise KeyError("not supported env type: {}".format(env)) else: raise KeyError("not supported algorithm type: {}".format(algorithm)) diff --git a/ding/bonus/ddpg.py b/ding/bonus/ddpg.py index 8515b9754c..15cd0d03b5 100644 --- a/ding/bonus/ddpg.py +++ b/ding/bonus/ddpg.py @@ -1,66 +1,70 @@ -from dataclasses import dataclass from typing import Optional, Union from ditk import logging from easydict import EasyDict import os -from functools import partial import torch import treetensor.torch as ttorch from ding.framework import task, OnlineRLContext -from ding.framework.middleware import CkptSaver, multistep_trainer, \ +from ding.framework.middleware import CkptSaver, \ wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \ OffPolicyLearner, final_ctx_saver -from ding.envs import BaseEnv, BaseEnvManagerV2, SubprocessEnvManagerV2 +from ding.envs import BaseEnv +from ding.envs import setup_ding_env_manager from ding.policy import DDPGPolicy from ding.utils import set_pkg_seed -from ding.config import Config, save_config_py, compile_config +from ding.config import save_config_py, compile_config from ding.model import QAC from ding.data import DequeBuffer -from ding.bonus.config import get_instance_config, get_instance_env from ding.bonus.common import TrainingReturn, EvalReturn +from ding.bonus.cfg.DDPG import supported_env_cfg +from ding.bonus.cfg.DDPG import supported_env class DDPGAgent: - supported_env_list = [ - 'hopper', - 'HalfCheetah', - 'Walker2d', - 'lunarlander_continuous', - 'bipedalwalker', - 'pendulum', - ] - algorithm = 'DDPG' + supported_env_list = list(supported_env_cfg.keys()) def __init__( self, - env: Union[str, BaseEnv], + env_id: str = None, + env: BaseEnv = None, seed: int = 0, exp_name: str = None, model: Optional[torch.nn.Module] = None, - cfg: Optional[Union[EasyDict, dict, str]] = None, + cfg: Optional[Union[EasyDict, dict]] = None, policy_state_dict: str = None, ) -> None: - if isinstance(env, str): - assert env in DDPGAgent.supported_env_list, "Please use supported envs: {}".format( + assert env_id is not None or cfg is not None, "Please specify env_id or cfg." + + if cfg is not None and not isinstance(cfg, EasyDict): + cfg = EasyDict(cfg) + + if env_id is not None: + assert env_id in DDPGAgent.supported_env_list, "Please use supported envs: {}".format( DDPGAgent.supported_env_list ) - self.env = get_instance_env(env) if cfg is None: - # 'It should be default env tuned config' - cfg = get_instance_config(env, algorithm=DDPGAgent.algorithm) + cfg = supported_env_cfg[env_id] else: - assert isinstance(cfg, EasyDict), "Please use EasyDict as config data type." - - if exp_name is not None: - cfg.exp_name = exp_name - self.cfg = compile_config(cfg, policy=DDPGPolicy) - self.exp_name = self.cfg.exp_name - - elif isinstance(env, BaseEnv): - self.cfg = compile_config(cfg, policy=DDPGPolicy) - raise NotImplementedError + assert cfg.env.env_id == env_id, "env_id in cfg should be the same as env_id in args." + else: + assert hasattr(cfg.env, "env_id"), "Please specify env_id in cfg." + assert cfg.env.env_id in DDPGAgent.supported_env_list, "Please use supported envs: {}".format( + DDPGAgent.supported_env_list + ) + default_policy_config = EasyDict({"policy": DDPGPolicy.default_config()}) + default_policy_config.update(cfg) + cfg = default_policy_config + + if exp_name is not None: + cfg.exp_name = exp_name + self.cfg = compile_config(cfg, policy=DDPGPolicy) + self.exp_name = self.cfg.exp_name + if env is None: + self.env = supported_env[cfg.env.env_id](cfg=cfg.env) else: - raise TypeError("not support env type: {}, only strings and instances of `BaseEnv` now".format(type(env))) + assert isinstance(env, BaseEnv), "Please use BaseEnv as env data type." + self.env = env + logging.getLogger().setLevel(logging.INFO) self.seed = seed set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda) @@ -78,8 +82,8 @@ def __init__( def train( self, step: int = int(1e7), - collector_env_num: int = 4, - evaluator_env_num: int = 4, + collector_env_num: int = None, + evaluator_env_num: int = None, n_iter_log_show: int = 500, n_iter_save_ckpt: int = 1000, context: Optional[str] = None, @@ -90,8 +94,10 @@ def train( logging.getLogger().setLevel(logging.DEBUG) logging.debug(self.policy._model) # define env and policy - collector_env = self._setup_env_manager(collector_env_num, context, debug, 'collector') - evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug, 'evaluator') + collector_env_num = collector_env_num if collector_env_num else self.cfg.env.collector_env_num + evaluator_env_num = evaluator_env_num if evaluator_env_num else self.cfg.env.evaluator_env_num + collector_env = setup_ding_env_manager(self.env, collector_env_num, context, debug, 'collector') + evaluator_env = setup_ding_env_manager(self.env, evaluator_env_num, context, debug, 'evaluator') with task.start(ctx=OnlineRLContext()): task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) @@ -168,6 +174,8 @@ def _forward(obs): break logging.info(f'DDPG deploy is finished, final episode return with {step} steps is: {return_}') + env.close() + return return_ def collect_data( @@ -184,7 +192,8 @@ def collect_data( if n_episode is not None: raise NotImplementedError # define env and policy - env = self._setup_env_manager(env_num, context, debug, 'collector') + env_num = env_num if env_num else self.cfg.env.collector_env_num + env = setup_ding_env_manager(self.env, env_num, context, debug, 'collector') if save_data_path is None: save_data_path = os.path.join(self.exp_name, 'demo_data') @@ -212,7 +221,8 @@ def batch_evaluate( if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy - env = self._setup_env_manager(env_num, context, debug, 'evaluator') + env_num = env_num if env_num else self.cfg.env.evaluator_env_num + env = setup_ding_env_manager(self.env, env_num, context, debug, 'evaluator') # reset first to make sure the env is in the initial state # env will be reset again in the main loop @@ -229,24 +239,6 @@ def batch_evaluate( return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) - def _setup_env_manager( - self, - env_num: int, - context: Optional[str] = None, - debug: bool = False, - caller: str = 'collector' - ) -> BaseEnvManagerV2: - assert caller in ['evaluator', 'collector'] - if debug: - env_cls = BaseEnvManagerV2 - manager_cfg = env_cls.default_config() - else: - env_cls = SubprocessEnvManagerV2 - manager_cfg = env_cls.default_config() - if context is not None: - manager_cfg.context = context - return env_cls([partial(self.env.clone, caller) for _ in range(env_num)], manager_cfg) - @property def best(self): best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar") diff --git a/ding/bonus/dqn.py b/ding/bonus/dqn.py index 8bc5973548..b15bb7184e 100644 --- a/ding/bonus/dqn.py +++ b/ding/bonus/dqn.py @@ -1,65 +1,71 @@ -from dataclasses import dataclass from typing import Optional, Union from ditk import logging from easydict import EasyDict import os -from functools import partial import torch import treetensor.torch as ttorch from ding.framework import task, OnlineRLContext -from ding.framework.middleware import CkptSaver, multistep_trainer, \ +from ding.framework.middleware import CkptSaver, \ wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \ OffPolicyLearner, final_ctx_saver, nstep_reward_enhancer, eps_greedy_handler -from ding.envs import BaseEnv, BaseEnvManagerV2, SubprocessEnvManagerV2 +from ding.envs import BaseEnv +from ding.envs import setup_ding_env_manager from ding.policy import DQNPolicy from ding.utils import set_pkg_seed from ding.config import save_config_py, compile_config from ding.model import DQN from ding.model import model_wrap from ding.data import DequeBuffer -from ding.bonus.config import get_instance_config, get_instance_env from ding.bonus.common import TrainingReturn, EvalReturn +from ding.bonus.cfg.DQN import supported_env_cfg +from ding.bonus.cfg.DQN import supported_env class DQNAgent: - supported_env_list = [ - 'lunarlander_discrete', - 'PongNoFrameskip', - 'SpaceInvadersNoFrameskip', - 'QbertNoFrameskip', - ] - algorithm = 'DQN' + supported_env_list = list(supported_env_cfg.keys()) def __init__( self, - env: Union[str, BaseEnv], + env_id: str = None, + env: BaseEnv = None, seed: int = 0, exp_name: str = None, model: Optional[torch.nn.Module] = None, - cfg: Optional[Union[EasyDict, dict, str]] = None, + cfg: Optional[Union[EasyDict, dict]] = None, policy_state_dict: str = None, ) -> None: - if isinstance(env, str): - assert env in DQNAgent.supported_env_list, "Please use supported envs: {}".format( + assert env_id is not None or cfg is not None, "Please specify env_id or cfg." + + if cfg is not None and not isinstance(cfg, EasyDict): + cfg = EasyDict(cfg) + + if env_id is not None: + assert env_id in DQNAgent.supported_env_list, "Please use supported envs: {}".format( DQNAgent.supported_env_list ) - self.env = get_instance_env(env) if cfg is None: - # 'It should be default env tuned config' - cfg = get_instance_config(env, algorithm=DQNAgent.algorithm) + cfg = supported_env_cfg[env_id] else: - assert isinstance(cfg, EasyDict), "Please use EasyDict as config data type." - - if exp_name is not None: - cfg.exp_name = exp_name - self.cfg = compile_config(cfg, policy=DQNPolicy) - self.exp_name = self.cfg.exp_name - - elif isinstance(env, BaseEnv): - self.cfg = compile_config(cfg, policy=DQNPolicy) - raise NotImplementedError + assert cfg.env.env_id == env_id, "env_id in cfg should be the same as env_id in args." + else: + assert hasattr(cfg.env, "env_id"), "Please specify env_id in cfg." + assert cfg.env.env_id in DQNAgent.supported_env_list, "Please use supported envs: {}".format( + DQNAgent.supported_env_list + ) + default_policy_config = EasyDict({"policy": DQNPolicy.default_config()}) + default_policy_config.update(cfg) + cfg = default_policy_config + + if exp_name is not None: + cfg.exp_name = exp_name + self.cfg = compile_config(cfg, policy=DQNPolicy) + self.exp_name = self.cfg.exp_name + if env is None: + self.env = supported_env[cfg.env.env_id](cfg=cfg.env) else: - raise TypeError("not support env type: {}, only strings and instances of `BaseEnv` now".format(type(env))) + assert isinstance(env, BaseEnv), "Please use BaseEnv as env data type." + self.env = env + logging.getLogger().setLevel(logging.INFO) self.seed = seed set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda) @@ -77,8 +83,8 @@ def __init__( def train( self, step: int = int(1e7), - collector_env_num: int = 4, - evaluator_env_num: int = 4, + collector_env_num: int = None, + evaluator_env_num: int = None, n_iter_save_ckpt: int = 1000, context: Optional[str] = None, debug: bool = False, @@ -88,8 +94,10 @@ def train( logging.getLogger().setLevel(logging.DEBUG) logging.debug(self.policy._model) # define env and policy - collector_env = self._setup_env_manager(collector_env_num, context, debug, 'collector') - evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug, 'evaluator') + collector_env_num = collector_env_num if collector_env_num else self.cfg.env.collector_env_num + evaluator_env_num = evaluator_env_num if evaluator_env_num else self.cfg.env.evaluator_env_num + collector_env = setup_ding_env_manager(self.env, collector_env_num, context, debug, 'collector') + evaluator_env = setup_ding_env_manager(self.env, evaluator_env_num, context, debug, 'evaluator') with task.start(ctx=OnlineRLContext()): task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) @@ -171,6 +179,8 @@ def _forward(obs): break logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}') + env.close() + return return_ def collect_data( @@ -187,7 +197,8 @@ def collect_data( if n_episode is not None: raise NotImplementedError # define env and policy - env = self._setup_env_manager(env_num, context, debug, 'collector') + env_num = env_num if env_num else self.cfg.env.collector_env_num + env = setup_ding_env_manager(self.env, env_num, context, debug, 'collector') if save_data_path is None: save_data_path = os.path.join(self.exp_name, 'demo_data') @@ -215,7 +226,8 @@ def batch_evaluate( if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy - env = self._setup_env_manager(env_num, context, debug, 'evaluator') + env_num = env_num if env_num else self.cfg.env.evaluator_env_num + env = setup_ding_env_manager(self.env, env_num, context, debug, 'evaluator') # reset first to make sure the env is in the initial state # env will be reset again in the main loop @@ -232,24 +244,6 @@ def batch_evaluate( return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) - def _setup_env_manager( - self, - env_num: int, - context: Optional[str] = None, - debug: bool = False, - caller: str = 'collector' - ) -> BaseEnvManagerV2: - assert caller in ['evaluator', 'collector'] - if debug: - env_cls = BaseEnvManagerV2 - manager_cfg = env_cls.default_config() - else: - env_cls = SubprocessEnvManagerV2 - manager_cfg = env_cls.default_config() - if context is not None: - manager_cfg.context = context - return env_cls([partial(self.env.clone, caller) for _ in range(env_num)], manager_cfg) - @property def best(self): best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar") diff --git a/ding/bonus/pg.py b/ding/bonus/pg.py index 46354837a6..d966263efb 100644 --- a/ding/bonus/pg.py +++ b/ding/bonus/pg.py @@ -2,62 +2,68 @@ from ditk import logging from easydict import EasyDict import os -from functools import partial import torch import treetensor.torch as ttorch from ding.framework import task, OnlineRLContext from ding.framework.middleware import CkptSaver, trainer, \ wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, \ pg_estimator, final_ctx_saver, EpisodeCollector -from ding.envs import BaseEnv, BaseEnvManagerV2, SubprocessEnvManagerV2 +from ding.envs import BaseEnv +from ding.envs import setup_ding_env_manager from ding.policy import PGPolicy from ding.utils import set_pkg_seed -from ding.config import Config, save_config_py, compile_config +from ding.config import save_config_py, compile_config from ding.model import PG -from ding.model import model_wrap -from ding.bonus.config import get_instance_config, get_instance_env from ding.bonus.common import TrainingReturn, EvalReturn +from ding.bonus.cfg.PG import supported_env_cfg +from ding.bonus.cfg.PG import supported_env class PGAgent: - supported_env_list = [ - 'lunarlander_discrete', - 'bipedalwalker', - 'pendulum', - 'hopper', - 'HalfCheetah', - 'Walker2d', - ] - algorithm = 'PG' + supported_env_list = list(supported_env_cfg.keys()) def __init__( self, - env: Union[str, BaseEnv], + env_id: str = None, + env: BaseEnv = None, seed: int = 0, exp_name: str = None, model: Optional[torch.nn.Module] = None, - cfg: Optional[Union[EasyDict, dict, str]] = None, + cfg: Optional[Union[EasyDict, dict]] = None, policy_state_dict: str = None, ) -> None: - if isinstance(env, str): - assert env in PGAgent.supported_env_list, "Please use supported envs: {}".format(PGAgent.supported_env_list) - self.env = get_instance_env(env) - if cfg is None: - # 'It should be default env tuned config' - cfg = get_instance_config(env, algorithm=PGAgent.algorithm) - else: - assert isinstance(cfg, EasyDict), "Please use EasyDict as config data type." + assert env_id is not None or cfg is not None, "Please specify env_id or cfg." - if exp_name is not None: - cfg.exp_name = exp_name - self.cfg = compile_config(cfg, policy=PGPolicy) - self.exp_name = self.cfg.exp_name + if cfg is not None and not isinstance(cfg, EasyDict): + cfg = EasyDict(cfg) - elif isinstance(env, BaseEnv): - self.cfg = compile_config(cfg, policy=PGPolicy) - raise NotImplementedError + if env_id is not None: + assert env_id in PGAgent.supported_env_list, "Please use supported envs: {}".format( + PGAgent.supported_env_list + ) + if cfg is None: + cfg = supported_env_cfg[env_id] + else: + assert cfg.env.env_id == env_id, "env_id in cfg should be the same as env_id in args." + else: + assert hasattr(cfg.env, "env_id"), "Please specify env_id in cfg." + assert cfg.env.env_id in PGAgent.supported_env_list, "Please use supported envs: {}".format( + PGAgent.supported_env_list + ) + default_policy_config = EasyDict({"policy": PGPolicy.default_config()}) + default_policy_config.update(cfg) + cfg = default_policy_config + + if exp_name is not None: + cfg.exp_name = exp_name + self.cfg = compile_config(cfg, policy=PGPolicy) + self.exp_name = self.cfg.exp_name + if env is None: + self.env = supported_env[cfg.env.env_id](cfg=cfg.env) else: - raise TypeError("not support env type: {}, only strings and instances of `BaseEnv` now".format(type(env))) + assert isinstance(env, BaseEnv), "Please use BaseEnv as env data type." + self.env = env + logging.getLogger().setLevel(logging.INFO) self.seed = seed set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda) @@ -74,9 +80,8 @@ def __init__( def train( self, step: int = int(1e7), - collector_env_num: int = 4, - evaluator_env_num: int = 4, - n_iter_log_show: int = 500, + collector_env_num: int = None, + evaluator_env_num: int = None, n_iter_save_ckpt: int = 1000, context: Optional[str] = None, debug: bool = False, @@ -86,8 +91,10 @@ def train( logging.getLogger().setLevel(logging.DEBUG) logging.debug(self.policy._model) # define env and policy - collector_env = self._setup_env_manager(collector_env_num, context, debug, 'collector') - evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug, 'evaluator') + collector_env_num = collector_env_num if collector_env_num else self.cfg.env.collector_env_num + evaluator_env_num = evaluator_env_num if evaluator_env_num else self.cfg.env.evaluator_env_num + collector_env = setup_ding_env_manager(self.env, collector_env_num, context, debug, 'collector') + evaluator_env = setup_ding_env_manager(self.env, evaluator_env_num, context, debug, 'evaluator') with task.start(ctx=OnlineRLContext()): task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) @@ -166,6 +173,8 @@ def _forward(obs): break logging.info(f'PG deploy is finished, final episode return with {step} steps is: {return_}') + env.close() + return return_ def collect_data( @@ -182,7 +191,8 @@ def collect_data( if n_episode is not None: raise NotImplementedError # define env and policy - env = self._setup_env_manager(env_num, context, debug, 'collector') + env_num = env_num if env_num else self.cfg.env.collector_env_num + env = setup_ding_env_manager(self.env, env_num, context, debug, 'collector') if save_data_path is None: save_data_path = os.path.join(self.exp_name, 'demo_data') @@ -210,7 +220,8 @@ def batch_evaluate( if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy - env = self._setup_env_manager(env_num, context, debug, 'evaluator') + env_num = env_num if env_num else self.cfg.env.evaluator_env_num + env = setup_ding_env_manager(self.env, env_num, context, debug, 'evaluator') # reset first to make sure the env is in the initial state # env will be reset again in the main loop @@ -227,24 +238,6 @@ def batch_evaluate( return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) - def _setup_env_manager( - self, - env_num: int, - context: Optional[str] = None, - debug: bool = False, - caller: str = 'collector' - ) -> BaseEnvManagerV2: - assert caller in ['evaluator', 'collector'] - if debug: - env_cls = BaseEnvManagerV2 - manager_cfg = env_cls.default_config() - else: - env_cls = SubprocessEnvManagerV2 - manager_cfg = env_cls.default_config() - if context is not None: - manager_cfg.context = context - return env_cls([partial(self.env.clone, caller) for _ in range(env_num)], manager_cfg) - @property def best(self): best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar") diff --git a/ding/bonus/ppo_offpolicy.py b/ding/bonus/ppo_offpolicy.py index 524f66d254..e37d79c60f 100644 --- a/ding/bonus/ppo_offpolicy.py +++ b/ding/bonus/ppo_offpolicy.py @@ -1,64 +1,70 @@ -from dataclasses import dataclass from typing import Optional, Union from ditk import logging from easydict import EasyDict import os -from functools import partial import torch import treetensor.torch as ttorch from ding.framework import task, OnlineRLContext from ding.framework.middleware import CkptSaver, final_ctx_saver, OffPolicyLearner, StepCollector, \ wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, gae_estimator -from ding.envs import BaseEnv, BaseEnvManagerV2, SubprocessEnvManagerV2 +from ding.envs import BaseEnv +from ding.envs import setup_ding_env_manager from ding.policy import PPOOffPolicy from ding.utils import set_pkg_seed -from ding.config import Config, save_config_py, compile_config +from ding.config import save_config_py, compile_config from ding.model import VAC from ding.model import model_wrap from ding.data import DequeBuffer -from ding.bonus.config import get_instance_config, get_instance_env from ding.bonus.common import TrainingReturn, EvalReturn +from ding.bonus.cfg.PPOOffPolicy import supported_env_cfg +from ding.bonus.cfg.PPOOffPolicy import supported_env class PPOOffPolicyAgent: - supported_env_list = [ - 'lunarlander_discrete', - 'PongNoFrameskip', - 'SpaceInvadersNoFrameskip', - 'QbertNoFrameskip', - ] - algorithm = 'PPOOffPolicy' + supported_env_list = list(supported_env_cfg.keys()) def __init__( self, - env: Union[str, BaseEnv], + env_id: str = None, + env: BaseEnv = None, seed: int = 0, exp_name: str = None, model: Optional[torch.nn.Module] = None, - cfg: Optional[Union[EasyDict, dict, str]] = None, + cfg: Optional[Union[EasyDict, dict]] = None, policy_state_dict: str = None, ) -> None: - if isinstance(env, str): - assert env in PPOOffPolicyAgent.supported_env_list, "Please use supported envs: {}".format( + assert env_id is not None or cfg is not None, "Please specify env_id or cfg." + + if cfg is not None and not isinstance(cfg, EasyDict): + cfg = EasyDict(cfg) + + if env_id is not None: + assert env_id in PPOOffPolicyAgent.supported_env_list, "Please use supported envs: {}".format( PPOOffPolicyAgent.supported_env_list ) - self.env = get_instance_env(env) if cfg is None: - # 'It should be default env tuned config' - cfg = get_instance_config(env, algorithm=PPOOffPolicyAgent.algorithm) + cfg = supported_env_cfg[env_id] else: - assert isinstance(cfg, EasyDict), "Please use EasyDict as config data type." - - if exp_name is not None: - cfg.exp_name = exp_name - self.cfg = compile_config(cfg, policy=PPOOffPolicy) - self.exp_name = self.cfg.exp_name - - elif isinstance(env, BaseEnv): - self.cfg = compile_config(cfg, policy=PPOOffPolicy) - raise NotImplementedError + assert cfg.env.env_id == env_id, "env_id in cfg should be the same as env_id in args." + else: + assert hasattr(cfg.env, "env_id"), "Please specify env_id in cfg." + assert cfg.env.env_id in PPOOffPolicyAgent.supported_env_list, "Please use supported envs: {}".format( + PPOOffPolicyAgent.supported_env_list + ) + default_policy_config = EasyDict({"policy": PPOOffPolicy.default_config()}) + default_policy_config.update(cfg) + cfg = default_policy_config + + if exp_name is not None: + cfg.exp_name = exp_name + self.cfg = compile_config(cfg, policy=PPOOffPolicy) + self.exp_name = self.cfg.exp_name + if env is None: + self.env = supported_env[cfg.env.env_id](cfg=cfg.env) else: - raise TypeError("not support env type: {}, only strings and instances of `BaseEnv` now".format(type(env))) + assert isinstance(env, BaseEnv), "Please use BaseEnv as env data type." + self.env = env + logging.getLogger().setLevel(logging.INFO) self.seed = seed set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda) @@ -76,9 +82,8 @@ def __init__( def train( self, step: int = int(1e7), - collector_env_num: int = 4, - evaluator_env_num: int = 4, - n_iter_log_show: int = 500, + collector_env_num: int = None, + evaluator_env_num: int = None, n_iter_save_ckpt: int = 1000, context: Optional[str] = None, debug: bool = False, @@ -88,8 +93,10 @@ def train( logging.getLogger().setLevel(logging.DEBUG) logging.debug(self.policy._model) # define env and policy - collector_env = self._setup_env_manager(collector_env_num, context, debug, 'collector') - evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug, 'evaluator') + collector_env_num = collector_env_num if collector_env_num else self.cfg.env.collector_env_num + evaluator_env_num = evaluator_env_num if evaluator_env_num else self.cfg.env.evaluator_env_num + collector_env = setup_ding_env_manager(self.env, collector_env_num, context, debug, 'collector') + evaluator_env = setup_ding_env_manager(self.env, evaluator_env_num, context, debug, 'evaluator') with task.start(ctx=OnlineRLContext()): task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) @@ -165,6 +172,8 @@ def _forward(obs): break logging.info(f'PPOOffPolicy deploy is finished, final episode return with {step} steps is: {return_}') + env.close() + return return_ def collect_data( @@ -181,7 +190,8 @@ def collect_data( if n_episode is not None: raise NotImplementedError # define env and policy - env = self._setup_env_manager(env_num, context, debug, 'collector') + env_num = env_num if env_num else self.cfg.env.collector_env_num + env = setup_ding_env_manager(self.env, env_num, context, debug, 'collector') if save_data_path is None: save_data_path = os.path.join(self.exp_name, 'demo_data') @@ -209,7 +219,8 @@ def batch_evaluate( if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy - env = self._setup_env_manager(env_num, context, debug, 'evaluator') + env_num = env_num if env_num else self.cfg.env.evaluator_env_num + env = setup_ding_env_manager(self.env, env_num, context, debug, 'evaluator') # reset first to make sure the env is in the initial state # env will be reset again in the main loop @@ -226,24 +237,6 @@ def batch_evaluate( return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) - def _setup_env_manager( - self, - env_num: int, - context: Optional[str] = None, - debug: bool = False, - caller: str = 'collector' - ) -> BaseEnvManagerV2: - assert caller in ['evaluator', 'collector'] - if debug: - env_cls = BaseEnvManagerV2 - manager_cfg = env_cls.default_config() - else: - env_cls = SubprocessEnvManagerV2 - manager_cfg = env_cls.default_config() - if context is not None: - manager_cfg.context = context - return env_cls([partial(self.env.clone, caller) for _ in range(env_num)], manager_cfg) - @property def best(self): best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar") diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index c0d158dabe..def970995e 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -1,9 +1,7 @@ -from dataclasses import dataclass from typing import Optional, Union from ditk import logging from easydict import EasyDict from functools import partial -import random import os import gym import gymnasium @@ -187,6 +185,8 @@ def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, break logging.info(f'PPOF deploy is finished, final episode return with {step} steps is: {return_}') + env.close() + return return_ def collect_data( diff --git a/ding/bonus/sac.py b/ding/bonus/sac.py index b430674f65..89349c4d9f 100644 --- a/ding/bonus/sac.py +++ b/ding/bonus/sac.py @@ -1,67 +1,71 @@ -from dataclasses import dataclass from typing import Optional, Union from ditk import logging from easydict import EasyDict import os -from functools import partial import torch import treetensor.torch as ttorch from ding.framework import task, OnlineRLContext -from ding.framework.middleware import CkptSaver, multistep_trainer, \ +from ding.framework.middleware import CkptSaver, \ wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \ OffPolicyLearner, final_ctx_saver -from ding.envs import BaseEnv, BaseEnvManagerV2, SubprocessEnvManagerV2 +from ding.envs import BaseEnv +from ding.envs import setup_ding_env_manager from ding.policy import SACPolicy from ding.utils import set_pkg_seed -from ding.config import Config, save_config_py, compile_config +from ding.config import save_config_py, compile_config from ding.model import QAC from ding.model import model_wrap from ding.data import DequeBuffer -from ding.bonus.config import get_instance_config, get_instance_env from ding.bonus.common import TrainingReturn, EvalReturn +from ding.bonus.cfg.SAC import supported_env_cfg +from ding.bonus.cfg.SAC import supported_env class SACAgent: - supported_env_list = [ - 'hopper', - 'HalfCheetah', - 'Walker2d', - 'lunarlander_continuous', - 'bipedalwalker', - 'pendulum', - ] - algorithm = 'SAC' + supported_env_list = list(supported_env_cfg.keys()) def __init__( self, - env: Union[str, BaseEnv], + env_id: str = None, + env: BaseEnv = None, seed: int = 0, exp_name: str = None, model: Optional[torch.nn.Module] = None, - cfg: Optional[Union[EasyDict, dict, str]] = None, + cfg: Optional[Union[EasyDict, dict]] = None, policy_state_dict: str = None, ) -> None: - if isinstance(env, str): - assert env in SACAgent.supported_env_list, "Please use supported envs: {}".format( + assert env_id is not None or cfg is not None, "Please specify env_id or cfg." + + if cfg is not None and not isinstance(cfg, EasyDict): + cfg = EasyDict(cfg) + + if env_id is not None: + assert env_id in SACAgent.supported_env_list, "Please use supported envs: {}".format( SACAgent.supported_env_list ) - self.env = get_instance_env(env) if cfg is None: - # 'It should be default env tuned config' - cfg = get_instance_config(env, algorithm=SACAgent.algorithm) + cfg = supported_env_cfg[env_id] else: - assert isinstance(cfg, EasyDict), "Please use EasyDict as config data type." - - if exp_name is not None: - cfg.exp_name = exp_name - self.cfg = compile_config(cfg, policy=SACPolicy) - self.exp_name = self.cfg.exp_name - - elif isinstance(env, BaseEnv): - self.cfg = compile_config(cfg, policy=SACPolicy) - raise NotImplementedError + assert cfg.env.env_id == env_id, "env_id in cfg should be the same as env_id in args." + else: + assert hasattr(cfg.env, "env_id"), "Please specify env_id in cfg." + assert cfg.env.env_id in SACAgent.supported_env_list, "Please use supported envs: {}".format( + SACAgent.supported_env_list + ) + default_policy_config = EasyDict({"policy": SACPolicy.default_config()}) + default_policy_config.update(cfg) + cfg = default_policy_config + + if exp_name is not None: + cfg.exp_name = exp_name + self.cfg = compile_config(cfg, policy=SACPolicy) + self.exp_name = self.cfg.exp_name + if env is None: + self.env = supported_env[cfg.env.env_id](cfg=cfg.env) else: - raise TypeError("not support env type: {}, only strings and instances of `BaseEnv` now".format(type(env))) + assert isinstance(env, BaseEnv), "Please use BaseEnv as env data type." + self.env = env + logging.getLogger().setLevel(logging.INFO) self.seed = seed set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda) @@ -79,9 +83,8 @@ def __init__( def train( self, step: int = int(1e7), - collector_env_num: int = 4, - evaluator_env_num: int = 4, - n_iter_log_show: int = 500, + collector_env_num: int = None, + evaluator_env_num: int = None, n_iter_save_ckpt: int = 1000, context: Optional[str] = None, debug: bool = False, @@ -91,8 +94,10 @@ def train( logging.getLogger().setLevel(logging.DEBUG) logging.debug(self.policy._model) # define env and policy - collector_env = self._setup_env_manager(collector_env_num, context, debug, 'collector') - evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug, 'evaluator') + collector_env_num = collector_env_num if collector_env_num else self.cfg.env.collector_env_num + evaluator_env_num = evaluator_env_num if evaluator_env_num else self.cfg.env.evaluator_env_num + collector_env = setup_ding_env_manager(self.env, collector_env_num, context, debug, 'collector') + evaluator_env = setup_ding_env_manager(self.env, evaluator_env_num, context, debug, 'evaluator') with task.start(ctx=OnlineRLContext()): task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) @@ -170,6 +175,8 @@ def _forward(obs): break logging.info(f'SAC deploy is finished, final episode return with {step} steps is: {return_}') + env.close() + return return_ def collect_data( @@ -186,7 +193,8 @@ def collect_data( if n_episode is not None: raise NotImplementedError # define env and policy - env = self._setup_env_manager(env_num, context, debug, 'collector') + env_num = env_num if env_num else self.cfg.env.collector_env_num + env = setup_ding_env_manager(self.env, env_num, context, debug, 'collector') if save_data_path is None: save_data_path = os.path.join(self.exp_name, 'demo_data') @@ -214,7 +222,8 @@ def batch_evaluate( if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy - env = self._setup_env_manager(env_num, context, debug, 'evaluator') + env_num = env_num if env_num else self.cfg.env.evaluator_env_num + env = setup_ding_env_manager(self.env, env_num, context, debug, 'evaluator') # reset first to make sure the env is in the initial state # env will be reset again in the main loop @@ -231,24 +240,6 @@ def batch_evaluate( return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) - def _setup_env_manager( - self, - env_num: int, - context: Optional[str] = None, - debug: bool = False, - caller: str = 'collector' - ) -> BaseEnvManagerV2: - assert caller in ['evaluator', 'collector'] - if debug: - env_cls = BaseEnvManagerV2 - manager_cfg = env_cls.default_config() - else: - env_cls = SubprocessEnvManagerV2 - manager_cfg = env_cls.default_config() - if context is not None: - manager_cfg.context = context - return env_cls([partial(self.env.clone, caller) for _ in range(env_num)], manager_cfg) - @property def best(self): best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar") diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 88cbbaf644..e9f18df681 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -1,66 +1,70 @@ -from dataclasses import dataclass from typing import Optional, Union from ditk import logging from easydict import EasyDict import os -from functools import partial import torch import treetensor.torch as ttorch from ding.framework import task, OnlineRLContext -from ding.framework.middleware import CkptSaver, multistep_trainer, \ +from ding.framework.middleware import CkptSaver, \ wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \ OffPolicyLearner, final_ctx_saver -from ding.envs import BaseEnv, BaseEnvManagerV2, SubprocessEnvManagerV2 +from ding.envs import BaseEnv +from ding.envs import setup_ding_env_manager from ding.policy import TD3Policy from ding.utils import set_pkg_seed -from ding.config import Config, save_config_py, compile_config +from ding.config import save_config_py, compile_config from ding.model import QAC from ding.data import DequeBuffer -from ding.bonus.config import get_instance_config, get_instance_env from ding.bonus.common import TrainingReturn, EvalReturn +from ding.bonus.cfg.TD3 import supported_env_cfg +from ding.bonus.cfg.TD3 import supported_env class TD3Agent: - supported_env_list = [ - 'hopper', - 'HalfCheetah', - 'Walker2d', - 'lunarlander_continuous', - 'bipedalwalker', - 'pendulum', - ] - algorithm = 'TD3' + supported_env_list = list(supported_env_cfg.keys()) def __init__( self, - env: Union[str, BaseEnv], + env_id: str = None, + env: BaseEnv = None, seed: int = 0, exp_name: str = None, model: Optional[torch.nn.Module] = None, - cfg: Optional[Union[EasyDict, dict, str]] = None, + cfg: Optional[Union[EasyDict, dict]] = None, policy_state_dict: str = None, ) -> None: - if isinstance(env, str): - assert env in TD3Agent.supported_env_list, "Please use supported envs: {}".format( + assert env_id is not None or cfg is not None, "Please specify env_id or cfg." + + if cfg is not None and not isinstance(cfg, EasyDict): + cfg = EasyDict(cfg) + + if env_id is not None: + assert env_id in TD3Agent.supported_env_list, "Please use supported envs: {}".format( TD3Agent.supported_env_list ) - self.env = get_instance_env(env) if cfg is None: - # 'It should be default env tuned config' - cfg = get_instance_config(env, algorithm=TD3Agent.algorithm) + cfg = supported_env_cfg[env_id] else: - assert isinstance(cfg, EasyDict), "Please use EasyDict as config data type." - - if exp_name is not None: - cfg.exp_name = exp_name - self.cfg = compile_config(cfg, policy=TD3Policy) - self.exp_name = self.cfg.exp_name - - elif isinstance(env, BaseEnv): - self.cfg = compile_config(cfg, policy=TD3Policy) - raise NotImplementedError + assert cfg.env.env_id == env_id, "env_id in cfg should be the same as env_id in args." + else: + assert hasattr(cfg.env, "env_id"), "Please specify env_id in cfg." + assert cfg.env.env_id in TD3Agent.supported_env_list, "Please use supported envs: {}".format( + TD3Agent.supported_env_list + ) + default_policy_config = EasyDict({"policy": TD3Policy.default_config()}) + default_policy_config.update(cfg) + cfg = default_policy_config + + if exp_name is not None: + cfg.exp_name = exp_name + self.cfg = compile_config(cfg, policy=TD3Policy) + self.exp_name = self.cfg.exp_name + if env is None: + self.env = supported_env[cfg.env.env_id](cfg=cfg.env) else: - raise TypeError("not support env type: {}, only strings and instances of `BaseEnv` now".format(type(env))) + assert isinstance(env, BaseEnv), "Please use BaseEnv as env data type." + self.env = env + logging.getLogger().setLevel(logging.INFO) self.seed = seed set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda) @@ -78,9 +82,8 @@ def __init__( def train( self, step: int = int(1e7), - collector_env_num: int = 4, - evaluator_env_num: int = 4, - n_iter_log_show: int = 500, + collector_env_num: int = None, + evaluator_env_num: int = None, n_iter_save_ckpt: int = 1000, context: Optional[str] = None, debug: bool = False, @@ -90,8 +93,10 @@ def train( logging.getLogger().setLevel(logging.DEBUG) logging.debug(self.policy._model) # define env and policy - collector_env = self._setup_env_manager(collector_env_num, context, debug, 'collector') - evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug, 'evaluator') + collector_env_num = collector_env_num if collector_env_num else self.cfg.env.collector_env_num + evaluator_env_num = evaluator_env_num if evaluator_env_num else self.cfg.env.evaluator_env_num + collector_env = setup_ding_env_manager(self.env, collector_env_num, context, debug, 'collector') + evaluator_env = setup_ding_env_manager(self.env, evaluator_env_num, context, debug, 'evaluator') with task.start(ctx=OnlineRLContext()): task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) @@ -168,6 +173,8 @@ def _forward(obs): break logging.info(f'TD3 deploy is finished, final episode return with {step} steps is: {return_}') + env.close() + return return_ def collect_data( @@ -184,7 +191,8 @@ def collect_data( if n_episode is not None: raise NotImplementedError # define env and policy - env = self._setup_env_manager(env_num, context, debug, 'collector') + env_num = env_num if env_num else self.cfg.env.collector_env_num + env = setup_ding_env_manager(self.env, env_num, context, debug, 'collector') if save_data_path is None: save_data_path = os.path.join(self.exp_name, 'demo_data') @@ -212,7 +220,8 @@ def batch_evaluate( if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy - env = self._setup_env_manager(env_num, context, debug, 'evaluator') + env_num = env_num if env_num else self.cfg.env.evaluator_env_num + env = setup_ding_env_manager(self.env, env_num, context, debug, 'evaluator') # reset first to make sure the env is in the initial state # env will be reset again in the main loop @@ -229,24 +238,6 @@ def batch_evaluate( return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) - def _setup_env_manager( - self, - env_num: int, - context: Optional[str] = None, - debug: bool = False, - caller: str = 'collector' - ) -> BaseEnvManagerV2: - assert caller in ['evaluator', 'collector'] - if debug: - env_cls = BaseEnvManagerV2 - manager_cfg = env_cls.default_config() - else: - env_cls = SubprocessEnvManagerV2 - manager_cfg = env_cls.default_config() - if context is not None: - manager_cfg.context = context - return env_cls([partial(self.env.clone, caller) for _ in range(env_num)], manager_cfg) - @property def best(self): best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar") diff --git a/ding/envs/__init__.py b/ding/envs/__init__.py index 5c9ecbf1b4..f659b3a590 100644 --- a/ding/envs/__init__.py +++ b/ding/envs/__init__.py @@ -1,3 +1,5 @@ from .env import * from .env_wrappers import * from .env_manager import * +from .ding_env_manager import setup_ding_env_manager +from . import gym_env diff --git a/ding/envs/ding_env_manager.py b/ding/envs/ding_env_manager.py new file mode 100644 index 0000000000..8dbffd4042 --- /dev/null +++ b/ding/envs/ding_env_manager.py @@ -0,0 +1,23 @@ +from .env_manager import BaseEnvManagerV2, SubprocessEnvManagerV2 +from .env import DingEnvWrapper +from typing import Optional +from functools import partial + + +def setup_ding_env_manager( + env: DingEnvWrapper, + env_num: int, + context: Optional[str] = None, + debug: bool = False, + caller: str = 'collector' +) -> BaseEnvManagerV2: + assert caller in ['evaluator', 'collector'] + if debug: + env_cls = BaseEnvManagerV2 + manager_cfg = env_cls.default_config() + else: + env_cls = SubprocessEnvManagerV2 + manager_cfg = env_cls.default_config() + if context is not None: + manager_cfg.context = context + return env_cls([partial(env.clone, caller) for _ in range(env_num)], manager_cfg) diff --git a/ding/envs/gym_env.py b/ding/envs/gym_env.py new file mode 100644 index 0000000000..b0f3e34dd2 --- /dev/null +++ b/ding/envs/gym_env.py @@ -0,0 +1,6 @@ +from ding.envs import BaseEnv, DingEnvWrapper + + +def env(cfg, seed_api=True, caller='collector', **kwargs) -> BaseEnv: + import gym + return DingEnvWrapper(gym.make(cfg.env_id, **kwargs), cfg=cfg, seed_api=seed_api, caller=caller) From 41786e31ca7875ccc4cc0ed0af6350113e374f39 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Mon, 10 Jul 2023 16:17:05 +0000 Subject: [PATCH 154/244] polish code; add SQL --- ding/bonus/__init__.py | 22 +- ding/bonus/cfg/A2C/gym_bipedalwalker_v3.py | 2 +- ding/bonus/cfg/A2C/gym_halfcheetah_v3.py | 1 - ding/bonus/cfg/DDPG/gym_bipedalwalker_v3.py | 3 +- .../cfg/DDPG/gym_lunarlandercontinuous_v2.py | 2 +- ding/bonus/cfg/DDPG/gym_pendulum_v1.py | 3 +- ding/bonus/cfg/PG/gym_bipedalwalker_v3.py | 2 +- ding/bonus/cfg/PG/gym_pendulum_v1.py | 2 +- .../gym_spaceInvadersnoframeskip_v4.py | 1 - ding/bonus/cfg/SAC/gym_bipedalwalker_v3.py | 2 +- ding/bonus/cfg/SAC/gym_halfcheetah_v3.py | 4 +- .../cfg/SAC/gym_lunarlandercontinuous_v2.py | 2 +- ding/bonus/cfg/SAC/gym_pendulum_v1.py | 4 +- ding/bonus/cfg/SAC/gym_walker2d_v3.py | 2 - ding/bonus/cfg/SQL/__init__.py | 14 + ding/bonus/cfg/SQL/gym_lunarlander_v2.py | 43 +++ ding/bonus/cfg/TD3/gym_bipedalwalker_v3.py | 2 +- ding/bonus/cfg/TD3/gym_halfcheetah_v3.py | 2 - .../cfg/TD3/gym_lunarlandercontinuous_v2.py | 2 +- ding/bonus/cfg/TD3/gym_pendulum_v1.py | 2 +- ding/bonus/cfg/__init__.py | 1 + ding/bonus/sql.py | 253 ++++++++++++++++++ 22 files changed, 339 insertions(+), 32 deletions(-) create mode 100644 ding/bonus/cfg/SQL/__init__.py create mode 100644 ding/bonus/cfg/SQL/gym_lunarlander_v2.py create mode 100644 ding/bonus/sql.py diff --git a/ding/bonus/__init__.py b/ding/bonus/__init__.py index a9571dd342..24e094a9c2 100644 --- a/ding/bonus/__init__.py +++ b/ding/bonus/__init__.py @@ -1,24 +1,26 @@ from . import cfg from .a2c import A2CAgent -from .ppof import PPOF -from .ppo_offpolicy import PPOOffPolicyAgent from .c51 import C51Agent -from .td3 import TD3Agent from .ddpg import DDPGAgent from .dqn import DQNAgent -from .sac import SACAgent from .pg import PGAgent +from .ppof import PPOF +from .ppo_offpolicy import PPOOffPolicyAgent +from .sac import SACAgent +from .sql import SQLAgent +from .td3 import TD3Agent supported_algo = dict( A2C=A2CAgent, - PPOF=PPOF, - PPOOffPolicy=PPOOffPolicyAgent, C51=C51Agent, - TD3=TD3Agent, DDPG=DDPGAgent, DQN=DQNAgent, - SAC=SACAgent, PG=PGAgent, + PPOF=PPOF, + PPOOffPolicy=PPOOffPolicyAgent, + SAC=SACAgent, + SQL=SQLAgent, + TD3=TD3Agent, ) supported_algo_list = list(supported_algo.keys()) @@ -46,6 +48,8 @@ def env_supported(algo: str = None) -> list: return list(cfg.PPOOffPolicy.supported_env.keys()) elif algo.upper() == "SAC": return list(cfg.SAC.supported_env.keys()) + elif algo.upper() == "SQL": + return list(cfg.SQL.supported_env.keys()) elif algo.upper() == "TD3": return list(cfg.TD3.supported_env.keys()) else: @@ -82,6 +86,8 @@ def algo_supported(env_id: str = None) -> list: algo.append("PPOOffPolicy") if env_id.upper() in [item.upper() for item in cfg.SAC.supported_env.keys()]: algo.append("SAC") + if env_id.upper() in [item.upper() for item in cfg.SQL.supported_env.keys()]: + algo.append("SQL") if env_id.upper() in [item.upper() for item in cfg.TD3.supported_env.keys()]: algo.append("TD3") diff --git a/ding/bonus/cfg/A2C/gym_bipedalwalker_v3.py b/ding/bonus/cfg/A2C/gym_bipedalwalker_v3.py index 3ab43bd168..b53159ece3 100644 --- a/ding/bonus/cfg/A2C/gym_bipedalwalker_v3.py +++ b/ding/bonus/cfg/A2C/gym_bipedalwalker_v3.py @@ -7,8 +7,8 @@ env_id='BipedalWalker-v3', collector_env_num=8, evaluator_env_num=8, - act_scale=True, n_evaluator_episode=8, + act_scale=True, rew_clip=True, ), policy=dict( diff --git a/ding/bonus/cfg/A2C/gym_halfcheetah_v3.py b/ding/bonus/cfg/A2C/gym_halfcheetah_v3.py index 5b4e1e181d..3f38fe610a 100644 --- a/ding/bonus/cfg/A2C/gym_halfcheetah_v3.py +++ b/ding/bonus/cfg/A2C/gym_halfcheetah_v3.py @@ -9,7 +9,6 @@ collector_env_num=1, evaluator_env_num=8, n_evaluator_episode=8, - act_scale=True, stop_value=12000, env_wrapper='mujoco_default', act_scale=True, diff --git a/ding/bonus/cfg/DDPG/gym_bipedalwalker_v3.py b/ding/bonus/cfg/DDPG/gym_bipedalwalker_v3.py index 227c773891..e444c3a570 100644 --- a/ding/bonus/cfg/DDPG/gym_bipedalwalker_v3.py +++ b/ding/bonus/cfg/DDPG/gym_bipedalwalker_v3.py @@ -7,9 +7,8 @@ env_id='BipedalWalker-v3', collector_env_num=8, evaluator_env_num=5, - # (bool) Scale output action into legal range. - act_scale=True, n_evaluator_episode=5, + act_scale=True, rew_clip=True, ), policy=dict( diff --git a/ding/bonus/cfg/DDPG/gym_lunarlandercontinuous_v2.py b/ding/bonus/cfg/DDPG/gym_lunarlandercontinuous_v2.py index dd902cd191..d4b6510c8c 100644 --- a/ding/bonus/cfg/DDPG/gym_lunarlandercontinuous_v2.py +++ b/ding/bonus/cfg/DDPG/gym_lunarlandercontinuous_v2.py @@ -8,8 +8,8 @@ collector_env_num=8, evaluator_env_num=8, n_evaluator_episode=8, - act_scale=True, stop_value=240, + act_scale=True, ), policy=dict( cuda=True, diff --git a/ding/bonus/cfg/DDPG/gym_pendulum_v1.py b/ding/bonus/cfg/DDPG/gym_pendulum_v1.py index 34e76c7750..fe1e69e265 100644 --- a/ding/bonus/cfg/DDPG/gym_pendulum_v1.py +++ b/ding/bonus/cfg/DDPG/gym_pendulum_v1.py @@ -6,10 +6,9 @@ env=dict( collector_env_num=8, evaluator_env_num=5, - # (bool) Scale output action into legal range. - act_scale=True, n_evaluator_episode=5, stop_value=-250, + act_scale=True, ), policy=dict( cuda=False, diff --git a/ding/bonus/cfg/PG/gym_bipedalwalker_v3.py b/ding/bonus/cfg/PG/gym_bipedalwalker_v3.py index e3c194f4b9..9b7cde76b4 100644 --- a/ding/bonus/cfg/PG/gym_bipedalwalker_v3.py +++ b/ding/bonus/cfg/PG/gym_bipedalwalker_v3.py @@ -7,9 +7,9 @@ env_id='BipedalWalker-v3', collector_env_num=8, evaluator_env_num=8, - act_scale=True, n_evaluator_episode=8, stop_value=300, + act_scale=True, rew_clip=True, ), policy=dict( diff --git a/ding/bonus/cfg/PG/gym_pendulum_v1.py b/ding/bonus/cfg/PG/gym_pendulum_v1.py index 08baf62322..0372f016b5 100644 --- a/ding/bonus/cfg/PG/gym_pendulum_v1.py +++ b/ding/bonus/cfg/PG/gym_pendulum_v1.py @@ -6,9 +6,9 @@ env=dict( collector_env_num=8, evaluator_env_num=5, - act_scale=True, n_evaluator_episode=5, stop_value=-200, + act_scale=True, ), policy=dict( cuda=False, diff --git a/ding/bonus/cfg/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py b/ding/bonus/cfg/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py index 7aa6909b35..a7a12cb5e7 100644 --- a/ding/bonus/cfg/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py +++ b/ding/bonus/cfg/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py @@ -9,7 +9,6 @@ stop_value=10000000000, env_id='SpaceInvadersNoFrameskip-v4', frame_stack=4, - manager=dict(shared_memory=False, ) ), policy=dict( cuda=True, diff --git a/ding/bonus/cfg/SAC/gym_bipedalwalker_v3.py b/ding/bonus/cfg/SAC/gym_bipedalwalker_v3.py index 89443e451b..0bd3b8c5c4 100644 --- a/ding/bonus/cfg/SAC/gym_bipedalwalker_v3.py +++ b/ding/bonus/cfg/SAC/gym_bipedalwalker_v3.py @@ -7,8 +7,8 @@ env_id='BipedalWalker-v3', collector_env_num=8, evaluator_env_num=5, - act_scale=True, n_evaluator_episode=5, + act_scale=True, rew_clip=True, ), policy=dict( diff --git a/ding/bonus/cfg/SAC/gym_halfcheetah_v3.py b/ding/bonus/cfg/SAC/gym_halfcheetah_v3.py index 9d1b445834..ba22c115cf 100644 --- a/ding/bonus/cfg/SAC/gym_halfcheetah_v3.py +++ b/ding/bonus/cfg/SAC/gym_halfcheetah_v3.py @@ -5,8 +5,6 @@ seed=0, env=dict( env_id='HalfCheetah-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), collector_env_num=1, evaluator_env_num=8, n_evaluator_episode=8, @@ -16,7 +14,7 @@ rew_clip=True, ), policy=dict( - cuda=False, + cuda=True, random_collect_size=10000, model=dict( obs_shape=17, diff --git a/ding/bonus/cfg/SAC/gym_lunarlandercontinuous_v2.py b/ding/bonus/cfg/SAC/gym_lunarlandercontinuous_v2.py index 3aa9f30d79..b34504c6a5 100644 --- a/ding/bonus/cfg/SAC/gym_lunarlandercontinuous_v2.py +++ b/ding/bonus/cfg/SAC/gym_lunarlandercontinuous_v2.py @@ -7,9 +7,9 @@ env_id='LunarLanderContinuous-v2', collector_env_num=4, evaluator_env_num=8, - act_scale=True, n_evaluator_episode=8, stop_value=240, + act_scale=True, ), policy=dict( cuda=True, diff --git a/ding/bonus/cfg/SAC/gym_pendulum_v1.py b/ding/bonus/cfg/SAC/gym_pendulum_v1.py index 9c2f713314..ea18844c85 100644 --- a/ding/bonus/cfg/SAC/gym_pendulum_v1.py +++ b/ding/bonus/cfg/SAC/gym_pendulum_v1.py @@ -6,12 +6,12 @@ env=dict( collector_env_num=10, evaluator_env_num=8, - act_scale=True, n_evaluator_episode=8, stop_value=-250, + act_scale=True, ), policy=dict( - cuda=False, + cuda=True, priority=False, random_collect_size=1000, model=dict( diff --git a/ding/bonus/cfg/SAC/gym_walker2d_v3.py b/ding/bonus/cfg/SAC/gym_walker2d_v3.py index 32a2bafac8..1d2668c2fc 100644 --- a/ding/bonus/cfg/SAC/gym_walker2d_v3.py +++ b/ding/bonus/cfg/SAC/gym_walker2d_v3.py @@ -5,8 +5,6 @@ seed=0, env=dict( env_id='Walker2d-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), collector_env_num=1, evaluator_env_num=8, n_evaluator_episode=8, diff --git a/ding/bonus/cfg/SQL/__init__.py b/ding/bonus/cfg/SQL/__init__.py new file mode 100644 index 0000000000..9637366fb4 --- /dev/null +++ b/ding/bonus/cfg/SQL/__init__.py @@ -0,0 +1,14 @@ +from easydict import EasyDict +from . import gym_lunarlander_v2 + +supported_env_cfg = { + gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.cfg, +} + +supported_env_cfg = EasyDict(supported_env_cfg) + +supported_env = { + gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.env, +} + +supported_env = EasyDict(supported_env) diff --git a/ding/bonus/cfg/SQL/gym_lunarlander_v2.py b/ding/bonus/cfg/SQL/gym_lunarlander_v2.py new file mode 100644 index 0000000000..696a4863dd --- /dev/null +++ b/ding/bonus/cfg/SQL/gym_lunarlander_v2.py @@ -0,0 +1,43 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='LunarLander-v2-SQL', + env=dict( + collector_env_num=8, + evaluator_env_num=8, + env_id='LunarLander-v2', + n_evaluator_episode=8, + stop_value=200, + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=8, + action_shape=4, + encoder_hidden_size_list=[128, 128, 64], + dueling=True, + ), + nstep=1, + discount_factor=0.97, + learn=dict(batch_size=64, learning_rate=0.001, alpha=0.08), + collect=dict(n_sample=64), + eval=dict(evaluator=dict(eval_freq=50, )), # note: this is the times after which you learns to evaluate + other=dict( + eps=dict( + type='exp', + start=0.95, + end=0.1, + decay=10000, + ), + replay_buffer=dict(replay_buffer_size=20000, ), + ), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +env = ding.envs.gym_env.env diff --git a/ding/bonus/cfg/TD3/gym_bipedalwalker_v3.py b/ding/bonus/cfg/TD3/gym_bipedalwalker_v3.py index 7634f43b5e..238222a688 100644 --- a/ding/bonus/cfg/TD3/gym_bipedalwalker_v3.py +++ b/ding/bonus/cfg/TD3/gym_bipedalwalker_v3.py @@ -7,8 +7,8 @@ env_id='BipedalWalker-v3', collector_env_num=8, evaluator_env_num=5, - act_scale=True, n_evaluator_episode=5, + act_scale=True, rew_clip=True, ), policy=dict( diff --git a/ding/bonus/cfg/TD3/gym_halfcheetah_v3.py b/ding/bonus/cfg/TD3/gym_halfcheetah_v3.py index a04c023b8e..8d0c7dc6fd 100644 --- a/ding/bonus/cfg/TD3/gym_halfcheetah_v3.py +++ b/ding/bonus/cfg/TD3/gym_halfcheetah_v3.py @@ -5,8 +5,6 @@ seed=0, env=dict( env_id='HalfCheetah-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), collector_env_num=1, evaluator_env_num=8, n_evaluator_episode=8, diff --git a/ding/bonus/cfg/TD3/gym_lunarlandercontinuous_v2.py b/ding/bonus/cfg/TD3/gym_lunarlandercontinuous_v2.py index b91315a68a..c876ca1c7d 100644 --- a/ding/bonus/cfg/TD3/gym_lunarlandercontinuous_v2.py +++ b/ding/bonus/cfg/TD3/gym_lunarlandercontinuous_v2.py @@ -8,8 +8,8 @@ collector_env_num=4, evaluator_env_num=8, n_evaluator_episode=8, - act_scale=True, stop_value=240, + act_scale=True, ), policy=dict( cuda=True, diff --git a/ding/bonus/cfg/TD3/gym_pendulum_v1.py b/ding/bonus/cfg/TD3/gym_pendulum_v1.py index 64eced070d..7c6128896a 100644 --- a/ding/bonus/cfg/TD3/gym_pendulum_v1.py +++ b/ding/bonus/cfg/TD3/gym_pendulum_v1.py @@ -6,9 +6,9 @@ env=dict( collector_env_num=8, evaluator_env_num=5, - act_scale=True, n_evaluator_episode=5, stop_value=-250, + act_scale=True, ), policy=dict( cuda=False, diff --git a/ding/bonus/cfg/__init__.py b/ding/bonus/cfg/__init__.py index 71bd7c5a71..7b2c8e750c 100644 --- a/ding/bonus/cfg/__init__.py +++ b/ding/bonus/cfg/__init__.py @@ -6,4 +6,5 @@ from . import PPOF from . import PPOOffPolicy from . import SAC +from . import SQL from . import TD3 diff --git a/ding/bonus/sql.py b/ding/bonus/sql.py new file mode 100644 index 0000000000..4a7c70235e --- /dev/null +++ b/ding/bonus/sql.py @@ -0,0 +1,253 @@ +from typing import Optional, Union +from ditk import logging +from easydict import EasyDict +import os +import torch +import treetensor.torch as ttorch +from ding.framework import task, OnlineRLContext +from ding.framework.middleware import CkptSaver, \ + wandb_online_logger, offline_data_saver, termination_checker, interaction_evaluator, StepCollector, data_pusher, \ + OffPolicyLearner, final_ctx_saver, nstep_reward_enhancer, eps_greedy_handler +from ding.envs import BaseEnv +from ding.envs import setup_ding_env_manager +from ding.policy import SQLPolicy +from ding.utils import set_pkg_seed +from ding.config import save_config_py, compile_config +from ding.model import DQN +from ding.model import model_wrap +from ding.data import DequeBuffer +from ding.bonus.common import TrainingReturn, EvalReturn +from ding.bonus.cfg.SQL import supported_env_cfg +from ding.bonus.cfg.SQL import supported_env + + +class SQLAgent: + supported_env_list = list(supported_env_cfg.keys()) + + def __init__( + self, + env_id: str = None, + env: BaseEnv = None, + seed: int = 0, + exp_name: str = None, + model: Optional[torch.nn.Module] = None, + cfg: Optional[Union[EasyDict, dict]] = None, + policy_state_dict: str = None, + ) -> None: + assert env_id is not None or cfg is not None, "Please specify env_id or cfg." + + if cfg is not None and not isinstance(cfg, EasyDict): + cfg = EasyDict(cfg) + + if env_id is not None: + assert env_id in SQLAgent.supported_env_list, "Please use supported envs: {}".format( + SQLAgent.supported_env_list + ) + if cfg is None: + cfg = supported_env_cfg[env_id] + else: + assert cfg.env.env_id == env_id, "env_id in cfg should be the same as env_id in args." + else: + assert hasattr(cfg.env, "env_id"), "Please specify env_id in cfg." + assert cfg.env.env_id in SQLAgent.supported_env_list, "Please use supported envs: {}".format( + SQLAgent.supported_env_list + ) + default_policy_config = EasyDict({"policy": SQLPolicy.default_config()}) + default_policy_config.update(cfg) + cfg = default_policy_config + + if exp_name is not None: + cfg.exp_name = exp_name + self.cfg = compile_config(cfg, policy=SQLPolicy) + self.exp_name = self.cfg.exp_name + if env is None: + self.env = supported_env[cfg.env.env_id](cfg=cfg.env) + else: + assert isinstance(env, BaseEnv), "Please use BaseEnv as env data type." + self.env = env + + logging.getLogger().setLevel(logging.INFO) + self.seed = seed + set_pkg_seed(self.seed, use_cuda=self.cfg.policy.cuda) + if not os.path.exists(self.exp_name): + os.makedirs(self.exp_name) + save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py')) + if model is None: + model = DQN(**self.cfg.policy.model) + self.buffer_ = DequeBuffer(size=self.cfg.policy.other.replay_buffer.replay_buffer_size) + self.policy = SQLPolicy(self.cfg.policy, model=model) + if policy_state_dict is not None: + self.policy.learn_mode.load_state_dict(policy_state_dict) + self.checkpoint_save_dir = os.path.join(self.exp_name, "ckpt") + + def train( + self, + step: int = int(1e7), + collector_env_num: int = None, + evaluator_env_num: int = None, + n_iter_save_ckpt: int = 1000, + context: Optional[str] = None, + debug: bool = False, + wandb_sweep: bool = False, + ) -> TrainingReturn: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + logging.debug(self.policy._model) + # define env and policy + collector_env_num = collector_env_num if collector_env_num else self.cfg.env.collector_env_num + evaluator_env_num = evaluator_env_num if evaluator_env_num else self.cfg.env.evaluator_env_num + collector_env = setup_ding_env_manager(self.env, collector_env_num, context, debug, 'collector') + evaluator_env = setup_ding_env_manager(self.env, evaluator_env_num, context, debug, 'evaluator') + + with task.start(ctx=OnlineRLContext()): + task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(self.cfg)) + task.use( + StepCollector( + self.cfg, + self.policy.collect_mode, + collector_env, + random_collect_size=self.cfg.policy.random_collect_size \ + if hasattr(self.cfg.policy, 'random_collect_size') else 0, + ) + ) + task.use(data_pusher(self.cfg, self.buffer_)) + task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_)) + task.use(CkptSaver(policy=self.policy, save_dir=self.checkpoint_save_dir, train_freq=n_iter_save_ckpt)) + task.use( + wandb_online_logger( + metric_list=self.policy.monitor_vars(), + model=self.policy._model, + anonymous=True, + project_name=self.exp_name, + wandb_sweep=wandb_sweep, + ) + ) + task.use(termination_checker(max_env_step=step)) + task.use(final_ctx_saver(name=self.exp_name)) + task.run() + + return TrainingReturn(wandb_url=task.ctx.wandb_url) + + def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> float: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + # define env and policy + env = self.env.clone(caller='evaluator') + env.seed(self.seed, dynamic_seed=False) + + if enable_save_replay and replay_save_path: + env.enable_save_replay(replay_path=replay_save_path) + elif enable_save_replay: + env.enable_save_replay(replay_path=os.path.join(self.exp_name, 'videos')) + else: + logging.warning('No video would be generated during the deploy.') + + def single_env_forward_wrapper(forward_fn, cuda=True): + + forward_fn = model_wrap(forward_fn, wrapper_name='argmax_sample').forward + + def _forward(obs): + # unsqueeze means add batch dim, i.e. (O, ) -> (1, O) + obs = ttorch.as_tensor(obs).unsqueeze(0) + if cuda and torch.cuda.is_available(): + obs = obs.cuda() + action = forward_fn(obs)["action"] + # squeeze means delete batch dim, i.e. (1, A) -> (A, ) + action = action.squeeze(0).detach().cpu().numpy() + return action + + return _forward + + forward_fn = single_env_forward_wrapper(self.policy._model, self.cfg.policy.cuda) + + # reset first to make sure the env is in the initial state + # env will be reset again in the main loop + env.reset() + + # main loop + return_ = 0. + step = 0 + obs = env.reset() + while True: + action = forward_fn(obs) + obs, rew, done, info = env.step(action) + return_ += rew + step += 1 + if done: + break + logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}') + + env.close() + + return return_ + + def collect_data( + self, + env_num: int = 8, + save_data_path: Optional[str] = None, + n_sample: Optional[int] = None, + n_episode: Optional[int] = None, + context: Optional[str] = None, + debug: bool = False + ) -> None: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + if n_episode is not None: + raise NotImplementedError + # define env and policy + env_num = env_num if env_num else self.cfg.env.collector_env_num + env = setup_ding_env_manager(self.env, env_num, context, debug, 'collector') + + if save_data_path is None: + save_data_path = os.path.join(self.exp_name, 'demo_data') + + # main execution task + with task.start(ctx=OnlineRLContext()): + task.use( + StepCollector( + self.cfg, self.policy.collect_mode, env, random_collect_size=self.cfg.policy.random_collect_size + ) + ) + task.use(offline_data_saver(save_data_path, data_type='hdf5')) + task.run(max_step=1) + logging.info( + f'DQN collecting is finished, more than {n_sample} samples are collected and saved in `{save_data_path}`' + ) + + def batch_evaluate( + self, + env_num: int = 4, + n_evaluator_episode: int = 4, + context: Optional[str] = None, + debug: bool = False + ) -> EvalReturn: + if debug: + logging.getLogger().setLevel(logging.DEBUG) + # define env and policy + env_num = env_num if env_num else self.cfg.env.evaluator_env_num + env = setup_ding_env_manager(self.env, env_num, context, debug, 'evaluator') + + # reset first to make sure the env is in the initial state + # env will be reset again in the main loop + env.launch() + env.reset() + + evaluate_cfg = self.cfg + evaluate_cfg.env.n_evaluator_episode = n_evaluator_episode + + # main execution task + with task.start(ctx=OnlineRLContext()): + task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, env)) + task.run(max_step=1) + + return EvalReturn(eval_value=task.ctx.eval_value, eval_value_std=task.ctx.eval_value_std) + + @property + def best(self): + best_model_file_path = os.path.join(self.checkpoint_save_dir, "eval.pth.tar") + # Load best model if it exists + if os.path.exists(best_model_file_path): + policy_state_dict = torch.load(best_model_file_path, map_location=torch.device("cpu")) + self.policy.learn_mode.load_state_dict(policy_state_dict) + return self From ebcefb4d7c5ccef449c83e576ed267cec7bd3ea0 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Mon, 10 Jul 2023 16:20:54 +0000 Subject: [PATCH 155/244] polish code --- ding/bonus/cfg/DDPG/gym_pendulum_v1.py | 1 + ding/bonus/cfg/PG/gym_pendulum_v1.py | 1 + ding/bonus/cfg/SAC/gym_pendulum_v1.py | 1 + ding/bonus/cfg/TD3/gym_pendulum_v1.py | 1 + 4 files changed, 4 insertions(+) diff --git a/ding/bonus/cfg/DDPG/gym_pendulum_v1.py b/ding/bonus/cfg/DDPG/gym_pendulum_v1.py index fe1e69e265..e6a08cef11 100644 --- a/ding/bonus/cfg/DDPG/gym_pendulum_v1.py +++ b/ding/bonus/cfg/DDPG/gym_pendulum_v1.py @@ -4,6 +4,7 @@ exp_name='Pendulum-v1-DDPG', seed=0, env=dict( + env_id='Pendulum-v1', collector_env_num=8, evaluator_env_num=5, n_evaluator_episode=5, diff --git a/ding/bonus/cfg/PG/gym_pendulum_v1.py b/ding/bonus/cfg/PG/gym_pendulum_v1.py index 0372f016b5..e3cd9db474 100644 --- a/ding/bonus/cfg/PG/gym_pendulum_v1.py +++ b/ding/bonus/cfg/PG/gym_pendulum_v1.py @@ -4,6 +4,7 @@ exp_name='Pendulum-v1-PG', seed=0, env=dict( + env_id='Pendulum-v1', collector_env_num=8, evaluator_env_num=5, n_evaluator_episode=5, diff --git a/ding/bonus/cfg/SAC/gym_pendulum_v1.py b/ding/bonus/cfg/SAC/gym_pendulum_v1.py index ea18844c85..49c81c4a0b 100644 --- a/ding/bonus/cfg/SAC/gym_pendulum_v1.py +++ b/ding/bonus/cfg/SAC/gym_pendulum_v1.py @@ -4,6 +4,7 @@ exp_name='Pendulum-v1-SAC', seed=0, env=dict( + env_id='Pendulum-v1', collector_env_num=10, evaluator_env_num=8, n_evaluator_episode=8, diff --git a/ding/bonus/cfg/TD3/gym_pendulum_v1.py b/ding/bonus/cfg/TD3/gym_pendulum_v1.py index 7c6128896a..305de9ea13 100644 --- a/ding/bonus/cfg/TD3/gym_pendulum_v1.py +++ b/ding/bonus/cfg/TD3/gym_pendulum_v1.py @@ -4,6 +4,7 @@ exp_name='Pendulum-v1-TD3', seed=0, env=dict( + env_id='Pendulum-v1', collector_env_num=8, evaluator_env_num=5, n_evaluator_episode=5, From 420ef72e1c7c272c845e025508011a5648c94c2c Mon Sep 17 00:00:00 2001 From: zjowowen Date: Mon, 10 Jul 2023 16:23:08 +0000 Subject: [PATCH 156/244] polish code --- ding/bonus/cfg/PPOF/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ding/bonus/cfg/PPOF/__init__.py b/ding/bonus/cfg/PPOF/__init__.py index 2fba110833..2adaaf4df2 100644 --- a/ding/bonus/cfg/PPOF/__init__.py +++ b/ding/bonus/cfg/PPOF/__init__.py @@ -3,15 +3,15 @@ from . import gym_lunarlandercontinuous_v2 supported_env_cfg = { - gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.cfg, - gym_lunarlandercontinuous_v2.cfg.env.env_id: gym_lunarlandercontinuous_v2.cfg, + gym_lunarlander_v2.cfg.env_id: gym_lunarlander_v2.cfg, + gym_lunarlandercontinuous_v2.cfg.env_id: gym_lunarlandercontinuous_v2.cfg, } supported_env_cfg = EasyDict(supported_env_cfg) supported_env = { - gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.env, - gym_lunarlandercontinuous_v2.cfg.env.env_id: gym_lunarlandercontinuous_v2.env, + gym_lunarlander_v2.cfg.env_id: gym_lunarlander_v2.env, + gym_lunarlandercontinuous_v2.cfg.env_id: gym_lunarlandercontinuous_v2.env, } supported_env = EasyDict(supported_env) From d9d93ddd6961ef2e4a3cabe62b573a7a52c49440 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Mon, 10 Jul 2023 16:24:45 +0000 Subject: [PATCH 157/244] polish code --- ding/bonus/cfg/PPOF/gym_lunarlander_v2.py | 1 + ding/bonus/cfg/PPOF/gym_lunarlandercontinuous_v2.py | 1 + 2 files changed, 2 insertions(+) diff --git a/ding/bonus/cfg/PPOF/gym_lunarlander_v2.py b/ding/bonus/cfg/PPOF/gym_lunarlander_v2.py index 4f21511e03..6496a505fb 100644 --- a/ding/bonus/cfg/PPOF/gym_lunarlander_v2.py +++ b/ding/bonus/cfg/PPOF/gym_lunarlander_v2.py @@ -2,6 +2,7 @@ cfg = dict( exp_name='LunarLander-v2-PPO', + env_id='LunarLander-v2', n_sample=400, value_norm='popart', ) diff --git a/ding/bonus/cfg/PPOF/gym_lunarlandercontinuous_v2.py b/ding/bonus/cfg/PPOF/gym_lunarlandercontinuous_v2.py index 546e37bb25..b19fb19e4b 100644 --- a/ding/bonus/cfg/PPOF/gym_lunarlandercontinuous_v2.py +++ b/ding/bonus/cfg/PPOF/gym_lunarlandercontinuous_v2.py @@ -2,6 +2,7 @@ cfg = dict( exp_name='LunarLanderContinuous-V2-PPO', + env_id='LunarLanderContinuous-v2', action_space='continuous', n_sample=400, act_scale=True, From aa1f39dda749a3c4d284e866eacb577c87c8e400 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Mon, 10 Jul 2023 16:27:33 +0000 Subject: [PATCH 158/244] polish code --- ding/bonus/cfg/PPOOffPolicy/gym_pongnoframeskip_v4.py | 1 + ding/bonus/cfg/PPOOffPolicy/gym_qbertnoframeskip_v4.py | 3 ++- ding/bonus/cfg/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ding/bonus/cfg/PPOOffPolicy/gym_pongnoframeskip_v4.py b/ding/bonus/cfg/PPOOffPolicy/gym_pongnoframeskip_v4.py index 5b2dc8be6a..eebb27bd7e 100644 --- a/ding/bonus/cfg/PPOOffPolicy/gym_pongnoframeskip_v4.py +++ b/ding/bonus/cfg/PPOOffPolicy/gym_pongnoframeskip_v4.py @@ -9,6 +9,7 @@ stop_value=20, env_id='PongNoFrameskip-v4', frame_stack=4, + env_wrapper='atari_default', ), policy=dict( cuda=True, diff --git a/ding/bonus/cfg/PPOOffPolicy/gym_qbertnoframeskip_v4.py b/ding/bonus/cfg/PPOOffPolicy/gym_qbertnoframeskip_v4.py index 4e06e99550..90e07db907 100644 --- a/ding/bonus/cfg/PPOOffPolicy/gym_qbertnoframeskip_v4.py +++ b/ding/bonus/cfg/PPOOffPolicy/gym_qbertnoframeskip_v4.py @@ -8,7 +8,8 @@ n_evaluator_episode=8, stop_value=10000000000, env_id='QbertNoFrameskip-v4', - frame_stack=4 + frame_stack=4, + env_wrapper='atari_default', ), policy=dict( cuda=True, diff --git a/ding/bonus/cfg/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py b/ding/bonus/cfg/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py index a7a12cb5e7..1ad2f01a1f 100644 --- a/ding/bonus/cfg/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py +++ b/ding/bonus/cfg/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py @@ -9,6 +9,7 @@ stop_value=10000000000, env_id='SpaceInvadersNoFrameskip-v4', frame_stack=4, + env_wrapper='atari_default', ), policy=dict( cuda=True, From 653a00bbce4fc6374033e599e2b5c2ddd1aa6b40 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 11 Jul 2023 06:19:05 +0000 Subject: [PATCH 159/244] change config path --- ding/bonus/__init__.py | 59 +++++++++++-------- ding/bonus/a2c.py | 4 +- ding/bonus/c51.py | 4 +- ding/bonus/cfg/__init__.py | 10 ---- ding/bonus/ddpg.py | 4 +- ding/bonus/dqn.py | 4 +- ding/bonus/pg.py | 4 +- ding/bonus/ppo_offpolicy.py | 4 +- ding/bonus/sac.py | 4 +- ding/bonus/sql.py | 4 +- ding/bonus/td3.py | 4 +- ding/{bonus/cfg => config}/A2C/__init__.py | 0 .../A2C/gym_bipedalwalker_v3.py | 0 .../cfg => config}/A2C/gym_halfcheetah_v3.py | 0 .../cfg => config}/A2C/gym_hopper_v3.py | 0 .../cfg => config}/A2C/gym_lunarlander_v2.py | 0 .../cfg => config}/A2C/gym_pendulum_v1.py | 0 .../cfg => config}/A2C/gym_walker2d_v3.py | 0 ding/{bonus/cfg => config}/C51/__init__.py | 0 .../cfg => config}/C51/gym_lunarlander_v2.py | 0 .../C51/gym_pongnoframeskip_v4.py | 0 .../C51/gym_qbertnoframeskip_v4.py | 0 .../C51/gym_spaceInvadersnoframeskip_v4.py | 0 ding/{bonus/cfg => config}/DDPG/__init__.py | 0 .../DDPG/gym_bipedalwalker_v3.py | 0 .../cfg => config}/DDPG/gym_halfcheetah_v3.py | 0 .../cfg => config}/DDPG/gym_hopper_v3.py | 0 .../DDPG/gym_lunarlandercontinuous_v2.py | 0 .../cfg => config}/DDPG/gym_pendulum_v1.py | 0 .../cfg => config}/DDPG/gym_walker2d_v3.py | 0 ding/{bonus/cfg => config}/DQN/__init__.py | 0 .../cfg => config}/DQN/gym_lunarlander_v2.py | 0 .../DQN/gym_pongnoframeskip_v4.py | 0 .../DQN/gym_qbertnoframeskip_v4.py | 0 .../DQN/gym_spaceInvadersnoframeskip_v4.py | 0 .../PG/__init__ .py => config/PG/__init__.py} | 1 - .../cfg => config}/PG/gym_bipedalwalker_v3.py | 0 .../cfg => config}/PG/gym_halfcheetah_v3.py | 0 .../{bonus/cfg => config}/PG/gym_hopper_v3.py | 0 .../cfg => config}/PG/gym_lunarlander_v2.py | 0 .../cfg => config}/PG/gym_pendulum_v1.py | 0 .../cfg => config}/PG/gym_walker2d_v3.py | 0 ding/{bonus/cfg => config}/PPOF/__init__.py | 0 .../cfg => config}/PPOF/gym_lunarlander_v2.py | 0 .../PPOF/gym_lunarlandercontinuous_v2.py | 0 .../cfg => config}/PPOOffPolicy/__init__.py | 0 .../PPOOffPolicy/gym_lunarlander_v2.py | 0 .../PPOOffPolicy/gym_pongnoframeskip_v4.py | 0 .../PPOOffPolicy/gym_qbertnoframeskip_v4.py | 0 .../gym_spaceInvadersnoframeskip_v4.py | 0 ding/{bonus/cfg => config}/SAC/__init__.py | 0 .../SAC/gym_bipedalwalker_v3.py | 0 .../cfg => config}/SAC/gym_halfcheetah_v3.py | 0 .../cfg => config}/SAC/gym_hopper_v3.py | 0 .../SAC/gym_lunarlandercontinuous_v2.py | 0 .../cfg => config}/SAC/gym_pendulum_v1.py | 0 .../cfg => config}/SAC/gym_walker2d_v3.py | 0 ding/{bonus/cfg => config}/SQL/__init__.py | 0 .../cfg => config}/SQL/gym_lunarlander_v2.py | 0 ding/{bonus/cfg => config}/TD3/__init__.py | 0 .../TD3/gym_bipedalwalker_v3.py | 0 .../cfg => config}/TD3/gym_halfcheetah_v3.py | 0 .../cfg => config}/TD3/gym_hopper_v3.py | 0 .../TD3/gym_lunarlandercontinuous_v2.py | 0 .../cfg => config}/TD3/gym_pendulum_v1.py | 0 .../cfg => config}/TD3/gym_walker2d_v3.py | 0 ding/config/__init__.py | 11 ++++ 67 files changed, 63 insertions(+), 54 deletions(-) delete mode 100644 ding/bonus/cfg/__init__.py rename ding/{bonus/cfg => config}/A2C/__init__.py (100%) rename ding/{bonus/cfg => config}/A2C/gym_bipedalwalker_v3.py (100%) rename ding/{bonus/cfg => config}/A2C/gym_halfcheetah_v3.py (100%) rename ding/{bonus/cfg => config}/A2C/gym_hopper_v3.py (100%) rename ding/{bonus/cfg => config}/A2C/gym_lunarlander_v2.py (100%) rename ding/{bonus/cfg => config}/A2C/gym_pendulum_v1.py (100%) rename ding/{bonus/cfg => config}/A2C/gym_walker2d_v3.py (100%) rename ding/{bonus/cfg => config}/C51/__init__.py (100%) rename ding/{bonus/cfg => config}/C51/gym_lunarlander_v2.py (100%) rename ding/{bonus/cfg => config}/C51/gym_pongnoframeskip_v4.py (100%) rename ding/{bonus/cfg => config}/C51/gym_qbertnoframeskip_v4.py (100%) rename ding/{bonus/cfg => config}/C51/gym_spaceInvadersnoframeskip_v4.py (100%) rename ding/{bonus/cfg => config}/DDPG/__init__.py (100%) rename ding/{bonus/cfg => config}/DDPG/gym_bipedalwalker_v3.py (100%) rename ding/{bonus/cfg => config}/DDPG/gym_halfcheetah_v3.py (100%) rename ding/{bonus/cfg => config}/DDPG/gym_hopper_v3.py (100%) rename ding/{bonus/cfg => config}/DDPG/gym_lunarlandercontinuous_v2.py (100%) rename ding/{bonus/cfg => config}/DDPG/gym_pendulum_v1.py (100%) rename ding/{bonus/cfg => config}/DDPG/gym_walker2d_v3.py (100%) rename ding/{bonus/cfg => config}/DQN/__init__.py (100%) rename ding/{bonus/cfg => config}/DQN/gym_lunarlander_v2.py (100%) rename ding/{bonus/cfg => config}/DQN/gym_pongnoframeskip_v4.py (100%) rename ding/{bonus/cfg => config}/DQN/gym_qbertnoframeskip_v4.py (100%) rename ding/{bonus/cfg => config}/DQN/gym_spaceInvadersnoframeskip_v4.py (100%) rename ding/{bonus/cfg/PG/__init__ .py => config/PG/__init__.py} (96%) rename ding/{bonus/cfg => config}/PG/gym_bipedalwalker_v3.py (100%) rename ding/{bonus/cfg => config}/PG/gym_halfcheetah_v3.py (100%) rename ding/{bonus/cfg => config}/PG/gym_hopper_v3.py (100%) rename ding/{bonus/cfg => config}/PG/gym_lunarlander_v2.py (100%) rename ding/{bonus/cfg => config}/PG/gym_pendulum_v1.py (100%) rename ding/{bonus/cfg => config}/PG/gym_walker2d_v3.py (100%) rename ding/{bonus/cfg => config}/PPOF/__init__.py (100%) rename ding/{bonus/cfg => config}/PPOF/gym_lunarlander_v2.py (100%) rename ding/{bonus/cfg => config}/PPOF/gym_lunarlandercontinuous_v2.py (100%) rename ding/{bonus/cfg => config}/PPOOffPolicy/__init__.py (100%) rename ding/{bonus/cfg => config}/PPOOffPolicy/gym_lunarlander_v2.py (100%) rename ding/{bonus/cfg => config}/PPOOffPolicy/gym_pongnoframeskip_v4.py (100%) rename ding/{bonus/cfg => config}/PPOOffPolicy/gym_qbertnoframeskip_v4.py (100%) rename ding/{bonus/cfg => config}/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py (100%) rename ding/{bonus/cfg => config}/SAC/__init__.py (100%) rename ding/{bonus/cfg => config}/SAC/gym_bipedalwalker_v3.py (100%) rename ding/{bonus/cfg => config}/SAC/gym_halfcheetah_v3.py (100%) rename ding/{bonus/cfg => config}/SAC/gym_hopper_v3.py (100%) rename ding/{bonus/cfg => config}/SAC/gym_lunarlandercontinuous_v2.py (100%) rename ding/{bonus/cfg => config}/SAC/gym_pendulum_v1.py (100%) rename ding/{bonus/cfg => config}/SAC/gym_walker2d_v3.py (100%) rename ding/{bonus/cfg => config}/SQL/__init__.py (100%) rename ding/{bonus/cfg => config}/SQL/gym_lunarlander_v2.py (100%) rename ding/{bonus/cfg => config}/TD3/__init__.py (100%) rename ding/{bonus/cfg => config}/TD3/gym_bipedalwalker_v3.py (100%) rename ding/{bonus/cfg => config}/TD3/gym_halfcheetah_v3.py (100%) rename ding/{bonus/cfg => config}/TD3/gym_hopper_v3.py (100%) rename ding/{bonus/cfg => config}/TD3/gym_lunarlandercontinuous_v2.py (100%) rename ding/{bonus/cfg => config}/TD3/gym_pendulum_v1.py (100%) rename ding/{bonus/cfg => config}/TD3/gym_walker2d_v3.py (100%) diff --git a/ding/bonus/__init__.py b/ding/bonus/__init__.py index 24e094a9c2..d8f83f3840 100644 --- a/ding/bonus/__init__.py +++ b/ding/bonus/__init__.py @@ -1,4 +1,4 @@ -from . import cfg +import ding.config from .a2c import A2CAgent from .c51 import C51Agent from .ddpg import DDPGAgent @@ -33,32 +33,41 @@ def env_supported(algo: str = None) -> list: if algo is not None: if algo.upper() == "A2C": - return list(cfg.A2C.supported_env.keys()) + return list(ding.config.A2C.supported_env.keys()) elif algo.upper() == "C51": - return list(cfg.C51.supported_env.keys()) + return list(ding.config.C51.supported_env.keys()) elif algo.upper() == "DDPG": - return list(cfg.DDPG.supported_env.keys()) + return list(ding.config.DDPG.supported_env.keys()) elif algo.upper() == "DQN": - return list(cfg.DQN.supported_env.keys()) + return list(ding.config.DQN.supported_env.keys()) elif algo.upper() == "PG": - return list(cfg.PG.supported_env.keys()) + return list(ding.config.PG.supported_env.keys()) elif algo.upper() == "PPOF": - return list(cfg.PPOF.supported_env.keys()) + return list(ding.config.PPOF.supported_env.keys()) elif algo.upper() == "PPOOFFPOLICY": - return list(cfg.PPOOffPolicy.supported_env.keys()) + return list(ding.config.PPOOffPolicy.supported_env.keys()) elif algo.upper() == "SAC": - return list(cfg.SAC.supported_env.keys()) + return list(ding.config.SAC.supported_env.keys()) elif algo.upper() == "SQL": - return list(cfg.SQL.supported_env.keys()) + return list(ding.config.SQL.supported_env.keys()) elif algo.upper() == "TD3": - return list(cfg.TD3.supported_env.keys()) + return list(ding.config.TD3.supported_env.keys()) else: raise ValueError("The algo {} is not supported by di-engine.".format(algo)) else: - #merge all the supported envs from all the algos - supported_env = [] - supported_env.extend(list(cfg.SAC.supported_env.keys())) - return supported_env + supported_env = set() + supported_env.update(ding.config.A2C.supported_env.keys()) + supported_env.update(ding.config.C51.supported_env.keys()) + supported_env.update(ding.config.DDPG.supported_env.keys()) + supported_env.update(ding.config.DQN.supported_env.keys()) + supported_env.update(ding.config.PG.supported_env.keys()) + supported_env.update(ding.config.PPOF.supported_env.keys()) + supported_env.update(ding.config.PPOOffPolicy.supported_env.keys()) + supported_env.update(ding.config.SAC.supported_env.keys()) + supported_env.update(ding.config.SQL.supported_env.keys()) + supported_env.update(ding.config.TD3.supported_env.keys()) + # return the list of the envs + return list(supported_env) supported_env = env_supported() @@ -70,25 +79,25 @@ def algo_supported(env_id: str = None) -> list: """ if env_id is not None: algo = [] - if env_id.upper() in [item.upper() for item in cfg.A2C.supported_env.keys()]: + if env_id.upper() in [item.upper() for item in ding.config.A2C.supported_env.keys()]: algo.append("A2C") - if env_id.upper() in [item.upper() for item in cfg.C51.supported_env.keys()]: + if env_id.upper() in [item.upper() for item in ding.config.C51.supported_env.keys()]: algo.append("C51") - if env_id.upper() in [item.upper() for item in cfg.DDPG.supported_env.keys()]: + if env_id.upper() in [item.upper() for item in ding.config.DDPG.supported_env.keys()]: algo.append("DDPG") - if env_id.upper() in [item.upper() for item in cfg.DQN.supported_env.keys()]: + if env_id.upper() in [item.upper() for item in ding.config.DQN.supported_env.keys()]: algo.append("DQN") - if env_id.upper() in [item.upper() for item in cfg.PG.supported_env.keys()]: + if env_id.upper() in [item.upper() for item in ding.config.PG.supported_env.keys()]: algo.append("PG") - if env_id.upper() in [item.upper() for item in cfg.PPOF.supported_env.keys()]: + if env_id.upper() in [item.upper() for item in ding.config.PPOF.supported_env.keys()]: algo.append("PPOF") - if env_id.upper() in [item.upper() for item in cfg.PPOOffPolicy.supported_env.keys()]: + if env_id.upper() in [item.upper() for item in ding.config.PPOOffPolicy.supported_env.keys()]: algo.append("PPOOffPolicy") - if env_id.upper() in [item.upper() for item in cfg.SAC.supported_env.keys()]: + if env_id.upper() in [item.upper() for item in ding.config.SAC.supported_env.keys()]: algo.append("SAC") - if env_id.upper() in [item.upper() for item in cfg.SQL.supported_env.keys()]: + if env_id.upper() in [item.upper() for item in ding.config.SQL.supported_env.keys()]: algo.append("SQL") - if env_id.upper() in [item.upper() for item in cfg.TD3.supported_env.keys()]: + if env_id.upper() in [item.upper() for item in ding.config.TD3.supported_env.keys()]: algo.append("TD3") if len(algo) == 0: diff --git a/ding/bonus/a2c.py b/ding/bonus/a2c.py index 9b25e42909..d5f7b95a08 100644 --- a/ding/bonus/a2c.py +++ b/ding/bonus/a2c.py @@ -16,8 +16,8 @@ from ding.model import VAC from ding.model import model_wrap from ding.bonus.common import TrainingReturn, EvalReturn -from ding.bonus.cfg.A2C import supported_env_cfg -from ding.bonus.cfg.A2C import supported_env +from ding.config.A2C import supported_env_cfg +from ding.config.A2C import supported_env class A2CAgent: diff --git a/ding/bonus/c51.py b/ding/bonus/c51.py index 820ecf0130..9cca64ae90 100644 --- a/ding/bonus/c51.py +++ b/ding/bonus/c51.py @@ -17,8 +17,8 @@ from ding.model import model_wrap from ding.data import DequeBuffer from ding.bonus.common import TrainingReturn, EvalReturn -from ding.bonus.cfg.C51 import supported_env_cfg -from ding.bonus.cfg.C51 import supported_env +from ding.config.C51 import supported_env_cfg +from ding.config.C51 import supported_env class C51Agent: diff --git a/ding/bonus/cfg/__init__.py b/ding/bonus/cfg/__init__.py deleted file mode 100644 index 7b2c8e750c..0000000000 --- a/ding/bonus/cfg/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -from . import A2C -from . import C51 -from . import DDPG -from . import DQN -from . import PG -from . import PPOF -from . import PPOOffPolicy -from . import SAC -from . import SQL -from . import TD3 diff --git a/ding/bonus/ddpg.py b/ding/bonus/ddpg.py index 15cd0d03b5..0dbc5e6ede 100644 --- a/ding/bonus/ddpg.py +++ b/ding/bonus/ddpg.py @@ -16,8 +16,8 @@ from ding.model import QAC from ding.data import DequeBuffer from ding.bonus.common import TrainingReturn, EvalReturn -from ding.bonus.cfg.DDPG import supported_env_cfg -from ding.bonus.cfg.DDPG import supported_env +from ding.config.DDPG import supported_env_cfg +from ding.config.DDPG import supported_env class DDPGAgent: diff --git a/ding/bonus/dqn.py b/ding/bonus/dqn.py index b15bb7184e..c5e3539ab3 100644 --- a/ding/bonus/dqn.py +++ b/ding/bonus/dqn.py @@ -17,8 +17,8 @@ from ding.model import model_wrap from ding.data import DequeBuffer from ding.bonus.common import TrainingReturn, EvalReturn -from ding.bonus.cfg.DQN import supported_env_cfg -from ding.bonus.cfg.DQN import supported_env +from ding.config.DQN import supported_env_cfg +from ding.config.DQN import supported_env class DQNAgent: diff --git a/ding/bonus/pg.py b/ding/bonus/pg.py index d966263efb..1c7ef7eb85 100644 --- a/ding/bonus/pg.py +++ b/ding/bonus/pg.py @@ -15,8 +15,8 @@ from ding.config import save_config_py, compile_config from ding.model import PG from ding.bonus.common import TrainingReturn, EvalReturn -from ding.bonus.cfg.PG import supported_env_cfg -from ding.bonus.cfg.PG import supported_env +from ding.config.PG import supported_env_cfg +from ding.config.PG import supported_env class PGAgent: diff --git a/ding/bonus/ppo_offpolicy.py b/ding/bonus/ppo_offpolicy.py index e37d79c60f..ea7647e899 100644 --- a/ding/bonus/ppo_offpolicy.py +++ b/ding/bonus/ppo_offpolicy.py @@ -16,8 +16,8 @@ from ding.model import model_wrap from ding.data import DequeBuffer from ding.bonus.common import TrainingReturn, EvalReturn -from ding.bonus.cfg.PPOOffPolicy import supported_env_cfg -from ding.bonus.cfg.PPOOffPolicy import supported_env +from ding.config.PPOOffPolicy import supported_env_cfg +from ding.config.PPOOffPolicy import supported_env class PPOOffPolicyAgent: diff --git a/ding/bonus/sac.py b/ding/bonus/sac.py index 89349c4d9f..ab36036eef 100644 --- a/ding/bonus/sac.py +++ b/ding/bonus/sac.py @@ -17,8 +17,8 @@ from ding.model import model_wrap from ding.data import DequeBuffer from ding.bonus.common import TrainingReturn, EvalReturn -from ding.bonus.cfg.SAC import supported_env_cfg -from ding.bonus.cfg.SAC import supported_env +from ding.config.SAC import supported_env_cfg +from ding.config.SAC import supported_env class SACAgent: diff --git a/ding/bonus/sql.py b/ding/bonus/sql.py index 4a7c70235e..8c5dbcf9d8 100644 --- a/ding/bonus/sql.py +++ b/ding/bonus/sql.py @@ -17,8 +17,8 @@ from ding.model import model_wrap from ding.data import DequeBuffer from ding.bonus.common import TrainingReturn, EvalReturn -from ding.bonus.cfg.SQL import supported_env_cfg -from ding.bonus.cfg.SQL import supported_env +from ding.config.SQL import supported_env_cfg +from ding.config.SQL import supported_env class SQLAgent: diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index e9f18df681..12d7d9f92c 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -16,8 +16,8 @@ from ding.model import QAC from ding.data import DequeBuffer from ding.bonus.common import TrainingReturn, EvalReturn -from ding.bonus.cfg.TD3 import supported_env_cfg -from ding.bonus.cfg.TD3 import supported_env +from ding.config.TD3 import supported_env_cfg +from ding.config.TD3 import supported_env class TD3Agent: diff --git a/ding/bonus/cfg/A2C/__init__.py b/ding/config/A2C/__init__.py similarity index 100% rename from ding/bonus/cfg/A2C/__init__.py rename to ding/config/A2C/__init__.py diff --git a/ding/bonus/cfg/A2C/gym_bipedalwalker_v3.py b/ding/config/A2C/gym_bipedalwalker_v3.py similarity index 100% rename from ding/bonus/cfg/A2C/gym_bipedalwalker_v3.py rename to ding/config/A2C/gym_bipedalwalker_v3.py diff --git a/ding/bonus/cfg/A2C/gym_halfcheetah_v3.py b/ding/config/A2C/gym_halfcheetah_v3.py similarity index 100% rename from ding/bonus/cfg/A2C/gym_halfcheetah_v3.py rename to ding/config/A2C/gym_halfcheetah_v3.py diff --git a/ding/bonus/cfg/A2C/gym_hopper_v3.py b/ding/config/A2C/gym_hopper_v3.py similarity index 100% rename from ding/bonus/cfg/A2C/gym_hopper_v3.py rename to ding/config/A2C/gym_hopper_v3.py diff --git a/ding/bonus/cfg/A2C/gym_lunarlander_v2.py b/ding/config/A2C/gym_lunarlander_v2.py similarity index 100% rename from ding/bonus/cfg/A2C/gym_lunarlander_v2.py rename to ding/config/A2C/gym_lunarlander_v2.py diff --git a/ding/bonus/cfg/A2C/gym_pendulum_v1.py b/ding/config/A2C/gym_pendulum_v1.py similarity index 100% rename from ding/bonus/cfg/A2C/gym_pendulum_v1.py rename to ding/config/A2C/gym_pendulum_v1.py diff --git a/ding/bonus/cfg/A2C/gym_walker2d_v3.py b/ding/config/A2C/gym_walker2d_v3.py similarity index 100% rename from ding/bonus/cfg/A2C/gym_walker2d_v3.py rename to ding/config/A2C/gym_walker2d_v3.py diff --git a/ding/bonus/cfg/C51/__init__.py b/ding/config/C51/__init__.py similarity index 100% rename from ding/bonus/cfg/C51/__init__.py rename to ding/config/C51/__init__.py diff --git a/ding/bonus/cfg/C51/gym_lunarlander_v2.py b/ding/config/C51/gym_lunarlander_v2.py similarity index 100% rename from ding/bonus/cfg/C51/gym_lunarlander_v2.py rename to ding/config/C51/gym_lunarlander_v2.py diff --git a/ding/bonus/cfg/C51/gym_pongnoframeskip_v4.py b/ding/config/C51/gym_pongnoframeskip_v4.py similarity index 100% rename from ding/bonus/cfg/C51/gym_pongnoframeskip_v4.py rename to ding/config/C51/gym_pongnoframeskip_v4.py diff --git a/ding/bonus/cfg/C51/gym_qbertnoframeskip_v4.py b/ding/config/C51/gym_qbertnoframeskip_v4.py similarity index 100% rename from ding/bonus/cfg/C51/gym_qbertnoframeskip_v4.py rename to ding/config/C51/gym_qbertnoframeskip_v4.py diff --git a/ding/bonus/cfg/C51/gym_spaceInvadersnoframeskip_v4.py b/ding/config/C51/gym_spaceInvadersnoframeskip_v4.py similarity index 100% rename from ding/bonus/cfg/C51/gym_spaceInvadersnoframeskip_v4.py rename to ding/config/C51/gym_spaceInvadersnoframeskip_v4.py diff --git a/ding/bonus/cfg/DDPG/__init__.py b/ding/config/DDPG/__init__.py similarity index 100% rename from ding/bonus/cfg/DDPG/__init__.py rename to ding/config/DDPG/__init__.py diff --git a/ding/bonus/cfg/DDPG/gym_bipedalwalker_v3.py b/ding/config/DDPG/gym_bipedalwalker_v3.py similarity index 100% rename from ding/bonus/cfg/DDPG/gym_bipedalwalker_v3.py rename to ding/config/DDPG/gym_bipedalwalker_v3.py diff --git a/ding/bonus/cfg/DDPG/gym_halfcheetah_v3.py b/ding/config/DDPG/gym_halfcheetah_v3.py similarity index 100% rename from ding/bonus/cfg/DDPG/gym_halfcheetah_v3.py rename to ding/config/DDPG/gym_halfcheetah_v3.py diff --git a/ding/bonus/cfg/DDPG/gym_hopper_v3.py b/ding/config/DDPG/gym_hopper_v3.py similarity index 100% rename from ding/bonus/cfg/DDPG/gym_hopper_v3.py rename to ding/config/DDPG/gym_hopper_v3.py diff --git a/ding/bonus/cfg/DDPG/gym_lunarlandercontinuous_v2.py b/ding/config/DDPG/gym_lunarlandercontinuous_v2.py similarity index 100% rename from ding/bonus/cfg/DDPG/gym_lunarlandercontinuous_v2.py rename to ding/config/DDPG/gym_lunarlandercontinuous_v2.py diff --git a/ding/bonus/cfg/DDPG/gym_pendulum_v1.py b/ding/config/DDPG/gym_pendulum_v1.py similarity index 100% rename from ding/bonus/cfg/DDPG/gym_pendulum_v1.py rename to ding/config/DDPG/gym_pendulum_v1.py diff --git a/ding/bonus/cfg/DDPG/gym_walker2d_v3.py b/ding/config/DDPG/gym_walker2d_v3.py similarity index 100% rename from ding/bonus/cfg/DDPG/gym_walker2d_v3.py rename to ding/config/DDPG/gym_walker2d_v3.py diff --git a/ding/bonus/cfg/DQN/__init__.py b/ding/config/DQN/__init__.py similarity index 100% rename from ding/bonus/cfg/DQN/__init__.py rename to ding/config/DQN/__init__.py diff --git a/ding/bonus/cfg/DQN/gym_lunarlander_v2.py b/ding/config/DQN/gym_lunarlander_v2.py similarity index 100% rename from ding/bonus/cfg/DQN/gym_lunarlander_v2.py rename to ding/config/DQN/gym_lunarlander_v2.py diff --git a/ding/bonus/cfg/DQN/gym_pongnoframeskip_v4.py b/ding/config/DQN/gym_pongnoframeskip_v4.py similarity index 100% rename from ding/bonus/cfg/DQN/gym_pongnoframeskip_v4.py rename to ding/config/DQN/gym_pongnoframeskip_v4.py diff --git a/ding/bonus/cfg/DQN/gym_qbertnoframeskip_v4.py b/ding/config/DQN/gym_qbertnoframeskip_v4.py similarity index 100% rename from ding/bonus/cfg/DQN/gym_qbertnoframeskip_v4.py rename to ding/config/DQN/gym_qbertnoframeskip_v4.py diff --git a/ding/bonus/cfg/DQN/gym_spaceInvadersnoframeskip_v4.py b/ding/config/DQN/gym_spaceInvadersnoframeskip_v4.py similarity index 100% rename from ding/bonus/cfg/DQN/gym_spaceInvadersnoframeskip_v4.py rename to ding/config/DQN/gym_spaceInvadersnoframeskip_v4.py diff --git a/ding/bonus/cfg/PG/__init__ .py b/ding/config/PG/__init__.py similarity index 96% rename from ding/bonus/cfg/PG/__init__ .py rename to ding/config/PG/__init__.py index 63c6804ef6..280e8cef9f 100644 --- a/ding/bonus/cfg/PG/__init__ .py +++ b/ding/config/PG/__init__.py @@ -3,7 +3,6 @@ from . import gym_halfcheetah_v3 from . import gym_hopper_v3 from . import gym_lunarlander_v2 -from . import gym_pendulum_v1 from . import gym_walker2d_v3 supported_env_cfg = { diff --git a/ding/bonus/cfg/PG/gym_bipedalwalker_v3.py b/ding/config/PG/gym_bipedalwalker_v3.py similarity index 100% rename from ding/bonus/cfg/PG/gym_bipedalwalker_v3.py rename to ding/config/PG/gym_bipedalwalker_v3.py diff --git a/ding/bonus/cfg/PG/gym_halfcheetah_v3.py b/ding/config/PG/gym_halfcheetah_v3.py similarity index 100% rename from ding/bonus/cfg/PG/gym_halfcheetah_v3.py rename to ding/config/PG/gym_halfcheetah_v3.py diff --git a/ding/bonus/cfg/PG/gym_hopper_v3.py b/ding/config/PG/gym_hopper_v3.py similarity index 100% rename from ding/bonus/cfg/PG/gym_hopper_v3.py rename to ding/config/PG/gym_hopper_v3.py diff --git a/ding/bonus/cfg/PG/gym_lunarlander_v2.py b/ding/config/PG/gym_lunarlander_v2.py similarity index 100% rename from ding/bonus/cfg/PG/gym_lunarlander_v2.py rename to ding/config/PG/gym_lunarlander_v2.py diff --git a/ding/bonus/cfg/PG/gym_pendulum_v1.py b/ding/config/PG/gym_pendulum_v1.py similarity index 100% rename from ding/bonus/cfg/PG/gym_pendulum_v1.py rename to ding/config/PG/gym_pendulum_v1.py diff --git a/ding/bonus/cfg/PG/gym_walker2d_v3.py b/ding/config/PG/gym_walker2d_v3.py similarity index 100% rename from ding/bonus/cfg/PG/gym_walker2d_v3.py rename to ding/config/PG/gym_walker2d_v3.py diff --git a/ding/bonus/cfg/PPOF/__init__.py b/ding/config/PPOF/__init__.py similarity index 100% rename from ding/bonus/cfg/PPOF/__init__.py rename to ding/config/PPOF/__init__.py diff --git a/ding/bonus/cfg/PPOF/gym_lunarlander_v2.py b/ding/config/PPOF/gym_lunarlander_v2.py similarity index 100% rename from ding/bonus/cfg/PPOF/gym_lunarlander_v2.py rename to ding/config/PPOF/gym_lunarlander_v2.py diff --git a/ding/bonus/cfg/PPOF/gym_lunarlandercontinuous_v2.py b/ding/config/PPOF/gym_lunarlandercontinuous_v2.py similarity index 100% rename from ding/bonus/cfg/PPOF/gym_lunarlandercontinuous_v2.py rename to ding/config/PPOF/gym_lunarlandercontinuous_v2.py diff --git a/ding/bonus/cfg/PPOOffPolicy/__init__.py b/ding/config/PPOOffPolicy/__init__.py similarity index 100% rename from ding/bonus/cfg/PPOOffPolicy/__init__.py rename to ding/config/PPOOffPolicy/__init__.py diff --git a/ding/bonus/cfg/PPOOffPolicy/gym_lunarlander_v2.py b/ding/config/PPOOffPolicy/gym_lunarlander_v2.py similarity index 100% rename from ding/bonus/cfg/PPOOffPolicy/gym_lunarlander_v2.py rename to ding/config/PPOOffPolicy/gym_lunarlander_v2.py diff --git a/ding/bonus/cfg/PPOOffPolicy/gym_pongnoframeskip_v4.py b/ding/config/PPOOffPolicy/gym_pongnoframeskip_v4.py similarity index 100% rename from ding/bonus/cfg/PPOOffPolicy/gym_pongnoframeskip_v4.py rename to ding/config/PPOOffPolicy/gym_pongnoframeskip_v4.py diff --git a/ding/bonus/cfg/PPOOffPolicy/gym_qbertnoframeskip_v4.py b/ding/config/PPOOffPolicy/gym_qbertnoframeskip_v4.py similarity index 100% rename from ding/bonus/cfg/PPOOffPolicy/gym_qbertnoframeskip_v4.py rename to ding/config/PPOOffPolicy/gym_qbertnoframeskip_v4.py diff --git a/ding/bonus/cfg/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py b/ding/config/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py similarity index 100% rename from ding/bonus/cfg/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py rename to ding/config/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py diff --git a/ding/bonus/cfg/SAC/__init__.py b/ding/config/SAC/__init__.py similarity index 100% rename from ding/bonus/cfg/SAC/__init__.py rename to ding/config/SAC/__init__.py diff --git a/ding/bonus/cfg/SAC/gym_bipedalwalker_v3.py b/ding/config/SAC/gym_bipedalwalker_v3.py similarity index 100% rename from ding/bonus/cfg/SAC/gym_bipedalwalker_v3.py rename to ding/config/SAC/gym_bipedalwalker_v3.py diff --git a/ding/bonus/cfg/SAC/gym_halfcheetah_v3.py b/ding/config/SAC/gym_halfcheetah_v3.py similarity index 100% rename from ding/bonus/cfg/SAC/gym_halfcheetah_v3.py rename to ding/config/SAC/gym_halfcheetah_v3.py diff --git a/ding/bonus/cfg/SAC/gym_hopper_v3.py b/ding/config/SAC/gym_hopper_v3.py similarity index 100% rename from ding/bonus/cfg/SAC/gym_hopper_v3.py rename to ding/config/SAC/gym_hopper_v3.py diff --git a/ding/bonus/cfg/SAC/gym_lunarlandercontinuous_v2.py b/ding/config/SAC/gym_lunarlandercontinuous_v2.py similarity index 100% rename from ding/bonus/cfg/SAC/gym_lunarlandercontinuous_v2.py rename to ding/config/SAC/gym_lunarlandercontinuous_v2.py diff --git a/ding/bonus/cfg/SAC/gym_pendulum_v1.py b/ding/config/SAC/gym_pendulum_v1.py similarity index 100% rename from ding/bonus/cfg/SAC/gym_pendulum_v1.py rename to ding/config/SAC/gym_pendulum_v1.py diff --git a/ding/bonus/cfg/SAC/gym_walker2d_v3.py b/ding/config/SAC/gym_walker2d_v3.py similarity index 100% rename from ding/bonus/cfg/SAC/gym_walker2d_v3.py rename to ding/config/SAC/gym_walker2d_v3.py diff --git a/ding/bonus/cfg/SQL/__init__.py b/ding/config/SQL/__init__.py similarity index 100% rename from ding/bonus/cfg/SQL/__init__.py rename to ding/config/SQL/__init__.py diff --git a/ding/bonus/cfg/SQL/gym_lunarlander_v2.py b/ding/config/SQL/gym_lunarlander_v2.py similarity index 100% rename from ding/bonus/cfg/SQL/gym_lunarlander_v2.py rename to ding/config/SQL/gym_lunarlander_v2.py diff --git a/ding/bonus/cfg/TD3/__init__.py b/ding/config/TD3/__init__.py similarity index 100% rename from ding/bonus/cfg/TD3/__init__.py rename to ding/config/TD3/__init__.py diff --git a/ding/bonus/cfg/TD3/gym_bipedalwalker_v3.py b/ding/config/TD3/gym_bipedalwalker_v3.py similarity index 100% rename from ding/bonus/cfg/TD3/gym_bipedalwalker_v3.py rename to ding/config/TD3/gym_bipedalwalker_v3.py diff --git a/ding/bonus/cfg/TD3/gym_halfcheetah_v3.py b/ding/config/TD3/gym_halfcheetah_v3.py similarity index 100% rename from ding/bonus/cfg/TD3/gym_halfcheetah_v3.py rename to ding/config/TD3/gym_halfcheetah_v3.py diff --git a/ding/bonus/cfg/TD3/gym_hopper_v3.py b/ding/config/TD3/gym_hopper_v3.py similarity index 100% rename from ding/bonus/cfg/TD3/gym_hopper_v3.py rename to ding/config/TD3/gym_hopper_v3.py diff --git a/ding/bonus/cfg/TD3/gym_lunarlandercontinuous_v2.py b/ding/config/TD3/gym_lunarlandercontinuous_v2.py similarity index 100% rename from ding/bonus/cfg/TD3/gym_lunarlandercontinuous_v2.py rename to ding/config/TD3/gym_lunarlandercontinuous_v2.py diff --git a/ding/bonus/cfg/TD3/gym_pendulum_v1.py b/ding/config/TD3/gym_pendulum_v1.py similarity index 100% rename from ding/bonus/cfg/TD3/gym_pendulum_v1.py rename to ding/config/TD3/gym_pendulum_v1.py diff --git a/ding/bonus/cfg/TD3/gym_walker2d_v3.py b/ding/config/TD3/gym_walker2d_v3.py similarity index 100% rename from ding/bonus/cfg/TD3/gym_walker2d_v3.py rename to ding/config/TD3/gym_walker2d_v3.py diff --git a/ding/config/__init__.py b/ding/config/__init__.py index 1baf5adadf..aa0cd3d08d 100644 --- a/ding/config/__init__.py +++ b/ding/config/__init__.py @@ -1,3 +1,14 @@ from .config import Config, read_config, save_config, compile_config, compile_config_parallel, read_config_directly, \ read_config_with_system, save_config_py from .utils import parallel_transform, parallel_transform_slurm + +from . import A2C +from . import C51 +from . import DDPG +from . import DQN +from . import PG +from . import PPOF +from . import PPOOffPolicy +from . import SAC +from . import SQL +from . import TD3 From 57e7325dc6e38040578650dc6a82c645f35a0049 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 11 Jul 2023 07:45:03 +0000 Subject: [PATCH 160/244] add compatibility fix for nstep --- ding/bonus/dqn.py | 3 ++- ding/bonus/sql.py | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ding/bonus/dqn.py b/ding/bonus/dqn.py index c5e3539ab3..bfa6312341 100644 --- a/ding/bonus/dqn.py +++ b/ding/bonus/dqn.py @@ -111,7 +111,8 @@ def train( if hasattr(self.cfg.policy, 'random_collect_size') else 0, ) ) - task.use(nstep_reward_enhancer(self.cfg)) + if "nstep" in self.cfg.policy and self.cfg.policy.nstep > 1: + task.use(nstep_reward_enhancer(self.cfg)) task.use(data_pusher(self.cfg, self.buffer_)) task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_)) task.use(CkptSaver(policy=self.policy, save_dir=self.checkpoint_save_dir, train_freq=n_iter_save_ckpt)) diff --git a/ding/bonus/sql.py b/ding/bonus/sql.py index 8c5dbcf9d8..6b9d436a17 100644 --- a/ding/bonus/sql.py +++ b/ding/bonus/sql.py @@ -111,6 +111,8 @@ def train( if hasattr(self.cfg.policy, 'random_collect_size') else 0, ) ) + if "nstep" in self.cfg.policy and self.cfg.policy.nstep > 1: + task.use(nstep_reward_enhancer(self.cfg)) task.use(data_pusher(self.cfg, self.buffer_)) task.use(OffPolicyLearner(self.cfg, self.policy.learn_mode, self.buffer_)) task.use(CkptSaver(policy=self.policy, save_dir=self.checkpoint_save_dir, train_freq=n_iter_save_ckpt)) From 0754dd97bb2284003dfb5191e41275c26f0191c2 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 11 Jul 2023 08:48:13 +0000 Subject: [PATCH 161/244] polish code --- ding/policy/sql.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ding/policy/sql.py b/ding/policy/sql.py index dc6170dfb7..ff7343b067 100644 --- a/ding/policy/sql.py +++ b/ding/policy/sql.py @@ -294,3 +294,6 @@ def _forward_eval(self, data: dict) -> dict: def _monitor_vars_learn(self) -> List[str]: return super()._monitor_vars_learn() + ['record_value_function'] + + def monitor_vars(self) -> List[str]: + return self._monitor_vars_learn() From 0919f06cdbc5a62d720d0ae2739abf7439b4fd51 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 12 Jul 2023 12:56:31 +0000 Subject: [PATCH 162/244] Add ppo offpolicy continuous policy --- ding/bonus/ppo_offpolicy.py | 1 + ding/config/PPOOffPolicy/__init__.py | 3 + .../gym_lunarlandercontinuous_v2.py | 58 +++++ .../SAC/gym_lunarlandercontinuous_v2.py | 2 +- .../framework/middleware/functional/logger.py | 21 +- ding/policy/ppo.py | 209 +++++++++++++++--- 6 files changed, 261 insertions(+), 33 deletions(-) create mode 100644 ding/config/PPOOffPolicy/gym_lunarlandercontinuous_v2.py diff --git a/ding/bonus/ppo_offpolicy.py b/ding/bonus/ppo_offpolicy.py index ea7647e899..e019ca528d 100644 --- a/ding/bonus/ppo_offpolicy.py +++ b/ding/bonus/ppo_offpolicy.py @@ -110,6 +110,7 @@ def train( task.use(CkptSaver(policy=self.policy, save_dir=self.checkpoint_save_dir, train_freq=n_iter_save_ckpt)) task.use( wandb_online_logger( + cfg=self.cfg.wandb_logger, metric_list=self.policy.monitor_vars(), model=self.policy._model, anonymous=True, diff --git a/ding/config/PPOOffPolicy/__init__.py b/ding/config/PPOOffPolicy/__init__.py index 2704b04c53..e2e296bd7d 100644 --- a/ding/config/PPOOffPolicy/__init__.py +++ b/ding/config/PPOOffPolicy/__init__.py @@ -1,11 +1,13 @@ from easydict import EasyDict from . import gym_lunarlander_v2 +from . import gym_lunarlandercontinuous_v2 from . import gym_pongnoframeskip_v4 from . import gym_qbertnoframeskip_v4 from . import gym_spaceInvadersnoframeskip_v4 supported_env_cfg = { gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.cfg, + gym_lunarlandercontinuous_v2.cfg.env.env_id: gym_lunarlandercontinuous_v2.cfg, gym_pongnoframeskip_v4.cfg.env.env_id: gym_pongnoframeskip_v4.cfg, gym_qbertnoframeskip_v4.cfg.env.env_id: gym_qbertnoframeskip_v4.cfg, gym_spaceInvadersnoframeskip_v4.cfg.env.env_id: gym_spaceInvadersnoframeskip_v4.cfg, @@ -15,6 +17,7 @@ supported_env = { gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.env, + gym_lunarlandercontinuous_v2.cfg.env.env_id: gym_lunarlandercontinuous_v2.env, gym_pongnoframeskip_v4.cfg.env.env_id: gym_pongnoframeskip_v4.env, gym_qbertnoframeskip_v4.cfg.env.env_id: gym_qbertnoframeskip_v4.env, gym_spaceInvadersnoframeskip_v4.cfg.env.env_id: gym_spaceInvadersnoframeskip_v4.env, diff --git a/ding/config/PPOOffPolicy/gym_lunarlandercontinuous_v2.py b/ding/config/PPOOffPolicy/gym_lunarlandercontinuous_v2.py new file mode 100644 index 0000000000..5a26d6257b --- /dev/null +++ b/ding/config/PPOOffPolicy/gym_lunarlandercontinuous_v2.py @@ -0,0 +1,58 @@ +from easydict import EasyDict + +cfg = dict( + exp_name='LunarLanderContinuous-v2-PPOOffPolicy', + seed=0, + env=dict( + env_id='LunarLanderContinuous-v2', + collector_env_num=4, + evaluator_env_num=8, + n_evaluator_episode=8, + stop_value=240, + act_scale=True, + ), + policy=dict( + cuda=True, + action_space='continuous', + model=dict( + obs_shape=8, + action_shape=2, + action_space='continuous', + sigma_type = 'conditioned', + encoder_hidden_size_list=[64, 64, 128], + actor_head_hidden_size=128, + critic_head_hidden_size=128, + actor_head_layer_num=2, + critic_head_layer_num=2, + share_encoder=False, + ), + learn=dict( + update_per_collect=10, + batch_size=640, + learning_rate=3e-4, + value_weight=0.5, + entropy_weight=0.001, + clip_ratio=0.2, + nstep=1, + nstep_return=False, + adv_norm=True, + value_norm=False, + ), + collect=dict( + n_sample=128, + unroll_len=1, + discount_factor=0.99, + gae_lambda=0.95, + ), + eval=dict(render=True), + ), + wandb_logger=dict( + gradient_logger=True, video_logger=True, plot_logger=True, action_logger=False, return_logger=False + ), +) + +cfg = EasyDict(cfg) + +import ding.envs.gym_env +from functools import partial +env = partial(ding.envs.gym_env.env, continuous=True) diff --git a/ding/config/SAC/gym_lunarlandercontinuous_v2.py b/ding/config/SAC/gym_lunarlandercontinuous_v2.py index b34504c6a5..7c1a1f4183 100644 --- a/ding/config/SAC/gym_lunarlandercontinuous_v2.py +++ b/ding/config/SAC/gym_lunarlandercontinuous_v2.py @@ -1,7 +1,7 @@ from easydict import EasyDict cfg = dict( - exp_name='LunarLander-v2-SAC', + exp_name='LunarLanderContinuous-v2-SAC', seed=0, env=dict( env_id='LunarLanderContinuous-v2', diff --git a/ding/framework/middleware/functional/logger.py b/ding/framework/middleware/functional/logger.py index 638a47228d..1d4b2068e1 100644 --- a/ding/framework/middleware/functional/logger.py +++ b/ding/framework/middleware/functional/logger.py @@ -239,13 +239,20 @@ def _plot(ctx: "OnlineRLContext"): episode_return = episode_return.squeeze(1) if cfg.video_logger: - file_list = [] - for p in os.listdir(record_path): - if os.path.splitext(p)[-1] == ".mp4": - file_list.append(p) - file_list.sort(key=lambda fn: os.path.getmtime(os.path.join(record_path, fn))) - video_path = os.path.join(record_path, file_list[-2]) - info_for_logging.update({"video": wandb.Video(video_path, format="mp4")}) + if 'replay_video' in ctx.eval_output: + # save numpy array "images" of shape (N,1212,3,224,320) to N video files in mp4 format + # The numpy tensor must be either 4 dimensional or 5 dimensional. Channels should be (time, channel, height, width) or (batch, time, channel, height width) + video_images = ctx.eval_output['replay_video'] + video_images = video_images.astype(np.uint8) + info_for_logging.update({"replay_video": wandb.Video(video_images, fps=60)}) + elif record_path is not None: + file_list = [] + for p in os.listdir(record_path): + if os.path.splitext(p)[-1] == ".mp4": + file_list.append(p) + file_list.sort(key=lambda fn: os.path.getmtime(os.path.join(record_path, fn))) + video_path = os.path.join(record_path, file_list[-2]) + info_for_logging.update({"video": wandb.Video(video_path, format="mp4")}) if cfg.action_logger: action_path = os.path.join(record_path, (str(ctx.env_step) + "_action.gif")) diff --git a/ding/policy/ppo.py b/ding/policy/ppo.py index 619fdd437c..6d7c7a852c 100644 --- a/ding/policy/ppo.py +++ b/ding/policy/ppo.py @@ -685,6 +685,8 @@ class PPOOffPolicy(Policy): priority=False, # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True. priority_IS_weight=False, + # (str) Which kind of action space used in PPOPolicy, ['discrete', 'continuous'] + action_space='discrete', # (bool) Whether to use nstep_return for value loss nstep_return=False, nstep=3, @@ -708,6 +710,10 @@ class PPOOffPolicy(Policy): clip_ratio=0.2, # (bool) Whether to use advantage norm in a whole training batch adv_norm=False, + value_norm=True, + ppo_param_init=True, + grad_clip_type='clip_norm', + grad_clip_value=0.5, ignore_done=False, ), collect=dict( @@ -736,14 +742,44 @@ def _init_learn(self) -> None: self._priority = self._cfg.priority self._priority_IS_weight = self._cfg.priority_IS_weight assert not self._priority and not self._priority_IS_weight, "Priority is not implemented in PPO" - # Orthogonal init - for m in self._model.modules(): - if isinstance(m, torch.nn.Conv2d): - torch.nn.init.orthogonal_(m.weight) - if isinstance(m, torch.nn.Linear): - torch.nn.init.orthogonal_(m.weight) + + assert self._cfg.action_space in ["continuous", "discrete", "hybrid"] + self._action_space = self._cfg.action_space + if self._cfg.learn.ppo_param_init: + for n, m in self._model.named_modules(): + if isinstance(m, torch.nn.Linear): + torch.nn.init.orthogonal_(m.weight) + torch.nn.init.zeros_(m.bias) + if self._action_space in ['continuous', 'hybrid']: + # init log sigma + if self._action_space == 'continuous': + if hasattr(self._model.actor_head, 'log_sigma_param'): + torch.nn.init.constant_(self._model.actor_head.log_sigma_param, -0.5) + elif self._action_space == 'hybrid': # actor_head[1]: ReparameterizationHead, for action_args + if hasattr(self._model.actor_head[1], 'log_sigma_param'): + torch.nn.init.constant_(self._model.actor_head[1].log_sigma_param, -0.5) + + for m in list(self._model.critic.modules()) + list(self._model.actor.modules()): + if isinstance(m, torch.nn.Linear): + # orthogonal initialization + torch.nn.init.orthogonal_(m.weight, gain=np.sqrt(2)) + torch.nn.init.zeros_(m.bias) + # do last policy layer scaling, this will make initial actions have (close to) + # 0 mean and std, and will help boost performances, + # see https://arxiv.org/abs/2006.05990, Fig.24 for details + for m in self._model.actor.modules(): + if isinstance(m, torch.nn.Linear): + torch.nn.init.zeros_(m.bias) + m.weight.data.copy_(0.01 * m.weight.data) + # Optimizer - self._optimizer = Adam(self._model.parameters(), lr=self._cfg.learn.learning_rate) + self._optimizer = Adam( + self._model.parameters(), + lr=self._cfg.learn.learning_rate, + grad_clip_type=self._cfg.learn.grad_clip_type, + clip_value=self._cfg.learn.grad_clip_value + ) + self._learn_model = model_wrap(self._model, wrapper_name='base') # Algorithm config @@ -751,6 +787,11 @@ def _init_learn(self) -> None: self._entropy_weight = self._cfg.learn.entropy_weight self._clip_ratio = self._cfg.learn.clip_ratio self._adv_norm = self._cfg.learn.adv_norm + self._value_norm = self._cfg.learn.value_norm + if self._value_norm: + self._running_mean_std = RunningMeanStd(epsilon=1e-4, device=self._device) + self._gamma = self._cfg.collect.discount_factor + self._gae_lambda = self._cfg.collect.gae_lambda self._nstep = self._cfg.nstep self._nstep_return = self._cfg.nstep_return # Main model @@ -770,25 +811,68 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: data = default_preprocess_learn(data, ignore_done=self._cfg.learn.ignore_done, use_nstep=self._nstep_return) if self._cuda: data = to_device(data, self._device) + data['obs'] = to_dtype(data['obs'], torch.float32) + if 'next_obs' in data: + data['next_obs'] = to_dtype(data['next_obs'], torch.float32) # ==================== # PPO forward # ==================== self._learn_model.train() + + with torch.no_grad(): + if self._value_norm: + unnormalized_return = data['adv'] + data['value'] * self._running_mean_std.std + data['return'] = unnormalized_return / self._running_mean_std.std + self._running_mean_std.update(unnormalized_return.cpu().numpy()) + else: + data['return'] = data['adv'] + data['value'] + # normal ppo if not self._nstep_return: output = self._learn_model.forward(data['obs'], mode='compute_actor_critic') adv = data['adv'] - return_ = data['value'] + adv + if self._adv_norm: # Normalize advantage in a total train_batch adv = (adv - adv.mean()) / (adv.std() + 1e-8) # Calculate ppo loss - ppodata = ppo_data( - output['logit'], data['logit'], data['action'], output['value'], data['value'], adv, return_, - data['weight'] - ) - ppo_loss, ppo_info = ppo_error(ppodata, self._clip_ratio) + if self._action_space == 'continuous': + ppodata = ppo_data( + output['logit'], data['logit'], data['action'], output['value'], data['value'], adv, data['return'], + data['weight'] + ) + ppo_loss, ppo_info = ppo_error_continuous(ppodata, self._clip_ratio) + elif self._action_space == 'discrete': + ppodata = ppo_data( + output['logit'], data['logit'], data['action'], output['value'], data['value'], adv, data['return'], + data['weight'] + ) + ppo_loss, ppo_info = ppo_error(ppodata, self._clip_ratio) + elif self._action_space == 'hybrid': + # discrete part (discrete policy loss and entropy loss) + ppo_discrete_batch = ppo_policy_data( + output['logit']['action_type'], data['logit']['action_type'], data['action']['action_type'], + adv, data['weight'] + ) + ppo_discrete_loss, ppo_discrete_info = ppo_policy_error(ppo_discrete_batch, self._clip_ratio) + # continuous part (continuous policy loss and entropy loss, value loss) + ppo_continuous_batch = ppo_data( + output['logit']['action_args'], data['logit']['action_args'], data['action']['action_args'], + output['value'], data['value'], adv, data['return'], data['weight'] + ) + ppo_continuous_loss, ppo_continuous_info = ppo_error_continuous( + ppo_continuous_batch, self._clip_ratio + ) + # sum discrete and continuous loss + ppo_loss = type(ppo_continuous_loss)( + ppo_continuous_loss.policy_loss + ppo_discrete_loss.policy_loss, ppo_continuous_loss.value_loss, + ppo_continuous_loss.entropy_loss + ppo_discrete_loss.entropy_loss + ) + ppo_info = type(ppo_continuous_info)( + max(ppo_continuous_info.approx_kl, ppo_discrete_info.approx_kl), + max(ppo_continuous_info.clipfrac, ppo_discrete_info.clipfrac) + ) wv, we = self._value_weight, self._entropy_weight total_loss = ppo_loss.policy_loss + wv * ppo_loss.value_loss - we * ppo_loss.entropy_loss @@ -800,8 +884,35 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: adv = (adv - adv.mean()) / (adv.std() + 1e-8) # Calculate ppo loss - ppodata = ppo_policy_data(output['logit'], data['logit'], data['action'], adv, data['weight']) - ppo_policy_loss, ppo_info = ppo_policy_error(ppodata, self._clip_ratio) + if self._action_space == 'continuous': + ppodata = ppo_policy_data(output['logit'], data['logit'], data['action'], adv, data['weight']) + ppo_policy_loss, ppo_info = ppo_policy_error_continuous(ppodata, self._clip_ratio) + elif self._action_space == 'discrete': + ppodata = ppo_policy_data(output['logit'], data['logit'], data['action'], adv, data['weight']) + ppo_policy_loss, ppo_info = ppo_policy_error(ppodata, self._clip_ratio) + elif self._action_space == 'hybrid': + # discrete part (discrete policy loss and entropy loss) + ppo_discrete_data = ppo_policy_data( + output['logit']['action_type'], data['logit']['action_type'], data['action']['action_type'], + adv, data['weight'] + ) + ppo_discrete_loss, ppo_discrete_info = ppo_policy_error(ppo_discrete_data, self._clip_ratio) + # continuous part (continuous policy loss and entropy loss, value loss) + ppo_continuous_data = ppo_policy_data( + output['logit']['action_args'], data['logit']['action_args'], data['action']['action_args'], + adv, data['weight'] + ) + ppo_continuous_loss, ppo_continuous_info = ppo_policy_error_continuous(ppo_continuous_data, self._clip_ratio) + # sum discrete and continuous loss + ppo_policy_loss = type(ppo_continuous_loss)( + ppo_continuous_loss.policy_loss + ppo_discrete_loss.policy_loss, + ppo_continuous_loss.entropy_loss + ppo_discrete_loss.entropy_loss + ) + ppo_info = type(ppo_continuous_info)( + max(ppo_continuous_info.approx_kl, ppo_discrete_info.approx_kl), + max(ppo_continuous_info.clipfrac, ppo_discrete_info.clipfrac) + ) + wv, we = self._value_weight, self._entropy_weight next_obs = data.get('next_obs') value_gamma = data.get('value_gamma') @@ -828,7 +939,7 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: self._optimizer.zero_grad() total_loss.backward() self._optimizer.step() - return { + return_info = { 'cur_lr': self._optimizer.defaults['lr'], 'total_loss': total_loss.item(), 'policy_loss': ppo_loss.policy_loss.item(), @@ -838,6 +949,15 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: 'approx_kl': ppo_info.approx_kl, 'clipfrac': ppo_info.clipfrac, } + if self._action_space == 'continuous': + return_info.update( + { + 'act': data['action'].float().mean().item(), + 'mu_mean': output['logit']['mu'].mean().item(), + 'sigma_mean': output['logit']['sigma'].mean().item(), + } + ) + return return_info def _state_dict_learn(self) -> Dict[str, Any]: return { @@ -856,7 +976,14 @@ def _init_collect(self) -> None: Init traj and unroll length, collect model. """ self._unroll_len = self._cfg.collect.unroll_len - self._collect_model = model_wrap(self._model, wrapper_name='multinomial_sample') + assert self._cfg.action_space in ["continuous", "discrete", "hybrid"] + self._action_space = self._cfg.action_space + if self._action_space == 'continuous': + self._collect_model = model_wrap(self._model, wrapper_name='reparam_sample') + elif self._action_space == 'discrete': + self._collect_model = model_wrap(self._model, wrapper_name='multinomial_sample') + elif self._action_space == 'hybrid': + self._collect_model = model_wrap(self._model, wrapper_name='hybrid_reparam_multinomial_sample') self._collect_model.reset() self._gamma = self._cfg.collect.discount_factor self._gae_lambda = self._cfg.collect.gae_lambda @@ -920,17 +1047,39 @@ def _get_train_sample(self, data: list) -> Union[None, List[Any]]: Returns: - samples (:obj:`dict`): The training samples generated """ - data = get_gae_with_default_last_value( + data = to_device(data, self._device) + for transition in data: + transition['traj_flag'] = copy.deepcopy(transition['done']) + data[-1]['traj_flag'] = True + + if self._cfg.learn.ignore_done: + data[-1]['done'] = False + + if data[-1]['done']: + last_value = torch.zeros_like(data[-1]['value']) + else: + with torch.no_grad(): + last_value = self._collect_model.forward( + unsqueeze(data[-1]['next_obs'], 0), mode='compute_actor_critic' + )['value'] + if len(last_value.shape) == 2: # multi_agent case: + last_value = last_value.squeeze(0) + if self._value_norm: + last_value *= self._running_mean_std.std + for i in range(len(data)): + data[i]['value'] *= self._running_mean_std.std + data = get_gae( data, - data[-1]['done'], + to_device(last_value, self._device), gamma=self._gamma, gae_lambda=self._gae_lambda, cuda=False, ) - if not self._nstep_return: - return get_train_sample(data, self._unroll_len) - else: - return get_nstep_return_data(data, self._nstep) + if self._value_norm: + for i in range(len(data)): + data[i]['value'] /= self._running_mean_std.std + + return get_train_sample(data, self._unroll_len) def _init_eval(self) -> None: r""" @@ -938,7 +1087,14 @@ def _init_eval(self) -> None: Evaluate mode init method. Called by ``self.__init__``. Init eval model with argmax strategy. """ - self._eval_model = model_wrap(self._model, wrapper_name='argmax_sample') + assert self._cfg.action_space in ["continuous", "discrete", "hybrid"] + self._action_space = self._cfg.action_space + if self._action_space == 'continuous': + self._eval_model = model_wrap(self._model, wrapper_name='deterministic_sample') + elif self._action_space == 'discrete': + self._eval_model = model_wrap(self._model, wrapper_name='argmax_sample') + elif self._action_space == 'hybrid': + self._eval_model = model_wrap(self._model, wrapper_name='hybrid_deterministic_argmax_sample') self._eval_model.reset() def _forward_eval(self, data: dict) -> dict: @@ -969,9 +1125,12 @@ def default_model(self) -> Tuple[str, List[str]]: return 'vac', ['ding.model.template.vac'] def _monitor_vars_learn(self) -> List[str]: - return super()._monitor_vars_learn() + [ + variables = super()._monitor_vars_learn() + [ 'policy_loss', 'value_loss', 'entropy_loss', 'adv_abs_max', 'approx_kl', 'clipfrac' ] + if self._action_space == 'continuous': + variables += ['mu_mean', 'sigma_mean', 'sigma_grad', 'act'] + return variables def monitor_vars(self) -> List[str]: return self._monitor_vars_learn() From d958f49079d3a7b8e903aaf64f002c53e127f80a Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 14 Jul 2023 07:10:57 +0000 Subject: [PATCH 163/244] polish config --- ding/bonus/ppo_offpolicy.py | 7 ++- ding/config/A2C/gym_bipedalwalker_v3.py | 8 +--- ding/config/A2C/gym_halfcheetah_v3.py | 6 +-- ding/config/A2C/gym_hopper_v3.py | 6 +-- ding/config/A2C/gym_lunarlander_v2.py | 3 +- ding/config/A2C/gym_walker2d_v3.py | 6 +-- ding/config/C51/gym_pongnoframeskip_v4.py | 3 +- ding/config/C51/gym_qbertnoframeskip_v4.py | 3 +- .../C51/gym_spaceInvadersnoframeskip_v4.py | 3 +- ding/config/DDPG/gym_bipedalwalker_v3.py | 8 +--- ding/config/DDPG/gym_halfcheetah_v3.py | 6 +-- ding/config/DDPG/gym_hopper_v3.py | 6 +-- ding/config/DDPG/gym_pendulum_v1.py | 3 +- ding/config/DDPG/gym_walker2d_v3.py | 6 +-- ding/config/DQN/gym_pongnoframeskip_v4.py | 3 +- ding/config/DQN/gym_qbertnoframeskip_v4.py | 3 +- .../DQN/gym_spaceInvadersnoframeskip_v4.py | 3 +- ding/config/PG/gym_bipedalwalker_v3.py | 8 +--- ding/config/PG/gym_halfcheetah_v3.py | 6 +-- ding/config/PG/gym_hopper_v3.py | 6 +-- ding/config/PG/gym_lunarlander_v2.py | 3 +- ding/config/PG/gym_pendulum_v1.py | 3 +- ding/config/PG/gym_walker2d_v3.py | 6 +-- ding/config/PPOF/gym_lunarlander_v2.py | 3 +- .../PPOF/gym_lunarlandercontinuous_v2.py | 2 +- ding/config/PPOOffPolicy/__init__.py | 1 + .../gym_lunarlandercontinuous_v2.py | 21 ++++++--- .../PPOOffPolicy/gym_pongnoframeskip_v4.py | 3 +- .../PPOOffPolicy/gym_qbertnoframeskip_v4.py | 3 +- .../gym_spaceInvadersnoframeskip_v4.py | 3 +- ding/config/SAC/gym_bipedalwalker_v3.py | 8 +--- ding/config/SAC/gym_halfcheetah_v3.py | 6 +-- ding/config/SAC/gym_hopper_v3.py | 6 +-- ding/config/SAC/gym_pendulum_v1.py | 3 +- ding/config/SAC/gym_walker2d_v3.py | 6 +-- ding/config/TD3/gym_bipedalwalker_v3.py | 8 +--- ding/config/TD3/gym_halfcheetah_v3.py | 6 +-- ding/config/TD3/gym_hopper_v3.py | 6 +-- .../TD3/gym_lunarlandercontinuous_v2.py | 2 +- ding/config/TD3/gym_pendulum_v1.py | 3 +- ding/config/TD3/gym_walker2d_v3.py | 6 +-- ding/model/common/__init__.py | 2 +- ding/model/common/utils.py | 25 +++++++++++ ding/model/template/vac.py | 44 +++++++++++++------ ding/policy/ppo.py | 7 +-- 45 files changed, 120 insertions(+), 169 deletions(-) diff --git a/ding/bonus/ppo_offpolicy.py b/ding/bonus/ppo_offpolicy.py index e019ca528d..eb57e63763 100644 --- a/ding/bonus/ppo_offpolicy.py +++ b/ding/bonus/ppo_offpolicy.py @@ -99,7 +99,12 @@ def train( evaluator_env = setup_ding_env_manager(self.env, evaluator_env_num, context, debug, 'evaluator') with task.start(ctx=OnlineRLContext()): - task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) + task.use( + interaction_evaluator( + self.cfg, self.policy.eval_mode, evaluator_env, render=self.cfg.policy.eval.render \ + if hasattr(self.cfg.policy.eval, "render") else False + ) + ) task.use(StepCollector( self.cfg, self.policy.collect_mode, diff --git a/ding/config/A2C/gym_bipedalwalker_v3.py b/ding/config/A2C/gym_bipedalwalker_v3.py index b53159ece3..5159b07dfa 100644 --- a/ding/config/A2C/gym_bipedalwalker_v3.py +++ b/ding/config/A2C/gym_bipedalwalker_v3.py @@ -40,10 +40,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial( - ding.envs.gym_env.env, cfg=dict( - act_scale=cfg.env.act_scale, - rew_clip=cfg.env.rew_clip, - ) -) +env = ding.envs.gym_env.env, diff --git a/ding/config/A2C/gym_halfcheetah_v3.py b/ding/config/A2C/gym_halfcheetah_v3.py index 3f38fe610a..ae414bc00e 100644 --- a/ding/config/A2C/gym_halfcheetah_v3.py +++ b/ding/config/A2C/gym_halfcheetah_v3.py @@ -45,8 +45,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial( - ding.envs.gym_env.env, - cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) -) +env = ding.envs.gym_env.env, diff --git a/ding/config/A2C/gym_hopper_v3.py b/ding/config/A2C/gym_hopper_v3.py index de461a79d7..1c0d81a5b4 100644 --- a/ding/config/A2C/gym_hopper_v3.py +++ b/ding/config/A2C/gym_hopper_v3.py @@ -43,8 +43,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial( - ding.envs.gym_env.env, - cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) -) +env = ding.envs.gym_env.env, diff --git a/ding/config/A2C/gym_lunarlander_v2.py b/ding/config/A2C/gym_lunarlander_v2.py index cf2e3b50b0..e6916bc132 100644 --- a/ding/config/A2C/gym_lunarlander_v2.py +++ b/ding/config/A2C/gym_lunarlander_v2.py @@ -35,5 +35,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial(ding.envs.gym_env.env, continuous=True) +env = ding.envs.gym_env.env, diff --git a/ding/config/A2C/gym_walker2d_v3.py b/ding/config/A2C/gym_walker2d_v3.py index a714e53f26..8f98c2dc7c 100644 --- a/ding/config/A2C/gym_walker2d_v3.py +++ b/ding/config/A2C/gym_walker2d_v3.py @@ -43,8 +43,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial( - ding.envs.gym_env.env, - cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) -) +env = ding.envs.gym_env.env, diff --git a/ding/config/C51/gym_pongnoframeskip_v4.py b/ding/config/C51/gym_pongnoframeskip_v4.py index 5cbf8debd3..5260357d8a 100644 --- a/ding/config/C51/gym_pongnoframeskip_v4.py +++ b/ding/config/C51/gym_pongnoframeskip_v4.py @@ -51,5 +51,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial(ding.envs.gym_env.env, cfg=dict(env_id=cfg.env.env_id, env_wrapper=cfg.env.env_wrapper)) +env = ding.envs.gym_env.env, diff --git a/ding/config/C51/gym_qbertnoframeskip_v4.py b/ding/config/C51/gym_qbertnoframeskip_v4.py index 8442414e1e..c9e5412af3 100644 --- a/ding/config/C51/gym_qbertnoframeskip_v4.py +++ b/ding/config/C51/gym_qbertnoframeskip_v4.py @@ -51,5 +51,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial(ding.envs.gym_env.env, cfg=dict(env_id=cfg.env.env_id, env_wrapper=cfg.env.env_wrapper)) +env = ding.envs.gym_env.env, diff --git a/ding/config/C51/gym_spaceInvadersnoframeskip_v4.py b/ding/config/C51/gym_spaceInvadersnoframeskip_v4.py index 42490e4fda..f8e9c7c929 100644 --- a/ding/config/C51/gym_spaceInvadersnoframeskip_v4.py +++ b/ding/config/C51/gym_spaceInvadersnoframeskip_v4.py @@ -51,5 +51,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial(ding.envs.gym_env.env, cfg=dict(env_id=cfg.env.env_id, env_wrapper=cfg.env.env_wrapper)) +env = ding.envs.gym_env.env, diff --git a/ding/config/DDPG/gym_bipedalwalker_v3.py b/ding/config/DDPG/gym_bipedalwalker_v3.py index e444c3a570..8e9babf148 100644 --- a/ding/config/DDPG/gym_bipedalwalker_v3.py +++ b/ding/config/DDPG/gym_bipedalwalker_v3.py @@ -42,10 +42,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial( - ding.envs.gym_env.env, cfg=dict( - act_scale=cfg.env.act_scale, - rew_clip=cfg.env.rew_clip, - ) -) +env = ding.envs.gym_env.env, diff --git a/ding/config/DDPG/gym_halfcheetah_v3.py b/ding/config/DDPG/gym_halfcheetah_v3.py index 66845e5997..bf07473488 100644 --- a/ding/config/DDPG/gym_halfcheetah_v3.py +++ b/ding/config/DDPG/gym_halfcheetah_v3.py @@ -52,8 +52,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial( - ding.envs.gym_env.env, - cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) -) +env = ding.envs.gym_env.env, diff --git a/ding/config/DDPG/gym_hopper_v3.py b/ding/config/DDPG/gym_hopper_v3.py index 865571c285..fa1791316e 100644 --- a/ding/config/DDPG/gym_hopper_v3.py +++ b/ding/config/DDPG/gym_hopper_v3.py @@ -52,8 +52,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial( - ding.envs.gym_env.env, - cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) -) +env = ding.envs.gym_env.env, diff --git a/ding/config/DDPG/gym_pendulum_v1.py b/ding/config/DDPG/gym_pendulum_v1.py index e6a08cef11..45ebd9bc33 100644 --- a/ding/config/DDPG/gym_pendulum_v1.py +++ b/ding/config/DDPG/gym_pendulum_v1.py @@ -49,5 +49,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial(ding.envs.gym_env.env, cfg=dict(act_scale=cfg.env.act_scale)) +env = ding.envs.gym_env.env, diff --git a/ding/config/DDPG/gym_walker2d_v3.py b/ding/config/DDPG/gym_walker2d_v3.py index 1611ca7d62..0461253d17 100644 --- a/ding/config/DDPG/gym_walker2d_v3.py +++ b/ding/config/DDPG/gym_walker2d_v3.py @@ -52,8 +52,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial( - ding.envs.gym_env.env, - cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) -) +env = ding.envs.gym_env.env, diff --git a/ding/config/DQN/gym_pongnoframeskip_v4.py b/ding/config/DQN/gym_pongnoframeskip_v4.py index 5777aae121..d88ef6e348 100644 --- a/ding/config/DQN/gym_pongnoframeskip_v4.py +++ b/ding/config/DQN/gym_pongnoframeskip_v4.py @@ -47,5 +47,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial(ding.envs.gym_env.env, cfg=dict(env_id=cfg.env.env_id, env_wrapper=cfg.env.env_wrapper)) +env = ding.envs.gym_env.env, diff --git a/ding/config/DQN/gym_qbertnoframeskip_v4.py b/ding/config/DQN/gym_qbertnoframeskip_v4.py index 94c0a48c57..7e688cfcee 100644 --- a/ding/config/DQN/gym_qbertnoframeskip_v4.py +++ b/ding/config/DQN/gym_qbertnoframeskip_v4.py @@ -47,5 +47,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial(ding.envs.gym_env.env, cfg=dict(env_id=cfg.env.env_id, env_wrapper=cfg.env.env_wrapper)) +env = ding.envs.gym_env.env, diff --git a/ding/config/DQN/gym_spaceInvadersnoframeskip_v4.py b/ding/config/DQN/gym_spaceInvadersnoframeskip_v4.py index 17a4b5cad5..2dfc298c97 100644 --- a/ding/config/DQN/gym_spaceInvadersnoframeskip_v4.py +++ b/ding/config/DQN/gym_spaceInvadersnoframeskip_v4.py @@ -48,5 +48,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial(ding.envs.gym_env.env, cfg=dict(env_id=cfg.env.env_id, env_wrapper=cfg.env.env_wrapper)) +env = ding.envs.gym_env.env, diff --git a/ding/config/PG/gym_bipedalwalker_v3.py b/ding/config/PG/gym_bipedalwalker_v3.py index 9b7cde76b4..c62e5b8178 100644 --- a/ding/config/PG/gym_bipedalwalker_v3.py +++ b/ding/config/PG/gym_bipedalwalker_v3.py @@ -40,10 +40,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial( - ding.envs.gym_env.env, cfg=dict( - act_scale=cfg.env.act_scale, - rew_clip=cfg.env.rew_clip, - ) -) +env = ding.envs.gym_env.env, diff --git a/ding/config/PG/gym_halfcheetah_v3.py b/ding/config/PG/gym_halfcheetah_v3.py index 2a869e84a3..f5309bedc0 100644 --- a/ding/config/PG/gym_halfcheetah_v3.py +++ b/ding/config/PG/gym_halfcheetah_v3.py @@ -43,8 +43,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial( - ding.envs.gym_env.env, - cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) -) +env = ding.envs.gym_env.env, diff --git a/ding/config/PG/gym_hopper_v3.py b/ding/config/PG/gym_hopper_v3.py index 2c81292094..1eb4925ee1 100644 --- a/ding/config/PG/gym_hopper_v3.py +++ b/ding/config/PG/gym_hopper_v3.py @@ -43,8 +43,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial( - ding.envs.gym_env.env, - cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) -) +env = ding.envs.gym_env.env, diff --git a/ding/config/PG/gym_lunarlander_v2.py b/ding/config/PG/gym_lunarlander_v2.py index ea1e855430..607f6d3fbf 100644 --- a/ding/config/PG/gym_lunarlander_v2.py +++ b/ding/config/PG/gym_lunarlander_v2.py @@ -35,5 +35,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial(ding.envs.gym_env.env, continuous=True) +env = ding.envs.gym_env.env, diff --git a/ding/config/PG/gym_pendulum_v1.py b/ding/config/PG/gym_pendulum_v1.py index e3cd9db474..cd68f13b8e 100644 --- a/ding/config/PG/gym_pendulum_v1.py +++ b/ding/config/PG/gym_pendulum_v1.py @@ -39,5 +39,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial(ding.envs.gym_env.env, cfg=dict(act_scale=cfg.env.act_scale)) +env = ding.envs.gym_env.env, diff --git a/ding/config/PG/gym_walker2d_v3.py b/ding/config/PG/gym_walker2d_v3.py index d71b0fdc5f..a572b348ad 100644 --- a/ding/config/PG/gym_walker2d_v3.py +++ b/ding/config/PG/gym_walker2d_v3.py @@ -43,8 +43,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial( - ding.envs.gym_env.env, - cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) -) +env = ding.envs.gym_env.env, diff --git a/ding/config/PPOF/gym_lunarlander_v2.py b/ding/config/PPOF/gym_lunarlander_v2.py index 6496a505fb..352f12a495 100644 --- a/ding/config/PPOF/gym_lunarlander_v2.py +++ b/ding/config/PPOF/gym_lunarlander_v2.py @@ -10,5 +10,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial(ding.envs.gym_env.env, continuous=True) +env = ding.envs.gym_env.env, diff --git a/ding/config/PPOF/gym_lunarlandercontinuous_v2.py b/ding/config/PPOF/gym_lunarlandercontinuous_v2.py index b19fb19e4b..67603f7997 100644 --- a/ding/config/PPOF/gym_lunarlandercontinuous_v2.py +++ b/ding/config/PPOF/gym_lunarlandercontinuous_v2.py @@ -12,4 +12,4 @@ import ding.envs.gym_env from functools import partial -env = partial(ding.envs.gym_env.env, cfg=dict(act_scale=cfg.act_scale)) +env = partial(ding.envs.gym_env.env, continuous=True) diff --git a/ding/config/PPOOffPolicy/__init__.py b/ding/config/PPOOffPolicy/__init__.py index e2e296bd7d..6fd395d632 100644 --- a/ding/config/PPOOffPolicy/__init__.py +++ b/ding/config/PPOOffPolicy/__init__.py @@ -7,6 +7,7 @@ supported_env_cfg = { gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.cfg, + gym_lunarlandercontinuous_v2.cfg.env.env_id: gym_lunarlandercontinuous_v2.cfg, gym_pongnoframeskip_v4.cfg.env.env_id: gym_pongnoframeskip_v4.cfg, gym_qbertnoframeskip_v4.cfg.env.env_id: gym_qbertnoframeskip_v4.cfg, diff --git a/ding/config/PPOOffPolicy/gym_lunarlandercontinuous_v2.py b/ding/config/PPOOffPolicy/gym_lunarlandercontinuous_v2.py index 5a26d6257b..460368a52d 100644 --- a/ding/config/PPOOffPolicy/gym_lunarlandercontinuous_v2.py +++ b/ding/config/PPOOffPolicy/gym_lunarlandercontinuous_v2.py @@ -5,9 +5,9 @@ seed=0, env=dict( env_id='LunarLanderContinuous-v2', - collector_env_num=4, - evaluator_env_num=8, - n_evaluator_episode=8, + collector_env_num=1, + evaluator_env_num=1, + n_evaluator_episode=1, stop_value=240, act_scale=True, ), @@ -18,25 +18,29 @@ obs_shape=8, action_shape=2, action_space='continuous', - sigma_type = 'conditioned', + sigma_type='independent', + # sigma_type='conditioned', encoder_hidden_size_list=[64, 64, 128], actor_head_hidden_size=128, critic_head_hidden_size=128, actor_head_layer_num=2, critic_head_layer_num=2, share_encoder=False, + policy_activation="tanh", + value_activation="relu", ), learn=dict( update_per_collect=10, batch_size=640, learning_rate=3e-4, value_weight=0.5, - entropy_weight=0.001, - clip_ratio=0.2, + entropy_weight=0.0, + clip_ratio=0.1, nstep=1, nstep_return=False, adv_norm=True, value_norm=False, + ppo_param_init=True, ), collect=dict( n_sample=128, @@ -44,7 +48,10 @@ discount_factor=0.99, gae_lambda=0.95, ), - eval=dict(render=True), + eval=dict( + evaluator=dict(eval_freq=10,), + render=True, + ), ), wandb_logger=dict( gradient_logger=True, video_logger=True, plot_logger=True, action_logger=False, return_logger=False diff --git a/ding/config/PPOOffPolicy/gym_pongnoframeskip_v4.py b/ding/config/PPOOffPolicy/gym_pongnoframeskip_v4.py index eebb27bd7e..cc13c5653b 100644 --- a/ding/config/PPOOffPolicy/gym_pongnoframeskip_v4.py +++ b/ding/config/PPOOffPolicy/gym_pongnoframeskip_v4.py @@ -51,5 +51,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial(ding.envs.gym_env.env, cfg=dict(env_id=cfg.env.env_id, env_wrapper=cfg.env.env_wrapper)) +env = ding.envs.gym_env.env, diff --git a/ding/config/PPOOffPolicy/gym_qbertnoframeskip_v4.py b/ding/config/PPOOffPolicy/gym_qbertnoframeskip_v4.py index 90e07db907..b0e3a4ce20 100644 --- a/ding/config/PPOOffPolicy/gym_qbertnoframeskip_v4.py +++ b/ding/config/PPOOffPolicy/gym_qbertnoframeskip_v4.py @@ -45,5 +45,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial(ding.envs.gym_env.env, cfg=dict(env_id=cfg.env.env_id, env_wrapper=cfg.env.env_wrapper)) +env = ding.envs.gym_env.env, diff --git a/ding/config/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py b/ding/config/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py index 1ad2f01a1f..218ea0a727 100644 --- a/ding/config/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py +++ b/ding/config/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py @@ -45,5 +45,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial(ding.envs.gym_env.env, cfg=dict(env_id=cfg.env.env_id, env_wrapper=cfg.env.env_wrapper)) +env = ding.envs.gym_env.env, diff --git a/ding/config/SAC/gym_bipedalwalker_v3.py b/ding/config/SAC/gym_bipedalwalker_v3.py index 0bd3b8c5c4..9ec1a9d0af 100644 --- a/ding/config/SAC/gym_bipedalwalker_v3.py +++ b/ding/config/SAC/gym_bipedalwalker_v3.py @@ -44,10 +44,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial( - ding.envs.gym_env.env, cfg=dict( - act_scale=cfg.env.act_scale, - rew_clip=cfg.env.rew_clip, - ) -) +env = ding.envs.gym_env.env, diff --git a/ding/config/SAC/gym_halfcheetah_v3.py b/ding/config/SAC/gym_halfcheetah_v3.py index ba22c115cf..1a1e094e77 100644 --- a/ding/config/SAC/gym_halfcheetah_v3.py +++ b/ding/config/SAC/gym_halfcheetah_v3.py @@ -53,8 +53,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial( - ding.envs.gym_env.env, - cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) -) +env = ding.envs.gym_env.env, diff --git a/ding/config/SAC/gym_hopper_v3.py b/ding/config/SAC/gym_hopper_v3.py index db7b90e8ab..1bb0d87e47 100644 --- a/ding/config/SAC/gym_hopper_v3.py +++ b/ding/config/SAC/gym_hopper_v3.py @@ -40,8 +40,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial( - ding.envs.gym_env.env, - cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) -) +env = ding.envs.gym_env.env, diff --git a/ding/config/SAC/gym_pendulum_v1.py b/ding/config/SAC/gym_pendulum_v1.py index 49c81c4a0b..2c7acb462c 100644 --- a/ding/config/SAC/gym_pendulum_v1.py +++ b/ding/config/SAC/gym_pendulum_v1.py @@ -46,5 +46,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial(ding.envs.gym_env.env, cfg=dict(act_scale=cfg.env.act_scale)) +env = ding.envs.gym_env.env, diff --git a/ding/config/SAC/gym_walker2d_v3.py b/ding/config/SAC/gym_walker2d_v3.py index 1d2668c2fc..c476e2e54e 100644 --- a/ding/config/SAC/gym_walker2d_v3.py +++ b/ding/config/SAC/gym_walker2d_v3.py @@ -53,8 +53,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial( - ding.envs.gym_env.env, - cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) -) +env = ding.envs.gym_env.env, diff --git a/ding/config/TD3/gym_bipedalwalker_v3.py b/ding/config/TD3/gym_bipedalwalker_v3.py index 238222a688..2459148f52 100644 --- a/ding/config/TD3/gym_bipedalwalker_v3.py +++ b/ding/config/TD3/gym_bipedalwalker_v3.py @@ -49,10 +49,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial( - ding.envs.gym_env.env, cfg=dict( - act_scale=cfg.env.act_scale, - rew_clip=cfg.env.rew_clip, - ) -) +env = ding.envs.gym_env.env, diff --git a/ding/config/TD3/gym_halfcheetah_v3.py b/ding/config/TD3/gym_halfcheetah_v3.py index 8d0c7dc6fd..a53b1889ac 100644 --- a/ding/config/TD3/gym_halfcheetah_v3.py +++ b/ding/config/TD3/gym_halfcheetah_v3.py @@ -55,8 +55,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial( - ding.envs.gym_env.env, - cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) -) +env = ding.envs.gym_env.env, diff --git a/ding/config/TD3/gym_hopper_v3.py b/ding/config/TD3/gym_hopper_v3.py index 62791007aa..f1458f54c5 100644 --- a/ding/config/TD3/gym_hopper_v3.py +++ b/ding/config/TD3/gym_hopper_v3.py @@ -34,8 +34,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial( - ding.envs.gym_env.env, - cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) -) +env = ding.envs.gym_env.env, diff --git a/ding/config/TD3/gym_lunarlandercontinuous_v2.py b/ding/config/TD3/gym_lunarlandercontinuous_v2.py index c876ca1c7d..7586c3ffc8 100644 --- a/ding/config/TD3/gym_lunarlandercontinuous_v2.py +++ b/ding/config/TD3/gym_lunarlandercontinuous_v2.py @@ -47,4 +47,4 @@ import ding.envs.gym_env from functools import partial -env = partial(ding.envs.gym_env.env, cfg=dict(act_scale=cfg.env.act_scale)) +env = partial(ding.envs.gym_env.env, continuous=True) diff --git a/ding/config/TD3/gym_pendulum_v1.py b/ding/config/TD3/gym_pendulum_v1.py index 305de9ea13..18e43a2831 100644 --- a/ding/config/TD3/gym_pendulum_v1.py +++ b/ding/config/TD3/gym_pendulum_v1.py @@ -51,5 +51,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial(ding.envs.gym_env.env, cfg=dict(act_scale=cfg.env.act_scale)) +env = ding.envs.gym_env.env, diff --git a/ding/config/TD3/gym_walker2d_v3.py b/ding/config/TD3/gym_walker2d_v3.py index 155d9332d0..650cbffd1f 100644 --- a/ding/config/TD3/gym_walker2d_v3.py +++ b/ding/config/TD3/gym_walker2d_v3.py @@ -57,8 +57,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -from functools import partial -env = partial( - ding.envs.gym_env.env, - cfg=dict(env_wrapper=cfg.env.env_wrapper, act_scale=cfg.env.act_scale, rew_clip=cfg.env.rew_clip) -) +env = ding.envs.gym_env.env, diff --git a/ding/model/common/__init__.py b/ding/model/common/__init__.py index ea30de717e..d14a4659fb 100755 --- a/ding/model/common/__init__.py +++ b/ding/model/common/__init__.py @@ -2,4 +2,4 @@ QuantileHead, FQFHead, RegressionHead, ReparameterizationHead, MultiHead, BranchingHead, head_cls_map, \ independent_normal_dist, AttentionPolicyHead, PopArtVHead, EnsembleHead from .encoder import ConvEncoder, FCEncoder, IMPALAConvEncoder -from .utils import create_model +from .utils import create_model, get_activation diff --git a/ding/model/common/utils.py b/ding/model/common/utils.py index 494481fa8b..ffe6234ce0 100644 --- a/ding/model/common/utils.py +++ b/ding/model/common/utils.py @@ -1,7 +1,32 @@ import torch +import torch.nn as nn from easydict import EasyDict from ding.utils import import_module, MODEL_REGISTRY +class Lambda(nn.Module): + + def __init__(self, f): + super(Lambda, self).__init__() + self.f = f + + def forward(self, x): + return self.f(x) + + +NONLINEARITIES = { + "tanh": nn.Tanh(), + "relu": nn.ReLU(), + "softplus": nn.Softplus(), + "elu": nn.ELU(), + "square": Lambda(lambda x: x**2), + "identity": Lambda(lambda x: x), +} + +def get_activation(name:str): + name=name.lower() + if name not in NONLINEARITIES: + raise ValueError("Unknown activation function {}".format(name)) + return NONLINEARITIES[name] def create_model(cfg: EasyDict) -> torch.nn.Module: """ diff --git a/ding/model/template/vac.py b/ding/model/template/vac.py index cf026e0192..2d277a0da8 100644 --- a/ding/model/template/vac.py +++ b/ding/model/template/vac.py @@ -5,7 +5,7 @@ from copy import deepcopy from ding.utils import SequenceType, squeeze, MODEL_REGISTRY from ..common import ReparameterizationHead, RegressionHead, DiscreteHead, MultiHead, \ - FCEncoder, ConvEncoder, IMPALAConvEncoder + FCEncoder, ConvEncoder, IMPALAConvEncoder, get_activation @MODEL_REGISTRY.register('vac') @@ -29,7 +29,9 @@ def __init__( actor_head_layer_num: int = 1, critic_head_hidden_size: int = 64, critic_head_layer_num: int = 1, - activation: Optional[nn.Module] = nn.ReLU(), + activation: Optional[Union[str, nn.Module]] = nn.ReLU(), + policy_activation: Optional[Union[str, nn.Module]] = None, + value_activation: Optional[Union[str, nn.Module]] = None, norm_type: Optional[str] = None, sigma_type: Optional[str] = 'independent', fixed_sigma_value: Optional[int] = 0.3, @@ -52,9 +54,15 @@ def __init__( - critic_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to critic-nn's ``Head``. - critic_head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output for critic's nn. - - activation (:obj:`Optional[nn.Module]`): + - activation (:obj:`Optional[Union[str, nn.Module]]`): The type of activation function to use in ``MLP`` the after ``layer_fn``, if ``None`` then default set to ``nn.ReLU()`` + - policy_activation (:obj:`Optional[Union[str, nn.Module]]`): + The type of activation function to use in ``MLP`` the after ``layer_fn`` in actor's nn, + if ``None`` then default set to ``activation`` + - value_activation (:obj:`Optional[Union[str, nn.Module]]`): + The type of activation function to use in ``MLP`` the after ``layer_fn`` in critic's nn, + if ``None`` then default set to ``activation`` - norm_type (:obj:`Optional[str]`): The type of normalization to use, see ``ding.torch_utils.fc_block`` for more details` """ @@ -64,9 +72,19 @@ def __init__( self.obs_shape, self.action_shape = obs_shape, action_shape self.impala_cnn_encoder = impala_cnn_encoder self.share_encoder = share_encoder + if isinstance(activation, str): + activation = get_activation(activation) + if policy_activation is not None and isinstance(policy_activation, str): + policy_activation = get_activation(policy_activation) + else: + policy_activation = activation + if value_activation is not None and isinstance(value_activation, str): + value_activation = get_activation(value_activation) + else: + value_activation = activation # Encoder Type - def new_encoder(outsize): + def new_encoder(outsize, activation): if impala_cnn_encoder: return IMPALAConvEncoder(obs_shape=obs_shape, channels=encoder_hidden_size_list, outsize=outsize) else: @@ -99,7 +117,7 @@ def new_encoder(outsize): else: raise ValueError("illegal encoder instance.") else: - self.encoder = new_encoder(actor_head_hidden_size) + self.encoder = new_encoder(actor_head_hidden_size, activation) else: if encoder: if isinstance(encoder, torch.nn.Module): @@ -108,12 +126,12 @@ def new_encoder(outsize): else: raise ValueError("illegal encoder instance.") else: - self.actor_encoder = new_encoder(actor_head_hidden_size) - self.critic_encoder = new_encoder(critic_head_hidden_size) + self.actor_encoder = new_encoder(actor_head_hidden_size, policy_activation) + self.critic_encoder = new_encoder(critic_head_hidden_size, value_activation) # Head Type self.critic_head = RegressionHead( - critic_head_hidden_size, 1, critic_head_layer_num, activation=activation, norm_type=norm_type + critic_head_hidden_size, 1, critic_head_layer_num, activation=value_activation, norm_type=norm_type ) self.action_space = action_space assert self.action_space in ['discrete', 'continuous', 'hybrid'], self.action_space @@ -124,7 +142,7 @@ def new_encoder(outsize): action_shape, actor_head_layer_num, sigma_type=sigma_type, - activation=activation, + activation=policy_activation, norm_type=norm_type, bound_type=bound_type ) @@ -138,7 +156,7 @@ def new_encoder(outsize): actor_head_hidden_size, action_shape, layer_num=actor_head_layer_num, - activation=activation, + activation=policy_activation, norm_type=norm_type ) else: @@ -146,7 +164,7 @@ def new_encoder(outsize): actor_head_hidden_size, action_shape, actor_head_layer_num, - activation=activation, + activation=policy_activation, norm_type=norm_type ) elif self.action_space == 'hybrid': # HPPO @@ -160,7 +178,7 @@ def new_encoder(outsize): actor_head_layer_num, sigma_type=sigma_type, fixed_sigma_value=fixed_sigma_value, - activation=activation, + activation=policy_activation, norm_type=norm_type, bound_type=bound_type, ) @@ -168,7 +186,7 @@ def new_encoder(outsize): actor_head_hidden_size, action_shape.action_type_shape, actor_head_layer_num, - activation=activation, + activation=policy_activation, norm_type=norm_type, ) self.actor_head = nn.ModuleList([actor_action_type, actor_action_args]) diff --git a/ding/policy/ppo.py b/ding/policy/ppo.py index 6d7c7a852c..b491127273 100644 --- a/ding/policy/ppo.py +++ b/ding/policy/ppo.py @@ -685,7 +685,7 @@ class PPOOffPolicy(Policy): priority=False, # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True. priority_IS_weight=False, - # (str) Which kind of action space used in PPOPolicy, ['discrete', 'continuous'] + # (str) Which kind of action space used in PPOPolicy, ["general", "continuous", "discrete", "hybrid"] action_space='discrete', # (bool) Whether to use nstep_return for value loss nstep_return=False, @@ -743,7 +743,7 @@ def _init_learn(self) -> None: self._priority_IS_weight = self._cfg.priority_IS_weight assert not self._priority and not self._priority_IS_weight, "Priority is not implemented in PPO" - assert self._cfg.action_space in ["continuous", "discrete", "hybrid"] + assert self._cfg.action_space in ["general", "continuous", "discrete", "hybrid"] self._action_space = self._cfg.action_space if self._cfg.learn.ppo_param_init: for n, m in self._model.named_modules(): @@ -943,6 +943,7 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: 'cur_lr': self._optimizer.defaults['lr'], 'total_loss': total_loss.item(), 'policy_loss': ppo_loss.policy_loss.item(), + 'value': data['value'].mean().item(), 'value_loss': ppo_loss.value_loss.item(), 'entropy_loss': ppo_loss.entropy_loss.item(), 'adv_abs_max': adv.abs().max().item(), @@ -1126,7 +1127,7 @@ def default_model(self) -> Tuple[str, List[str]]: def _monitor_vars_learn(self) -> List[str]: variables = super()._monitor_vars_learn() + [ - 'policy_loss', 'value_loss', 'entropy_loss', 'adv_abs_max', 'approx_kl', 'clipfrac' + 'policy_loss', 'value', 'value_loss', 'entropy_loss', 'adv_abs_max', 'approx_kl', 'clipfrac' ] if self._action_space == 'continuous': variables += ['mu_mean', 'sigma_mean', 'sigma_grad', 'act'] From ab0fdda3c178731a6b65b8a906d880f443b4cf22 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Mon, 17 Jul 2023 08:27:24 +0000 Subject: [PATCH 164/244] add ppo offpolicy general action modeling --- ding/bonus/ppo_offpolicy.py | 11 +- ding/config/PPOOffPolicy/__init__.py | 1 - .../gym_lunarlandercontinuous_v2.py | 96 ++++++++--- .../functional/advantage_estimator.py | 10 +- .../middleware/functional/collector.py | 3 - .../middleware/functional/evaluator.py | 4 + .../framework/middleware/functional/logger.py | 33 +++- ding/model/common/__init__.py | 2 +- ding/model/common/utils.py | 24 --- ding/model/template/__init__.py | 4 +- ding/model/template/stochastic_policy.py | 35 ++++ ding/model/template/vac.py | 56 ++++++- ding/model/template/value_network.py | 41 +++++ ding/policy/common_utils.py | 9 ++ ding/policy/ppo.py | 149 ++++++++++++----- ding/rl_utils/__init__.py | 3 +- ding/rl_utils/ppo.py | 70 ++++++++ ding/torch_utils/__init__.py | 2 + ding/torch_utils/activation.py | 29 ++++ ding/torch_utils/modules/__init__.py | 6 + ding/torch_utils/modules/distribution.py | 34 ++++ ding/torch_utils/modules/function.py | 25 +++ ding/torch_utils/modules/gaussian.py | 150 ++++++++++++++++++ ding/torch_utils/modules/matrix.py | 52 ++++++ ding/torch_utils/modules/parameter.py | 37 +++++ ding/torch_utils/modules/perceptron.py | 52 ++++++ 26 files changed, 832 insertions(+), 106 deletions(-) create mode 100644 ding/model/template/stochastic_policy.py create mode 100644 ding/model/template/value_network.py create mode 100644 ding/torch_utils/activation.py create mode 100644 ding/torch_utils/modules/__init__.py create mode 100644 ding/torch_utils/modules/distribution.py create mode 100644 ding/torch_utils/modules/function.py create mode 100644 ding/torch_utils/modules/gaussian.py create mode 100644 ding/torch_utils/modules/matrix.py create mode 100644 ding/torch_utils/modules/parameter.py create mode 100644 ding/torch_utils/modules/perceptron.py diff --git a/ding/bonus/ppo_offpolicy.py b/ding/bonus/ppo_offpolicy.py index eb57e63763..cf9334660c 100644 --- a/ding/bonus/ppo_offpolicy.py +++ b/ding/bonus/ppo_offpolicy.py @@ -12,7 +12,8 @@ from ding.policy import PPOOffPolicy from ding.utils import set_pkg_seed from ding.config import save_config_py, compile_config -from ding.model import VAC +from ding.model import BaseVAC, VAC +from ding.model import StochasticPolicy, VModel from ding.model import model_wrap from ding.data import DequeBuffer from ding.bonus.common import TrainingReturn, EvalReturn @@ -72,7 +73,12 @@ def __init__( os.makedirs(self.exp_name) save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py')) if model is None: - model = VAC(**self.cfg.policy.model) + if hasattr(self.cfg.policy.model, "customized_model") and self.cfg.policy.model.customized_model: + actor = StochasticPolicy(self.cfg.policy.model.actor) + critic = VModel(self.cfg.policy.model.critic) + model = BaseVAC(actor=actor, critic=critic, action_space=self.cfg.policy.action_space) + else: + model = VAC(**self.cfg.policy.model) self.buffer_ = DequeBuffer(size=self.cfg.policy.other.replay_buffer.replay_buffer_size) self.policy = PPOOffPolicy(self.cfg.policy, model=model) if policy_state_dict is not None: @@ -116,6 +122,7 @@ def train( task.use( wandb_online_logger( cfg=self.cfg.wandb_logger, + exp_config=self.cfg, metric_list=self.policy.monitor_vars(), model=self.policy._model, anonymous=True, diff --git a/ding/config/PPOOffPolicy/__init__.py b/ding/config/PPOOffPolicy/__init__.py index 6fd395d632..e2e296bd7d 100644 --- a/ding/config/PPOOffPolicy/__init__.py +++ b/ding/config/PPOOffPolicy/__init__.py @@ -7,7 +7,6 @@ supported_env_cfg = { gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.cfg, - gym_lunarlandercontinuous_v2.cfg.env.env_id: gym_lunarlandercontinuous_v2.cfg, gym_pongnoframeskip_v4.cfg.env.env_id: gym_pongnoframeskip_v4.cfg, gym_qbertnoframeskip_v4.cfg.env.env_id: gym_qbertnoframeskip_v4.cfg, diff --git a/ding/config/PPOOffPolicy/gym_lunarlandercontinuous_v2.py b/ding/config/PPOOffPolicy/gym_lunarlandercontinuous_v2.py index 460368a52d..408db5b7fe 100644 --- a/ding/config/PPOOffPolicy/gym_lunarlandercontinuous_v2.py +++ b/ding/config/PPOOffPolicy/gym_lunarlandercontinuous_v2.py @@ -1,57 +1,101 @@ from easydict import EasyDict +action_shape = 2 +obs_shape = 8 + cfg = dict( exp_name='LunarLanderContinuous-v2-PPOOffPolicy', seed=0, env=dict( env_id='LunarLanderContinuous-v2', - collector_env_num=1, - evaluator_env_num=1, - n_evaluator_episode=1, + collector_env_num=16, + evaluator_env_num=4, + n_evaluator_episode=4, stop_value=240, act_scale=True, ), policy=dict( cuda=True, - action_space='continuous', + action_space='general', model=dict( obs_shape=8, - action_shape=2, - action_space='continuous', - sigma_type='independent', - # sigma_type='conditioned', - encoder_hidden_size_list=[64, 64, 128], - actor_head_hidden_size=128, - critic_head_hidden_size=128, - actor_head_layer_num=2, - critic_head_layer_num=2, - share_encoder=False, - policy_activation="tanh", - value_activation="relu", + action_shape=action_shape, + action_space='general', + customized_model=True, + actor=dict( + model_type='GaussianTanh', + model=dict( + mu_model=dict( + hidden_sizes=[obs_shape, 256, 256], + activation=['relu', 'relu', 'tanh'], + output_size=action_shape, + dropout=0, + layernorm=False, + final_activation='tanh', + scale=5.0, + shrink=0.01, + ), + cov=dict( + dim=action_shape, + functional=True, + random_init=False, + sigma_lambda=dict( + hidden_sizes=[obs_shape, 128], + activation='tanh', + output_size=action_shape, + dropout=0, + layernorm=False, + final_activation='tanh', + scale=5.0, + offset=-5.0, + ), + sigma_offdiag=dict( + hidden_sizes=[obs_shape, 128], + activation='tanh', + output_size=int(action_shape * (action_shape - 1) // 2), + dropout=0, + layernorm=False, + ), + ), + ), + ), + critic=dict( + model_num=1, + model=dict( + hidden_sizes=[obs_shape, 512, 256], + activation=['relu', 'softplus', 'softplus'], + output_size=1, + dropout=0, + layernorm=False, + ), + ), ), learn=dict( - update_per_collect=10, - batch_size=640, + update_per_collect=1, + batch_size=1024, learning_rate=3e-4, value_weight=0.5, - entropy_weight=0.0, - clip_ratio=0.1, + entropy_weight=0.01, + clip_ratio=0.05, nstep=1, nstep_return=False, - adv_norm=True, + adv_norm=False, value_norm=False, - ppo_param_init=True, + ppo_param_init=False, + separate_optimizer=True, + weight_decay=0.0, ), collect=dict( - n_sample=128, + n_sample=1024, unroll_len=1, discount_factor=0.99, - gae_lambda=0.95, + gae_lambda=1.0, ), eval=dict( - evaluator=dict(eval_freq=10,), + evaluator=dict(eval_freq=100, ), render=True, - ), + ), + other=dict(replay_buffer=dict(replay_buffer_size=int(128), ), ), ), wandb_logger=dict( gradient_logger=True, video_logger=True, plot_logger=True, action_logger=False, return_logger=False diff --git a/ding/framework/middleware/functional/advantage_estimator.py b/ding/framework/middleware/functional/advantage_estimator.py index 69e6125947..9acafe2f39 100644 --- a/ding/framework/middleware/functional/advantage_estimator.py +++ b/ding/framework/middleware/functional/advantage_estimator.py @@ -82,11 +82,19 @@ def _gae(ctx: "OnlineRLContext"): for d in data: d['obs'] = d['obs'].squeeze(0) d['next_obs'] = d['next_obs'].squeeze(0) - if hasattr(data[0], 'logit'): + if 'logit' in data[0]: for d in data: d['logit'] = d['logit'].squeeze(0) + if 'log_prob' in data[0]: + for d in data: + d['log_prob'] = d['log_prob'].squeeze(0) else: raise RuntimeError("The shape of obs is {}, which is not same as config.".format(data[0]['obs'].shape)) + + if data[0]['action'].dtype in [torch.float16,torch.float32,torch.double] \ + and data[0]['action'].dim() == 2: + for d in data: + d['action'] = d['action'].squeeze(0) for d in data: buffer_.push(d) ctx.trajectories = None diff --git a/ding/framework/middleware/functional/collector.py b/ding/framework/middleware/functional/collector.py index f530bfabd4..62d183e6d8 100644 --- a/ding/framework/middleware/functional/collector.py +++ b/ding/framework/middleware/functional/collector.py @@ -77,9 +77,6 @@ def _inference(ctx: "OnlineRLContext"): obs = {i: obs[i] for i in range(get_shape0(obs))} # TBD inference_output = policy.forward(obs, **ctx.collect_kwargs) - # for key, value in inference_output.items(): - # if value['action'].dim() == 0: - # inference_output[key]['action'] = value['action'].unsqueeze(0) ctx.action = [to_ndarray(v['action']) for v in inference_output.values()] # TBD ctx.inference_output = inference_output diff --git a/ding/framework/middleware/functional/evaluator.py b/ding/framework/middleware/functional/evaluator.py index 7aa654afe6..46e4c76e96 100644 --- a/ding/framework/middleware/functional/evaluator.py +++ b/ding/framework/middleware/functional/evaluator.py @@ -268,6 +268,8 @@ def _evaluate(ctx: Union["OnlineRLContext", "OfflineRLContext"]): if 'episode_info' in timestep.info: eval_monitor.update_info(env_id, timestep.info.episode_info) episode_return = eval_monitor.get_episode_return() + episode_return_min = np.std(episode_return) + episode_return_max = np.std(episode_return) episode_return_std = np.std(episode_return) episode_return = np.mean(episode_return) stop_flag = episode_return >= cfg.env.stop_value and ctx.train_iter > 0 @@ -283,6 +285,8 @@ def _evaluate(ctx: Union["OnlineRLContext", "OfflineRLContext"]): raise TypeError("not supported ctx type: {}".format(type(ctx))) ctx.last_eval_iter = ctx.train_iter ctx.eval_value = episode_return + ctx.eval_value_min = episode_return_min + ctx.eval_value_max = episode_return_max ctx.eval_value_std = episode_return_std ctx.last_eval_value = ctx.eval_value ctx.eval_output = {'episode_return': episode_return} diff --git a/ding/framework/middleware/functional/logger.py b/ding/framework/middleware/functional/logger.py index 1d4b2068e1..69dc527ca0 100644 --- a/ding/framework/middleware/functional/logger.py +++ b/ding/framework/middleware/functional/logger.py @@ -120,6 +120,7 @@ def _logger(ctx: "OfflineRLContext"): def wandb_online_logger( record_path: str = None, cfg: Union[dict, EasyDict] = None, + exp_config: Union[dict, EasyDict] = None, metric_list: Optional[List[str]] = None, env: Optional[BaseEnvManagerV2] = None, model: Optional[torch.nn.Module] = None, @@ -152,16 +153,28 @@ def wandb_online_logger( metric_list = ["q_value", "target q_value", "loss", "lr", "entropy", "target_q_value", "td_error"] # Initialize wandb with default settings # Settings can be covered by calling wandb.init() at the top of the script - if not wandb_sweep: - if anonymous: - wandb.init(project=project_name, reinit=True, anonymous="must") + if exp_config: + if not wandb_sweep: + if anonymous: + wandb.init(project=project_name, config=exp_config, reinit=True, anonymous="must") + else: + wandb.init(project=project_name, config=exp_config, reinit=True) else: - wandb.init(project=project_name, reinit=True) + if anonymous: + wandb.init(project=project_name, config=exp_config, anonymous="must") + else: + wandb.init(project=project_name, config=exp_config) else: - if anonymous: - wandb.init(project=project_name, anonymous="must") + if not wandb_sweep: + if anonymous: + wandb.init(project=project_name, reinit=True, anonymous="must") + else: + wandb.init(project=project_name, reinit=True) else: - wandb.init(project=project_name) + if anonymous: + wandb.init(project=project_name, anonymous="must") + else: + wandb.init(project=project_name) plt.switch_backend('agg') if cfg is None: cfg = EasyDict( @@ -185,7 +198,7 @@ def wandb_online_logger( if env is not None and cfg.video_logger is True and record_path is not None: env.enable_save_replay(replay_path=record_path) if cfg.gradient_logger: - wandb.watch(model) + wandb.watch(model, log="all", log_freq=100, log_graph=True) else: one_time_warning( "If you want to use wandb to visualize the gradient, please set gradient_logger = True in the config." @@ -225,6 +238,8 @@ def _plot(ctx: "OnlineRLContext"): if ctx.eval_value != -np.inf: info_for_logging.update( { + "episode return min": ctx.eval_value_min, + "episode return max": ctx.eval_value_max, "episode return mean": ctx.eval_value, "episode return std": ctx.eval_value_std, "train iter": ctx.train_iter, @@ -472,6 +487,8 @@ def _plot(ctx: "OnlineRLContext"): if ctx.eval_value != -np.inf: info_for_logging.update( { + "episode return min": ctx.eval_value_min, + "episode return max": ctx.eval_value_max, "episode return mean": ctx.eval_value, "episode return std": ctx.eval_value_std, "train iter": ctx.train_iter, diff --git a/ding/model/common/__init__.py b/ding/model/common/__init__.py index d14a4659fb..ea30de717e 100755 --- a/ding/model/common/__init__.py +++ b/ding/model/common/__init__.py @@ -2,4 +2,4 @@ QuantileHead, FQFHead, RegressionHead, ReparameterizationHead, MultiHead, BranchingHead, head_cls_map, \ independent_normal_dist, AttentionPolicyHead, PopArtVHead, EnsembleHead from .encoder import ConvEncoder, FCEncoder, IMPALAConvEncoder -from .utils import create_model, get_activation +from .utils import create_model diff --git a/ding/model/common/utils.py b/ding/model/common/utils.py index ffe6234ce0..4314c49c94 100644 --- a/ding/model/common/utils.py +++ b/ding/model/common/utils.py @@ -3,30 +3,6 @@ from easydict import EasyDict from ding.utils import import_module, MODEL_REGISTRY -class Lambda(nn.Module): - - def __init__(self, f): - super(Lambda, self).__init__() - self.f = f - - def forward(self, x): - return self.f(x) - - -NONLINEARITIES = { - "tanh": nn.Tanh(), - "relu": nn.ReLU(), - "softplus": nn.Softplus(), - "elu": nn.ELU(), - "square": Lambda(lambda x: x**2), - "identity": Lambda(lambda x: x), -} - -def get_activation(name:str): - name=name.lower() - if name not in NONLINEARITIES: - raise ValueError("Unknown activation function {}".format(name)) - return NONLINEARITIES[name] def create_model(cfg: EasyDict) -> torch.nn.Module: """ diff --git a/ding/model/template/__init__.py b/ding/model/template/__init__.py index dc3ff9d5b4..6cf750aae7 100755 --- a/ding/model/template/__init__.py +++ b/ding/model/template/__init__.py @@ -2,7 +2,7 @@ from .q_learning import DQN, RainbowDQN, QRDQN, IQN, FQF, DRQN, C51DQN, BDQ from .qac import QAC, DiscreteQAC from .pdqn import PDQN -from .vac import VAC +from .vac import BaseVAC, VAC from .bc import DiscreteBC, ContinuousBC from .pg import PG # algorithm-specific @@ -25,3 +25,5 @@ from .procedure_cloning import ProcedureCloningMCTS, ProcedureCloningBFS from .bcq import BCQ from .edac import QACEnsemble +from .value_network import QModel, VModel +from .stochastic_policy import StochasticPolicy diff --git a/ding/model/template/stochastic_policy.py b/ding/model/template/stochastic_policy.py new file mode 100644 index 0000000000..2a2eb0411e --- /dev/null +++ b/ding/model/template/stochastic_policy.py @@ -0,0 +1,35 @@ +import torch +from torch import nn +from ding.torch_utils import Gaussian, GaussianTanh + + +class StochasticPolicy(nn.Module): + + def __init__(self, cfg): + super().__init__() + self.cfg = cfg + if cfg.model_type == 'Gaussian': + self.model = Gaussian(cfg.model) + elif cfg.model_type == 'GaussianTanh': + self.model = GaussianTanh(cfg.model) + else: + raise NotImplementedError + + def forward(self, obs): + action, log_prob = self.model(obs) + return action, log_prob + + def log_prob(self, action, obs): + return self.model.log_prob(action, obs) + + def sample(self, obs, sample_shape=torch.Size()): + return self.model.sample(obs, sample_shape) + + def rsample(self, obs, sample_shape=torch.Size()): + return self.model.rsample(obs, sample_shape) + + def entropy(self, obs): + return self.model.entropy(obs) + + def dist(self, obs): + return self.model.dist(obs) diff --git a/ding/model/template/vac.py b/ding/model/template/vac.py index 2d277a0da8..113097e128 100644 --- a/ding/model/template/vac.py +++ b/ding/model/template/vac.py @@ -3,9 +3,63 @@ import torch import torch.nn as nn from copy import deepcopy +from ding.torch_utils import get_activation from ding.utils import SequenceType, squeeze, MODEL_REGISTRY from ..common import ReparameterizationHead, RegressionHead, DiscreteHead, MultiHead, \ - FCEncoder, ConvEncoder, IMPALAConvEncoder, get_activation + FCEncoder, ConvEncoder, IMPALAConvEncoder + + +@MODEL_REGISTRY.register('base_vac') +class BaseVAC(nn.Module): + r""" + Overview: + The VAC model. + Interfaces: + ``__init__``, ``forward``, ``compute_actor``, ``compute_critic`` + """ + mode = ['compute_actor', 'compute_critic', 'compute_actor_critic'] + + def __init__( + self, + actor: nn.Module, + critic: nn.Module, + action_space: str, + ) -> None: + super(BaseVAC, self).__init__() + self.actor = actor + self.critic = critic + self.action_space = action_space + + def forward(self, inputs: Union[torch.Tensor, Dict], mode: str): + assert mode in self.mode, "not support forward mode: {}/{}".format(mode, self.mode) + return getattr(self, mode)(inputs) + + def compute_actor(self, x: torch.Tensor): + if self.action_space == 'discrete': + raise NotImplementedError + elif self.action_space == 'continuous': + raise NotImplementedError + elif self.action_space == 'general': + action, log_prob = self.actor(x) + return {'action': action, 'log_prob': log_prob} + else: + raise NotImplementedError + + def compute_critic(self, x: torch.Tensor): + value = self.critic(x) + return {'value': value} + + def compute_actor_critic(self, x: torch.Tensor): + if self.action_space == 'discrete': + raise NotImplementedError + elif self.action_space == 'continuous': + raise NotImplementedError + elif self.action_space == 'general': + action, log_prob = self.actor(x) + value = self.critic(x) + return {'action': action, 'log_prob': log_prob, 'value': value} + else: + raise NotImplementedError @MODEL_REGISTRY.register('vac') diff --git a/ding/model/template/value_network.py b/ding/model/template/value_network.py new file mode 100644 index 0000000000..958f066505 --- /dev/null +++ b/ding/model/template/value_network.py @@ -0,0 +1,41 @@ +import torch +from torch import nn +from ding.torch_utils import multilayer_perceptron + + +class QModel(nn.Module): + + def __init__(self, cfg): + super().__init__() + self.cfg = cfg + self.model_num = cfg.model_num if hasattr(cfg, 'model_num') else 1 + self.models = nn.ModuleList([multilayer_perceptron(cfg.model) for _ in range(self.model_num)]) + + def forward(self, obs, action): + if self.model_num == 1: + return self.models[0](torch.cat((obs, action), dim=1)).squeeze(dim=1) + else: + return torch.cat([model(torch.cat((obs, action), dim=1)) for model in self.models], dim=1) + + def min_q(self, obs, action): + return torch.min( + input=torch.cat([model(torch.cat((obs, action), dim=1)) for model in self.models], dim=1), dim=1 + ).values + + +class VModel(nn.Module): + + def __init__(self, cfg): + super().__init__() + self.cfg = cfg + self.model_num = cfg.model_num if hasattr(cfg, 'model_num') else 1 + self.models = nn.ModuleList([multilayer_perceptron(cfg.model) for _ in range(self.model_num)]) + + def forward(self, obs): + if self.model_num == 1: + return self.models[0](obs).squeeze(dim=1) + else: + return torch.cat([model(obs) for model in self.models], dim=1) + + def min_q(self, obs): + return torch.min(input=torch.cat([model(obs) for model in self.models], dim=1), dim=1).values diff --git a/ding/policy/common_utils.py b/ding/policy/common_utils.py index 714583e63f..4f5193803a 100644 --- a/ding/policy/common_utils.py +++ b/ding/policy/common_utils.py @@ -17,10 +17,19 @@ def default_preprocess_learn( data = default_collate(data, cat_1dim=True) # for discrete action else: data = default_collate(data, cat_1dim=False) # for continuous action + if 'value' in data and data['value'].dim() == 2 and data['value'].shape[1] == 1: + data['value'] = data['value'].squeeze(-1) + if 'adv' in data and data['adv'].dim() == 2 and data['adv'].shape[1] == 1: + data['adv'] = data['adv'].squeeze(-1) + if ignore_done: data['done'] = torch.zeros_like(data['done']).float() else: data['done'] = data['done'].float() + + if data['done'].dim() == 2 and data['done'].shape[1] == 1: + data['done'] = data['done'].squeeze(-1) + if use_priority_IS_weight: assert use_priority, "Use IS Weight correction, but Priority is not used." if use_priority and use_priority_IS_weight: diff --git a/ding/policy/ppo.py b/ding/policy/ppo.py index b491127273..7969f06717 100644 --- a/ding/policy/ppo.py +++ b/ding/policy/ppo.py @@ -7,7 +7,8 @@ from ding.torch_utils import Adam, to_device, to_dtype, unsqueeze, ContrastiveLoss from ding.rl_utils import ppo_data, ppo_error, ppo_policy_error, ppo_policy_data, get_gae_with_default_last_value, \ v_nstep_td_data, v_nstep_td_error, get_nstep_return_data, get_train_sample, gae, gae_data, ppo_error_continuous, \ - get_gae, ppo_policy_error_continuous + get_gae, ppo_policy_error_continuous, ppo_error_general, ppo_policy_error_general, ppo_data_general, \ + ppo_policy_data_general from ding.model import model_wrap from ding.utils import POLICY_REGISTRY, split_data_generator, RunningMeanStd from ding.utils.data import default_collate, default_decollate @@ -699,6 +700,7 @@ class PPOOffPolicy(Policy): update_per_collect=5, batch_size=64, learning_rate=0.001, + separate_optimizer=False, # ============================================================== # The following configs is algorithm-specific # ============================================================== @@ -715,6 +717,7 @@ class PPOOffPolicy(Policy): grad_clip_type='clip_norm', grad_clip_value=0.5, ignore_done=False, + weight_decay=0.0, ), collect=dict( # (int) Only one of [n_sample, n_episode] shoule be set @@ -742,10 +745,11 @@ def _init_learn(self) -> None: self._priority = self._cfg.priority self._priority_IS_weight = self._cfg.priority_IS_weight assert not self._priority and not self._priority_IS_weight, "Priority is not implemented in PPO" - + assert self._cfg.action_space in ["general", "continuous", "discrete", "hybrid"] self._action_space = self._cfg.action_space - if self._cfg.learn.ppo_param_init: + + if self._action_space != "general" and self._cfg.learn.ppo_param_init: for n, m in self._model.named_modules(): if isinstance(m, torch.nn.Linear): torch.nn.init.orthogonal_(m.weight) @@ -754,7 +758,7 @@ def _init_learn(self) -> None: # init log sigma if self._action_space == 'continuous': if hasattr(self._model.actor_head, 'log_sigma_param'): - torch.nn.init.constant_(self._model.actor_head.log_sigma_param, -0.5) + torch.nn.init.constant_(self._model.actor_head.log_sigma_param, -2.0) elif self._action_space == 'hybrid': # actor_head[1]: ReparameterizationHead, for action_args if hasattr(self._model.actor_head[1], 'log_sigma_param'): torch.nn.init.constant_(self._model.actor_head[1].log_sigma_param, -0.5) @@ -773,12 +777,28 @@ def _init_learn(self) -> None: m.weight.data.copy_(0.01 * m.weight.data) # Optimizer - self._optimizer = Adam( - self._model.parameters(), - lr=self._cfg.learn.learning_rate, - grad_clip_type=self._cfg.learn.grad_clip_type, - clip_value=self._cfg.learn.grad_clip_value - ) + if self._cfg.learn.separate_optimizer: + self._actor_optimizer = Adam( + self._model.actor.parameters(), + lr=self._cfg.learn.learning_rate, + grad_clip_type=self._cfg.learn.grad_clip_type, + clip_value=self._cfg.learn.grad_clip_value, + weight_decay=self._cfg.learn.weight_decay, + ) + self._critic_optimizer = Adam( + self._model.critic.parameters(), + lr=self._cfg.learn.learning_rate, + grad_clip_type=self._cfg.learn.grad_clip_type, + clip_value=self._cfg.learn.grad_clip_value, + ) + else: + self._optimizer = Adam( + self._model.parameters(), + lr=self._cfg.learn.learning_rate, + grad_clip_type=self._cfg.learn.grad_clip_type, + clip_value=self._cfg.learn.grad_clip_value, + weight_decay=self._cfg.learn.weight_decay, + ) self._learn_model = model_wrap(self._model, wrapper_name='base') @@ -830,9 +850,10 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: # normal ppo if not self._nstep_return: - output = self._learn_model.forward(data['obs'], mode='compute_actor_critic') + if self._action_space != 'general': + output = self._learn_model.forward(data['obs'], mode='compute_actor_critic') adv = data['adv'] - + if self._adv_norm: # Normalize advantage in a total train_batch adv = (adv - adv.mean()) / (adv.std() + 1e-8) @@ -852,8 +873,8 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: elif self._action_space == 'hybrid': # discrete part (discrete policy loss and entropy loss) ppo_discrete_batch = ppo_policy_data( - output['logit']['action_type'], data['logit']['action_type'], data['action']['action_type'], - adv, data['weight'] + output['logit']['action_type'], data['logit']['action_type'], data['action']['action_type'], adv, + data['weight'] ) ppo_discrete_loss, ppo_discrete_info = ppo_policy_error(ppo_discrete_batch, self._clip_ratio) # continuous part (continuous policy loss and entropy loss, value loss) @@ -861,9 +882,7 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: output['logit']['action_args'], data['logit']['action_args'], data['action']['action_args'], output['value'], data['value'], adv, data['return'], data['weight'] ) - ppo_continuous_loss, ppo_continuous_info = ppo_error_continuous( - ppo_continuous_batch, self._clip_ratio - ) + ppo_continuous_loss, ppo_continuous_info = ppo_error_continuous(ppo_continuous_batch, self._clip_ratio) # sum discrete and continuous loss ppo_loss = type(ppo_continuous_loss)( ppo_continuous_loss.policy_loss + ppo_discrete_loss.policy_loss, ppo_continuous_loss.value_loss, @@ -873,11 +892,28 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: max(ppo_continuous_info.approx_kl, ppo_discrete_info.approx_kl), max(ppo_continuous_info.clipfrac, ppo_discrete_info.clipfrac) ) + elif self._action_space == 'general': + entropy = self._learn_model.actor.entropy(data['obs']) + log_prob = self._learn_model.actor.log_prob(action=data['action'], obs=data['obs']) + value = self._learn_model.critic(data['obs']) + ppodata = ppo_data_general( + log_prob, data['log_prob'], value, data['value'], data['adv'], data['return'], data['weight'] + ) + ppo_loss, ppo_info = ppo_error_general( + data=ppodata, entropy=entropy, clip_ratio=self._clip_ratio, use_value_clip=False + ) + wv, we = self._value_weight, self._entropy_weight total_loss = ppo_loss.policy_loss + wv * ppo_loss.value_loss - we * ppo_loss.entropy_loss + if self._cfg.learn.separate_optimizer: + actor_loss = ppo_loss.policy_loss - we * ppo_loss.entropy_loss + print(f"actor_loss:[{actor_loss}]") + critic_loss = ppo_loss.value_loss + print(f"critic_loss:[{critic_loss}]") else: - output = self._learn_model.forward(data['obs'], mode='compute_actor') + if self._action_space != 'general': + output = self._learn_model.forward(data['obs'], mode='compute_actor') adv = data['adv'] if self._adv_norm: # Normalize advantage in a total train_batch @@ -893,16 +929,18 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: elif self._action_space == 'hybrid': # discrete part (discrete policy loss and entropy loss) ppo_discrete_data = ppo_policy_data( - output['logit']['action_type'], data['logit']['action_type'], data['action']['action_type'], - adv, data['weight'] + output['logit']['action_type'], data['logit']['action_type'], data['action']['action_type'], adv, + data['weight'] ) ppo_discrete_loss, ppo_discrete_info = ppo_policy_error(ppo_discrete_data, self._clip_ratio) # continuous part (continuous policy loss and entropy loss, value loss) ppo_continuous_data = ppo_policy_data( - output['logit']['action_args'], data['logit']['action_args'], data['action']['action_args'], - adv, data['weight'] + output['logit']['action_args'], data['logit']['action_args'], data['action']['action_args'], adv, + data['weight'] + ) + ppo_continuous_loss, ppo_continuous_info = ppo_policy_error_continuous( + ppo_continuous_data, self._clip_ratio ) - ppo_continuous_loss, ppo_continuous_info = ppo_policy_error_continuous(ppo_continuous_data, self._clip_ratio) # sum discrete and continuous loss ppo_policy_loss = type(ppo_continuous_loss)( ppo_continuous_loss.policy_loss + ppo_discrete_loss.policy_loss, @@ -912,7 +950,14 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: max(ppo_continuous_info.approx_kl, ppo_discrete_info.approx_kl), max(ppo_continuous_info.clipfrac, ppo_discrete_info.clipfrac) ) - + elif self._action_space == 'general': + entropy = self._learn_model.actor.entropy(data['obs']) + log_prob = self._learn_model.actor.log_prob(action=data['action'], obs=data['obs']) + ppodata = ppo_policy_data_general(log_prob, data['log_prob'], adv, data['weight']) + ppo_policy_loss, ppo_info = ppo_policy_error_general( + data=ppodata, entropy=entropy, clip_ratio=self._clip_ratio + ) + wv, we = self._value_weight, self._entropy_weight next_obs = data.get('next_obs') value_gamma = data.get('value_gamma') @@ -932,15 +977,26 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: ppo_loss_data = namedtuple('ppo_loss', ['policy_loss', 'value_loss', 'entropy_loss']) ppo_loss = ppo_loss_data(ppo_policy_loss.policy_loss, critic_loss, ppo_policy_loss.entropy_loss) total_loss = ppo_policy_loss.policy_loss + wv * critic_loss - we * ppo_policy_loss.entropy_loss + if self._cfg.learn.separate_optimizer: + actor_loss = ppo_policy_loss.policy_loss - we * ppo_policy_loss.entropy_loss # ==================== # PPO update # ==================== - self._optimizer.zero_grad() - total_loss.backward() - self._optimizer.step() + if self._cfg.learn.separate_optimizer: + self._actor_optimizer.zero_grad() + actor_loss.backward() + self._actor_optimizer.step() + self._critic_optimizer.zero_grad() + critic_loss.backward() + self._critic_optimizer.step() + else: + self._optimizer.zero_grad() + total_loss.backward() + self._optimizer.step() return_info = { - 'cur_lr': self._optimizer.defaults['lr'], + 'cur_lr': self._optimizer.defaults['lr'] + if not self._cfg.learn.separate_optimizer else self._actor_optimizer.defaults['lr'], 'total_loss': total_loss.item(), 'policy_loss': ppo_loss.policy_loss.item(), 'value': data['value'].mean().item(), @@ -961,14 +1017,26 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: return return_info def _state_dict_learn(self) -> Dict[str, Any]: - return { - 'model': self._learn_model.state_dict(), - 'optimizer': self._optimizer.state_dict(), - } + if self._cfg.learn.separate_optimizer: + return { + 'model': self._learn_model.state_dict(), + 'actor_optimizer': self._actor_optimizer.state_dict(), + 'critic_optimizer': self._critic_optimizer.state_dict(), + } + else: + return { + 'model': self._learn_model.state_dict(), + 'optimizer': self._optimizer.state_dict(), + } def _load_state_dict_learn(self, state_dict: Dict[str, Any]) -> None: - self._learn_model.load_state_dict(state_dict['model']) - self._optimizer.load_state_dict(state_dict['optimizer']) + if self._cfg.learn.separate_optimizer: + self._learn_model.load_state_dict(state_dict['model']) + self._actor_optimizer.load_state_dict(state_dict['actor_optimizer']) + self._critic_optimizer.load_state_dict(state_dict['critic_optimizer']) + else: + self._learn_model.load_state_dict(state_dict['model']) + self._optimizer.load_state_dict(state_dict['optimizer']) def _init_collect(self) -> None: r""" @@ -977,7 +1045,7 @@ def _init_collect(self) -> None: Init traj and unroll length, collect model. """ self._unroll_len = self._cfg.collect.unroll_len - assert self._cfg.action_space in ["continuous", "discrete", "hybrid"] + assert self._cfg.action_space in ["general", "continuous", "discrete", "hybrid"] self._action_space = self._cfg.action_space if self._action_space == 'continuous': self._collect_model = model_wrap(self._model, wrapper_name='reparam_sample') @@ -985,6 +1053,8 @@ def _init_collect(self) -> None: self._collect_model = model_wrap(self._model, wrapper_name='multinomial_sample') elif self._action_space == 'hybrid': self._collect_model = model_wrap(self._model, wrapper_name='hybrid_reparam_multinomial_sample') + elif self._action_space == 'general': + self._collect_model = model_wrap(self._model, wrapper_name='base') self._collect_model.reset() self._gamma = self._cfg.collect.discount_factor self._gae_lambda = self._cfg.collect.gae_lambda @@ -1031,12 +1101,15 @@ def _process_transition(self, obs: Any, model_output: dict, timestep: namedtuple transition = { 'obs': obs, 'next_obs': timestep.obs, - 'logit': model_output['logit'], 'action': model_output['action'], 'value': model_output['value'], 'reward': timestep.reward, 'done': timestep.done, } + if model_output.get('logit', None) is not None: + transition['logit'] = model_output['logit'] + if model_output.get('log_prob', None) is not None: + transition['log_prob'] = model_output['log_prob'] return transition def _get_train_sample(self, data: list) -> Union[None, List[Any]]: @@ -1088,7 +1161,7 @@ def _init_eval(self) -> None: Evaluate mode init method. Called by ``self.__init__``. Init eval model with argmax strategy. """ - assert self._cfg.action_space in ["continuous", "discrete", "hybrid"] + assert self._cfg.action_space in ["general", "continuous", "discrete", "hybrid"] self._action_space = self._cfg.action_space if self._action_space == 'continuous': self._eval_model = model_wrap(self._model, wrapper_name='deterministic_sample') @@ -1096,6 +1169,8 @@ def _init_eval(self) -> None: self._eval_model = model_wrap(self._model, wrapper_name='argmax_sample') elif self._action_space == 'hybrid': self._eval_model = model_wrap(self._model, wrapper_name='hybrid_deterministic_argmax_sample') + elif self._action_space == 'general': + self._eval_model = model_wrap(self._model, wrapper_name='base') self._eval_model.reset() def _forward_eval(self, data: dict) -> dict: diff --git a/ding/rl_utils/__init__.py b/ding/rl_utils/__init__.py index 381ee601f7..2fa7e77c91 100644 --- a/ding/rl_utils/__init__.py +++ b/ding/rl_utils/__init__.py @@ -1,6 +1,7 @@ from .exploration import get_epsilon_greedy_fn, create_noise_generator from .ppo import ppo_data, ppo_loss, ppo_info, ppo_policy_data, ppo_policy_error, ppo_value_data, ppo_value_error,\ - ppo_error, ppo_error_continuous, ppo_policy_error_continuous + ppo_error, ppo_error_continuous, ppo_policy_error_continuous, ppo_error_general, ppo_policy_error_general, ppo_data_general, \ + ppo_policy_data_general from .ppg import ppg_data, ppg_joint_loss, ppg_joint_error from .gae import gae_data, gae from .a2c import a2c_data, a2c_error, a2c_error_continuous diff --git a/ding/rl_utils/ppo.py b/ding/rl_utils/ppo.py index 5b62d5a489..6cd5da3355 100644 --- a/ding/rl_utils/ppo.py +++ b/ding/rl_utils/ppo.py @@ -1,6 +1,7 @@ from collections import namedtuple from typing import Optional, Tuple import torch +import torch.nn as nn from torch.distributions import Independent, Normal from ding.hpc_rl import hpc_wrapper @@ -12,6 +13,10 @@ ppo_loss = namedtuple('ppo_loss', ['policy_loss', 'value_loss', 'entropy_loss']) ppo_policy_loss = namedtuple('ppo_policy_loss', ['policy_loss', 'entropy_loss']) ppo_info = namedtuple('ppo_info', ['approx_kl', 'clipfrac']) +ppo_data_general = namedtuple( + 'ppo_data_general', ['logp_new', 'logp_old', 'value_new', 'value_old', 'adv', 'return_', 'weight'] +) +ppo_policy_data_general = namedtuple('ppo_policy_data_general', ['logp_new', 'logp_old', 'adv', 'weight']) def shape_fn_ppo(args, kwargs): @@ -260,3 +265,68 @@ def ppo_policy_error_continuous(data: namedtuple, clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped).float().mean().item() return ppo_policy_loss(policy_loss, entropy_loss), ppo_info(approx_kl, clipfrac) + + +def ppo_error_general( + data: namedtuple, + entropy, + clip_ratio: float = 0.2, + use_value_clip: bool = True, + dual_clip: Optional[float] = None +) -> Tuple[namedtuple, namedtuple]: + assert dual_clip is None or dual_clip > 1.0, "dual_clip value must be greater than 1.0, but get value: {}".format( + dual_clip + ) + logp_new, logp_old, value_new, value_old, adv, return_, weight = data + if weight is None: + weight = torch.ones_like(adv) + entropy_loss = (entropy * weight).mean() + # policy_loss + ratio = torch.exp(logp_new - logp_old) + surr1 = ratio * adv + surr2 = ratio.clamp(1 - clip_ratio, 1 + clip_ratio) * adv + if dual_clip is not None: + policy_loss = (-torch.max(torch.min(surr1, surr2), dual_clip * adv) * weight).mean() + else: + policy_loss = (-torch.min(surr1, surr2) * weight).mean() + with torch.no_grad(): + approx_kl = (logp_old - logp_new).mean().item() + clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) + clipfrac = torch.as_tensor(clipped).float().mean().item() + # value_loss + if use_value_clip: + value_clip = value_old + (value_new - value_old).clamp(-clip_ratio, clip_ratio) + v1 = (return_ - value_new).pow(2) + v2 = (return_ - value_clip).pow(2) + value_loss = 0.5 * (torch.max(v1, v2) * weight).mean() + else: + value_loss = 0.5 * ((return_ - value_new).pow(2) * weight).mean() + + return ppo_loss(policy_loss, value_loss, entropy_loss), ppo_info(approx_kl, clipfrac) + + +def ppo_policy_error_general(data: namedtuple, + entropy, + clip_ratio: float = 0.2, + dual_clip: Optional[float] = None) -> Tuple[namedtuple, namedtuple]: + assert dual_clip is None or dual_clip > 1.0, "dual_clip value must be greater than 1.0, but get value: {}".format( + dual_clip + ) + logp_new, logp_old, adv, weight = data + if weight is None: + weight = torch.ones_like(adv) + entropy_loss = (entropy * weight).mean() + # policy_loss + ratio = torch.exp(logp_new - logp_old) + surr1 = ratio * adv + surr2 = ratio.clamp(1 - clip_ratio, 1 + clip_ratio) * adv + if dual_clip is not None: + policy_loss = (-torch.max(torch.min(surr1, surr2), dual_clip * adv) * weight).mean() + else: + policy_loss = (-torch.min(surr1, surr2) * weight).mean() + with torch.no_grad(): + approx_kl = (logp_old - logp_new).mean().item() + clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) + clipfrac = torch.as_tensor(clipped).float().mean().item() + + return ppo_policy_loss(policy_loss, entropy_loss), ppo_info(approx_kl, clipfrac) diff --git a/ding/torch_utils/__init__.py b/ding/torch_utils/__init__.py index 9c7b677143..c28d593725 100755 --- a/ding/torch_utils/__init__.py +++ b/ding/torch_utils/__init__.py @@ -1,3 +1,4 @@ +from .activation import get_activation from .checkpoint_helper import build_checkpoint_helper, CountVar, auto_checkpoint from .data_helper import to_device, to_tensor, to_ndarray, to_list, to_dtype, same_shape, tensor_to_list, \ build_log_buffer, CudaFetcher, get_tensor_data, unsqueeze, squeeze, get_null_data, get_shape0, to_item @@ -10,3 +11,4 @@ from .math_helper import cov from .dataparallel import DataParallel from .reshape_helper import fold_batch, unfold_batch, unsqueeze_repeat +from .modules import * \ No newline at end of file diff --git a/ding/torch_utils/activation.py b/ding/torch_utils/activation.py new file mode 100644 index 0000000000..c0434e27af --- /dev/null +++ b/ding/torch_utils/activation.py @@ -0,0 +1,29 @@ +import torch +import torch.nn as nn + + +class Lambda(nn.Module): + + def __init__(self, f): + super(Lambda, self).__init__() + self.f = f + + def forward(self, x): + return self.f(x) + + +NONLINEARITIES = { + "tanh": nn.Tanh(), + "relu": nn.ReLU(), + "softplus": nn.Softplus(), + "elu": nn.ELU(), + "square": Lambda(lambda x: x ** 2), + "identity": Lambda(lambda x: x), +} + + +def get_activation(name: str): + name = name.lower() + if name not in NONLINEARITIES: + raise ValueError("Unknown activation function {}".format(name)) + return NONLINEARITIES[name] diff --git a/ding/torch_utils/modules/__init__.py b/ding/torch_utils/modules/__init__.py new file mode 100644 index 0000000000..66c6778ab7 --- /dev/null +++ b/ding/torch_utils/modules/__init__.py @@ -0,0 +1,6 @@ +from .parameter import NonegativeParameter, TanhParameter +from .perceptron import multilayer_perceptron +from .distribution import Distribution +from .gaussian import StandardGaussian, Gaussian, GaussianTanh +from .function import NonegativeFunction, TanhFunction +from .matrix import CovarianceMatrix diff --git a/ding/torch_utils/modules/distribution.py b/ding/torch_utils/modules/distribution.py new file mode 100644 index 0000000000..e17aabb211 --- /dev/null +++ b/ding/torch_utils/modules/distribution.py @@ -0,0 +1,34 @@ +import torch +from torch import nn + + +class Distribution(nn.Module): + + def __init__(self): + super().__init__() + + def forward(self, *args, **kwargs): + raise RuntimeError("Forward method cannot be called for a Distribution object.") + + def log_prob(self, x, condition=None, **kwargs): + raise NotImplementedError + + def sample(self, num=1, condition=None, **kwargs): + with torch.no_grad(): + return self.rsample(num, condition, **kwargs) + + def rsample(self, num=1, condition=None, **kwargs): + raise NotImplementedError + + def entropy(self, *args, **kwargs): + raise NotImplementedError + + def dist(self, *args, **kwargs): + raise NotImplementedError + + def sample_and_log_prob(self, num=1, condition=None, **kwargs): + with torch.no_grad(): + return self.rsample_and_log_prob(num, condition, **kwargs) + + def rsample_and_log_prob(self, num=1, condition=None, **kwargs): + raise NotImplementedError diff --git a/ding/torch_utils/modules/function.py b/ding/torch_utils/modules/function.py new file mode 100644 index 0000000000..9a971e3cc3 --- /dev/null +++ b/ding/torch_utils/modules/function.py @@ -0,0 +1,25 @@ +import torch +from torch import nn +from torch.distributions.transforms import TanhTransform +from .perceptron import multilayer_perceptron + + +class NonegativeFunction(nn.Module): + + def __init__(self, cfg): + super().__init__() + self.model = multilayer_perceptron(cfg) + + def forward(self, x): + return torch.exp(self.model(x)) + + +class TanhFunction(nn.Module): + + def __init__(self, cfg): + super().__init__() + self.transform = TanhTransform(cache_size=1) + self.model = multilayer_perceptron(cfg) + + def forward(self, x): + return self.transform(self.model(x)) diff --git a/ding/torch_utils/modules/gaussian.py b/ding/torch_utils/modules/gaussian.py new file mode 100644 index 0000000000..26cf9b5ca6 --- /dev/null +++ b/ding/torch_utils/modules/gaussian.py @@ -0,0 +1,150 @@ +import torch +from torch import nn +from .perceptron import multilayer_perceptron +from .parameter import NonegativeParameter +from .matrix import CovarianceMatrix +from torch.distributions import TransformedDistribution, MultivariateNormal, Independent +from torch.distributions.transforms import TanhTransform +from .distribution import Distribution + + +class StandardGaussian(Distribution): + + def __init__(self, dim) -> None: + super().__init__() + self.dim = dim + self.dist = MultivariateNormal(torch.zeros(dim), torch.eye(dim)) + + def log_prob(self, x, condition=None, **kwargs): + return self.dist.log_prob(x) + + def rsample_and_log_prob(self, condition=None, sample_shape=torch.Size(), **kwargs): + if condition is not None: + sample_shape = condition.shape[0] + x = self.dist.rsample(sample_shape=sample_shape) + log_prob = self.dist.log_prob(x) + return x, log_prob + + def sample_and_log_prob(self, condition=None, sample_shape=torch.Size(), **kwargs): + with torch.no_grad(): + return self.rsample_and_log_prob(condition, sample_shape, **kwargs) + + def rsample(self, condition=None, sample_shape=torch.Size(), **kwargs): + if condition is not None: + sample_shape = condition.shape[0] + return self.dist.rsample(sample_shape=sample_shape) + + def sample(self, condition=None, sample_shape=torch.Size(), **kwargs): + with torch.no_grad(): + return self.rsample(condition=condition, sample_shape=sample_shape, **kwargs) + + def entropy(self): + return self.dist.entropy() + + def dist(self): + return self.dist + + +class Gaussian(Distribution): + + def __init__(self, cfg): + super().__init__() + self.cfg = cfg + self.mu_model = multilayer_perceptron(cfg.mu_model) + self.cov = CovarianceMatrix(cfg.cov) + self.functional_cov = cfg.cov.functional + + def dist(self, conditioning): + mu = self.mu_model(conditioning) + # repeat the sigma to match the shape of mu + if self.functional_cov: + scale_tril = self.cov.low_triangle_matrix(conditioning) + else: + scale_tril = self.cov.low_triangle_matrix().unsqueeze(0).repeat(mu.shape[0], 1, 1) + return MultivariateNormal(loc=mu, scale_tril=scale_tril) + + def log_prob(self, x, conditioning): + return self.dist(conditioning).log_prob(x) + + def sample(self, conditioning, sample_shape=torch.Size()): + return self.dist(conditioning).sample(sample_shape=sample_shape) + + def rsample(self, conditioning, sample_shape=torch.Size()): + return self.dist(conditioning).rsample(sample_shape=sample_shape) + + def entropy(self, conditioning): + return self.dist(conditioning).entropy() + + def rsample_and_log_prob(self, conditioning, sample_shape=torch.Size()): + dist = self.dist(conditioning) + x = dist.rsample(sample_shape=sample_shape) + log_prob = dist.log_prob(x) + return x, log_prob + + def sample_and_log_prob(self, conditioning, sample_shape=torch.Size()): + with torch.no_grad(): + return self.rsample_and_log_prob(conditioning, sample_shape) + + def forward(self, conditioning): + dist = self.dist(conditioning) + x = dist.rsample() + log_prob = dist.log_prob(x) + return x, log_prob + + +class GaussianTanh(Distribution): + + def __init__(self, cfg): + super().__init__() + self.cfg = cfg + self.mu_model = multilayer_perceptron(cfg.mu_model) + self.cov = CovarianceMatrix(cfg.cov) + self.functional_cov = cfg.cov.functional + + def dist(self, conditioning): + mu = self.mu_model(conditioning) + # repeat the sigma to match the shape of mu + if self.functional_cov: + scale_tril = self.cov.low_triangle_matrix(conditioning) + else: + scale_tril = self.cov.low_triangle_matrix().unsqueeze(0).repeat(mu.shape[0], 1, 1) + return TransformedDistribution( + base_distribution=MultivariateNormal(loc=mu, scale_tril=scale_tril), + transforms=[TanhTransform(cache_size=1)] + ) + + def log_prob(self, x, conditioning): + return self.dist(conditioning).log_prob(x) + + def sample(self, conditioning, sample_shape=torch.Size()): + return self.dist(conditioning).sample(sample_shape=sample_shape) + + def rsample(self, conditioning, sample_shape=torch.Size()): + return self.dist(conditioning).rsample(sample_shape=sample_shape) + + def rsample_and_log_prob(self, conditioning, sample_shape=torch.Size()): + dist = self.dist(conditioning) + x = dist.rsample(sample_shape=sample_shape) + log_prob = dist.log_prob(x) + return x, log_prob + + def sample_and_log_prob(self, conditioning, sample_shape=torch.Size()): + with torch.no_grad(): + return self.rsample_and_log_prob(conditioning, sample_shape) + + def entropy(self, conditioning): + mu = self.mu_model(conditioning) + # repeat the sigma to match the shape of mu + if self.functional_cov: + scale_tril = self.cov.low_triangle_matrix(conditioning) + else: + scale_tril = self.cov.low_triangle_matrix().unsqueeze(0).repeat(mu.shape[0], 1, 1) + base_distribution = MultivariateNormal(loc=mu, scale_tril=scale_tril) + x = base_distribution.rsample(sample_shape=torch.Size([1000])) + return base_distribution.entropy() + torch.sum(torch.log(1.0 - torch.tanh(x) ** 2), dim=(0, 2)) / 1000 + + def forward(self, conditioning): + dist = self.dist(conditioning) + x = dist.rsample() + log_prob = dist.log_prob(x) + return x, log_prob diff --git a/ding/torch_utils/modules/matrix.py b/ding/torch_utils/modules/matrix.py new file mode 100644 index 0000000000..a287735dc8 --- /dev/null +++ b/ding/torch_utils/modules/matrix.py @@ -0,0 +1,52 @@ +import torch +from torch import nn +from .parameter import NonegativeParameter, TanhParameter +from .function import NonegativeFunction, TanhFunction + + +class CovarianceMatrix(nn.Module): + + def __init__(self, cfg=None, delta=1e-8): + super().__init__() + self.dim = cfg.dim + if cfg.functional: + self.functional = True + self.sigma_lambda = NonegativeFunction(cfg.sigma_lambda) + self.sigma_offdiag = TanhFunction(cfg.sigma_offdiag) + else: + self.functional = False + if cfg.random_init: + self.sigma_lambda = NonegativeParameter(torch.abs(nn.init.normal_(torch.Tensor(self.dim)))) + self.sigma_offdiag = TanhParameter( + torch.tanh(nn.init.normal_(torch.Tensor(self.dim * (self.dim - 1) // 2))) + ) + else: + self.sigma_lambda = NonegativeParameter(torch.ones(self.dim)) + self.sigma_offdiag = TanhParameter(torch.tanh(torch.zeros(self.dim * (self.dim - 1) // 2))) + # register eye matrix + self.eye = nn.Parameter(torch.eye(self.dim), requires_grad=False) + self.delta = delta + + def low_triangle_matrix(self, x=None): + low_t_m = self.eye.clone() + if self.functional: + low_t_m = low_t_m.repeat(x.shape[0], 1, 1) + low_t_m[torch.cat( + ( + torch.reshape(torch.arange(x.shape[0]).repeat(self.dim * (self.dim - 1) // 2, 1).T, + (1, -1)), torch.tril_indices(self.dim, self.dim, offset=-1).repeat(1, x.shape[0]) + ) + ).tolist()] = torch.reshape(self.sigma_offdiag(x), (-1, 1)).squeeze(-1) + low_t_m = torch.einsum( + "bj,bjk,bk->bjk", self.delta + self.sigma_lambda(x), low_t_m, self.delta + self.sigma_lambda(x) + ) + else: + low_t_m[torch.tril_indices(self.dim, self.dim, offset=-1).tolist()] = self.sigma_offdiag.data + low_t_m = torch.mul( + self.delta + self.sigma_lambda.data, + torch.mul(low_t_m, self.delta + self.sigma_lambda.data).T + ).T + return low_t_m + + def forward(self, x=None): + return torch.matmul(self.low_triangle_matrix(x), self.low_triangle_matrix(x).T) diff --git a/ding/torch_utils/modules/parameter.py b/ding/torch_utils/modules/parameter.py new file mode 100644 index 0000000000..e5879933a8 --- /dev/null +++ b/ding/torch_utils/modules/parameter.py @@ -0,0 +1,37 @@ +import torch +from torch import nn +from torch.distributions.transforms import TanhTransform + + +class NonegativeParameter(nn.Module): + + def __init__(self, data=None, requires_grad=True, delta=1e-8): + super().__init__() + if data is None: + data = torch.zeros(1) + self.log_data = nn.Parameter(torch.log(data + delta), requires_grad=requires_grad) + + def forward(self): + return torch.exp(self.log_data) + + @property + def data(self): + return torch.exp(self.log_data) + + +class TanhParameter(nn.Module): + + def __init__(self, data=None, requires_grad=True): + super().__init__() + if data is None: + data = torch.zeros(1) + self.transform = TanhTransform(cache_size=1) + + self.data_inv = nn.Parameter(self.transform.inv(data), requires_grad=requires_grad) + + def forward(self): + return self.transform(self.data_inv) + + @property + def data(self): + return self.transform(self.data_inv) diff --git a/ding/torch_utils/modules/perceptron.py b/ding/torch_utils/modules/perceptron.py new file mode 100644 index 0000000000..c8640b2ca2 --- /dev/null +++ b/ding/torch_utils/modules/perceptron.py @@ -0,0 +1,52 @@ +import torch +from torch import nn +from ding.torch_utils.activation import get_activation + + +class multilayer_perceptron(nn.Module): + + def __init__(self, cfg): + super(multilayer_perceptron, self).__init__() + + self.model = nn.Sequential() + + for i in range(len(cfg.hidden_sizes) - 1): + self.model.add_module('linear' + str(i), nn.Linear(cfg.hidden_sizes[i], cfg.hidden_sizes[i + 1])) + + if isinstance(cfg.activation, list): + self.model.add_module('activation' + str(i), get_activation(cfg.activation[i])) + else: + self.model.add_module('activation' + str(i), get_activation(cfg.activation)) + if hasattr(cfg, "dropout") and cfg.dropout > 0: + self.model.add_module('dropout', nn.Dropout(cfg.dropout)) + if hasattr(cfg, "layernorm") and cfg.layernorm: + self.model.add_module('layernorm', nn.LayerNorm(cfg.hidden_sizes[i])) + + self.model.add_module( + 'linear' + str(len(cfg.hidden_sizes) - 1), nn.Linear(cfg.hidden_sizes[-1], cfg.output_size) + ) + + if hasattr(cfg, 'final_activation'): + self.model.add_module('final_activation', get_activation(cfg.final_activation)) + + if hasattr(cfg, 'scale'): + self.scale = nn.Parameter(torch.tensor(cfg.scale), requires_grad=False) + else: + self.scale = 1.0 + + if hasattr(cfg, 'offset'): + self.offset = nn.Parameter(torch.tensor(cfg.offset), requires_grad=False) + else: + self.offset = 0.0 + + # shrink the weight of linear layer 'linear'+str(len(cfg.hidden_sizes) to it's origin 0.01 + if hasattr(cfg, 'shrink'): + if hasattr(cfg, 'final_activation'): + self.model[-2].weight.data.normal_(0, cfg.shrink) + self.model[-2].bias.data.normal_(0, cfg.shrink) + else: + self.model[-1].weight.data.normal_(0, cfg.shrink) + self.model[-1].bias.data.normal_(0, cfg.shrink) + + def forward(self, x): + return self.scale * self.model(x) + self.offset From 0c1f2b60ce686286041d0c51a3c90c72f6df58cd Mon Sep 17 00:00:00 2001 From: zjowowen Date: Mon, 17 Jul 2023 09:14:11 +0000 Subject: [PATCH 165/244] add dependencies --- setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.py b/setup.py index 57a1064f7c..5911b312e6 100644 --- a/setup.py +++ b/setup.py @@ -61,6 +61,8 @@ 'trueskill', 'tensorboardX>=2.2', 'wandb', + 'moviepy', + 'imageio', 'matplotlib', 'easydict==1.9', 'pyyaml', From 9336a0ac3130d7c31d0bc56081c5355932dec44f Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 18 Jul 2023 06:20:54 +0000 Subject: [PATCH 166/244] polish config --- .../PPOOffPolicy/gym_lunarlandercontinuous_v2.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ding/config/PPOOffPolicy/gym_lunarlandercontinuous_v2.py b/ding/config/PPOOffPolicy/gym_lunarlandercontinuous_v2.py index 408db5b7fe..f8949535c1 100644 --- a/ding/config/PPOOffPolicy/gym_lunarlandercontinuous_v2.py +++ b/ding/config/PPOOffPolicy/gym_lunarlandercontinuous_v2.py @@ -8,7 +8,7 @@ seed=0, env=dict( env_id='LunarLanderContinuous-v2', - collector_env_num=16, + collector_env_num=8, evaluator_env_num=4, n_evaluator_episode=4, stop_value=240, @@ -72,7 +72,7 @@ ), learn=dict( update_per_collect=1, - batch_size=1024, + batch_size=512, learning_rate=3e-4, value_weight=0.5, entropy_weight=0.01, @@ -86,16 +86,16 @@ weight_decay=0.0, ), collect=dict( - n_sample=1024, + n_sample=512, unroll_len=1, - discount_factor=0.99, + discount_factor=0.999, gae_lambda=1.0, ), eval=dict( evaluator=dict(eval_freq=100, ), render=True, ), - other=dict(replay_buffer=dict(replay_buffer_size=int(128), ), ), + other=dict(replay_buffer=dict(replay_buffer_size=int(512), ), ), ), wandb_logger=dict( gradient_logger=True, video_logger=True, plot_logger=True, action_logger=False, return_logger=False From ced06f8b7558b6523398650e5d3fb657902e0ff2 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 18 Jul 2023 06:42:48 +0000 Subject: [PATCH 167/244] polish deploy --- ding/bonus/ppo_offpolicy.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/ding/bonus/ppo_offpolicy.py b/ding/bonus/ppo_offpolicy.py index cf9334660c..ce0c4cafe2 100644 --- a/ding/bonus/ppo_offpolicy.py +++ b/ding/bonus/ppo_offpolicy.py @@ -151,8 +151,17 @@ def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, logging.warning('No video would be generated during the deploy.') def single_env_forward_wrapper(forward_fn, cuda=True): - - forward_fn = model_wrap(forward_fn, wrapper_name='argmax_sample').forward + + if self.cfg.policy.action_space=='discrete': + forward_fn = model_wrap(forward_fn, wrapper_name='argmax_sample').forward + elif self.cfg.policy.action_space=='continuous': + forward_fn = model_wrap(forward_fn, wrapper_name='deterministic_sample').forward + elif self.cfg.policy.action_space=='hybrid': + forward_fn = model_wrap(forward_fn, wrapper_name='hybrid_deterministic_argmax_sample').forward + elif self.cfg.policy.action_space=='general': + forward_fn = model_wrap(forward_fn, wrapper_name='base').forward + else: + raise NotImplementedError def _forward(obs): # unsqueeze means add batch dim, i.e. (O, ) -> (1, O) From a8822fdded6b97ebe76b28eb3230d062bf94cd2a Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 18 Jul 2023 07:32:38 +0000 Subject: [PATCH 168/244] Add array video helper --- ding/utils/__init__.py | 1 + ding/utils/tests/test_video_helper.py | 19 +++++++++++++++++++ ding/utils/video_helper.py | 19 +++++++++++++++++++ 3 files changed, 39 insertions(+) create mode 100644 ding/utils/tests/test_video_helper.py create mode 100644 ding/utils/video_helper.py diff --git a/ding/utils/__init__.py b/ding/utils/__init__.py index 2980747f6a..eb65f792d8 100644 --- a/ding/utils/__init__.py +++ b/ding/utils/__init__.py @@ -30,6 +30,7 @@ from .render_helper import render, fps from .fast_copy import fastcopy from .bfs_helper import get_vi_sequence +from .video_helper import numpy_array_to_video if ding.enable_linklink: from .linklink_dist_helper import get_rank, get_world_size, dist_mode, dist_init, dist_finalize, \ diff --git a/ding/utils/tests/test_video_helper.py b/ding/utils/tests/test_video_helper.py new file mode 100644 index 0000000000..a6f4e7e2df --- /dev/null +++ b/ding/utils/tests/test_video_helper.py @@ -0,0 +1,19 @@ +import os +import shutil +import tempfile +import pytest +import numpy as np +from ding.utils.video_helper import numpy_array_to_video + +@pytest.mark.unittest +class TestVideoHelper: + + def test_numpy_array_to_video(self): + + # create a numpy array + frames = np.random.randint(0, 255, size=(100, 100, 100, 3), dtype=np.uint8) + + with tempfile.TemporaryDirectory() as tmpdir: + temp_file = os.path.join(tmpdir, 'temp_file.mp4') + numpy_array_to_video(frames, temp_file, fps=30.0) + assert os.path.exists(temp_file) diff --git a/ding/utils/video_helper.py b/ding/utils/video_helper.py new file mode 100644 index 0000000000..e90c8db575 --- /dev/null +++ b/ding/utils/video_helper.py @@ -0,0 +1,19 @@ +import cv2 +import numpy as np + +def numpy_array_to_video(numpy_array, output_file, fps=30.0, codec='mp4v'): + height, width, channels = numpy_array.shape[1:] + fourcc = cv2.VideoWriter_fourcc(*codec) + out = cv2.VideoWriter(output_file, fourcc, fps, (width, height)) + + for frame in numpy_array: + out.write(frame) + + out.release() + +# # Example usage +# # Assuming you have a numpy array called 'frames' with shape (num_frames, height, width, channels) +# numpy_array_to_video(frames, 'output_video.mp4', fps=30.0) + +# pytest for function numpy_array_to_video +# use virtual directory From 8d152e019bd6d708d1d2b4f88018a8759d2358aa Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 18 Jul 2023 14:52:51 +0000 Subject: [PATCH 169/244] polish deploy --- ding/bonus/a2c.py | 70 ++++++++++++++++++------- ding/bonus/c51.py | 70 ++++++++++++++++++------- ding/bonus/ddpg.py | 70 ++++++++++++++++++------- ding/bonus/dqn.py | 73 ++++++++++++++++++-------- ding/bonus/pg.py | 70 ++++++++++++++++++------- ding/bonus/ppo_offpolicy.py | 74 ++++++++++++++++++--------- ding/bonus/sac.py | 70 ++++++++++++++++++------- ding/bonus/sql.py | 69 ++++++++++++++++++------- ding/bonus/td3.py | 69 ++++++++++++++++++------- ding/utils/__init__.py | 3 +- ding/utils/render_helper.py | 56 ++++++++++++++------ ding/utils/tests/test_video_helper.py | 19 ------- ding/utils/video_helper.py | 19 ------- setup.py | 6 ++- 14 files changed, 505 insertions(+), 233 deletions(-) delete mode 100644 ding/utils/tests/test_video_helper.py delete mode 100644 ding/utils/video_helper.py diff --git a/ding/bonus/a2c.py b/ding/bonus/a2c.py index d5f7b95a08..92f1a0d0c9 100644 --- a/ding/bonus/a2c.py +++ b/ding/bonus/a2c.py @@ -2,6 +2,7 @@ from ditk import logging from easydict import EasyDict import os +import numpy as np import torch import treetensor.torch as ttorch from ding.framework import task, OnlineRLContext @@ -12,6 +13,7 @@ from ding.envs import setup_ding_env_manager from ding.policy import A2CPolicy from ding.utils import set_pkg_seed +from ding.utils import get_env_fps, render from ding.config import save_config_py, compile_config from ding.model import VAC from ding.model import model_wrap @@ -97,7 +99,11 @@ def train( evaluator_env = self._setup_env_manager(evaluator_env_num, context, debug, 'evaluator') with task.start(ctx=OnlineRLContext()): - task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) + task.use(interaction_evaluator( + self.cfg, self.policy.eval_mode, evaluator_env, render=self.cfg.policy.eval.render \ + if hasattr(self.cfg.policy.eval, "render") else False + ) + ) task.use(StepCollector(self.cfg, self.policy.collect_mode, collector_env)) task.use(gae_estimator(self.cfg, self.policy.collect_mode)) task.use(trainer(self.cfg, self.policy.learn_mode)) @@ -117,19 +123,36 @@ def train( return TrainingReturn(wandb_url=task.ctx.wandb_url) - def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> float: + def deploy( + self, + enable_save_replay: bool = False, + concatenate_all_replay: bool = False, + replay_save_path: str = None, + seed: Optional[Union[int, List]] = None, + debug: bool = False + ) -> EvalReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy env = self.env.clone(caller='evaluator') - env.seed(self.seed, dynamic_seed=False) - if enable_save_replay and replay_save_path: + if seed is not None and isinstance(seed, int): + seeds = [seed] + elif seed is not None and isinstance(seed, list): + seeds = seed + else: + seeds = [self.seed] + + returns = [] + images = [] + if enable_save_replay: + replay_save_path = os.path.join(self.exp_name, 'videos') if replay_save_path is None else replay_save_path env.enable_save_replay(replay_path=replay_save_path) - elif enable_save_replay: - env.enable_save_replay(replay_path=os.path.join(self.exp_name, 'videos')) else: logging.warning('No video would be generated during the deploy.') + if concatenate_all_replay: + logging.warning('concatenate_all_replay is set to False because enable_save_replay is False.') + concatenate_all_replay = False def single_env_forward_wrapper(forward_fn, cuda=True): @@ -158,22 +181,31 @@ def _forward(obs): # env will be reset again in the main loop env.reset() - # main loop - return_ = 0. - step = 0 - obs = env.reset() - while True: - action = forward_fn(obs) - obs, rew, done, info = env.step(action) - return_ += rew - step += 1 - if done: - break - logging.info(f'A2C deploy is finished, final episode return with {step} steps is: {return_}') + for seed in seeds: + env.seed(seed, dynamic_seed=False) + return_ = 0. + step = 0 + obs = env.reset() + images.append(render(env)[None]) if concatenate_all_replay else None + while True: + action = forward_fn(obs) + obs, rew, done, info = env.step(action) + images.append(render(env)[None]) if concatenate_all_replay else None + return_ += rew + step += 1 + if done: + break + logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}') + returns.append(return_) env.close() - return return_ + if concatenate_all_replay: + images = np.concatenate(images, axis=0) + import imageio + imageio.mimwrite(os.path.join(replay_save_path, 'deploy.mp4'), images, fps=get_env_fps(env)) + + return EvalReturn(eval_value=np.mean(returns), eval_value_std=np.std(returns)) def collect_data( self, diff --git a/ding/bonus/c51.py b/ding/bonus/c51.py index 9cca64ae90..0f868179dd 100644 --- a/ding/bonus/c51.py +++ b/ding/bonus/c51.py @@ -2,6 +2,7 @@ from ditk import logging from easydict import EasyDict import os +import numpy as np import torch import treetensor.torch as ttorch from ding.framework import task, OnlineRLContext @@ -12,6 +13,7 @@ from ding.envs import setup_ding_env_manager from ding.policy import C51Policy from ding.utils import set_pkg_seed +from ding.utils import get_env_fps, render from ding.config import save_config_py, compile_config from ding.model import C51DQN from ding.model import model_wrap @@ -100,7 +102,11 @@ def train( evaluator_env = setup_ding_env_manager(self.env, evaluator_env_num, context, debug, 'evaluator') with task.start(ctx=OnlineRLContext()): - task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) + task.use(interaction_evaluator( + self.cfg, self.policy.eval_mode, evaluator_env, render=self.cfg.policy.eval.render \ + if hasattr(self.cfg.policy.eval, "render") else False + ) + ) task.use(eps_greedy_handler(self.cfg)) task.use(StepCollector(self.cfg, self.policy.collect_mode, collector_env)) task.use(nstep_reward_enhancer(self.cfg)) @@ -122,19 +128,36 @@ def train( return TrainingReturn(wandb_url=task.ctx.wandb_url) - def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> float: + def deploy( + self, + enable_save_replay: bool = False, + concatenate_all_replay: bool = False, + replay_save_path: str = None, + seed: Optional[Union[int, List]] = None, + debug: bool = False + ) -> EvalReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy env = self.env.clone(caller='evaluator') - env.seed(self.seed, dynamic_seed=False) - if enable_save_replay and replay_save_path: + if seed is not None and isinstance(seed, int): + seeds = [seed] + elif seed is not None and isinstance(seed, list): + seeds = seed + else: + seeds = [self.seed] + + returns = [] + images = [] + if enable_save_replay: + replay_save_path = os.path.join(self.exp_name, 'videos') if replay_save_path is None else replay_save_path env.enable_save_replay(replay_path=replay_save_path) - elif enable_save_replay: - env.enable_save_replay(replay_path=os.path.join(self.exp_name, 'videos')) else: logging.warning('No video would be generated during the deploy.') + if concatenate_all_replay: + logging.warning('concatenate_all_replay is set to False because enable_save_replay is False.') + concatenate_all_replay = False def single_env_forward_wrapper(forward_fn, cuda=True): @@ -158,22 +181,31 @@ def _forward(obs): # env will be reset again in the main loop env.reset() - # main loop - return_ = 0. - step = 0 - obs = env.reset() - while True: - action = forward_fn(obs) - obs, rew, done, info = env.step(action) - return_ += rew - step += 1 - if done: - break - logging.info(f'C51 deploy is finished, final episode return with {step} steps is: {return_}') + for seed in seeds: + env.seed(seed, dynamic_seed=False) + return_ = 0. + step = 0 + obs = env.reset() + images.append(render(env)[None]) if concatenate_all_replay else None + while True: + action = forward_fn(obs) + obs, rew, done, info = env.step(action) + images.append(render(env)[None]) if concatenate_all_replay else None + return_ += rew + step += 1 + if done: + break + logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}') + returns.append(return_) env.close() - return return_ + if concatenate_all_replay: + images = np.concatenate(images, axis=0) + import imageio + imageio.mimwrite(os.path.join(replay_save_path, 'deploy.mp4'), images, fps=get_env_fps(env)) + + return EvalReturn(eval_value=np.mean(returns), eval_value_std=np.std(returns)) def collect_data( self, diff --git a/ding/bonus/ddpg.py b/ding/bonus/ddpg.py index 0dbc5e6ede..5f30398b65 100644 --- a/ding/bonus/ddpg.py +++ b/ding/bonus/ddpg.py @@ -2,6 +2,7 @@ from ditk import logging from easydict import EasyDict import os +import numpy as np import torch import treetensor.torch as ttorch from ding.framework import task, OnlineRLContext @@ -12,6 +13,7 @@ from ding.envs import setup_ding_env_manager from ding.policy import DDPGPolicy from ding.utils import set_pkg_seed +from ding.utils import get_env_fps, render from ding.config import save_config_py, compile_config from ding.model import QAC from ding.data import DequeBuffer @@ -100,7 +102,11 @@ def train( evaluator_env = setup_ding_env_manager(self.env, evaluator_env_num, context, debug, 'evaluator') with task.start(ctx=OnlineRLContext()): - task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) + task.use(interaction_evaluator( + self.cfg, self.policy.eval_mode, evaluator_env, render=self.cfg.policy.eval.render \ + if hasattr(self.cfg.policy.eval, "render") else False + ) + ) task.use( StepCollector( self.cfg, @@ -127,19 +133,36 @@ def train( return TrainingReturn(wandb_url=task.ctx.wandb_url) - def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> float: + def deploy( + self, + enable_save_replay: bool = False, + concatenate_all_replay: bool = False, + replay_save_path: str = None, + seed: Optional[Union[int, List]] = None, + debug: bool = False + ) -> EvalReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy env = self.env.clone(caller='evaluator') - env.seed(self.seed, dynamic_seed=False) - if enable_save_replay and replay_save_path: + if seed is not None and isinstance(seed, int): + seeds = [seed] + elif seed is not None and isinstance(seed, list): + seeds = seed + else: + seeds = [self.seed] + + returns = [] + images = [] + if enable_save_replay: + replay_save_path = os.path.join(self.exp_name, 'videos') if replay_save_path is None else replay_save_path env.enable_save_replay(replay_path=replay_save_path) - elif enable_save_replay: - env.enable_save_replay(replay_path=os.path.join(self.exp_name, 'videos')) else: logging.warning('No video would be generated during the deploy.') + if concatenate_all_replay: + logging.warning('concatenate_all_replay is set to False because enable_save_replay is False.') + concatenate_all_replay = False def single_env_forward_wrapper(forward_fn, cuda=True): @@ -161,22 +184,31 @@ def _forward(obs): # env will be reset again in the main loop env.reset() - # main loop - return_ = 0. - step = 0 - obs = env.reset() - while True: - action = forward_fn(obs) - obs, rew, done, info = env.step(action) - return_ += rew - step += 1 - if done: - break - logging.info(f'DDPG deploy is finished, final episode return with {step} steps is: {return_}') + for seed in seeds: + env.seed(seed, dynamic_seed=False) + return_ = 0. + step = 0 + obs = env.reset() + images.append(render(env)[None]) if concatenate_all_replay else None + while True: + action = forward_fn(obs) + obs, rew, done, info = env.step(action) + images.append(render(env)[None]) if concatenate_all_replay else None + return_ += rew + step += 1 + if done: + break + logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}') + returns.append(return_) env.close() - return return_ + if concatenate_all_replay: + images = np.concatenate(images, axis=0) + import imageio + imageio.mimwrite(os.path.join(replay_save_path, 'deploy.mp4'), images, fps=get_env_fps(env)) + + return EvalReturn(eval_value=np.mean(returns), eval_value_std=np.std(returns)) def collect_data( self, diff --git a/ding/bonus/dqn.py b/ding/bonus/dqn.py index bfa6312341..26a2195340 100644 --- a/ding/bonus/dqn.py +++ b/ding/bonus/dqn.py @@ -1,7 +1,8 @@ -from typing import Optional, Union +from typing import Optional, Union, List from ditk import logging from easydict import EasyDict import os +import numpy as np import torch import treetensor.torch as ttorch from ding.framework import task, OnlineRLContext @@ -12,6 +13,7 @@ from ding.envs import setup_ding_env_manager from ding.policy import DQNPolicy from ding.utils import set_pkg_seed +from ding.utils import get_env_fps, render from ding.config import save_config_py, compile_config from ding.model import DQN from ding.model import model_wrap @@ -100,7 +102,12 @@ def train( evaluator_env = setup_ding_env_manager(self.env, evaluator_env_num, context, debug, 'evaluator') with task.start(ctx=OnlineRLContext()): - task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) + task.use( + interaction_evaluator( + self.cfg, self.policy.eval_mode, evaluator_env, render=self.cfg.policy.eval.render \ + if hasattr(self.cfg.policy.eval, "render") else False + ) + ) task.use(eps_greedy_handler(self.cfg)) task.use( StepCollector( @@ -131,19 +138,36 @@ def train( return TrainingReturn(wandb_url=task.ctx.wandb_url) - def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> float: + def deploy( + self, + enable_save_replay: bool = False, + concatenate_all_replay: bool = False, + replay_save_path: str = None, + seed: Optional[Union[int, List]] = None, + debug: bool = False + ) -> EvalReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy env = self.env.clone(caller='evaluator') - env.seed(self.seed, dynamic_seed=False) - if enable_save_replay and replay_save_path: + if seed is not None and isinstance(seed, int): + seeds = [seed] + elif seed is not None and isinstance(seed, list): + seeds = seed + else: + seeds = [self.seed] + + returns = [] + images = [] + if enable_save_replay: + replay_save_path = os.path.join(self.exp_name, 'videos') if replay_save_path is None else replay_save_path env.enable_save_replay(replay_path=replay_save_path) - elif enable_save_replay: - env.enable_save_replay(replay_path=os.path.join(self.exp_name, 'videos')) else: logging.warning('No video would be generated during the deploy.') + if concatenate_all_replay: + logging.warning('concatenate_all_replay is set to False because enable_save_replay is False.') + concatenate_all_replay = False def single_env_forward_wrapper(forward_fn, cuda=True): @@ -167,22 +191,31 @@ def _forward(obs): # env will be reset again in the main loop env.reset() - # main loop - return_ = 0. - step = 0 - obs = env.reset() - while True: - action = forward_fn(obs) - obs, rew, done, info = env.step(action) - return_ += rew - step += 1 - if done: - break - logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}') + for seed in seeds: + env.seed(seed, dynamic_seed=False) + return_ = 0. + step = 0 + obs = env.reset() + images.append(render(env)[None]) if concatenate_all_replay else None + while True: + action = forward_fn(obs) + obs, rew, done, info = env.step(action) + images.append(render(env)[None]) if concatenate_all_replay else None + return_ += rew + step += 1 + if done: + break + logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}') + returns.append(return_) env.close() - return return_ + if concatenate_all_replay: + images = np.concatenate(images, axis=0) + import imageio + imageio.mimwrite(os.path.join(replay_save_path, 'deploy.mp4'), images, fps=get_env_fps(env)) + + return EvalReturn(eval_value=np.mean(returns), eval_value_std=np.std(returns)) def collect_data( self, diff --git a/ding/bonus/pg.py b/ding/bonus/pg.py index 1c7ef7eb85..e7e172859d 100644 --- a/ding/bonus/pg.py +++ b/ding/bonus/pg.py @@ -2,6 +2,7 @@ from ditk import logging from easydict import EasyDict import os +import numpy as np import torch import treetensor.torch as ttorch from ding.framework import task, OnlineRLContext @@ -12,6 +13,7 @@ from ding.envs import setup_ding_env_manager from ding.policy import PGPolicy from ding.utils import set_pkg_seed +from ding.utils import get_env_fps, render from ding.config import save_config_py, compile_config from ding.model import PG from ding.bonus.common import TrainingReturn, EvalReturn @@ -97,7 +99,11 @@ def train( evaluator_env = setup_ding_env_manager(self.env, evaluator_env_num, context, debug, 'evaluator') with task.start(ctx=OnlineRLContext()): - task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) + task.use(interaction_evaluator( + self.cfg, self.policy.eval_mode, evaluator_env, render=self.cfg.policy.eval.render \ + if hasattr(self.cfg.policy.eval, "render") else False + ) + ) task.use(EpisodeCollector(self.cfg, self.policy.collect_mode, collector_env)) task.use(pg_estimator(self.policy.collect_mode)) task.use(trainer(self.cfg, self.policy.learn_mode)) @@ -117,19 +123,36 @@ def train( return TrainingReturn(wandb_url=task.ctx.wandb_url) - def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> float: + def deploy( + self, + enable_save_replay: bool = False, + concatenate_all_replay: bool = False, + replay_save_path: str = None, + seed: Optional[Union[int, List]] = None, + debug: bool = False + ) -> EvalReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy env = self.env.clone(caller='evaluator') - env.seed(self.seed, dynamic_seed=False) - if enable_save_replay and replay_save_path: + if seed is not None and isinstance(seed, int): + seeds = [seed] + elif seed is not None and isinstance(seed, list): + seeds = seed + else: + seeds = [self.seed] + + returns = [] + images = [] + if enable_save_replay: + replay_save_path = os.path.join(self.exp_name, 'videos') if replay_save_path is None else replay_save_path env.enable_save_replay(replay_path=replay_save_path) - elif enable_save_replay: - env.enable_save_replay(replay_path=os.path.join(self.exp_name, 'videos')) else: logging.warning('No video would be generated during the deploy.') + if concatenate_all_replay: + logging.warning('concatenate_all_replay is set to False because enable_save_replay is False.') + concatenate_all_replay = False def single_env_forward_wrapper(forward_fn, cuda=True): @@ -160,22 +183,31 @@ def _forward(obs): # env will be reset again in the main loop env.reset() - # main loop - return_ = 0. - step = 0 - obs = env.reset() - while True: - action = forward_fn(obs) - obs, rew, done, info = env.step(action) - return_ += rew - step += 1 - if done: - break - logging.info(f'PG deploy is finished, final episode return with {step} steps is: {return_}') + for seed in seeds: + env.seed(seed, dynamic_seed=False) + return_ = 0. + step = 0 + obs = env.reset() + images.append(render(env)[None]) if concatenate_all_replay else None + while True: + action = forward_fn(obs) + obs, rew, done, info = env.step(action) + images.append(render(env)[None]) if concatenate_all_replay else None + return_ += rew + step += 1 + if done: + break + logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}') + returns.append(return_) env.close() - return return_ + if concatenate_all_replay: + images = np.concatenate(images, axis=0) + import imageio + imageio.mimwrite(os.path.join(replay_save_path, 'deploy.mp4'), images, fps=get_env_fps(env)) + + return EvalReturn(eval_value=np.mean(returns), eval_value_std=np.std(returns)) def collect_data( self, diff --git a/ding/bonus/ppo_offpolicy.py b/ding/bonus/ppo_offpolicy.py index ce0c4cafe2..8f86c1890a 100644 --- a/ding/bonus/ppo_offpolicy.py +++ b/ding/bonus/ppo_offpolicy.py @@ -2,6 +2,7 @@ from ditk import logging from easydict import EasyDict import os +import numpy as np import torch import treetensor.torch as ttorch from ding.framework import task, OnlineRLContext @@ -11,6 +12,7 @@ from ding.envs import setup_ding_env_manager from ding.policy import PPOOffPolicy from ding.utils import set_pkg_seed +from ding.utils import get_env_fps, render from ding.config import save_config_py, compile_config from ding.model import BaseVAC, VAC from ding.model import StochasticPolicy, VModel @@ -136,29 +138,46 @@ def train( return TrainingReturn(wandb_url=task.ctx.wandb_url) - def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> float: + def deploy( + self, + enable_save_replay: bool = False, + concatenate_all_replay: bool = False, + replay_save_path: str = None, + seed: Optional[Union[int, List]] = None, + debug: bool = False + ) -> EvalReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy env = self.env.clone(caller='evaluator') - env.seed(self.seed, dynamic_seed=False) - if enable_save_replay and replay_save_path: + if seed is not None and isinstance(seed, int): + seeds = [seed] + elif seed is not None and isinstance(seed, list): + seeds = seed + else: + seeds = [self.seed] + + returns = [] + images = [] + if enable_save_replay: + replay_save_path = os.path.join(self.exp_name, 'videos') if replay_save_path is None else replay_save_path env.enable_save_replay(replay_path=replay_save_path) - elif enable_save_replay: - env.enable_save_replay(replay_path=os.path.join(self.exp_name, 'videos')) else: logging.warning('No video would be generated during the deploy.') + if concatenate_all_replay: + logging.warning('concatenate_all_replay is set to False because enable_save_replay is False.') + concatenate_all_replay = False def single_env_forward_wrapper(forward_fn, cuda=True): - - if self.cfg.policy.action_space=='discrete': + + if self.cfg.policy.action_space == 'discrete': forward_fn = model_wrap(forward_fn, wrapper_name='argmax_sample').forward - elif self.cfg.policy.action_space=='continuous': + elif self.cfg.policy.action_space == 'continuous': forward_fn = model_wrap(forward_fn, wrapper_name='deterministic_sample').forward - elif self.cfg.policy.action_space=='hybrid': + elif self.cfg.policy.action_space == 'hybrid': forward_fn = model_wrap(forward_fn, wrapper_name='hybrid_deterministic_argmax_sample').forward - elif self.cfg.policy.action_space=='general': + elif self.cfg.policy.action_space == 'general': forward_fn = model_wrap(forward_fn, wrapper_name='base').forward else: raise NotImplementedError @@ -181,22 +200,31 @@ def _forward(obs): # env will be reset again in the main loop env.reset() - # main loop - return_ = 0. - step = 0 - obs = env.reset() - while True: - action = forward_fn(obs) - obs, rew, done, info = env.step(action) - return_ += rew - step += 1 - if done: - break - logging.info(f'PPOOffPolicy deploy is finished, final episode return with {step} steps is: {return_}') + for seed in seeds: + env.seed(seed, dynamic_seed=False) + return_ = 0. + step = 0 + obs = env.reset() + images.append(render(env)[None]) if concatenate_all_replay else None + while True: + action = forward_fn(obs) + obs, rew, done, info = env.step(action) + images.append(render(env)[None]) if concatenate_all_replay else None + return_ += rew + step += 1 + if done: + break + logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}') + returns.append(return_) env.close() - return return_ + if concatenate_all_replay: + images = np.concatenate(images, axis=0) + import imageio + imageio.mimwrite(os.path.join(replay_save_path, 'deploy.mp4'), images, fps=get_env_fps(env)) + + return EvalReturn(eval_value=np.mean(returns), eval_value_std=np.std(returns)) def collect_data( self, diff --git a/ding/bonus/sac.py b/ding/bonus/sac.py index ab36036eef..778b85bcc1 100644 --- a/ding/bonus/sac.py +++ b/ding/bonus/sac.py @@ -2,6 +2,7 @@ from ditk import logging from easydict import EasyDict import os +import numpy as np import torch import treetensor.torch as ttorch from ding.framework import task, OnlineRLContext @@ -12,6 +13,7 @@ from ding.envs import setup_ding_env_manager from ding.policy import SACPolicy from ding.utils import set_pkg_seed +from ding.utils import get_env_fps, render from ding.config import save_config_py, compile_config from ding.model import QAC from ding.model import model_wrap @@ -100,7 +102,11 @@ def train( evaluator_env = setup_ding_env_manager(self.env, evaluator_env_num, context, debug, 'evaluator') with task.start(ctx=OnlineRLContext()): - task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) + task.use(interaction_evaluator( + self.cfg, self.policy.eval_mode, evaluator_env, render=self.cfg.policy.eval.render \ + if hasattr(self.cfg.policy.eval, "render") else False + ) + ) task.use( StepCollector( self.cfg, @@ -127,19 +133,36 @@ def train( return TrainingReturn(wandb_url=task.ctx.wandb_url) - def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> float: + def deploy( + self, + enable_save_replay: bool = False, + concatenate_all_replay: bool = False, + replay_save_path: str = None, + seed: Optional[Union[int, List]] = None, + debug: bool = False + ) -> EvalReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy env = self.env.clone(caller='evaluator') - env.seed(self.seed, dynamic_seed=False) - if enable_save_replay and replay_save_path: + if seed is not None and isinstance(seed, int): + seeds = [seed] + elif seed is not None and isinstance(seed, list): + seeds = seed + else: + seeds = [self.seed] + + returns = [] + images = [] + if enable_save_replay: + replay_save_path = os.path.join(self.exp_name, 'videos') if replay_save_path is None else replay_save_path env.enable_save_replay(replay_path=replay_save_path) - elif enable_save_replay: - env.enable_save_replay(replay_path=os.path.join(self.exp_name, 'videos')) else: logging.warning('No video would be generated during the deploy.') + if concatenate_all_replay: + logging.warning('concatenate_all_replay is set to False because enable_save_replay is False.') + concatenate_all_replay = False def single_env_forward_wrapper(forward_fn, cuda=True): @@ -162,22 +185,31 @@ def _forward(obs): # env will be reset again in the main loop env.reset() - # main loop - return_ = 0. - step = 0 - obs = env.reset() - while True: - action = forward_fn(obs) - obs, rew, done, info = env.step(action) - return_ += rew - step += 1 - if done: - break - logging.info(f'SAC deploy is finished, final episode return with {step} steps is: {return_}') + for seed in seeds: + env.seed(seed, dynamic_seed=False) + return_ = 0. + step = 0 + obs = env.reset() + images.append(render(env)[None]) if concatenate_all_replay else None + while True: + action = forward_fn(obs) + obs, rew, done, info = env.step(action) + images.append(render(env)[None]) if concatenate_all_replay else None + return_ += rew + step += 1 + if done: + break + logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}') + returns.append(return_) env.close() - return return_ + if concatenate_all_replay: + images = np.concatenate(images, axis=0) + import imageio + imageio.mimwrite(os.path.join(replay_save_path, 'deploy.mp4'), images, fps=get_env_fps(env)) + + return EvalReturn(eval_value=np.mean(returns), eval_value_std=np.std(returns)) def collect_data( self, diff --git a/ding/bonus/sql.py b/ding/bonus/sql.py index 6b9d436a17..e4e3f89bdb 100644 --- a/ding/bonus/sql.py +++ b/ding/bonus/sql.py @@ -2,6 +2,7 @@ from ditk import logging from easydict import EasyDict import os +import numpy as np import torch import treetensor.torch as ttorch from ding.framework import task, OnlineRLContext @@ -12,6 +13,7 @@ from ding.envs import setup_ding_env_manager from ding.policy import SQLPolicy from ding.utils import set_pkg_seed +from ding.utils import get_env_fps, render from ding.config import save_config_py, compile_config from ding.model import DQN from ding.model import model_wrap @@ -100,7 +102,10 @@ def train( evaluator_env = setup_ding_env_manager(self.env, evaluator_env_num, context, debug, 'evaluator') with task.start(ctx=OnlineRLContext()): - task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) + task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env, render=self.cfg.policy.eval.render \ + if hasattr(self.cfg.policy.eval, "render") else False + ) + ) task.use(eps_greedy_handler(self.cfg)) task.use( StepCollector( @@ -131,19 +136,36 @@ def train( return TrainingReturn(wandb_url=task.ctx.wandb_url) - def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> float: + def deploy( + self, + enable_save_replay: bool = False, + concatenate_all_replay: bool = False, + replay_save_path: str = None, + seed: Optional[Union[int, List]] = None, + debug: bool = False + ) -> EvalReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy env = self.env.clone(caller='evaluator') - env.seed(self.seed, dynamic_seed=False) - if enable_save_replay and replay_save_path: + if seed is not None and isinstance(seed, int): + seeds = [seed] + elif seed is not None and isinstance(seed, list): + seeds = seed + else: + seeds = [self.seed] + + returns = [] + images = [] + if enable_save_replay: + replay_save_path = os.path.join(self.exp_name, 'videos') if replay_save_path is None else replay_save_path env.enable_save_replay(replay_path=replay_save_path) - elif enable_save_replay: - env.enable_save_replay(replay_path=os.path.join(self.exp_name, 'videos')) else: logging.warning('No video would be generated during the deploy.') + if concatenate_all_replay: + logging.warning('concatenate_all_replay is set to False because enable_save_replay is False.') + concatenate_all_replay = False def single_env_forward_wrapper(forward_fn, cuda=True): @@ -167,22 +189,31 @@ def _forward(obs): # env will be reset again in the main loop env.reset() - # main loop - return_ = 0. - step = 0 - obs = env.reset() - while True: - action = forward_fn(obs) - obs, rew, done, info = env.step(action) - return_ += rew - step += 1 - if done: - break - logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}') + for seed in seeds: + env.seed(seed, dynamic_seed=False) + return_ = 0. + step = 0 + obs = env.reset() + images.append(render(env)[None]) if concatenate_all_replay else None + while True: + action = forward_fn(obs) + obs, rew, done, info = env.step(action) + images.append(render(env)[None]) if concatenate_all_replay else None + return_ += rew + step += 1 + if done: + break + logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}') + returns.append(return_) env.close() - return return_ + if concatenate_all_replay: + images = np.concatenate(images, axis=0) + import imageio + imageio.mimwrite(os.path.join(replay_save_path, 'deploy.mp4'), images, fps=get_env_fps(env)) + + return EvalReturn(eval_value=np.mean(returns), eval_value_std=np.std(returns)) def collect_data( self, diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 12d7d9f92c..32392734c4 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -2,6 +2,7 @@ from ditk import logging from easydict import EasyDict import os +import numpy as np import torch import treetensor.torch as ttorch from ding.framework import task, OnlineRLContext @@ -12,6 +13,7 @@ from ding.envs import setup_ding_env_manager from ding.policy import TD3Policy from ding.utils import set_pkg_seed +from ding.utils import get_env_fps, render from ding.config import save_config_py, compile_config from ding.model import QAC from ding.data import DequeBuffer @@ -99,7 +101,10 @@ def train( evaluator_env = setup_ding_env_manager(self.env, evaluator_env_num, context, debug, 'evaluator') with task.start(ctx=OnlineRLContext()): - task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env)) + task.use(interaction_evaluator(self.cfg, self.policy.eval_mode, evaluator_env, render=self.cfg.policy.eval.render \ + if hasattr(self.cfg.policy.eval, "render") else False + ) + ) task.use( StepCollector( self.cfg, @@ -126,19 +131,36 @@ def train( return TrainingReturn(wandb_url=task.ctx.wandb_url) - def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> float: + def deploy( + self, + enable_save_replay: bool = False, + concatenate_all_replay: bool = False, + replay_save_path: str = None, + seed: Optional[Union[int, List]] = None, + debug: bool = False + ) -> EvalReturn: if debug: logging.getLogger().setLevel(logging.DEBUG) # define env and policy env = self.env.clone(caller='evaluator') - env.seed(self.seed, dynamic_seed=False) - if enable_save_replay and replay_save_path: + if seed is not None and isinstance(seed, int): + seeds = [seed] + elif seed is not None and isinstance(seed, list): + seeds = seed + else: + seeds = [self.seed] + + returns = [] + images = [] + if enable_save_replay: + replay_save_path = os.path.join(self.exp_name, 'videos') if replay_save_path is None else replay_save_path env.enable_save_replay(replay_path=replay_save_path) - elif enable_save_replay: - env.enable_save_replay(replay_path=os.path.join(self.exp_name, 'videos')) else: logging.warning('No video would be generated during the deploy.') + if concatenate_all_replay: + logging.warning('concatenate_all_replay is set to False because enable_save_replay is False.') + concatenate_all_replay = False def single_env_forward_wrapper(forward_fn, cuda=True): @@ -160,22 +182,31 @@ def _forward(obs): # env will be reset again in the main loop env.reset() - # main loop - return_ = 0. - step = 0 - obs = env.reset() - while True: - action = forward_fn(obs) - obs, rew, done, info = env.step(action) - return_ += rew - step += 1 - if done: - break - logging.info(f'TD3 deploy is finished, final episode return with {step} steps is: {return_}') + for seed in seeds: + env.seed(seed, dynamic_seed=False) + return_ = 0. + step = 0 + obs = env.reset() + images.append(render(env)[None]) if concatenate_all_replay else None + while True: + action = forward_fn(obs) + obs, rew, done, info = env.step(action) + images.append(render(env)[None]) if concatenate_all_replay else None + return_ += rew + step += 1 + if done: + break + logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}') + returns.append(return_) env.close() - return return_ + if concatenate_all_replay: + images = np.concatenate(images, axis=0) + import imageio + imageio.mimwrite(os.path.join(replay_save_path, 'deploy.mp4'), images, fps=get_env_fps(env)) + + return EvalReturn(eval_value=np.mean(returns), eval_value_std=np.std(returns)) def collect_data( self, diff --git a/ding/utils/__init__.py b/ding/utils/__init__.py index eb65f792d8..44b4ed0c43 100644 --- a/ding/utils/__init__.py +++ b/ding/utils/__init__.py @@ -27,10 +27,9 @@ from .system_helper import get_ip, get_pid, get_task_uid, PropagatingThread, find_free_port from .time_helper import build_time_helper, EasyTimer, WatchDog from .type_helper import SequenceType -from .render_helper import render, fps +from .render_helper import render, fps, get_env_fps, render_env from .fast_copy import fastcopy from .bfs_helper import get_vi_sequence -from .video_helper import numpy_array_to_video if ding.enable_linklink: from .linklink_dist_helper import get_rank, get_world_size, dist_mode, dist_init, dist_finalize, \ diff --git a/ding/utils/render_helper.py b/ding/utils/render_helper.py index 5535b290f5..679263eca1 100644 --- a/ding/utils/render_helper.py +++ b/ding/utils/render_helper.py @@ -5,6 +5,24 @@ from ding.envs import BaseEnv, BaseEnvManager +def render_env(env, render_mode: Optional[str] = 'rgb_array') -> "ndarray": + ''' + Overview: + Render the environment's current frame. + Arguments: + - env (:obj:`gym.Env`): DI-engine env instance. + - render_mode (:obj:`str`): Render mode. + Returns: + - frame (:obj:`numpy.ndarray`): [H * W * C] + ''' + if hasattr(env, 'sim'): + # mujoco: mujoco frame is unside-down by default + return env.sim.render(camera_name='track', height=128, width=128)[::-1] + else: + # other envs + return env.render(mode=render_mode) + + def render(env: "BaseEnv", render_mode: Optional[str] = 'rgb_array') -> "ndarray": ''' Overview: @@ -16,12 +34,29 @@ def render(env: "BaseEnv", render_mode: Optional[str] = 'rgb_array') -> "ndarray - frame (:obj:`numpy.ndarray`): [H * W * C] ''' gym_env = env._env - if hasattr(gym_env, 'sim'): - # mujoco: mujoco frame is unside-down by default - return gym_env.sim.render(camera_name='track', height=128, width=128)[::-1] + return render_env(gym_env, render_mode=render_mode) + + +def get_env_fps(env) -> "int": + ''' + Overview: + Get the environment's fps. + Arguments: + - env (:obj:`gym.Env`): DI-engine env instance. + Returns: + - fps (:obj:`int`). + ''' + + if hasattr(env, 'model'): + # mujoco + fps = 1 / env.model.opt.timestep + elif hasattr(env, 'env') and 'video.frames_per_second' in env.env.metadata.keys(): + # classic control + fps = env.env.metadata['video.frames_per_second'] else: - # other envs - return gym_env.render(mode=render_mode) + # atari and other envs + fps = 30 + return fps def fps(env_manager: "BaseEnvManager") -> "int": @@ -36,15 +71,6 @@ def fps(env_manager: "BaseEnvManager") -> "int": try: # env_ref is a ding gym environment gym_env = env_manager.env_ref._env - if hasattr(gym_env, 'model'): - # mujoco - fps = 1 / gym_env.model.opt.timestep - elif hasattr(gym_env, 'env') and 'video.frames_per_second' in gym_env.env.metadata.keys(): - # classic control - fps = gym_env.env.metadata['video.frames_per_second'] - else: - # atari and other envs - fps = 30 - return fps + return get_env_fps(gym_env) except: return 30 diff --git a/ding/utils/tests/test_video_helper.py b/ding/utils/tests/test_video_helper.py deleted file mode 100644 index a6f4e7e2df..0000000000 --- a/ding/utils/tests/test_video_helper.py +++ /dev/null @@ -1,19 +0,0 @@ -import os -import shutil -import tempfile -import pytest -import numpy as np -from ding.utils.video_helper import numpy_array_to_video - -@pytest.mark.unittest -class TestVideoHelper: - - def test_numpy_array_to_video(self): - - # create a numpy array - frames = np.random.randint(0, 255, size=(100, 100, 100, 3), dtype=np.uint8) - - with tempfile.TemporaryDirectory() as tmpdir: - temp_file = os.path.join(tmpdir, 'temp_file.mp4') - numpy_array_to_video(frames, temp_file, fps=30.0) - assert os.path.exists(temp_file) diff --git a/ding/utils/video_helper.py b/ding/utils/video_helper.py deleted file mode 100644 index e90c8db575..0000000000 --- a/ding/utils/video_helper.py +++ /dev/null @@ -1,19 +0,0 @@ -import cv2 -import numpy as np - -def numpy_array_to_video(numpy_array, output_file, fps=30.0, codec='mp4v'): - height, width, channels = numpy_array.shape[1:] - fourcc = cv2.VideoWriter_fourcc(*codec) - out = cv2.VideoWriter(output_file, fourcc, fps, (width, height)) - - for frame in numpy_array: - out.write(frame) - - out.release() - -# # Example usage -# # Assuming you have a numpy array called 'frames' with shape (num_frames, height, width, channels) -# numpy_array_to_video(frames, 'output_video.mp4', fps=30.0) - -# pytest for function numpy_array_to_video -# use virtual directory diff --git a/setup.py b/setup.py index 5911b312e6..b125b4c1f8 100644 --- a/setup.py +++ b/setup.py @@ -61,8 +61,6 @@ 'trueskill', 'tensorboardX>=2.2', 'wandb', - 'moviepy', - 'imageio', 'matplotlib', 'easydict==1.9', 'pyyaml', @@ -109,6 +107,10 @@ 'numpy-stl', 'numba>=0.53.0', ], + 'video':[ + 'moviepy', + 'imageio', + ], 'dist': [ 'redis-py-cluster==2.1.0', ], From e063d77704f6f36a859671c221f1fcd02f4ecedd Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 19 Jul 2023 05:41:08 +0000 Subject: [PATCH 170/244] polish config --- ding/bonus/a2c.py | 2 +- ding/bonus/c51.py | 2 +- ding/bonus/ddpg.py | 2 +- ding/bonus/dqn.py | 2 +- ding/bonus/pg.py | 2 +- ding/bonus/ppo_offpolicy.py | 2 +- ding/bonus/ppof.py | 2 +- ding/bonus/sac.py | 2 +- ding/bonus/sql.py | 2 +- ding/bonus/td3.py | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/ding/bonus/a2c.py b/ding/bonus/a2c.py index 92f1a0d0c9..28d0c9d2fd 100644 --- a/ding/bonus/a2c.py +++ b/ding/bonus/a2c.py @@ -1,4 +1,4 @@ -from typing import Optional, Union +from typing import Optional, Union, List from ditk import logging from easydict import EasyDict import os diff --git a/ding/bonus/c51.py b/ding/bonus/c51.py index 0f868179dd..046ecdd81a 100644 --- a/ding/bonus/c51.py +++ b/ding/bonus/c51.py @@ -1,4 +1,4 @@ -from typing import Optional, Union +from typing import Optional, Union, List from ditk import logging from easydict import EasyDict import os diff --git a/ding/bonus/ddpg.py b/ding/bonus/ddpg.py index 5f30398b65..a971f7f872 100644 --- a/ding/bonus/ddpg.py +++ b/ding/bonus/ddpg.py @@ -1,4 +1,4 @@ -from typing import Optional, Union +from typing import Optional, Union, List from ditk import logging from easydict import EasyDict import os diff --git a/ding/bonus/dqn.py b/ding/bonus/dqn.py index 26a2195340..f3ba3b61f4 100644 --- a/ding/bonus/dqn.py +++ b/ding/bonus/dqn.py @@ -1,4 +1,4 @@ -from typing import Optional, Union, List +from typing import Optional, Union, List, List from ditk import logging from easydict import EasyDict import os diff --git a/ding/bonus/pg.py b/ding/bonus/pg.py index e7e172859d..44b51324c6 100644 --- a/ding/bonus/pg.py +++ b/ding/bonus/pg.py @@ -1,4 +1,4 @@ -from typing import Optional, Union +from typing import Optional, Union, List from ditk import logging from easydict import EasyDict import os diff --git a/ding/bonus/ppo_offpolicy.py b/ding/bonus/ppo_offpolicy.py index 8f86c1890a..9fcd0dfa0a 100644 --- a/ding/bonus/ppo_offpolicy.py +++ b/ding/bonus/ppo_offpolicy.py @@ -1,4 +1,4 @@ -from typing import Optional, Union +from typing import Optional, Union, List from ditk import logging from easydict import EasyDict import os diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index def970995e..45be360e96 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -1,4 +1,4 @@ -from typing import Optional, Union +from typing import Optional, Union, List from ditk import logging from easydict import EasyDict from functools import partial diff --git a/ding/bonus/sac.py b/ding/bonus/sac.py index 778b85bcc1..97ee3172e4 100644 --- a/ding/bonus/sac.py +++ b/ding/bonus/sac.py @@ -1,4 +1,4 @@ -from typing import Optional, Union +from typing import Optional, Union, List from ditk import logging from easydict import EasyDict import os diff --git a/ding/bonus/sql.py b/ding/bonus/sql.py index e4e3f89bdb..db23698550 100644 --- a/ding/bonus/sql.py +++ b/ding/bonus/sql.py @@ -1,4 +1,4 @@ -from typing import Optional, Union +from typing import Optional, Union, List from ditk import logging from easydict import EasyDict import os diff --git a/ding/bonus/td3.py b/ding/bonus/td3.py index 32392734c4..d5a700432d 100644 --- a/ding/bonus/td3.py +++ b/ding/bonus/td3.py @@ -1,4 +1,4 @@ -from typing import Optional, Union +from typing import Optional, Union, List from ditk import logging from easydict import EasyDict import os From afb63552d20cab8a94c5689dd14c8919d63fead8 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 20 Jul 2023 11:49:18 +0000 Subject: [PATCH 171/244] polish setup --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b125b4c1f8..823165d6e7 100644 --- a/setup.py +++ b/setup.py @@ -109,7 +109,7 @@ ], 'video':[ 'moviepy', - 'imageio', + 'imageio[ffmpeg]', ], 'dist': [ 'redis-py-cluster==2.1.0', From 0863b0b5c166d2a8723310d56bd9a11a52129ef4 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Sat, 22 Jul 2023 11:58:43 +0000 Subject: [PATCH 172/244] fix config bug --- ding/config/A2C/gym_bipedalwalker_v3.py | 2 +- ding/config/A2C/gym_halfcheetah_v3.py | 2 +- ding/config/A2C/gym_hopper_v3.py | 2 +- ding/config/A2C/gym_lunarlander_v2.py | 2 +- ding/config/A2C/gym_walker2d_v3.py | 2 +- ding/config/C51/gym_pongnoframeskip_v4.py | 2 +- ding/config/C51/gym_qbertnoframeskip_v4.py | 2 +- ding/config/C51/gym_spaceInvadersnoframeskip_v4.py | 2 +- ding/config/DDPG/gym_bipedalwalker_v3.py | 2 +- ding/config/DDPG/gym_halfcheetah_v3.py | 2 +- ding/config/DDPG/gym_hopper_v3.py | 2 +- ding/config/DDPG/gym_pendulum_v1.py | 2 +- ding/config/DDPG/gym_walker2d_v3.py | 2 +- ding/config/DQN/gym_pongnoframeskip_v4.py | 2 +- ding/config/DQN/gym_qbertnoframeskip_v4.py | 2 +- ding/config/DQN/gym_spaceInvadersnoframeskip_v4.py | 2 +- ding/config/PG/gym_bipedalwalker_v3.py | 2 +- ding/config/PG/gym_halfcheetah_v3.py | 2 +- ding/config/PG/gym_hopper_v3.py | 2 +- ding/config/PG/gym_lunarlander_v2.py | 2 +- ding/config/PG/gym_pendulum_v1.py | 2 +- ding/config/PG/gym_walker2d_v3.py | 2 +- ding/config/PPOF/gym_lunarlander_v2.py | 2 +- ding/config/PPOOffPolicy/gym_pongnoframeskip_v4.py | 2 +- ding/config/PPOOffPolicy/gym_qbertnoframeskip_v4.py | 2 +- ding/config/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py | 2 +- ding/config/SAC/gym_bipedalwalker_v3.py | 2 +- ding/config/SAC/gym_halfcheetah_v3.py | 2 +- ding/config/SAC/gym_hopper_v3.py | 2 +- ding/config/SAC/gym_pendulum_v1.py | 2 +- ding/config/SAC/gym_walker2d_v3.py | 2 +- ding/config/TD3/gym_bipedalwalker_v3.py | 2 +- ding/config/TD3/gym_halfcheetah_v3.py | 2 +- ding/config/TD3/gym_hopper_v3.py | 2 +- ding/config/TD3/gym_pendulum_v1.py | 2 +- ding/config/TD3/gym_walker2d_v3.py | 2 +- 36 files changed, 36 insertions(+), 36 deletions(-) diff --git a/ding/config/A2C/gym_bipedalwalker_v3.py b/ding/config/A2C/gym_bipedalwalker_v3.py index 5159b07dfa..8293baed12 100644 --- a/ding/config/A2C/gym_bipedalwalker_v3.py +++ b/ding/config/A2C/gym_bipedalwalker_v3.py @@ -40,4 +40,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/A2C/gym_halfcheetah_v3.py b/ding/config/A2C/gym_halfcheetah_v3.py index ae414bc00e..4f06bab30d 100644 --- a/ding/config/A2C/gym_halfcheetah_v3.py +++ b/ding/config/A2C/gym_halfcheetah_v3.py @@ -45,4 +45,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/A2C/gym_hopper_v3.py b/ding/config/A2C/gym_hopper_v3.py index 1c0d81a5b4..bcefe401b5 100644 --- a/ding/config/A2C/gym_hopper_v3.py +++ b/ding/config/A2C/gym_hopper_v3.py @@ -43,4 +43,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/A2C/gym_lunarlander_v2.py b/ding/config/A2C/gym_lunarlander_v2.py index e6916bc132..0edf158ea7 100644 --- a/ding/config/A2C/gym_lunarlander_v2.py +++ b/ding/config/A2C/gym_lunarlander_v2.py @@ -35,4 +35,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/A2C/gym_walker2d_v3.py b/ding/config/A2C/gym_walker2d_v3.py index 8f98c2dc7c..287e9b0fe3 100644 --- a/ding/config/A2C/gym_walker2d_v3.py +++ b/ding/config/A2C/gym_walker2d_v3.py @@ -43,4 +43,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/C51/gym_pongnoframeskip_v4.py b/ding/config/C51/gym_pongnoframeskip_v4.py index 5260357d8a..d3d6ec2eec 100644 --- a/ding/config/C51/gym_pongnoframeskip_v4.py +++ b/ding/config/C51/gym_pongnoframeskip_v4.py @@ -51,4 +51,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/C51/gym_qbertnoframeskip_v4.py b/ding/config/C51/gym_qbertnoframeskip_v4.py index c9e5412af3..b68231f1b1 100644 --- a/ding/config/C51/gym_qbertnoframeskip_v4.py +++ b/ding/config/C51/gym_qbertnoframeskip_v4.py @@ -51,4 +51,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/C51/gym_spaceInvadersnoframeskip_v4.py b/ding/config/C51/gym_spaceInvadersnoframeskip_v4.py index f8e9c7c929..e635005b75 100644 --- a/ding/config/C51/gym_spaceInvadersnoframeskip_v4.py +++ b/ding/config/C51/gym_spaceInvadersnoframeskip_v4.py @@ -51,4 +51,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/DDPG/gym_bipedalwalker_v3.py b/ding/config/DDPG/gym_bipedalwalker_v3.py index 8e9babf148..4a09a2070c 100644 --- a/ding/config/DDPG/gym_bipedalwalker_v3.py +++ b/ding/config/DDPG/gym_bipedalwalker_v3.py @@ -42,4 +42,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/DDPG/gym_halfcheetah_v3.py b/ding/config/DDPG/gym_halfcheetah_v3.py index bf07473488..197c633db6 100644 --- a/ding/config/DDPG/gym_halfcheetah_v3.py +++ b/ding/config/DDPG/gym_halfcheetah_v3.py @@ -52,4 +52,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/DDPG/gym_hopper_v3.py b/ding/config/DDPG/gym_hopper_v3.py index fa1791316e..e84dd7aa8a 100644 --- a/ding/config/DDPG/gym_hopper_v3.py +++ b/ding/config/DDPG/gym_hopper_v3.py @@ -52,4 +52,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/DDPG/gym_pendulum_v1.py b/ding/config/DDPG/gym_pendulum_v1.py index 45ebd9bc33..41cc09933c 100644 --- a/ding/config/DDPG/gym_pendulum_v1.py +++ b/ding/config/DDPG/gym_pendulum_v1.py @@ -49,4 +49,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/DDPG/gym_walker2d_v3.py b/ding/config/DDPG/gym_walker2d_v3.py index 0461253d17..e510bc05be 100644 --- a/ding/config/DDPG/gym_walker2d_v3.py +++ b/ding/config/DDPG/gym_walker2d_v3.py @@ -52,4 +52,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/DQN/gym_pongnoframeskip_v4.py b/ding/config/DQN/gym_pongnoframeskip_v4.py index d88ef6e348..5ecb00071c 100644 --- a/ding/config/DQN/gym_pongnoframeskip_v4.py +++ b/ding/config/DQN/gym_pongnoframeskip_v4.py @@ -47,4 +47,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/DQN/gym_qbertnoframeskip_v4.py b/ding/config/DQN/gym_qbertnoframeskip_v4.py index 7e688cfcee..15f2b818e6 100644 --- a/ding/config/DQN/gym_qbertnoframeskip_v4.py +++ b/ding/config/DQN/gym_qbertnoframeskip_v4.py @@ -47,4 +47,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/DQN/gym_spaceInvadersnoframeskip_v4.py b/ding/config/DQN/gym_spaceInvadersnoframeskip_v4.py index 2dfc298c97..ea71f743be 100644 --- a/ding/config/DQN/gym_spaceInvadersnoframeskip_v4.py +++ b/ding/config/DQN/gym_spaceInvadersnoframeskip_v4.py @@ -48,4 +48,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/PG/gym_bipedalwalker_v3.py b/ding/config/PG/gym_bipedalwalker_v3.py index c62e5b8178..21cff070a2 100644 --- a/ding/config/PG/gym_bipedalwalker_v3.py +++ b/ding/config/PG/gym_bipedalwalker_v3.py @@ -40,4 +40,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/PG/gym_halfcheetah_v3.py b/ding/config/PG/gym_halfcheetah_v3.py index f5309bedc0..a2e9b00db4 100644 --- a/ding/config/PG/gym_halfcheetah_v3.py +++ b/ding/config/PG/gym_halfcheetah_v3.py @@ -43,4 +43,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/PG/gym_hopper_v3.py b/ding/config/PG/gym_hopper_v3.py index 1eb4925ee1..7851e7f316 100644 --- a/ding/config/PG/gym_hopper_v3.py +++ b/ding/config/PG/gym_hopper_v3.py @@ -43,4 +43,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/PG/gym_lunarlander_v2.py b/ding/config/PG/gym_lunarlander_v2.py index 607f6d3fbf..38783facd9 100644 --- a/ding/config/PG/gym_lunarlander_v2.py +++ b/ding/config/PG/gym_lunarlander_v2.py @@ -35,4 +35,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/PG/gym_pendulum_v1.py b/ding/config/PG/gym_pendulum_v1.py index cd68f13b8e..3ec9c68904 100644 --- a/ding/config/PG/gym_pendulum_v1.py +++ b/ding/config/PG/gym_pendulum_v1.py @@ -39,4 +39,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/PG/gym_walker2d_v3.py b/ding/config/PG/gym_walker2d_v3.py index a572b348ad..db516070d0 100644 --- a/ding/config/PG/gym_walker2d_v3.py +++ b/ding/config/PG/gym_walker2d_v3.py @@ -43,4 +43,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/PPOF/gym_lunarlander_v2.py b/ding/config/PPOF/gym_lunarlander_v2.py index 352f12a495..2844a177b6 100644 --- a/ding/config/PPOF/gym_lunarlander_v2.py +++ b/ding/config/PPOF/gym_lunarlander_v2.py @@ -10,4 +10,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/PPOOffPolicy/gym_pongnoframeskip_v4.py b/ding/config/PPOOffPolicy/gym_pongnoframeskip_v4.py index cc13c5653b..83ed2a50bf 100644 --- a/ding/config/PPOOffPolicy/gym_pongnoframeskip_v4.py +++ b/ding/config/PPOOffPolicy/gym_pongnoframeskip_v4.py @@ -51,4 +51,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/PPOOffPolicy/gym_qbertnoframeskip_v4.py b/ding/config/PPOOffPolicy/gym_qbertnoframeskip_v4.py index b0e3a4ce20..6f07200b48 100644 --- a/ding/config/PPOOffPolicy/gym_qbertnoframeskip_v4.py +++ b/ding/config/PPOOffPolicy/gym_qbertnoframeskip_v4.py @@ -45,4 +45,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py b/ding/config/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py index 218ea0a727..492ed090fe 100644 --- a/ding/config/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py +++ b/ding/config/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py @@ -45,4 +45,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/SAC/gym_bipedalwalker_v3.py b/ding/config/SAC/gym_bipedalwalker_v3.py index 9ec1a9d0af..8f427083b3 100644 --- a/ding/config/SAC/gym_bipedalwalker_v3.py +++ b/ding/config/SAC/gym_bipedalwalker_v3.py @@ -44,4 +44,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/SAC/gym_halfcheetah_v3.py b/ding/config/SAC/gym_halfcheetah_v3.py index 1a1e094e77..add0a8c636 100644 --- a/ding/config/SAC/gym_halfcheetah_v3.py +++ b/ding/config/SAC/gym_halfcheetah_v3.py @@ -53,4 +53,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/SAC/gym_hopper_v3.py b/ding/config/SAC/gym_hopper_v3.py index 1bb0d87e47..9ee256973d 100644 --- a/ding/config/SAC/gym_hopper_v3.py +++ b/ding/config/SAC/gym_hopper_v3.py @@ -40,4 +40,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/SAC/gym_pendulum_v1.py b/ding/config/SAC/gym_pendulum_v1.py index 2c7acb462c..ddda04197d 100644 --- a/ding/config/SAC/gym_pendulum_v1.py +++ b/ding/config/SAC/gym_pendulum_v1.py @@ -46,4 +46,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/SAC/gym_walker2d_v3.py b/ding/config/SAC/gym_walker2d_v3.py index c476e2e54e..6936603247 100644 --- a/ding/config/SAC/gym_walker2d_v3.py +++ b/ding/config/SAC/gym_walker2d_v3.py @@ -53,4 +53,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/TD3/gym_bipedalwalker_v3.py b/ding/config/TD3/gym_bipedalwalker_v3.py index 2459148f52..e2949f5ff9 100644 --- a/ding/config/TD3/gym_bipedalwalker_v3.py +++ b/ding/config/TD3/gym_bipedalwalker_v3.py @@ -49,4 +49,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/TD3/gym_halfcheetah_v3.py b/ding/config/TD3/gym_halfcheetah_v3.py index a53b1889ac..6aba1bcefd 100644 --- a/ding/config/TD3/gym_halfcheetah_v3.py +++ b/ding/config/TD3/gym_halfcheetah_v3.py @@ -55,4 +55,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/TD3/gym_hopper_v3.py b/ding/config/TD3/gym_hopper_v3.py index f1458f54c5..02773b5a9f 100644 --- a/ding/config/TD3/gym_hopper_v3.py +++ b/ding/config/TD3/gym_hopper_v3.py @@ -34,4 +34,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/TD3/gym_pendulum_v1.py b/ding/config/TD3/gym_pendulum_v1.py index 18e43a2831..7e1fb80965 100644 --- a/ding/config/TD3/gym_pendulum_v1.py +++ b/ding/config/TD3/gym_pendulum_v1.py @@ -51,4 +51,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env diff --git a/ding/config/TD3/gym_walker2d_v3.py b/ding/config/TD3/gym_walker2d_v3.py index 650cbffd1f..9cdd4a6885 100644 --- a/ding/config/TD3/gym_walker2d_v3.py +++ b/ding/config/TD3/gym_walker2d_v3.py @@ -57,4 +57,4 @@ cfg = EasyDict(cfg) import ding.envs.gym_env -env = ding.envs.gym_env.env, +env = ding.envs.gym_env.env From c934ef67c91eb62da3a23a064167d74404618455 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 25 Jul 2023 07:43:34 +0000 Subject: [PATCH 173/244] polish code --- .../framework/middleware/functional/logger.py | 55 +++++++++++++------ 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/ding/framework/middleware/functional/logger.py b/ding/framework/middleware/functional/logger.py index 69dc527ca0..ff3c9aea4f 100644 --- a/ding/framework/middleware/functional/logger.py +++ b/ding/framework/middleware/functional/logger.py @@ -318,9 +318,9 @@ def _plot(ctx: "OnlineRLContext"): def wandb_offline_logger( - dataset_path: str, record_path: str = None, cfg: Union[dict, EasyDict] = None, + exp_config: Union[dict, EasyDict] = None, metric_list: Optional[List[str]] = None, env: Optional[BaseEnvManagerV2] = None, model: Optional[torch.nn.Module] = None, @@ -332,7 +332,6 @@ def wandb_offline_logger( Overview: Wandb visualizer to track the experiment. Arguments: - - datasetpath (:obj:`str`): The path to save the replay of simulation. - record_path (:obj:`str`): The path to save the replay of simulation. - cfg (:obj:`Union[dict, EasyDict]`): Config, a dict of following settings: - gradient_logger: boolean. Whether to track the gradient. @@ -340,6 +339,7 @@ def wandb_offline_logger( - video_logger: boolean. Whether to upload the rendering video replay. - action_logger: boolean. `q_value` or `action probability`. - return_logger: boolean. Whether to track the return value. + - vis_dataset: boolean. Whether to visualize the dataset. - metric_list (:obj:`Optional[List[str]]`): Logged metric list, specialized by different policies. - env (:obj:`BaseEnvManagerV2`): Evaluator environment. - model (:obj:`nn.Module`): Policy neural network model. @@ -354,16 +354,28 @@ def wandb_offline_logger( metric_list = ["q_value", "target q_value", "loss", "lr", "entropy", "target_q_value", "td_error"] # Initialize wandb with default settings # Settings can be covered by calling wandb.init() at the top of the script - if not wandb_sweep: - if anonymous: - wandb.init(project=project_name, reinit=True, anonymous="must") + if exp_config: + if not wandb_sweep: + if anonymous: + wandb.init(project=project_name, config=exp_config, reinit=True, anonymous="must") + else: + wandb.init(project=project_name, config=exp_config, reinit=True) else: - wandb.init(project=project_name, reinit=True) + if anonymous: + wandb.init(project=project_name, config=exp_config, anonymous="must") + else: + wandb.init(project=project_name, config=exp_config) else: - if anonymous: - wandb.init(project=project_name, anonymous="must") + if not wandb_sweep: + if anonymous: + wandb.init(project=project_name, reinit=True, anonymous="must") + else: + wandb.init(project=project_name, reinit=True) else: - wandb.init(project=project_name) + if anonymous: + wandb.init(project=project_name, anonymous="must") + else: + wandb.init(project=project_name) plt.switch_backend('agg') if cfg is None: cfg = EasyDict( @@ -373,6 +385,7 @@ def wandb_offline_logger( video_logger=False, action_logger=False, return_logger=False, + vis_dataset=True, ) ) else: @@ -387,14 +400,13 @@ def wandb_offline_logger( if env is not None and cfg.video_logger is True and record_path is not None: env.enable_save_replay(replay_path=record_path) if cfg.gradient_logger: - wandb.watch(model) + wandb.watch(model, log="all", log_freq=100, log_graph=True) else: one_time_warning( "If you want to use wandb to visualize the gradient, please set gradient_logger = True in the config." ) first_plot = True - def _vis_dataset(datasetpath: str): try: from sklearn.manifold import TSNE @@ -503,13 +515,20 @@ def _plot(ctx: "OnlineRLContext"): episode_return = episode_return.squeeze(1) if cfg.video_logger: - file_list = [] - for p in os.listdir(record_path): - if os.path.splitext(p)[-1] == ".mp4": - file_list.append(p) - file_list.sort(key=lambda fn: os.path.getmtime(os.path.join(record_path, fn))) - video_path = os.path.join(record_path, file_list[-2]) - info_for_logging.update({"video": wandb.Video(video_path, format="mp4")}) + if 'replay_video' in ctx.eval_output: + # save numpy array "images" of shape (N,1212,3,224,320) to N video files in mp4 format + # The numpy tensor must be either 4 dimensional or 5 dimensional. Channels should be (time, channel, height, width) or (batch, time, channel, height width) + video_images = ctx.eval_output['replay_video'] + video_images = video_images.astype(np.uint8) + info_for_logging.update({"replay_video": wandb.Video(video_images, fps=60)}) + elif record_path is not None: + file_list = [] + for p in os.listdir(record_path): + if os.path.splitext(p)[-1] == ".mp4": + file_list.append(p) + file_list.sort(key=lambda fn: os.path.getmtime(os.path.join(record_path, fn))) + video_path = os.path.join(record_path, file_list[-2]) + info_for_logging.update({"video": wandb.Video(video_path, format="mp4")}) if cfg.action_logger: action_path = os.path.join(record_path, (str(ctx.env_step) + "_action.gif")) From a1f3e940138f42f6ccff8d22aa7b911ae6d402a9 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 25 Jul 2023 08:06:16 +0000 Subject: [PATCH 174/244] polish code --- .../framework/middleware/functional/logger.py | 103 ++++++++++++++---- 1 file changed, 79 insertions(+), 24 deletions(-) diff --git a/ding/framework/middleware/functional/logger.py b/ding/framework/middleware/functional/logger.py index ff3c9aea4f..0c7d0a8b47 100644 --- a/ding/framework/middleware/functional/logger.py +++ b/ding/framework/middleware/functional/logger.py @@ -126,6 +126,7 @@ def wandb_online_logger( model: Optional[torch.nn.Module] = None, anonymous: bool = False, project_name: str = 'default-project', + run_name: str = None, wandb_sweep: bool = False, ) -> Callable: ''' @@ -145,6 +146,8 @@ def wandb_online_logger( - anonymous (:obj:`bool`): Open the anonymous mode of wandb or not. The anonymous mode allows visualization of data without wandb count. - project_name (:obj:`str`): The name of wandb project. + - run_name (:obj:`str`): The name of wandb run. + - wandb_sweep (:obj:`bool`): Whether to use wandb sweep. ''' if task.router.is_active and not task.has_role(task.role.LEARNER): return task.void() @@ -155,26 +158,50 @@ def wandb_online_logger( # Settings can be covered by calling wandb.init() at the top of the script if exp_config: if not wandb_sweep: - if anonymous: - wandb.init(project=project_name, config=exp_config, reinit=True, anonymous="must") + if run_name is not None: + if anonymous: + wandb.init(project=project_name, config=exp_config, reinit=True, name=run_name, anonymous="must") + else: + wandb.init(project=project_name, config=exp_config, reinit=True, name=run_name) else: - wandb.init(project=project_name, config=exp_config, reinit=True) + if anonymous: + wandb.init(project=project_name, config=exp_config, reinit=True, anonymous="must") + else: + wandb.init(project=project_name, config=exp_config, reinit=True) else: - if anonymous: - wandb.init(project=project_name, config=exp_config, anonymous="must") + if run_name is not None: + if anonymous: + wandb.init(project=project_name, config=exp_config, name=run_name, anonymous="must") + else: + wandb.init(project=project_name, config=exp_config, name=run_name) else: - wandb.init(project=project_name, config=exp_config) + if anonymous: + wandb.init(project=project_name, config=exp_config, anonymous="must") + else: + wandb.init(project=project_name, config=exp_config) else: if not wandb_sweep: - if anonymous: - wandb.init(project=project_name, reinit=True, anonymous="must") + if run_name is not None: + if anonymous: + wandb.init(project=project_name, reinit=True, name=run_name, anonymous="must") + else: + wandb.init(project=project_name, reinit=True, name=run_name) else: - wandb.init(project=project_name, reinit=True) + if anonymous: + wandb.init(project=project_name, reinit=True, anonymous="must") + else: + wandb.init(project=project_name, reinit=True) else: - if anonymous: - wandb.init(project=project_name, anonymous="must") + if run_name is not None: + if anonymous: + wandb.init(project=project_name, name=run_name, anonymous="must") + else: + wandb.init(project=project_name, name=run_name) else: - wandb.init(project=project_name) + if anonymous: + wandb.init(project=project_name, anonymous="must") + else: + wandb.init(project=project_name) plt.switch_backend('agg') if cfg is None: cfg = EasyDict( @@ -326,6 +353,7 @@ def wandb_offline_logger( model: Optional[torch.nn.Module] = None, anonymous: bool = False, project_name: str = 'default-project', + run_name: str = None, wandb_sweep: bool = False, ) -> Callable: ''' @@ -346,6 +374,8 @@ def wandb_offline_logger( - anonymous (:obj:`bool`): Open the anonymous mode of wandb or not. The anonymous mode allows visualization of data without wandb count. - project_name (:obj:`str`): The name of wandb project. + - run_name (:obj:`str`): The name of wandb run. + - wandb_sweep (:obj:`bool`): Whether to use wandb sweep. ''' if task.router.is_active and not task.has_role(task.role.LEARNER): return task.void() @@ -356,26 +386,51 @@ def wandb_offline_logger( # Settings can be covered by calling wandb.init() at the top of the script if exp_config: if not wandb_sweep: - if anonymous: - wandb.init(project=project_name, config=exp_config, reinit=True, anonymous="must") + if run_name is not None: + if anonymous: + wandb.init(project=project_name, config=exp_config, reinit=True, name=run_name, anonymous="must") + else: + wandb.init(project=project_name, config=exp_config, reinit=True, name=run_name) else: - wandb.init(project=project_name, config=exp_config, reinit=True) + if anonymous: + wandb.init(project=project_name, config=exp_config, reinit=True, anonymous="must") + else: + wandb.init(project=project_name, config=exp_config, reinit=True) else: - if anonymous: - wandb.init(project=project_name, config=exp_config, anonymous="must") + if run_name is not None: + if anonymous: + wandb.init(project=project_name, config=exp_config, name=run_name, anonymous="must") + else: + wandb.init(project=project_name, config=exp_config, name=run_name) else: - wandb.init(project=project_name, config=exp_config) + if anonymous: + wandb.init(project=project_name, config=exp_config, anonymous="must") + else: + wandb.init(project=project_name, config=exp_config) else: if not wandb_sweep: - if anonymous: - wandb.init(project=project_name, reinit=True, anonymous="must") + if run_name is not None: + if anonymous: + wandb.init(project=project_name, reinit=True, name=run_name, anonymous="must") + else: + wandb.init(project=project_name, reinit=True, name=run_name) else: - wandb.init(project=project_name, reinit=True) + if anonymous: + wandb.init(project=project_name, reinit=True, anonymous="must") + else: + wandb.init(project=project_name, reinit=True) else: - if anonymous: - wandb.init(project=project_name, anonymous="must") + if run_name is not None: + if anonymous: + wandb.init(project=project_name, name=run_name, anonymous="must") + else: + wandb.init(project=project_name, name=run_name) else: - wandb.init(project=project_name) + if anonymous: + wandb.init(project=project_name, anonymous="must") + else: + wandb.init(project=project_name) + plt.switch_backend('agg') plt.switch_backend('agg') if cfg is None: cfg = EasyDict( From da9d2c1f06304635298a46461c04ac77fdc84244 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 25 Jul 2023 08:38:49 +0000 Subject: [PATCH 175/244] polish code --- ding/framework/middleware/functional/logger.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ding/framework/middleware/functional/logger.py b/ding/framework/middleware/functional/logger.py index 0c7d0a8b47..625965a087 100644 --- a/ding/framework/middleware/functional/logger.py +++ b/ding/framework/middleware/functional/logger.py @@ -520,9 +520,9 @@ def _vis_dataset(datasetpath: str): wandb.log({"dataset": wandb.Image("dataset.png")}) if cfg.vis_dataset is True: - _vis_dataset(dataset_path) + _vis_dataset(exp_config.dataset_path) - def _plot(ctx: "OnlineRLContext"): + def _plot(ctx: "OfflineRLContext"): nonlocal first_plot if first_plot: first_plot = False @@ -559,7 +559,7 @@ def _plot(ctx: "OnlineRLContext"): "episode return mean": ctx.eval_value, "episode return std": ctx.eval_value_std, "train iter": ctx.train_iter, - "env step": ctx.env_step + "train_epoch": ctx.train_epoch, } ) From 92d9504169415a2043eff02acde1bfd0cffab8eb Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 26 Jul 2023 05:43:09 +0000 Subject: [PATCH 176/244] fix bug in evaluator --- ding/framework/middleware/functional/evaluator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ding/framework/middleware/functional/evaluator.py b/ding/framework/middleware/functional/evaluator.py index 46e4c76e96..611bbcdea6 100644 --- a/ding/framework/middleware/functional/evaluator.py +++ b/ding/framework/middleware/functional/evaluator.py @@ -268,8 +268,8 @@ def _evaluate(ctx: Union["OnlineRLContext", "OfflineRLContext"]): if 'episode_info' in timestep.info: eval_monitor.update_info(env_id, timestep.info.episode_info) episode_return = eval_monitor.get_episode_return() - episode_return_min = np.std(episode_return) - episode_return_max = np.std(episode_return) + episode_return_min = np.min(episode_return) + episode_return_max = np.max(episode_return) episode_return_std = np.std(episode_return) episode_return = np.mean(episode_return) stop_flag = episode_return >= cfg.env.stop_value and ctx.train_iter > 0 From 1f0704c491e87a80e48836a07188b383ab290780 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 27 Jul 2023 04:41:04 +0000 Subject: [PATCH 177/244] polish code --- ding/config/C51/gym_lunarlander_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ding/config/C51/gym_lunarlander_v2.py b/ding/config/C51/gym_lunarlander_v2.py index 97ea8a7abe..6c52cc691a 100644 --- a/ding/config/C51/gym_lunarlander_v2.py +++ b/ding/config/C51/gym_lunarlander_v2.py @@ -8,7 +8,7 @@ evaluator_env_num=8, env_id='LunarLander-v2', n_evaluator_episode=8, - stop_value=200, + stop_value=260, ), policy=dict( cuda=False, From 5a08ec7980323ff9cc85998e01b9bb03640577a5 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 27 Jul 2023 04:46:57 +0000 Subject: [PATCH 178/244] polish code --- ding/config/A2C/gym_lunarlander_v2.py | 2 +- ding/config/C51/gym_pongnoframeskip_v4.py | 2 +- ding/config/DDPG/gym_lunarlandercontinuous_v2.py | 2 +- ding/config/DQN/gym_lunarlander_v2.py | 2 +- ding/config/DQN/gym_pongnoframeskip_v4.py | 2 +- ding/config/PG/gym_lunarlander_v2.py | 2 +- ding/config/PPOOffPolicy/gym_lunarlander_v2.py | 2 +- ding/config/PPOOffPolicy/gym_lunarlandercontinuous_v2.py | 2 +- ding/config/PPOOffPolicy/gym_pongnoframeskip_v4.py | 2 +- ding/config/SAC/gym_lunarlandercontinuous_v2.py | 2 +- ding/config/SQL/gym_lunarlander_v2.py | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/ding/config/A2C/gym_lunarlander_v2.py b/ding/config/A2C/gym_lunarlander_v2.py index 0edf158ea7..8e85171768 100644 --- a/ding/config/A2C/gym_lunarlander_v2.py +++ b/ding/config/A2C/gym_lunarlander_v2.py @@ -7,7 +7,7 @@ evaluator_env_num=8, env_id='LunarLander-v2', n_evaluator_episode=8, - stop_value=240, + stop_value=260, ), policy=dict( cuda=True, diff --git a/ding/config/C51/gym_pongnoframeskip_v4.py b/ding/config/C51/gym_pongnoframeskip_v4.py index d3d6ec2eec..d3dc9ffb9c 100644 --- a/ding/config/C51/gym_pongnoframeskip_v4.py +++ b/ding/config/C51/gym_pongnoframeskip_v4.py @@ -7,7 +7,7 @@ collector_env_num=8, evaluator_env_num=8, n_evaluator_episode=8, - stop_value=20, + stop_value=30, env_id='PongNoFrameskip-v4', frame_stack=4, env_wrapper='atari_default', diff --git a/ding/config/DDPG/gym_lunarlandercontinuous_v2.py b/ding/config/DDPG/gym_lunarlandercontinuous_v2.py index d4b6510c8c..2dca929028 100644 --- a/ding/config/DDPG/gym_lunarlandercontinuous_v2.py +++ b/ding/config/DDPG/gym_lunarlandercontinuous_v2.py @@ -8,7 +8,7 @@ collector_env_num=8, evaluator_env_num=8, n_evaluator_episode=8, - stop_value=240, + stop_value=260, act_scale=True, ), policy=dict( diff --git a/ding/config/DQN/gym_lunarlander_v2.py b/ding/config/DQN/gym_lunarlander_v2.py index 510c7b2e18..0307f7031f 100644 --- a/ding/config/DQN/gym_lunarlander_v2.py +++ b/ding/config/DQN/gym_lunarlander_v2.py @@ -8,7 +8,7 @@ collector_env_num=8, evaluator_env_num=8, n_evaluator_episode=8, - stop_value=240, + stop_value=260, ), policy=dict( cuda=True, diff --git a/ding/config/DQN/gym_pongnoframeskip_v4.py b/ding/config/DQN/gym_pongnoframeskip_v4.py index 5ecb00071c..696ee6f50d 100644 --- a/ding/config/DQN/gym_pongnoframeskip_v4.py +++ b/ding/config/DQN/gym_pongnoframeskip_v4.py @@ -8,7 +8,7 @@ collector_env_num=8, evaluator_env_num=8, n_evaluator_episode=8, - stop_value=20, + stop_value=30, fram_stack=4, env_wrapper='atari_default', ), diff --git a/ding/config/PG/gym_lunarlander_v2.py b/ding/config/PG/gym_lunarlander_v2.py index 38783facd9..414e4940c7 100644 --- a/ding/config/PG/gym_lunarlander_v2.py +++ b/ding/config/PG/gym_lunarlander_v2.py @@ -7,7 +7,7 @@ evaluator_env_num=8, env_id='LunarLander-v2', n_evaluator_episode=8, - stop_value=240, + stop_value=260, ), policy=dict( cuda=True, diff --git a/ding/config/PPOOffPolicy/gym_lunarlander_v2.py b/ding/config/PPOOffPolicy/gym_lunarlander_v2.py index 18ba940f3e..e68c3dd285 100644 --- a/ding/config/PPOOffPolicy/gym_lunarlander_v2.py +++ b/ding/config/PPOOffPolicy/gym_lunarlander_v2.py @@ -7,7 +7,7 @@ evaluator_env_num=8, env_id='LunarLander-v2', n_evaluator_episode=8, - stop_value=240, + stop_value=260, ), policy=dict( cuda=True, diff --git a/ding/config/PPOOffPolicy/gym_lunarlandercontinuous_v2.py b/ding/config/PPOOffPolicy/gym_lunarlandercontinuous_v2.py index f8949535c1..7e8d89a609 100644 --- a/ding/config/PPOOffPolicy/gym_lunarlandercontinuous_v2.py +++ b/ding/config/PPOOffPolicy/gym_lunarlandercontinuous_v2.py @@ -11,7 +11,7 @@ collector_env_num=8, evaluator_env_num=4, n_evaluator_episode=4, - stop_value=240, + stop_value=260, act_scale=True, ), policy=dict( diff --git a/ding/config/PPOOffPolicy/gym_pongnoframeskip_v4.py b/ding/config/PPOOffPolicy/gym_pongnoframeskip_v4.py index 83ed2a50bf..93f603f323 100644 --- a/ding/config/PPOOffPolicy/gym_pongnoframeskip_v4.py +++ b/ding/config/PPOOffPolicy/gym_pongnoframeskip_v4.py @@ -6,7 +6,7 @@ collector_env_num=8, evaluator_env_num=8, n_evaluator_episode=8, - stop_value=20, + stop_value=30, env_id='PongNoFrameskip-v4', frame_stack=4, env_wrapper='atari_default', diff --git a/ding/config/SAC/gym_lunarlandercontinuous_v2.py b/ding/config/SAC/gym_lunarlandercontinuous_v2.py index 7c1a1f4183..4af21e86aa 100644 --- a/ding/config/SAC/gym_lunarlandercontinuous_v2.py +++ b/ding/config/SAC/gym_lunarlandercontinuous_v2.py @@ -8,7 +8,7 @@ collector_env_num=4, evaluator_env_num=8, n_evaluator_episode=8, - stop_value=240, + stop_value=260, act_scale=True, ), policy=dict( diff --git a/ding/config/SQL/gym_lunarlander_v2.py b/ding/config/SQL/gym_lunarlander_v2.py index 696a4863dd..271e793c55 100644 --- a/ding/config/SQL/gym_lunarlander_v2.py +++ b/ding/config/SQL/gym_lunarlander_v2.py @@ -7,7 +7,7 @@ evaluator_env_num=8, env_id='LunarLander-v2', n_evaluator_episode=8, - stop_value=200, + stop_value=260, ), policy=dict( cuda=True, From 65b9f08a8382673cccb578e38b7cf9393599e004 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 8 Aug 2023 17:05:06 +0800 Subject: [PATCH 179/244] Add priority in collector --- ding/data/buffer/middleware/priority.py | 5 +- ding/example/apex_dqn.py | 135 ++++++++++++++++++ ding/example/apex_dqn_ddp.py | 54 +++++++ ding/example/apex_dqn_parallel.py | 109 ++++++++++++++ ding/example/apex_dqn_parallel_origin.py | 117 +++++++++++++++ ding/example/apex_dqn_per.py | 76 ++++++++++ ding/example/apex_dqn_per_parallel.py | 130 +++++++++++++++++ ding/framework/middleware/__init__.py | 2 +- ding/framework/middleware/distributer.py | 135 ++++++++++++++++++ .../middleware/functional/__init__.py | 2 +- .../middleware/functional/data_processor.py | 6 +- .../middleware/functional/enhancer.py | 2 +- .../framework/middleware/functional/logger.py | 1 + .../middleware/functional/priority.py | 38 +++++ ding/framework/middleware/learner.py | 3 +- .../middleware/tests/test_distributer.py | 49 ++++++- ding/policy/dqn.py | 48 +++++++ ding/rl_utils/td.py | 1 + 18 files changed, 904 insertions(+), 9 deletions(-) create mode 100644 ding/example/apex_dqn.py create mode 100644 ding/example/apex_dqn_ddp.py create mode 100644 ding/example/apex_dqn_parallel.py create mode 100644 ding/example/apex_dqn_parallel_origin.py create mode 100644 ding/example/apex_dqn_per.py create mode 100644 ding/example/apex_dqn_per_parallel.py create mode 100644 ding/framework/middleware/functional/priority.py diff --git a/ding/data/buffer/middleware/priority.py b/ding/data/buffer/middleware/priority.py index f6df6c1be3..017b302a5f 100644 --- a/ding/data/buffer/middleware/priority.py +++ b/ding/data/buffer/middleware/priority.py @@ -54,7 +54,10 @@ def __init__( def push(self, chain: Callable, data: Any, meta: Optional[dict] = None, *args, **kwargs) -> BufferedData: if meta is None: - meta = {'priority': self.max_priority} + if 'priority' in data: + meta = {'priority': data.pop('priority')} + else: + meta = {'priority': self.max_priority} else: if 'priority' not in meta: meta['priority'] = self.max_priority diff --git a/ding/example/apex_dqn.py b/ding/example/apex_dqn.py new file mode 100644 index 0000000000..6fbd43557d --- /dev/null +++ b/ding/example/apex_dqn.py @@ -0,0 +1,135 @@ +import os +from ditk import logging +from ding.model import DQN +from ding.policy import DQNPolicy +from ding.data import DequeBuffer +from ding.data.buffer.middleware import PriorityExperienceReplay +from ding.envs import setup_ding_env_manager +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework import Parallel +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ModelExchanger, ContextExchanger, online_logger, \ + nstep_reward_enhancer, priority_calculator +from ding.utils import set_pkg_seed + + +def main(): + from ding.config.DQN.gym_lunarlander_v2 import cfg, env + + cfg.exp_name = 'LunarLander-v2-Apex-DQN' + cfg.policy.priority = True + cfg.policy.priority_IS_weight = True + cfg = compile_config(cfg, policy=DQNPolicy, save_cfg=task.router.node_id == 0) + + logging.getLogger().setLevel(logging.INFO) + model_path = os.path.join(cfg.exp_name, 'models') + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + + assert task.router.is_active + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + if task.has_role(task.role.COLLECTOR): + collector_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.collector_env_num, 'collector', debug=True) + evaluator_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.evaluator_env_num, 'evaluator', debug=True) + elif task.has_role(task.role.EVALUATOR): + collector_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.collector_env_num, 'collector', debug=True) + evaluator_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.evaluator_env_num, 'evaluator', debug=True) + elif task.has_role(task.role.LEARNER): + collector_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.collector_env_num, 'collector', debug=True) + evaluator_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.evaluator_env_num, 'evaluator', debug=True) + + if task.has_role(task.role.LEARNER): + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) + policy = DQNPolicy(cfg.policy, model=model) + task.use(ContextExchanger(skip_n_iter=1)) + #task.use(PeriodicalModelExchanger(model=policy._model, mode="send")) + task.use(ModelExchanger(model)) + + elif task.has_role(task.role.COLLECTOR): + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) + policy = DQNPolicy(cfg.policy, model=model) + #collect_model_loader=FileModelLoader(model=model, dirname=model_path) + task.use(ContextExchanger(skip_n_iter=1)) + #task.use(PeriodicalModelExchanger(model=policy._model, model_loader=collect_model_loader, mode="update")) + task.use(ModelExchanger(model)) + + elif task.has_role(task.role.EVALUATOR): + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) + policy = DQNPolicy(cfg.policy, model=model) + #eval_model_loader=FileModelLoader(model=model, dirname=model_path) + task.use(ContextExchanger(skip_n_iter=1)) + #task.use(PeriodicalModelExchanger(model=policy._model, model_loader=eval_model_loader, mode="update")) + task.use(ModelExchanger(model)) + + # Here is the part of single process pipeline. + task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use(StepCollector(cfg, policy.collect_mode, collector_env)) + print(f"cfg.policy.nstep:{cfg.policy.nstep}") + if "nstep" in cfg.policy and cfg.policy.nstep > 1: + if task.has_role(task.role.COLLECTOR): + task.use(nstep_reward_enhancer(cfg)) + + def dqn_priority_calculation(update_target_model_frequency): + last_update_train_iter = 0 + + def _calculate_priority(data): + nonlocal last_update_train_iter + + if (task.ctx.train_iter - last_update_train_iter) % update_target_model_frequency == 0: + update_target_model = True + else: + update_target_model = False + priority = policy.calculate_priority(data, update_target_model=update_target_model)['priority'] + last_update_train_iter = task.ctx.train_iter + return priority + + return _calculate_priority + + task.use( + priority_calculator( + func_for_priority_calculation=dqn_priority_calculation( + update_target_model_frequency=cfg.policy.learn.target_update_freq + ), + ) + ) + + task.use(data_pusher(cfg, buffer_)) + task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use(CkptSaver(policy, cfg.exp_name, train_freq=100)) + task.run() + + +if __name__ == "__main__": + + Parallel.runner( + n_parallel_workers=3, + ports=50515, + protocol="tcp", + topology="mesh", + attach_to=None, + address=None, + labels=None, + node_ids=None, + mq_type="nng", + redis_host=None, + redis_port=None, + startup_interval=1 + )(main) diff --git a/ding/example/apex_dqn_ddp.py b/ding/example/apex_dqn_ddp.py new file mode 100644 index 0000000000..5491dbb79c --- /dev/null +++ b/ding/example/apex_dqn_ddp.py @@ -0,0 +1,54 @@ +import gym +from ditk import logging +from ding.model import DQN +from ding.policy import DQNPolicy +from ding.envs import DingEnvWrapper, BaseEnvManagerV2 +from ding.data import DequeBuffer +from ding.data.buffer.middleware import PriorityExperienceReplay +from ding.config import compile_config +from ding.framework import task +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ModelExchanger +from ding.utils import set_pkg_seed +from ding.utils import DDPContext, to_ddp_config +from dizoo.classic_control.cartpole.config.cartpole_dqn_config import main_config, create_config + + +def main(main_config, create_config): + logging.getLogger().setLevel(logging.INFO) + main_config.exp_name = 'cartpole_dqn_per' + main_config.policy.priority = True + main_config.policy.priority_IS_weight = True + cfg = compile_config(main_config, create_cfg=create_config, auto=True) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = BaseEnvManagerV2( + env_fn=[lambda: DingEnvWrapper(gym.make("CartPole-v0")) for _ in range(cfg.env.collector_env_num)], + cfg=cfg.env.manager + ) + evaluator_env = BaseEnvManagerV2( + env_fn=[lambda: DingEnvWrapper(gym.make("CartPole-v0")) for _ in range(cfg.env.evaluator_env_num)], + cfg=cfg.env.manager + ) + + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) + policy = DQNPolicy(cfg.policy, model=model) + + task.use(ModelExchanger(model)) + task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use(StepCollector(cfg, policy.collect_mode, collector_env)) + task.use(data_pusher(cfg, buffer_)) + task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(CkptSaver(policy, cfg.exp_name, train_freq=100)) + task.run() + + +if __name__ == "__main__": + with DDPContext(): + main_config = to_ddp_config(main_config) + main(main_config, create_config) diff --git a/ding/example/apex_dqn_parallel.py b/ding/example/apex_dqn_parallel.py new file mode 100644 index 0000000000..c050362585 --- /dev/null +++ b/ding/example/apex_dqn_parallel.py @@ -0,0 +1,109 @@ +import os +from ditk import logging +from ding.model import DQN +from ding.policy import DQNPolicy +from ding.envs import setup_ding_env_manager +from ding.data import DequeBuffer, FileModelLoader +from ding.data.buffer.middleware import PriorityExperienceReplay +from ding.config import compile_config +from ding.framework import task +from ding.framework import Parallel +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ModelExchanger, PeriodicalModelExchanger, ContextExchanger, online_logger +from ding.utils import set_pkg_seed + + +def main(cfg, env): + + logging.getLogger().setLevel(logging.INFO) + model_path = os.path.join(cfg.exp_name, 'models') + + with task.start(async_mode=False, ctx=OnlineRLContext()): + + assert task.router.is_active + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + if task.has_role(task.role.COLLECTOR): + collector_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.collector_env_num, 'collector') + evaluator_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.evaluator_env_num, 'evaluator') + elif task.has_role(task.role.EVALUATOR): + collector_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.collector_env_num, 'collector') + evaluator_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.evaluator_env_num, 'evaluator') + + if task.has_role(task.role.LEARNER): + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) + policy = DQNPolicy(cfg.policy, model=model, enable_field=['learn']) + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + task.use(PeriodicalModelExchanger(model=policy._model, mode="send")) + + # Here is the part of single process pipeline. + task.use(data_pusher(cfg, buffer_)) + task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use(CkptSaver(policy, cfg.exp_name, train_freq=100)) + + elif task.has_role(task.role.COLLECTOR): + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) + policy = DQNPolicy(cfg.policy, model=model, enable_field=['collect']) + collect_model_loader = FileModelLoader(model=model, dirname=model_path) + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + #task.use(PeriodicalModelExchanger(model=policy._model, model_loader=collect_model_loader, mode="update")) + + # Here is the part of single process pipeline. + task.use(eps_greedy_handler(cfg)) + task.use(StepCollector(cfg, policy.collect_mode, collector_env)) + task.use(data_pusher(cfg, buffer_)) + + elif task.has_role(task.role.EVALUATOR): + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) + policy = DQNPolicy(cfg.policy, model=model, enable_field=['eval']) + eval_model_loader = FileModelLoader(model=model, dirname=model_path) + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + #task.use(PeriodicalModelExchanger(model=policy._model, model_loader=eval_model_loader, mode="update")) + + # Here is the part of single process pipeline. + task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(CkptSaver(policy, cfg.exp_name, train_freq=100)) + + task.run() + + +if __name__ == "__main__": + + from ding.config.DQN.gym_lunarlander_v2 import cfg, env + cfg.exp_name = 'LunarLander-v2-Apex-DQN' + cfg.policy.priority = True + cfg.policy.priority_IS_weight = True + cfg = compile_config(cfg, policy=DQNPolicy) + + Parallel.runner( + n_parallel_workers=3, + ports=50515, + protocol="tcp", + topology="mesh", + attach_to=None, + address=None, + labels=None, + node_ids=None, + mq_type="nng", + redis_host=None, + redis_port=None, + startup_interval=1 + )(main, cfg, env) diff --git a/ding/example/apex_dqn_parallel_origin.py b/ding/example/apex_dqn_parallel_origin.py new file mode 100644 index 0000000000..c1d909ec19 --- /dev/null +++ b/ding/example/apex_dqn_parallel_origin.py @@ -0,0 +1,117 @@ +""" +# Example of DQN pipeline + +Use the pipeline on a single process: + +> python3 -u ding/example/dqn.py + +Use the pipeline on multiple processes: + +We surpose there are N processes (workers) = 1 learner + 1 evaluator + (N-2) collectors + +## First Example —— Execute on one machine with multi processes. + +Execute 4 processes with 1 learner + 1 evaluator + 2 collectors +Remember to keep them connected by mesh to ensure that they can exchange information with each other. + +> ditask --package . --main ding.example.dqn.main --parallel-workers 4 --topology mesh + +## Second Example —— Execute on multiple machines. + +1. Execute 1 learner + 1 evaluator on one machine. + +> ditask --package . --main ding.example.dqn.main --parallel-workers 2 --topology mesh --node-ids 0 --ports 50515 + +2. Execute 2 collectors on another machine. (Suppose the ip of the first machine is 127.0.0.1). + Here we use `alone` topology instead of `mesh` because the collectors do not need communicate with each other. + Remember the `node_ids` cannot be duplicated with the learner, evaluator processes. + And remember to set the `ports` (should not conflict with others) and `attach_to` parameters. + The value of the `attach_to` parameter should be obtained from the log of the + process started earlier (e.g. 'NNG listen on tcp://10.0.0.4:50515'). + +> ditask --package . --main ding.example.dqn.main --parallel-workers 2 --topology alone --node-ids 2 \ + --ports 50517 --attach-to tcp://10.0.0.4:50515,tcp://127.0.0.1:50516 + +3. You can repeat step 2 to start more collectors on other machines. +""" +import gym +from ditk import logging +from ding.data.model_loader import FileModelLoader +from ding.data.storage_loader import FileStorageLoader +from ding.model import DQN +from ding.policy import DQNPolicy +from ding.envs import DingEnvWrapper, BaseEnvManagerV2 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger +from ding.utils import set_pkg_seed +from dizoo.classic_control.cartpole.config.cartpole_dqn_config import main_config, create_config + + +def main(): + logging.getLogger().setLevel(logging.INFO) + cfg = compile_config(main_config, create_cfg=create_config, auto=True, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = BaseEnvManagerV2( + env_fn=[lambda: DingEnvWrapper(gym.make("CartPole-v0")) for _ in range(cfg.env.collector_env_num)], + cfg=cfg.env.manager + ) + evaluator_env = BaseEnvManagerV2( + env_fn=[lambda: DingEnvWrapper(gym.make("CartPole-v0")) for _ in range(cfg.env.evaluator_env_num)], + cfg=cfg.env.manager + ) + + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + # Here is the part of single process pipeline. + task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use(StepCollector(cfg, policy.collect_mode, collector_env)) + task.use(data_pusher(cfg, buffer_)) + task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use(CkptSaver(policy, cfg.exp_name, train_freq=100)) + + task.run() + + +if __name__ == "__main__": + + from ding.framework import Parallel + Parallel.runner( + n_parallel_workers=3, + ports=50520, + protocol="tcp", + topology="mesh", + attach_to=None, + address=None, + labels=None, + node_ids=None, + mq_type="nng", + redis_host=None, + redis_port=None, + startup_interval=1 + )(main) diff --git a/ding/example/apex_dqn_per.py b/ding/example/apex_dqn_per.py new file mode 100644 index 0000000000..c51a3e3165 --- /dev/null +++ b/ding/example/apex_dqn_per.py @@ -0,0 +1,76 @@ +import gym +from ditk import logging +from ding.model import DQN +from ding.policy import DQNPolicy +from ding.envs import DingEnvWrapper, BaseEnvManagerV2 +from ding.data import DequeBuffer +from ding.data.buffer.middleware import PriorityExperienceReplay +from ding.config import compile_config +from ding.framework import task +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, nstep_reward_enhancer, priority_calculator +from ding.utils import set_pkg_seed +from dizoo.classic_control.cartpole.config.cartpole_dqn_config import main_config, create_config + + +def main(): + logging.getLogger().setLevel(logging.INFO) + main_config.exp_name = 'cartpole_dqn_per' + main_config.policy.priority = True + main_config.policy.priority_IS_weight = True + cfg = compile_config(main_config, create_cfg=create_config, auto=True) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = BaseEnvManagerV2( + env_fn=[lambda: DingEnvWrapper(gym.make("CartPole-v0")) for _ in range(cfg.env.collector_env_num)], + cfg=cfg.env.manager + ) + evaluator_env = BaseEnvManagerV2( + env_fn=[lambda: DingEnvWrapper(gym.make("CartPole-v0")) for _ in range(cfg.env.evaluator_env_num)], + cfg=cfg.env.manager + ) + + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) + policy = DQNPolicy(cfg.policy, model=model) + + task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use(StepCollector(cfg, policy.collect_mode, collector_env)) + if "nstep" in cfg.policy and cfg.policy.nstep > 1: + task.use(nstep_reward_enhancer(cfg)) + + def dqn_priority_calculation(update_target_model_frequency): + last_update_train_iter = 0 + + def _calculate_priority(data): + nonlocal last_update_train_iter + + if (task.ctx.train_iter - last_update_train_iter) % update_target_model_frequency == 0: + update_target_model = True + else: + update_target_model = False + priority = policy.calculate_priority(data, update_target_model=update_target_model)['priority'] + last_update_train_iter = task.ctx.train_iter + return priority + + return _calculate_priority + + task.use( + priority_calculator( + func_for_priority_calculation=dqn_priority_calculation( + update_target_model_frequency=cfg.policy.learn.target_update_freq + ), + ) + ) + task.use(data_pusher(cfg, buffer_)) + task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(CkptSaver(policy, cfg.exp_name, train_freq=100)) + task.run() + + +if __name__ == "__main__": + main() diff --git a/ding/example/apex_dqn_per_parallel.py b/ding/example/apex_dqn_per_parallel.py new file mode 100644 index 0000000000..5f15615e71 --- /dev/null +++ b/ding/example/apex_dqn_per_parallel.py @@ -0,0 +1,130 @@ +import os +from ditk import logging +from ding.model import DQN +from ding.policy import DQNPolicy +from ding.data import DequeBuffer +from ding.data.buffer.middleware import PriorityExperienceReplay +from ding.envs import setup_ding_env_manager +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework import Parallel +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ModelExchanger, ContextExchanger, online_logger, \ + nstep_reward_enhancer, priority_calculator +from ding.utils import set_pkg_seed + + +def main(): + from ding.config.DQN.gym_lunarlander_v2 import cfg, env + + cfg.exp_name = 'LunarLander-v2-Apex-DQN' + cfg.policy.priority = True + cfg.policy.priority_IS_weight = True + cfg = compile_config(cfg, policy=DQNPolicy, save_cfg=task.router.node_id == 0) + + logging.getLogger().setLevel(logging.INFO) + model_path = os.path.join(cfg.exp_name, 'models') + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + + assert task.router.is_active + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + if task.has_role(task.role.COLLECTOR): + collector_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.collector_env_num, 'collector', debug=True) + evaluator_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.evaluator_env_num, 'evaluator', debug=True) + elif task.has_role(task.role.EVALUATOR): + collector_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.collector_env_num, 'collector', debug=True) + evaluator_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.evaluator_env_num, 'evaluator', debug=True) + elif task.has_role(task.role.LEARNER): + collector_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.collector_env_num, 'collector', debug=True) + evaluator_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.evaluator_env_num, 'evaluator', debug=True) + + if task.has_role(task.role.LEARNER): + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) + policy = DQNPolicy(cfg.policy, model=model) + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + elif task.has_role(task.role.COLLECTOR): + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) + policy = DQNPolicy(cfg.policy, model=model) + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + elif task.has_role(task.role.EVALUATOR): + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) + policy = DQNPolicy(cfg.policy, model=model) + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + # Here is the part of single process pipeline. + task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use(StepCollector(cfg, policy.collect_mode, collector_env)) + print(f"cfg.policy.nstep:{cfg.policy.nstep}") + if "nstep" in cfg.policy and cfg.policy.nstep > 1: + if task.has_role(task.role.COLLECTOR): + task.use(nstep_reward_enhancer(cfg)) + + def dqn_priority_calculation(update_target_model_frequency): + last_update_train_iter = 0 + + def _calculate_priority(data): + nonlocal last_update_train_iter + + if (task.ctx.train_iter - last_update_train_iter) % update_target_model_frequency == 0: + update_target_model = True + else: + update_target_model = False + priority = policy.calculate_priority(data, update_target_model=update_target_model)['priority'] + last_update_train_iter = task.ctx.train_iter + return priority + + return _calculate_priority + + task.use( + priority_calculator( + func_for_priority_calculation=dqn_priority_calculation( + update_target_model_frequency=cfg.policy.learn.target_update_freq + ), + ) + ) + + task.use(data_pusher(cfg, buffer_)) + task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use(CkptSaver(policy, cfg.exp_name, train_freq=100)) + task.run() + + +if __name__ == "__main__": + + Parallel.runner( + n_parallel_workers=3, + ports=50515, + protocol="tcp", + topology="mesh", + attach_to=None, + address=None, + labels=None, + node_ids=None, + mq_type="nng", + redis_host=None, + redis_port=None, + startup_interval=1 + )(main) diff --git a/ding/framework/middleware/__init__.py b/ding/framework/middleware/__init__.py index 6ff67d8301..c956be88ae 100644 --- a/ding/framework/middleware/__init__.py +++ b/ding/framework/middleware/__init__.py @@ -2,5 +2,5 @@ from .collector import StepCollector, EpisodeCollector, PPOFStepCollector from .learner import OffPolicyLearner, HERLearner from .ckpt_handler import CkptSaver -from .distributer import ContextExchanger, ModelExchanger +from .distributer import ContextExchanger, ModelExchanger, PeriodicalModelExchanger from .barrier import Barrier, BarrierRuntime diff --git a/ding/framework/middleware/distributer.py b/ding/framework/middleware/distributer.py index c68a4b808f..ad92322bff 100644 --- a/ding/framework/middleware/distributer.py +++ b/ding/framework/middleware/distributer.py @@ -1,3 +1,4 @@ +import numpy as np from time import sleep, time from dataclasses import fields from typing import TYPE_CHECKING, List, Dict, Any, Optional, Union @@ -287,3 +288,137 @@ def _send_callback(self, storage: Storage): def __del__(self): if self._model_loader: self._model_loader.shutdown() + + +class PeriodicalModelExchanger: + + def __init__( + self, + model: "Module", + mode: str, + period: int = 1, + delay_toleration: float = np.inf, + stale_toleration: int = 1, + event_name: str = "model_exchanger", + model_loader: Optional[ModelLoader] = None + ) -> None: + """ + Overview: + Exchange model between processes, only the learner will send the model, + otherwise the model will only be received. + If you are using a shared model on a single host, there is no need to use this middleware. + Arguments: + - model (:obj:`torch.nn.Module`): Pytorch module. + - model_loader (:obj:`ModelLoader`): Encode model in subprocess. + """ + self._model = model + self._model_loader = model_loader + self._event_name = event_name + self._period = period + self.mode = mode + if self.mode == "receive": + self._id_counter = -1 + self._model_id = -1 + else: + self._id_counter = 0 + self.stale_toleration = stale_toleration + self.model_stale = stale_toleration + self.delay_toleration = delay_toleration + self._state_dict_cache: Optional[Union[object, Storage]] = None + + if self.mode == "receive": + task.on(self._event_name, self._cache_state_dict) + if model_loader: + task.once("finish", lambda _: model_loader.shutdown()) + + def _cache_state_dict(self, msg: Dict[str, Any]): + # msg: Dict {'id':id,'model':state_dict: Union[object, Storage]} + print(f"node_id[{task.router.node_id}] get model msg") + if msg['id'] % self._period == 0: + self._state_dict_cache = msg['model'] + self._id_counter = msg['id'] + self._time = msg['time'] + else: + print(f"node_id[{task.router.node_id}] skip save cache") + + def __new__(cls, *args, **kwargs): + return super(PeriodicalModelExchanger, cls).__new__(cls) + + def __call__(self, ctx: "Context") -> Any: + if self._model_loader: + self._model_loader.start() + + if self.mode == "receive": + print(f"node_id[{task.router.node_id}] try receive model") + if ctx.total_step != 0: # Skip first iteration + self._update_model() + else: + print(f"node_id[{task.router.node_id}] skip first iteration") + elif self.mode == "send": + yield + print(f"node_id[{task.router.node_id}] try send model") + if self._id_counter % self._period == 0: + self._send_model(id=self._id_counter) + print(f"node_id[{task.router.node_id}] model send [{self._id_counter}]") + self._id_counter += 1 + else: + raise NotImplementedError + + def _update_model(self): + start = time() + while True: + if task.finish: + return + if time() - start > 60: + logging.warning("Timeout when waiting for new model! Node id: {}".format(task.router.node_id)) + self.model_stale += 1 + break + if self._state_dict_cache is None: + if self.model_stale < self.stale_toleration and time() - self._time < self.delay_toleration: + self.model_stale += 1 + break + else: + sleep(0.01) + else: + #print(f"node_id[{task.router.node_id}] time diff {time()-self._time}") + if self._id_counter > self._model_id and time() - self._time < self.delay_toleration: + print(f"node_id[{task.router.node_id}] begin update") + if isinstance(self._state_dict_cache, Storage) and self._model_loader is not None: + try: + self._model.load_state_dict(self._model_loader.load(self._state_dict_cache)) + self._state_dict_cache = None + self._model_id = self._id_counter + self.model_stale = 1 + break + except FileNotFoundError as e: + logging.warning( + "Model file has been deleted on node {}, maybe you can increase the ttl.".format( + task.router.node_id + ) + ) + self._state_dict_cache = None + continue + else: + self._model.load_state_dict(self._state_dict_cache) + self._state_dict_cache = None + self._model_id = self._id_counter + print(f"node_id[{task.router.node_id}] model updated") + self.model_stale = 1 + break + else: + print(f"node_id[{task.router.node_id}] same id skip update") + self.model_stale += 1 + + def _send_model(self, id: int): + if self._model_loader: + self._model_loader.save(self._send_callback) + else: + task.emit(self._event_name, {'id': id, 'model': self._model.state_dict(), 'time': time()}, only_remote=True) + + def _send_callback(self, storage: Storage): + if task.running: + task.emit(self._event_name, storage, only_remote=True) + + def __del__(self): + if self._model_loader: + self._model_loader.shutdown() diff --git a/ding/framework/middleware/functional/__init__.py b/ding/framework/middleware/functional/__init__.py index c87832a856..9676a23591 100644 --- a/ding/framework/middleware/functional/__init__.py +++ b/ding/framework/middleware/functional/__init__.py @@ -11,5 +11,5 @@ from .explorer import eps_greedy_handler, eps_greedy_masker from .advantage_estimator import gae_estimator, ppof_adv_estimator, pg_estimator from .enhancer import reward_estimator, her_data_enhancer, nstep_reward_enhancer - +from .priority import priority_calculator from .timer import epoch_timer diff --git a/ding/framework/middleware/functional/data_processor.py b/ding/framework/middleware/functional/data_processor.py index 1dc9429458..e9387e196f 100644 --- a/ding/framework/middleware/functional/data_processor.py +++ b/ding/framework/middleware/functional/data_processor.py @@ -169,10 +169,10 @@ def _fetch(ctx: "OnlineRLContext"): index = [d.index for d in buffered_data] meta = [d.meta for d in buffered_data] # such as priority - if isinstance(ctx.train_output, List): - priority = ctx.train_output.pop()['priority'] + if isinstance(ctx.train_output_for_post_process, List): + priority = ctx.train_output_for_post_process.pop()['priority'] else: - priority = ctx.train_output['priority'] + priority = ctx.train_output_for_post_process['priority'] for idx, m, p in zip(index, meta, priority): m['priority'] = p buffer_.update(index=idx, data=None, meta=m) diff --git a/ding/framework/middleware/functional/enhancer.py b/ding/framework/middleware/functional/enhancer.py index b983945791..597a086850 100644 --- a/ding/framework/middleware/functional/enhancer.py +++ b/ding/framework/middleware/functional/enhancer.py @@ -73,7 +73,7 @@ def _fetch_and_enhance(ctx: "OnlineRLContext"): def nstep_reward_enhancer(cfg: EasyDict) -> Callable: - if task.router.is_active and not task.has_role(task.role.LEARNER): + if task.router.is_active and (not task.has_role(task.role.LEARNER) and not task.has_role(task.role.COLLECTOR)): return task.void() def _enhance(ctx: "OnlineRLContext"): diff --git a/ding/framework/middleware/functional/logger.py b/ding/framework/middleware/functional/logger.py index 625965a087..fe1f3687ae 100644 --- a/ding/framework/middleware/functional/logger.py +++ b/ding/framework/middleware/functional/logger.py @@ -462,6 +462,7 @@ def wandb_offline_logger( ) first_plot = True + def _vis_dataset(datasetpath: str): try: from sklearn.manifold import TSNE diff --git a/ding/framework/middleware/functional/priority.py b/ding/framework/middleware/functional/priority.py new file mode 100644 index 0000000000..661a24c08a --- /dev/null +++ b/ding/framework/middleware/functional/priority.py @@ -0,0 +1,38 @@ +from typing import TYPE_CHECKING, Optional, Callable, Dict, List, Union +from ditk import logging +from easydict import EasyDict +from matplotlib import pyplot as plt +from matplotlib import animation +import os +import numpy as np +import torch +import wandb +import pickle +import treetensor.numpy as tnp +from ding.policy import Policy +from ding.data import Buffer +from ding.rl_utils import gae, gae_data +from ding.framework import task +from ding.utils.data import ttorch_collate +from ding.torch_utils import to_device + + +def priority_calculator(func_for_priority_calculation: Callable) -> Callable: + """ + Overview: + The middleware that calculates the priority of the collected data. + Arguments: + - func_for_priority_calculation (:obj:`Callable`): The function that calculates the priority of the collected data. + """ + + if task.router.is_active and not task.has_role(task.role.COLLECTOR): + return task.void() + + def _priority_calculator(ctx: "OnlineRLContext") -> None: + + priority = func_for_priority_calculation(ctx.trajectories) + for i in range(len(priority)): + ctx.trajectories[i]['priority'] = priority[i] + print(f"node_id:{task.router.node_id}, priority:{priority}") + + return _priority_calculator diff --git a/ding/framework/middleware/learner.py b/ding/framework/middleware/learner.py index 6fe63ccc79..e582ef22d0 100644 --- a/ding/framework/middleware/learner.py +++ b/ding/framework/middleware/learner.py @@ -43,7 +43,7 @@ def __init__( """ self.cfg = cfg self._fetcher = task.wrap(offpolicy_data_fetcher(cfg, buffer_)) - self._trainer = task.wrap(trainer(cfg, policy)) + self._trainer = task.wrap(trainer(cfg, policy, log_freq=log_freq)) if reward_model is not None: self._reward_estimator = task.wrap(reward_estimator(cfg, reward_model)) else: @@ -63,6 +63,7 @@ def __call__(self, ctx: "OnlineRLContext") -> None: self._reward_estimator(ctx) self._trainer(ctx) train_output_queue.append(ctx.train_output) + ctx.train_output_for_post_process = ctx.train_output ctx.train_output = train_output_queue diff --git a/ding/framework/middleware/tests/test_distributer.py b/ding/framework/middleware/tests/test_distributer.py index e75a753c2f..f7c313c827 100644 --- a/ding/framework/middleware/tests/test_distributer.py +++ b/ding/framework/middleware/tests/test_distributer.py @@ -9,7 +9,7 @@ from ding.data.storage_loader import FileStorageLoader from ding.framework import task from ding.framework.context import OnlineRLContext -from ding.framework.middleware.distributer import ContextExchanger, ModelExchanger +from ding.framework.middleware.distributer import ContextExchanger, ModelExchanger, PeriodicalModelExchanger from ding.framework.parallel import Parallel from ding.utils.default_helper import set_pkg_seed from os import path @@ -221,3 +221,50 @@ def pred(ctx): @pytest.mark.tmp def test_model_exchanger_with_model_loader(): Parallel.runner(n_parallel_workers=2, startup_interval=0)(model_exchanger_main_with_model_loader) + + +def periodical_model_exchanger_main(): + with task.start(ctx=OnlineRLContext()): + set_pkg_seed(0, use_cuda=False) + policy = MockPolicy() + X = torch.rand(10) + y = torch.rand(10) + + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + task.use(PeriodicalModelExchanger(policy._model, mode="send", period=3)) + else: + task.add_role(task.role.COLLECTOR) + task.use(PeriodicalModelExchanger(policy._model, mode="receive", period=1, stale_toleration=3)) + if task.has_role(task.role.LEARNER): + + def train(ctx): + policy.train(X, y) + sleep(0.3) + + task.use(train) + else: + y_pred1 = policy.predict(X) + stale = 1 + + def pred(ctx): + nonlocal stale + y_pred2 = policy.predict(X) + stale += 1 + assert stale <= 3 or all(y_pred1 == y_pred2) + if any(y_pred1 != y_pred2): + stale = 1 + sleep(0.3) + + task.use(pred) + task.run(8) + + +@pytest.mark.tmp +def test_periodical_model_exchanger(): + Parallel.runner(n_parallel_workers=2, startup_interval=0)(periodical_model_exchanger_main) + + +if __name__ == "__main__": + test_model_exchanger() + test_periodical_model_exchanger() diff --git a/ding/policy/dqn.py b/ding/policy/dqn.py index dd8d2ea1f3..c039230777 100644 --- a/ding/policy/dqn.py +++ b/ding/policy/dqn.py @@ -421,6 +421,54 @@ def _forward_eval(self, data: Dict[int, Any]) -> Dict[int, Any]: def monitor_vars(self) -> List[str]: return ['cur_lr', 'total_loss', 'q_value'] + def calculate_priority(self, data: Dict[int, Any], update_target_model: bool = False) -> Dict[str, Any]: + """ + Overview: + Calculate priority for replay buffer. + Arguments: + - data (:obj:`Dict[str, Any]`): Dict type data, a batch of data for training. + Returns: + - priority (:obj:`Dict[str, Any]`): Dict type priority data, values are python scalar or a list of scalars. + ArgumentsKeys: + - necessary: ``obs``, ``action``, ``reward``, ``next_obs``, ``done`` + - optional: ``value_gamma`` + ReturnsKeys: + - necessary: ``priority`` + """ + + if update_target_model: + self._target_model.load_state_dict(self._learn_model.state_dict()) + + data = default_preprocess_learn( + data, + use_priority=False, + use_priority_IS_weight=False, + ignore_done=self._cfg.learn.ignore_done, + use_nstep=True + ) + if self._cuda: + data = to_device(data, self._device) + # ==================== + # Q-learning forward + # ==================== + self._learn_model.eval() + self._target_model.eval() + with torch.no_grad(): + # Current q value (main model) + q_value = self._learn_model.forward(data['obs'])['logit'] + # Target q value + target_q_value = self._target_model.forward(data['next_obs'])['logit'] + # Max q value action (main model), i.e. Double DQN + target_q_action = self._learn_model.forward(data['next_obs'])['action'] + data_n = q_nstep_td_data( + q_value, target_q_value, data['action'], target_q_action, data['reward'], data['done'], data['weight'] + ) + value_gamma = data.get('value_gamma') + loss, td_error_per_sample = q_nstep_td_error( + data_n, self._gamma, nstep=self._nstep, value_gamma=value_gamma + ) + return {'priority': td_error_per_sample.abs().tolist()} + @POLICY_REGISTRY.register('dqn_stdim') class DQNSTDIMPolicy(DQNPolicy): diff --git a/ding/rl_utils/td.py b/ding/rl_utils/td.py index 9a42630702..8682a740f7 100644 --- a/ding/rl_utils/td.py +++ b/ding/rl_utils/td.py @@ -174,6 +174,7 @@ def view_similar(x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: def nstep_return(data: namedtuple, gamma: Union[float, list], nstep: int, value_gamma: Optional[torch.Tensor] = None): reward, next_value, done = data + print(f"reward.shape:{reward.shape}") assert reward.shape[0] == nstep device = reward.device From c9e736acab129b72bb9bbb69cb7a7cc7efba5867 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 8 Aug 2023 09:09:02 +0000 Subject: [PATCH 180/244] polish code --- ding/framework/middleware/functional/logger.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ding/framework/middleware/functional/logger.py b/ding/framework/middleware/functional/logger.py index 625965a087..fe1f3687ae 100644 --- a/ding/framework/middleware/functional/logger.py +++ b/ding/framework/middleware/functional/logger.py @@ -462,6 +462,7 @@ def wandb_offline_logger( ) first_plot = True + def _vis_dataset(datasetpath: str): try: from sklearn.manifold import TSNE From d204c95e8561b6f3bf8e06e9666f8e5dea7fb7b4 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 8 Aug 2023 17:10:31 +0800 Subject: [PATCH 181/244] polish example --- ding/example/apex_dqn_priority.py | 76 ++++++++++++ ding/example/apex_dqn_priority_parallel.py | 130 +++++++++++++++++++++ 2 files changed, 206 insertions(+) create mode 100644 ding/example/apex_dqn_priority.py create mode 100644 ding/example/apex_dqn_priority_parallel.py diff --git a/ding/example/apex_dqn_priority.py b/ding/example/apex_dqn_priority.py new file mode 100644 index 0000000000..c51a3e3165 --- /dev/null +++ b/ding/example/apex_dqn_priority.py @@ -0,0 +1,76 @@ +import gym +from ditk import logging +from ding.model import DQN +from ding.policy import DQNPolicy +from ding.envs import DingEnvWrapper, BaseEnvManagerV2 +from ding.data import DequeBuffer +from ding.data.buffer.middleware import PriorityExperienceReplay +from ding.config import compile_config +from ding.framework import task +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, nstep_reward_enhancer, priority_calculator +from ding.utils import set_pkg_seed +from dizoo.classic_control.cartpole.config.cartpole_dqn_config import main_config, create_config + + +def main(): + logging.getLogger().setLevel(logging.INFO) + main_config.exp_name = 'cartpole_dqn_per' + main_config.policy.priority = True + main_config.policy.priority_IS_weight = True + cfg = compile_config(main_config, create_cfg=create_config, auto=True) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = BaseEnvManagerV2( + env_fn=[lambda: DingEnvWrapper(gym.make("CartPole-v0")) for _ in range(cfg.env.collector_env_num)], + cfg=cfg.env.manager + ) + evaluator_env = BaseEnvManagerV2( + env_fn=[lambda: DingEnvWrapper(gym.make("CartPole-v0")) for _ in range(cfg.env.evaluator_env_num)], + cfg=cfg.env.manager + ) + + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) + policy = DQNPolicy(cfg.policy, model=model) + + task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use(StepCollector(cfg, policy.collect_mode, collector_env)) + if "nstep" in cfg.policy and cfg.policy.nstep > 1: + task.use(nstep_reward_enhancer(cfg)) + + def dqn_priority_calculation(update_target_model_frequency): + last_update_train_iter = 0 + + def _calculate_priority(data): + nonlocal last_update_train_iter + + if (task.ctx.train_iter - last_update_train_iter) % update_target_model_frequency == 0: + update_target_model = True + else: + update_target_model = False + priority = policy.calculate_priority(data, update_target_model=update_target_model)['priority'] + last_update_train_iter = task.ctx.train_iter + return priority + + return _calculate_priority + + task.use( + priority_calculator( + func_for_priority_calculation=dqn_priority_calculation( + update_target_model_frequency=cfg.policy.learn.target_update_freq + ), + ) + ) + task.use(data_pusher(cfg, buffer_)) + task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(CkptSaver(policy, cfg.exp_name, train_freq=100)) + task.run() + + +if __name__ == "__main__": + main() diff --git a/ding/example/apex_dqn_priority_parallel.py b/ding/example/apex_dqn_priority_parallel.py new file mode 100644 index 0000000000..5f15615e71 --- /dev/null +++ b/ding/example/apex_dqn_priority_parallel.py @@ -0,0 +1,130 @@ +import os +from ditk import logging +from ding.model import DQN +from ding.policy import DQNPolicy +from ding.data import DequeBuffer +from ding.data.buffer.middleware import PriorityExperienceReplay +from ding.envs import setup_ding_env_manager +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework import Parallel +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ModelExchanger, ContextExchanger, online_logger, \ + nstep_reward_enhancer, priority_calculator +from ding.utils import set_pkg_seed + + +def main(): + from ding.config.DQN.gym_lunarlander_v2 import cfg, env + + cfg.exp_name = 'LunarLander-v2-Apex-DQN' + cfg.policy.priority = True + cfg.policy.priority_IS_weight = True + cfg = compile_config(cfg, policy=DQNPolicy, save_cfg=task.router.node_id == 0) + + logging.getLogger().setLevel(logging.INFO) + model_path = os.path.join(cfg.exp_name, 'models') + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + + assert task.router.is_active + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + if task.has_role(task.role.COLLECTOR): + collector_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.collector_env_num, 'collector', debug=True) + evaluator_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.evaluator_env_num, 'evaluator', debug=True) + elif task.has_role(task.role.EVALUATOR): + collector_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.collector_env_num, 'collector', debug=True) + evaluator_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.evaluator_env_num, 'evaluator', debug=True) + elif task.has_role(task.role.LEARNER): + collector_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.collector_env_num, 'collector', debug=True) + evaluator_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.evaluator_env_num, 'evaluator', debug=True) + + if task.has_role(task.role.LEARNER): + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) + policy = DQNPolicy(cfg.policy, model=model) + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + elif task.has_role(task.role.COLLECTOR): + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) + policy = DQNPolicy(cfg.policy, model=model) + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + elif task.has_role(task.role.EVALUATOR): + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) + policy = DQNPolicy(cfg.policy, model=model) + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + # Here is the part of single process pipeline. + task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use(StepCollector(cfg, policy.collect_mode, collector_env)) + print(f"cfg.policy.nstep:{cfg.policy.nstep}") + if "nstep" in cfg.policy and cfg.policy.nstep > 1: + if task.has_role(task.role.COLLECTOR): + task.use(nstep_reward_enhancer(cfg)) + + def dqn_priority_calculation(update_target_model_frequency): + last_update_train_iter = 0 + + def _calculate_priority(data): + nonlocal last_update_train_iter + + if (task.ctx.train_iter - last_update_train_iter) % update_target_model_frequency == 0: + update_target_model = True + else: + update_target_model = False + priority = policy.calculate_priority(data, update_target_model=update_target_model)['priority'] + last_update_train_iter = task.ctx.train_iter + return priority + + return _calculate_priority + + task.use( + priority_calculator( + func_for_priority_calculation=dqn_priority_calculation( + update_target_model_frequency=cfg.policy.learn.target_update_freq + ), + ) + ) + + task.use(data_pusher(cfg, buffer_)) + task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use(CkptSaver(policy, cfg.exp_name, train_freq=100)) + task.run() + + +if __name__ == "__main__": + + Parallel.runner( + n_parallel_workers=3, + ports=50515, + protocol="tcp", + topology="mesh", + attach_to=None, + address=None, + labels=None, + node_ids=None, + mq_type="nng", + redis_host=None, + redis_port=None, + startup_interval=1 + )(main) From eea55731cd85bc4b35b3e2f1c52902e35d2457ad Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 8 Aug 2023 17:10:57 +0800 Subject: [PATCH 182/244] polish example --- ding/example/apex_dqn_per.py | 76 --------------- ding/example/apex_dqn_per_parallel.py | 130 -------------------------- 2 files changed, 206 deletions(-) delete mode 100644 ding/example/apex_dqn_per.py delete mode 100644 ding/example/apex_dqn_per_parallel.py diff --git a/ding/example/apex_dqn_per.py b/ding/example/apex_dqn_per.py deleted file mode 100644 index c51a3e3165..0000000000 --- a/ding/example/apex_dqn_per.py +++ /dev/null @@ -1,76 +0,0 @@ -import gym -from ditk import logging -from ding.model import DQN -from ding.policy import DQNPolicy -from ding.envs import DingEnvWrapper, BaseEnvManagerV2 -from ding.data import DequeBuffer -from ding.data.buffer.middleware import PriorityExperienceReplay -from ding.config import compile_config -from ding.framework import task -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, nstep_reward_enhancer, priority_calculator -from ding.utils import set_pkg_seed -from dizoo.classic_control.cartpole.config.cartpole_dqn_config import main_config, create_config - - -def main(): - logging.getLogger().setLevel(logging.INFO) - main_config.exp_name = 'cartpole_dqn_per' - main_config.policy.priority = True - main_config.policy.priority_IS_weight = True - cfg = compile_config(main_config, create_cfg=create_config, auto=True) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = BaseEnvManagerV2( - env_fn=[lambda: DingEnvWrapper(gym.make("CartPole-v0")) for _ in range(cfg.env.collector_env_num)], - cfg=cfg.env.manager - ) - evaluator_env = BaseEnvManagerV2( - env_fn=[lambda: DingEnvWrapper(gym.make("CartPole-v0")) for _ in range(cfg.env.evaluator_env_num)], - cfg=cfg.env.manager - ) - - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) - policy = DQNPolicy(cfg.policy, model=model) - - task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use(StepCollector(cfg, policy.collect_mode, collector_env)) - if "nstep" in cfg.policy and cfg.policy.nstep > 1: - task.use(nstep_reward_enhancer(cfg)) - - def dqn_priority_calculation(update_target_model_frequency): - last_update_train_iter = 0 - - def _calculate_priority(data): - nonlocal last_update_train_iter - - if (task.ctx.train_iter - last_update_train_iter) % update_target_model_frequency == 0: - update_target_model = True - else: - update_target_model = False - priority = policy.calculate_priority(data, update_target_model=update_target_model)['priority'] - last_update_train_iter = task.ctx.train_iter - return priority - - return _calculate_priority - - task.use( - priority_calculator( - func_for_priority_calculation=dqn_priority_calculation( - update_target_model_frequency=cfg.policy.learn.target_update_freq - ), - ) - ) - task.use(data_pusher(cfg, buffer_)) - task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(CkptSaver(policy, cfg.exp_name, train_freq=100)) - task.run() - - -if __name__ == "__main__": - main() diff --git a/ding/example/apex_dqn_per_parallel.py b/ding/example/apex_dqn_per_parallel.py deleted file mode 100644 index 5f15615e71..0000000000 --- a/ding/example/apex_dqn_per_parallel.py +++ /dev/null @@ -1,130 +0,0 @@ -import os -from ditk import logging -from ding.model import DQN -from ding.policy import DQNPolicy -from ding.data import DequeBuffer -from ding.data.buffer.middleware import PriorityExperienceReplay -from ding.envs import setup_ding_env_manager -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework import Parallel -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ModelExchanger, ContextExchanger, online_logger, \ - nstep_reward_enhancer, priority_calculator -from ding.utils import set_pkg_seed - - -def main(): - from ding.config.DQN.gym_lunarlander_v2 import cfg, env - - cfg.exp_name = 'LunarLander-v2-Apex-DQN' - cfg.policy.priority = True - cfg.policy.priority_IS_weight = True - cfg = compile_config(cfg, policy=DQNPolicy, save_cfg=task.router.node_id == 0) - - logging.getLogger().setLevel(logging.INFO) - model_path = os.path.join(cfg.exp_name, 'models') - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - - assert task.router.is_active - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - if task.has_role(task.role.COLLECTOR): - collector_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.collector_env_num, 'collector', debug=True) - evaluator_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.evaluator_env_num, 'evaluator', debug=True) - elif task.has_role(task.role.EVALUATOR): - collector_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.collector_env_num, 'collector', debug=True) - evaluator_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.evaluator_env_num, 'evaluator', debug=True) - elif task.has_role(task.role.LEARNER): - collector_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.collector_env_num, 'collector', debug=True) - evaluator_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.evaluator_env_num, 'evaluator', debug=True) - - if task.has_role(task.role.LEARNER): - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) - policy = DQNPolicy(cfg.policy, model=model) - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - elif task.has_role(task.role.COLLECTOR): - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) - policy = DQNPolicy(cfg.policy, model=model) - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - elif task.has_role(task.role.EVALUATOR): - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) - policy = DQNPolicy(cfg.policy, model=model) - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - # Here is the part of single process pipeline. - task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use(StepCollector(cfg, policy.collect_mode, collector_env)) - print(f"cfg.policy.nstep:{cfg.policy.nstep}") - if "nstep" in cfg.policy and cfg.policy.nstep > 1: - if task.has_role(task.role.COLLECTOR): - task.use(nstep_reward_enhancer(cfg)) - - def dqn_priority_calculation(update_target_model_frequency): - last_update_train_iter = 0 - - def _calculate_priority(data): - nonlocal last_update_train_iter - - if (task.ctx.train_iter - last_update_train_iter) % update_target_model_frequency == 0: - update_target_model = True - else: - update_target_model = False - priority = policy.calculate_priority(data, update_target_model=update_target_model)['priority'] - last_update_train_iter = task.ctx.train_iter - return priority - - return _calculate_priority - - task.use( - priority_calculator( - func_for_priority_calculation=dqn_priority_calculation( - update_target_model_frequency=cfg.policy.learn.target_update_freq - ), - ) - ) - - task.use(data_pusher(cfg, buffer_)) - task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use(CkptSaver(policy, cfg.exp_name, train_freq=100)) - task.run() - - -if __name__ == "__main__": - - Parallel.runner( - n_parallel_workers=3, - ports=50515, - protocol="tcp", - topology="mesh", - attach_to=None, - address=None, - labels=None, - node_ids=None, - mq_type="nng", - redis_host=None, - redis_port=None, - startup_interval=1 - )(main) From 0f807d8f8d627ef8cb3e36f6cbdfcd67e7738236 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 8 Aug 2023 19:06:22 +0800 Subject: [PATCH 183/244] add wandb logger --- ding/example/apex_dqn.py | 2 +- ding/example/apex_dqn_priority.py | 2 +- ding/example/apex_dqn_priority_parallel.py | 2 +- .../apex_dqn_priority_parallel_wandb.py | 152 ++++++++++++++++++ .../middleware/functional/priority.py | 7 +- ding/rl_utils/td.py | 1 - 6 files changed, 158 insertions(+), 8 deletions(-) create mode 100644 ding/example/apex_dqn_priority_parallel_wandb.py diff --git a/ding/example/apex_dqn.py b/ding/example/apex_dqn.py index 6fbd43557d..8067ceb14e 100644 --- a/ding/example/apex_dqn.py +++ b/ding/example/apex_dqn.py @@ -104,7 +104,7 @@ def _calculate_priority(data): task.use( priority_calculator( - func_for_priority_calculation=dqn_priority_calculation( + priority_calculation_fn=dqn_priority_calculation( update_target_model_frequency=cfg.policy.learn.target_update_freq ), ) diff --git a/ding/example/apex_dqn_priority.py b/ding/example/apex_dqn_priority.py index c51a3e3165..1101b1e9eb 100644 --- a/ding/example/apex_dqn_priority.py +++ b/ding/example/apex_dqn_priority.py @@ -61,7 +61,7 @@ def _calculate_priority(data): task.use( priority_calculator( - func_for_priority_calculation=dqn_priority_calculation( + priority_calculation_fn=dqn_priority_calculation( update_target_model_frequency=cfg.policy.learn.target_update_freq ), ) diff --git a/ding/example/apex_dqn_priority_parallel.py b/ding/example/apex_dqn_priority_parallel.py index 5f15615e71..f306f95083 100644 --- a/ding/example/apex_dqn_priority_parallel.py +++ b/ding/example/apex_dqn_priority_parallel.py @@ -99,7 +99,7 @@ def _calculate_priority(data): task.use( priority_calculator( - func_for_priority_calculation=dqn_priority_calculation( + priority_calculation_fn=dqn_priority_calculation( update_target_model_frequency=cfg.policy.learn.target_update_freq ), ) diff --git a/ding/example/apex_dqn_priority_parallel_wandb.py b/ding/example/apex_dqn_priority_parallel_wandb.py new file mode 100644 index 0000000000..858091e328 --- /dev/null +++ b/ding/example/apex_dqn_priority_parallel_wandb.py @@ -0,0 +1,152 @@ +import os +from ditk import logging +from ding.model import DQN +from ding.policy import DQNPolicy +from ding.data import DequeBuffer +from ding.data.buffer.middleware import PriorityExperienceReplay +from ding.envs import setup_ding_env_manager +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework import Parallel +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ModelExchanger, ContextExchanger, online_logger, \ + nstep_reward_enhancer, priority_calculator, wandb_online_logger +from ding.utils import set_pkg_seed + + +def main(): + from ding.config.DQN.gym_lunarlander_v2 import cfg, env + + cfg.exp_name = 'LunarLander-v2-Apex-DQN' + cfg.policy.priority = True + cfg.policy.priority_IS_weight = True + cfg = compile_config(cfg, policy=DQNPolicy, save_cfg=task.router.node_id == 0) + + logging.getLogger().setLevel(logging.INFO) + model_path = os.path.join(cfg.exp_name, 'models') + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + + assert task.router.is_active + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + if task.has_role(task.role.COLLECTOR): + collector_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.collector_env_num, 'collector', debug=True) + evaluator_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.evaluator_env_num, 'evaluator', debug=True) + elif task.has_role(task.role.EVALUATOR): + collector_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.collector_env_num, 'collector', debug=True) + evaluator_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.evaluator_env_num, 'evaluator', debug=True) + elif task.has_role(task.role.LEARNER): + collector_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.collector_env_num, 'collector', debug=True) + evaluator_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.evaluator_env_num, 'evaluator', debug=True) + + if task.has_role(task.role.LEARNER): + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) + policy = DQNPolicy(cfg.policy, model=model) + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + elif task.has_role(task.role.COLLECTOR): + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) + policy = DQNPolicy(cfg.policy, model=model) + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + elif task.has_role(task.role.EVALUATOR): + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) + policy = DQNPolicy(cfg.policy, model=model) + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + # Here is the part of single process pipeline. + task.use( + interaction_evaluator( + cfg, policy.eval_mode, evaluator_env, render=cfg.policy.eval.render \ + if hasattr(cfg.policy.eval, "render") else False + ) + ) + task.use(eps_greedy_handler(cfg)) + task.use( + StepCollector( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) + print(f"cfg.policy.nstep:{cfg.policy.nstep}") + if "nstep" in cfg.policy and cfg.policy.nstep > 1: + if task.has_role(task.role.COLLECTOR): + task.use(nstep_reward_enhancer(cfg)) + + def dqn_priority_calculation(update_target_model_frequency): + last_update_train_iter = 0 + + def _calculate_priority(data): + nonlocal last_update_train_iter + + if (task.ctx.train_iter - last_update_train_iter) % update_target_model_frequency == 0: + update_target_model = True + else: + update_target_model = False + priority = policy.calculate_priority(data, update_target_model=update_target_model)['priority'] + last_update_train_iter = task.ctx.train_iter + return priority + + return _calculate_priority + + task.use( + priority_calculator( + priority_calculation_fn=dqn_priority_calculation( + update_target_model_frequency=cfg.policy.learn.target_update_freq + ), + ) + ) + + task.use(data_pusher(cfg, buffer_)) + task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use( + wandb_online_logger( + metric_list=policy.monitor_vars(), + model=policy._model, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + ) + ) + task.use(online_logger(train_show_freq=10)) + task.use(CkptSaver(policy, cfg.exp_name, train_freq=100)) + task.run() + + +if __name__ == "__main__": + + Parallel.runner( + n_parallel_workers=3, + ports=50515, + protocol="tcp", + topology="mesh", + attach_to=None, + address=None, + labels=None, + node_ids=None, + mq_type="nng", + redis_host=None, + redis_port=None, + startup_interval=1 + )(main) diff --git a/ding/framework/middleware/functional/priority.py b/ding/framework/middleware/functional/priority.py index 661a24c08a..6df347fea0 100644 --- a/ding/framework/middleware/functional/priority.py +++ b/ding/framework/middleware/functional/priority.py @@ -17,12 +17,12 @@ from ding.torch_utils import to_device -def priority_calculator(func_for_priority_calculation: Callable) -> Callable: +def priority_calculator(priority_calculation_fn: Callable) -> Callable: """ Overview: The middleware that calculates the priority of the collected data. Arguments: - - func_for_priority_calculation (:obj:`Callable`): The function that calculates the priority of the collected data. + - priority_calculation_fn (:obj:`Callable`): The function that calculates the priority of the collected data. """ if task.router.is_active and not task.has_role(task.role.COLLECTOR): @@ -30,9 +30,8 @@ def priority_calculator(func_for_priority_calculation: Callable) -> Callable: def _priority_calculator(ctx: "OnlineRLContext") -> None: - priority = func_for_priority_calculation(ctx.trajectories) + priority = priority_calculation_fn(ctx.trajectories) for i in range(len(priority)): ctx.trajectories[i]['priority'] = priority[i] - print(f"node_id:{task.router.node_id}, priority:{priority}") return _priority_calculator diff --git a/ding/rl_utils/td.py b/ding/rl_utils/td.py index 8682a740f7..9a42630702 100644 --- a/ding/rl_utils/td.py +++ b/ding/rl_utils/td.py @@ -174,7 +174,6 @@ def view_similar(x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: def nstep_return(data: namedtuple, gamma: Union[float, list], nstep: int, value_gamma: Optional[torch.Tensor] = None): reward, next_value, done = data - print(f"reward.shape:{reward.shape}") assert reward.shape[0] == nstep device = reward.device From 984c8bac67e74631282854d7c12a8ef39fcb7a45 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 10 Aug 2023 19:29:32 +0800 Subject: [PATCH 184/244] polish code --- ding/framework/middleware/distributer.py | 23 ++++--------- .../middleware/functional/priority.py | 19 ++--------- .../middleware/tests/test_distributer.py | 8 +++-- .../middleware/tests/test_priority.py | 33 +++++++++++++++++++ 4 files changed, 49 insertions(+), 34 deletions(-) create mode 100644 ding/framework/middleware/tests/test_priority.py diff --git a/ding/framework/middleware/distributer.py b/ding/framework/middleware/distributer.py index ad92322bff..b0f3502455 100644 --- a/ding/framework/middleware/distributer.py +++ b/ding/framework/middleware/distributer.py @@ -304,12 +304,16 @@ def __init__( ) -> None: """ Overview: - Exchange model between processes, only the learner will send the model, - otherwise the model will only be received. + Exchange model between processes, set the mode to "send" or "receive" to specify the role of the process. If you are using a shared model on a single host, there is no need to use this middleware. Arguments: - model (:obj:`torch.nn.Module`): Pytorch module. - - model_loader (:obj:`ModelLoader`): Encode model in subprocess. + - mode (:obj:`str`): "send" or "receive". + - period (:obj:`int`): The period of model exchange. + - delay_toleration (:obj:`float`): The permitted time interval for receiving model after being sent. + - stale_toleration (:obj:`int`): The permitted number of iterations for receiving model after being sent. + - event_name (:obj:`str`): The event name for model exchange. + - model_loader (:obj:`ModelLoader`): ModelLoader for this PeriodicalModelExchanger to use. """ self._model = model self._model_loader = model_loader @@ -332,14 +336,10 @@ def __init__( task.once("finish", lambda _: model_loader.shutdown()) def _cache_state_dict(self, msg: Dict[str, Any]): - # msg: Dict {'id':id,'model':state_dict: Union[object, Storage]} - print(f"node_id[{task.router.node_id}] get model msg") if msg['id'] % self._period == 0: self._state_dict_cache = msg['model'] self._id_counter = msg['id'] self._time = msg['time'] - else: - print(f"node_id[{task.router.node_id}] skip save cache") def __new__(cls, *args, **kwargs): return super(PeriodicalModelExchanger, cls).__new__(cls) @@ -349,17 +349,12 @@ def __call__(self, ctx: "Context") -> Any: self._model_loader.start() if self.mode == "receive": - print(f"node_id[{task.router.node_id}] try receive model") if ctx.total_step != 0: # Skip first iteration self._update_model() - else: - print(f"node_id[{task.router.node_id}] skip first iteration") elif self.mode == "send": yield - print(f"node_id[{task.router.node_id}] try send model") if self._id_counter % self._period == 0: self._send_model(id=self._id_counter) - print(f"node_id[{task.router.node_id}] model send [{self._id_counter}]") self._id_counter += 1 else: raise NotImplementedError @@ -380,9 +375,7 @@ def _update_model(self): else: sleep(0.01) else: - #print(f"node_id[{task.router.node_id}] time diff {time()-self._time}") if self._id_counter > self._model_id and time() - self._time < self.delay_toleration: - print(f"node_id[{task.router.node_id}] begin update") if isinstance(self._state_dict_cache, Storage) and self._model_loader is not None: try: self._model.load_state_dict(self._model_loader.load(self._state_dict_cache)) @@ -402,11 +395,9 @@ def _update_model(self): self._model.load_state_dict(self._state_dict_cache) self._state_dict_cache = None self._model_id = self._id_counter - print(f"node_id[{task.router.node_id}] model updated") self.model_stale = 1 break else: - print(f"node_id[{task.router.node_id}] same id skip update") self.model_stale += 1 def _send_model(self, id: int): diff --git a/ding/framework/middleware/functional/priority.py b/ding/framework/middleware/functional/priority.py index 6df347fea0..e62afbb5c9 100644 --- a/ding/framework/middleware/functional/priority.py +++ b/ding/framework/middleware/functional/priority.py @@ -1,20 +1,7 @@ -from typing import TYPE_CHECKING, Optional, Callable, Dict, List, Union -from ditk import logging -from easydict import EasyDict -from matplotlib import pyplot as plt -from matplotlib import animation -import os -import numpy as np -import torch -import wandb -import pickle -import treetensor.numpy as tnp -from ding.policy import Policy -from ding.data import Buffer -from ding.rl_utils import gae, gae_data +from typing import TYPE_CHECKING, Callable from ding.framework import task -from ding.utils.data import ttorch_collate -from ding.torch_utils import to_device +if TYPE_CHECKING: + from ding.framework import OnlineRLContext def priority_calculator(priority_calculation_fn: Callable) -> Callable: diff --git a/ding/framework/middleware/tests/test_distributer.py b/ding/framework/middleware/tests/test_distributer.py index f7c313c827..fd6cb75f4e 100644 --- a/ding/framework/middleware/tests/test_distributer.py +++ b/ding/framework/middleware/tests/test_distributer.py @@ -235,7 +235,11 @@ def periodical_model_exchanger_main(): task.use(PeriodicalModelExchanger(policy._model, mode="send", period=3)) else: task.add_role(task.role.COLLECTOR) - task.use(PeriodicalModelExchanger(policy._model, mode="receive", period=1, stale_toleration=3)) + task.use( + PeriodicalModelExchanger( + policy._model, mode="receive", period=1, stale_toleration=3, delay_toleration=1.0 + ) + ) if task.has_role(task.role.LEARNER): def train(ctx): @@ -266,5 +270,5 @@ def test_periodical_model_exchanger(): if __name__ == "__main__": - test_model_exchanger() + #test_model_exchanger() test_periodical_model_exchanger() diff --git a/ding/framework/middleware/tests/test_priority.py b/ding/framework/middleware/tests/test_priority.py new file mode 100644 index 0000000000..19261213d6 --- /dev/null +++ b/ding/framework/middleware/tests/test_priority.py @@ -0,0 +1,33 @@ +#unittest for priority_calculator + +import unittest +import pytest +import numpy as np +from unittest.mock import Mock, patch +from ding.framework import OnlineRLContext, OfflineRLContext +from ding.framework import task, Parallel +from ding.framework.middleware.functional import priority_calculator + + +class MockPolicy(Mock): + + def priority_fun(self, data): + return np.random.rand(len(data)) + + +@pytest.mark.unittest +def test_priority_calculator(): + policy = MockPolicy() + ctx = OnlineRLContext() + ctx.trajectories = [ + { + 'obs': np.random.rand(2, 2), + 'next_obs': np.random.rand(2, 2), + 'reward': np.random.rand(1), + 'info': {} + } for _ in range(10) + ] + priority_calculator_middleware = priority_calculator(priority_calculation_fn=policy.priority_fun) + priority_calculator_middleware(ctx) + assert len(ctx.trajectories) == 10 + assert all([isinstance(traj['priority'], float) for traj in ctx.trajectories]) From 584bd7a4e49f46152a34f90c731bee10c2fd1a4c Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 11 Aug 2023 19:15:04 +0800 Subject: [PATCH 185/244] polish code --- ding/data/buffer/tests/test_middleware.py | 32 +++++++++++++++++++++++ ding/framework/middleware/distributer.py | 30 ++++++++++----------- 2 files changed, 47 insertions(+), 15 deletions(-) diff --git a/ding/data/buffer/tests/test_middleware.py b/ding/data/buffer/tests/test_middleware.py index 9e7ca64fe0..cc19866ee3 100644 --- a/ding/data/buffer/tests/test_middleware.py +++ b/ding/data/buffer/tests/test_middleware.py @@ -110,6 +110,38 @@ def test_priority(): assert buffer.count() == 0 +@pytest.mark.unittest +def test_priority_from_collector(): + N = 5 + buffer = DequeBuffer(size=10) + buffer.use(PriorityExperienceReplay(buffer, IS_weight=True)) + for _ in range(N): + tmp_data = get_data() + tmp_data['priority'] = 2.0 + buffer.push(get_data()) + assert buffer.count() == N + for _ in range(N): + tmp_data = get_data() + tmp_data['priority'] = 2.0 + buffer.push(get_data()) + assert buffer.count() == N + N + data = buffer.sample(size=N + N, replace=False) + assert len(data) == N + N + for item in data: + meta = item.meta + assert set(meta.keys()).issuperset(set(['priority', 'priority_idx', 'priority_IS'])) + meta['priority'] = 3.0 + for item in data: + data, index, meta = item.data, item.index, item.meta + buffer.update(index, data, meta) + data = buffer.sample(size=1) + assert data[0].meta['priority'] == 3.0 + buffer.delete(data[0].index) + assert buffer.count() == N + N - 1 + buffer.clear() + assert buffer.count() == 0 + + @pytest.mark.unittest def test_padding(): buffer = DequeBuffer(size=10) diff --git a/ding/framework/middleware/distributer.py b/ding/framework/middleware/distributer.py index b0f3502455..d2f5e36402 100644 --- a/ding/framework/middleware/distributer.py +++ b/ding/framework/middleware/distributer.py @@ -319,18 +319,18 @@ def __init__( self._model_loader = model_loader self._event_name = event_name self._period = period - self.mode = mode - if self.mode == "receive": + self._mode = mode + if self._mode == "receive": self._id_counter = -1 self._model_id = -1 else: self._id_counter = 0 - self.stale_toleration = stale_toleration - self.model_stale = stale_toleration - self.delay_toleration = delay_toleration + self._stale_toleration = stale_toleration + self._model_stale = stale_toleration + self._delay_toleration = delay_toleration self._state_dict_cache: Optional[Union[object, Storage]] = None - if self.mode == "receive": + if self._mode == "receive": task.on(self._event_name, self._cache_state_dict) if model_loader: task.once("finish", lambda _: model_loader.shutdown()) @@ -348,10 +348,10 @@ def __call__(self, ctx: "Context") -> Any: if self._model_loader: self._model_loader.start() - if self.mode == "receive": + if self._mode == "receive": if ctx.total_step != 0: # Skip first iteration self._update_model() - elif self.mode == "send": + elif self._mode == "send": yield if self._id_counter % self._period == 0: self._send_model(id=self._id_counter) @@ -366,22 +366,22 @@ def _update_model(self): return if time() - start > 60: logging.warning("Timeout when waiting for new model! Node id: {}".format(task.router.node_id)) - self.model_stale += 1 + self._model_stale += 1 break if self._state_dict_cache is None: - if self.model_stale < self.stale_toleration and time() - self._time < self.delay_toleration: - self.model_stale += 1 + if self._model_stale < self._stale_toleration and time() - self._time < self._delay_toleration: + self._model_stale += 1 break else: sleep(0.01) else: - if self._id_counter > self._model_id and time() - self._time < self.delay_toleration: + if self._id_counter > self._model_id and time() - self._time < self._delay_toleration: if isinstance(self._state_dict_cache, Storage) and self._model_loader is not None: try: self._model.load_state_dict(self._model_loader.load(self._state_dict_cache)) self._state_dict_cache = None self._model_id = self._id_counter - self.model_stale = 1 + self._model_stale = 1 break except FileNotFoundError as e: logging.warning( @@ -395,10 +395,10 @@ def _update_model(self): self._model.load_state_dict(self._state_dict_cache) self._state_dict_cache = None self._model_id = self._id_counter - self.model_stale = 1 + self._model_stale = 1 break else: - self.model_stale += 1 + self._model_stale += 1 def _send_model(self, id: int): if self._model_loader: From 948c99b28b12473cb7a724effd995b107986c1d6 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 15 Aug 2023 16:20:08 +0800 Subject: [PATCH 186/244] polish code --- ding/entry/utils.py | 8 +- ding/envs/env_manager/envpool_env_manager.py | 189 +++++++++++++++++- ding/example/dqn_envpool.py | 100 +++++++++ .../collector/sample_serial_collector.py | 2 +- .../serial/pong/pong_dqn_envpool_config.py | 3 +- .../spaceinvaders_dqn_envpool_config.py | 63 ++++++ 6 files changed, 354 insertions(+), 11 deletions(-) create mode 100644 ding/example/dqn_envpool.py create mode 100644 dizoo/atari/config/serial/spaceinvaders/spaceinvaders_dqn_envpool_config.py diff --git a/ding/entry/utils.py b/ding/entry/utils.py index bbfbaa83bd..a3b66bfe70 100644 --- a/ding/entry/utils.py +++ b/ding/entry/utils.py @@ -1,4 +1,4 @@ -from typing import Optional, Callable, List, Any +from typing import Optional, Callable, List, Any, Dict from ding.policy import PolicyFactory from ding.worker import IMetric, MetricSerialEvaluator @@ -46,7 +46,8 @@ def random_collect( collector_env: 'BaseEnvManager', # noqa commander: 'BaseSerialCommander', # noqa replay_buffer: 'IBuffer', # noqa - postprocess_data_fn: Optional[Callable] = None + postprocess_data_fn: Optional[Callable] = None, + collect_kwargs: Optional[Dict] = None, ) -> None: # noqa assert policy_cfg.random_collect_size > 0 if policy_cfg.get('transition_with_policy_data', False): @@ -55,7 +56,8 @@ def random_collect( action_space = collector_env.action_space random_policy = PolicyFactory.get_random_policy(policy.collect_mode, action_space=action_space) collector.reset_policy(random_policy) - collect_kwargs = commander.step() + if collect_kwargs is None: + collect_kwargs = commander.step() if policy_cfg.collect.collector.type == 'episode': new_data = collector.collect(n_episode=policy_cfg.random_collect_size, policy_kwargs=collect_kwargs) else: diff --git a/ding/envs/env_manager/envpool_env_manager.py b/ding/envs/env_manager/envpool_env_manager.py index a8d1a4ae03..9fc730d5aa 100644 --- a/ding/envs/env_manager/envpool_env_manager.py +++ b/ding/envs/env_manager/envpool_env_manager.py @@ -2,7 +2,10 @@ from easydict import EasyDict from copy import deepcopy import numpy as np +import torch +import treetensor.numpy as tnp from collections import namedtuple +import enum from typing import Any, Union, List, Tuple, Dict, Callable, Optional from ditk import logging try: @@ -17,8 +20,18 @@ from ding.torch_utils import to_ndarray +class EnvState(enum.IntEnum): + VOID = 0 + INIT = 1 + RUN = 2 + RESET = 3 + DONE = 4 + ERROR = 5 + NEED_RESET = 6 + + @ENV_MANAGER_REGISTRY.register('env_pool') -class PoolEnvManager: +class PoolEnvManager(): ''' Overview: Envpool now supports Atari, Classic Control, Toy Text, ViZDoom. @@ -55,18 +68,29 @@ def launch(self) -> None: seed = 0 else: seed = self._seed + + kwargs = {} + if "episodic_life" in self._cfg: + kwargs["episodic_life"] = self._cfg.episodic_life + if "reward_clip" in self._cfg: + kwargs["reward_clip"] = self._cfg.reward_clip + if "stack_num" in self._cfg: + kwargs["stack_num"] = self._cfg.stack_num + if "gray_scale" in self._cfg: + kwargs["gray_scale"] = self._cfg.gray_scale + if "frame_skip" in self._cfg: + kwargs["frame_skip"] = self._cfg.frame_skip + self._envs = envpool.make( task_id=self._cfg.env_id, env_type="gym", num_envs=self._env_num, batch_size=self._batch_size, seed=seed, - episodic_life=self._cfg.episodic_life, - reward_clip=self._cfg.reward_clip, - stack_num=self._cfg.stack_num, - gray_scale=self._cfg.gray_scale, - frame_skip=self._cfg.frame_skip + **kwargs ) + self._action_space = self._envs.action_space + self._observation_space = self._envs.observation_space self._closed = False self.reset() @@ -77,6 +101,7 @@ def reset(self) -> None: obs, _, _, info = self._envs.recv() env_id = info['env_id'] obs = obs.astype(np.float32) + obs /= 255.0 self._ready_obs = deep_merge_dicts({i: o for i, o in zip(env_id, obs)}, self._ready_obs) if len(self._ready_obs) == self._env_num: break @@ -91,6 +116,7 @@ def step(self, action: dict) -> Dict[int, namedtuple]: obs, rew, done, info = self._envs.recv() obs = obs.astype(np.float32) + obs /= 255.0 rew = rew.astype(np.float32) env_id = info['env_id'] timesteps = {} @@ -124,3 +150,154 @@ def env_num(self) -> int: @property def ready_obs(self) -> Dict[int, Any]: return self._ready_obs + + @property + def observation_space(self) -> 'gym.spaces.Space': # noqa + return self._observation_space + + @property + def action_space(self) -> 'gym.spaces.Space': # noqa + return self._action_space + + +@ENV_MANAGER_REGISTRY.register('env_pool_v2') +class PoolEnvManagerV2(): + ''' + Overview: + Envpool now supports Atari, Classic Control, Toy Text, ViZDoom. + Here we list some commonly used env_ids as follows. + For more examples, you can refer to . + + - Atari: "Pong-v5", "SpaceInvaders-v5", "Qbert-v5" + - Classic Control: "CartPole-v0", "CartPole-v1", "Pendulum-v1" + ''' + + @classmethod + def default_config(cls) -> EasyDict: + return EasyDict(deepcopy(cls.config)) + + config = dict( + type='envpool', + # Sync mode: batch_size == env_num + # Async mode: batch_size < env_num + env_num=8, + batch_size=8, + ) + + def __init__(self, cfg: EasyDict) -> None: + super().__init__() + self._cfg = cfg + self._env_num = cfg.env_num + self._batch_size = cfg.batch_size + self._ready_obs = {} + self._closed = True + self._seed = None + + def launch(self) -> None: + assert self._closed, "Please first close the env manager" + if self._seed is None: + seed = 0 + else: + seed = self._seed + + kwargs = {} + if "episodic_life" in self._cfg: + kwargs["episodic_life"] = self._cfg.episodic_life + if "reward_clip" in self._cfg: + kwargs["reward_clip"] = self._cfg.reward_clip + if "stack_num" in self._cfg: + kwargs["stack_num"] = self._cfg.stack_num + if "gray_scale" in self._cfg: + kwargs["gray_scale"] = self._cfg.gray_scale + if "frame_skip" in self._cfg: + kwargs["frame_skip"] = self._cfg.frame_skip + + self._envs = envpool.make( + task_id=self._cfg.env_id, + env_type="gym", + num_envs=self._env_num, + batch_size=self._batch_size, + seed=seed, + **kwargs + ) + self._action_space = self._envs.action_space + self._observation_space = self._envs.observation_space + self._closed = False + self.reset() + + def reset(self) -> None: + self._ready_obs = {} + self._envs.async_reset() + while True: + obs, _, _, info = self._envs.recv() + env_id = info['env_id'] + obs = obs.astype(np.float32) + obs /= 255.0 + self._ready_obs = deep_merge_dicts({i: o for i, o in zip(env_id, obs)}, self._ready_obs) + if len(self._ready_obs) == self._env_num: + break + self._eval_episode_return = [0. for _ in range(self._env_num)] + + def step(self, action: Union[List, np.ndarray]) -> Dict[int, namedtuple]: + env_id = np.array(list(self._ready_obs.keys())) + action = np.array(action) + if len(action.shape) == 2: + action = action.squeeze(1) + self._envs.send(action, env_id) + + obs, rew, done, info = self._envs.recv() + obs = obs.astype(np.float32) + obs /= 255.0 + rew = rew.astype(np.float32) + env_id = info['env_id'] + timesteps = {} + new_data = [] + self._ready_obs = {} + for i in range(len(env_id)): + d = bool(done[i]) + r = to_ndarray([rew[i]]) + self._eval_episode_return[env_id[i]] += r + info_dict = {'env_id': i} + timesteps[env_id[i]] = BaseEnvTimestep(obs[i], r, d, info=info_dict) + if d: + info_dict['eval_episode_return'] = self._eval_episode_return[env_id[i]] + timesteps[env_id[i]].info['eval_episode_return'] = info_dict['eval_episode_return'] + self._eval_episode_return[env_id[i]] = 0. + new_data.append(tnp.array({'obs': obs[i], 'reward': r, 'done': d, 'info': info_dict, 'env_id': env_id[i]})) + self._ready_obs[env_id[i]] = obs[i] + return new_data + + def close(self) -> None: + if self._closed: + return + # Envpool has no `close` API + self._closed = True + + @property + def closed(self) -> None: + return self._closed + + def seed(self, seed: int, dynamic_seed=False) -> None: + # The i-th environment seed in Envpool will be set with i+seed, so we don't do extra transformation here + self._seed = seed + logging.warning("envpool doesn't support dynamic_seed in different episode") + + @property + def env_num(self) -> int: + return self._env_num + + @property + def ready_obs(self) -> tnp.array: + if isinstance(self._ready_obs, dict): + obs = [tnp.array(o) for k, o in self._ready_obs.items()] + return tnp.stack(obs) + else: + raise NotImplementedError + + @property + def observation_space(self) -> 'gym.spaces.Space': # noqa + return self._observation_space + + @property + def action_space(self) -> 'gym.spaces.Space': # noqa + return self._action_space diff --git a/ding/example/dqn_envpool.py b/ding/example/dqn_envpool.py new file mode 100644 index 0000000000..bef6a72369 --- /dev/null +++ b/ding/example/dqn_envpool.py @@ -0,0 +1,100 @@ +import gym +import datetime +import wandb +import numpy as np +from easydict import EasyDict +from ditk import logging +from ding.data.model_loader import FileModelLoader +from ding.data.storage_loader import FileStorageLoader +from ding.model import DQN +from ding.policy import DQNPolicy +from ding.envs import DingEnvWrapper, BaseEnvManagerV2 +from ding.envs.env_manager.envpool_env_manager import PoolEnvManager, PoolEnvManagerV2 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial import pong_dqn_envpool_config + + +def main(cfg): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'pong_dqn_envpool_' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + cfg.env.collector_env_num = 8 + cfg.env.collector_batch_size = 8 + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV2, DQNPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV2(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV2(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + # Here is the part of single process pipeline. + task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use(StepCollector(cfg, policy.collect_mode, collector_env)) + if "nstep" in cfg.policy and cfg.policy.nstep > 1: + task.use(nstep_reward_enhancer(cfg)) + task.use(data_pusher(cfg, buffer_)) + task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(online_logger(train_show_freq=10)) + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=10000000)) + + task.run() + + +if __name__ == "__main__": + main(pong_dqn_envpool_config) diff --git a/ding/worker/collector/sample_serial_collector.py b/ding/worker/collector/sample_serial_collector.py index 26db458edb..ccea8806cf 100644 --- a/ding/worker/collector/sample_serial_collector.py +++ b/ding/worker/collector/sample_serial_collector.py @@ -25,7 +25,7 @@ class SampleSerialCollector(ISerialCollector): envstep """ - config = dict(deepcopy_obs=False, transform_obs=False, collect_print_freq=100) + config = dict(type='sample', deepcopy_obs=False, transform_obs=False, collect_print_freq=100) def __init__( self, diff --git a/dizoo/atari/config/serial/pong/pong_dqn_envpool_config.py b/dizoo/atari/config/serial/pong/pong_dqn_envpool_config.py index 0b80e41548..f09cd4c54d 100644 --- a/dizoo/atari/config/serial/pong/pong_dqn_envpool_config.py +++ b/dizoo/atari/config/serial/pong/pong_dqn_envpool_config.py @@ -9,13 +9,14 @@ evaluator_batch_size=8, n_evaluator_episode=8, stop_value=20, - env_id='PongNoFrameskip-v4', + env_id='Pong-v5', #'ALE/Pong-v5' is available. But special setting is needed after gym make. frame_stack=4, ), policy=dict( cuda=True, priority=False, + random_collect_size=50000, model=dict( obs_shape=[4, 84, 84], action_shape=6, diff --git a/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_dqn_envpool_config.py b/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_dqn_envpool_config.py new file mode 100644 index 0000000000..da56810f0c --- /dev/null +++ b/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_dqn_envpool_config.py @@ -0,0 +1,63 @@ +from easydict import EasyDict + +spaceinvaders_dqn_envpool_config = dict( + exp_name='spaceinvaders_dqn_envpool_seed0', + env=dict( + collector_env_num=8, + collector_batch_size=8, + evaluator_env_num=8, + evaluator_batch_size=8, + n_evaluator_episode=8, + stop_value=10000000000, + env_id='SpaceInvaders-v5', + #'ALE/SpaceInvaders-v5' is available. But special setting is needed after gym make. + frame_stack=4, + ), + policy=dict( + cuda=True, + priority=False, + random_collect_size=5000, + model=dict( + obs_shape=[4, 84, 84], + action_shape=6, + encoder_hidden_size_list=[128, 128, 512], + ), + nstep=3, + discount_factor=0.99, + learn=dict( + update_per_collect=10, + batch_size=32, + learning_rate=0.0001, + target_update_freq=500, + ), + collect=dict(n_sample=100, ), + eval=dict(evaluator=dict(eval_freq=4000, )), + other=dict( + eps=dict( + type='exp', + start=1., + end=0.05, + decay=1000000, + ), + replay_buffer=dict(replay_buffer_size=400000, ), + ), + ), +) +spaceinvaders_dqn_envpool_config = EasyDict(spaceinvaders_dqn_envpool_config) +main_config = spaceinvaders_dqn_envpool_config +spaceinvaders_dqn_envpool_create_config = dict( + env=dict( + type='atari', + import_names=['dizoo.atari.envs.atari_env'], + ), + env_manager=dict(type='env_pool'), + policy=dict(type='dqn'), + replay_buffer=dict(type='deque'), +) +spaceinvaders_dqn_envpool_create_config = EasyDict(spaceinvaders_dqn_envpool_create_config) +create_config = spaceinvaders_dqn_envpool_create_config + +if __name__ == '__main__': + # or you can enter `ding -m serial -c spaceinvaders_dqn_envpool_config.py -s 0` + from ding.entry import serial_pipeline + serial_pipeline((main_config, create_config), seed=0) From d0047ed9e309884cbe58bc46590828fa23ff7229 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 15 Aug 2023 19:19:45 +0800 Subject: [PATCH 187/244] change timer gpu to false --- ding/worker/collector/sample_serial_collector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ding/worker/collector/sample_serial_collector.py b/ding/worker/collector/sample_serial_collector.py index ccea8806cf..35667b1ceb 100644 --- a/ding/worker/collector/sample_serial_collector.py +++ b/ding/worker/collector/sample_serial_collector.py @@ -51,7 +51,7 @@ def __init__( self._deepcopy_obs = cfg.deepcopy_obs # whether to deepcopy each data self._transform_obs = cfg.transform_obs self._cfg = cfg - self._timer = EasyTimer() + self._timer = EasyTimer(cuda=False) self._end_flag = False self._rank = get_rank() self._world_size = get_world_size() From c7509cb20f2823ba20d46977c62151c6e5ea01e9 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 15 Aug 2023 19:38:16 +0800 Subject: [PATCH 188/244] polish config --- ding/example/dqn_envpool_wandb.py | 109 ++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 ding/example/dqn_envpool_wandb.py diff --git a/ding/example/dqn_envpool_wandb.py b/ding/example/dqn_envpool_wandb.py new file mode 100644 index 0000000000..f96407bbcb --- /dev/null +++ b/ding/example/dqn_envpool_wandb.py @@ -0,0 +1,109 @@ +import gym +import datetime +import wandb +import numpy as np +from easydict import EasyDict +from ditk import logging +from ding.data.model_loader import FileModelLoader +from ding.data.storage_loader import FileStorageLoader +from ding.model import DQN +from ding.policy import DQNPolicy +from ding.envs import DingEnvWrapper, BaseEnvManagerV2 +from ding.envs.env_manager.envpool_env_manager import PoolEnvManager, PoolEnvManagerV2 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker, wandb_online_logger +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial import pong_dqn_envpool_config + + +def main(cfg): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'Pong-v5-DQN-envpool-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + cfg.env.collector_env_num = 8 + cfg.env.collector_batch_size = 8 + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV2, DQNPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV2(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV2(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + # Here is the part of single process pipeline. + task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use(StepCollector(cfg, policy.collect_mode, collector_env)) + if "nstep" in cfg.policy and cfg.policy.nstep > 1: + task.use(nstep_reward_enhancer(cfg)) + task.use(data_pusher(cfg, buffer_)) + task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use( + wandb_online_logger( + metric_list=policy.monitor_vars(), + model=policy._model, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + )) + + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=10000000)) + + task.run() + + +if __name__ == "__main__": + main(pong_dqn_envpool_config) From 9ff7d4b4cddd26fe5bcc0286939b6dac1ad56248 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 16 Aug 2023 12:19:38 +0800 Subject: [PATCH 189/244] add sweep main file for new pipeline --- ding/example/dqn_envpool_wandb_sweep_pong.py | 148 +++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 ding/example/dqn_envpool_wandb_sweep_pong.py diff --git a/ding/example/dqn_envpool_wandb_sweep_pong.py b/ding/example/dqn_envpool_wandb_sweep_pong.py new file mode 100644 index 0000000000..c4c519edf3 --- /dev/null +++ b/ding/example/dqn_envpool_wandb_sweep_pong.py @@ -0,0 +1,148 @@ +import shutil +import datetime +import wandb +import numpy as np +from easydict import EasyDict +from ditk import logging +from ding.data.model_loader import FileModelLoader +from ding.data.storage_loader import FileStorageLoader +from ding.model import DQN +from ding.policy import DQNPolicy +from ding.envs import DingEnvWrapper, BaseEnvManagerV2 +from ding.envs.env_manager.envpool_env_manager import PoolEnvManager, PoolEnvManagerV2 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker, wandb_online_logger +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial import pong_dqn_envpool_config + + +def main(cfg, seed=0, max_env_step=int(1e7)): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'Pong-v5-DQN-envpool-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + cfg.seed = seed + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV2, DQNPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV2(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV2(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + # Here is the part of single process pipeline. + task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use(StepCollector(cfg, policy.collect_mode, collector_env)) + if "nstep" in cfg.policy and cfg.policy.nstep > 1: + task.use(nstep_reward_enhancer(cfg)) + task.use(data_pusher(cfg, buffer_)) + task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use( + wandb_online_logger( + metric_list=policy.monitor_vars(), + model=policy._model, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + )) + + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=max_env_step)) + + task.run() + + +def sweep_main(): + wandb.init() + + good_pair=(wandb.config.collector_env_num%wandb.config.collector_batch_size==0) + + if not good_pair: + wandb.log({"time": 0.0}) + else: + import time + start_time = time.time() + pong_dqn_envpool_config.exp_name = f'Pong-v5-envpool-new-pipeline-speed-test-{wandb.config.collector_env_num}-{wandb.config.collector_batch_size}' + pong_dqn_envpool_config.env.collector_env_num=wandb.config.collector_env_num + pong_dqn_envpool_config.env.collector_batch_size=wandb.config.collector_batch_size + main(EasyDict(pong_dqn_envpool_config), max_env_step=10000000) + print(time.time()-start_time) + wandb.log({"time_cost": time.time()-start_time}) + #remove the directory named as exp_name + shutil.rmtree(pong_dqn_envpool_config.exp_name) + + +if __name__ == "__main__": + + sweep_configuration = { + 'method': 'grid', + 'metric': + { + 'goal': 'maximize', + 'name': 'time_cost' + }, + 'parameters': + { + 'collector_env_num': {'values': [64]}, + 'collector_batch_size': {'values': [64, 32, 16, 8]}, + } + } + + sweep_id = wandb.sweep( + sweep=sweep_configuration, + project='Pong-v5-envpool-new-pipeline-speed-test' + ) + + wandb.agent(sweep_id, function=sweep_main) From 0a1a2cc9d3c36d46089c3184920a1a6c66ffa738 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 16 Aug 2023 15:17:55 +0800 Subject: [PATCH 190/244] polish code --- ding/envs/env_manager/envpool_env_manager.py | 28 +++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/ding/envs/env_manager/envpool_env_manager.py b/ding/envs/env_manager/envpool_env_manager.py index 9fc730d5aa..24b7adca7d 100644 --- a/ding/envs/env_manager/envpool_env_manager.py +++ b/ding/envs/env_manager/envpool_env_manager.py @@ -153,11 +153,21 @@ def ready_obs(self) -> Dict[int, Any]: @property def observation_space(self) -> 'gym.spaces.Space': # noqa - return self._observation_space + try: + return self._observation_space + except AttributeError: + self.launch() + self.close() + return self._observation_space @property def action_space(self) -> 'gym.spaces.Space': # noqa - return self._action_space + try: + return self._action_space + except AttributeError: + self.launch() + self.close() + return self._action_space @ENV_MANAGER_REGISTRY.register('env_pool_v2') @@ -296,8 +306,18 @@ def ready_obs(self) -> tnp.array: @property def observation_space(self) -> 'gym.spaces.Space': # noqa - return self._observation_space + try: + return self._observation_space + except AttributeError: + self.launch() + self.close() + return self._observation_space @property def action_space(self) -> 'gym.spaces.Space': # noqa - return self._action_space + try: + return self._action_space + except AttributeError: + self.launch() + self.close() + return self._action_space From fb5045b396d6ab23c3fe0daedca8c9bfe6b52ec9 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 16 Aug 2023 15:20:03 +0800 Subject: [PATCH 191/244] polish code --- ding/example/dqn_envpool_wandb.py | 10 +++++++++- ding/example/dqn_envpool_wandb_sweep_pong.py | 10 +++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/ding/example/dqn_envpool_wandb.py b/ding/example/dqn_envpool_wandb.py index f96407bbcb..1fd03374a5 100644 --- a/ding/example/dqn_envpool_wandb.py +++ b/ding/example/dqn_envpool_wandb.py @@ -84,7 +84,15 @@ def main(cfg): # Here is the part of single process pipeline. task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) task.use(eps_greedy_handler(cfg)) - task.use(StepCollector(cfg, policy.collect_mode, collector_env)) + task.use( + StepCollector( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) if "nstep" in cfg.policy and cfg.policy.nstep > 1: task.use(nstep_reward_enhancer(cfg)) task.use(data_pusher(cfg, buffer_)) diff --git a/ding/example/dqn_envpool_wandb_sweep_pong.py b/ding/example/dqn_envpool_wandb_sweep_pong.py index c4c519edf3..730b4d232a 100644 --- a/ding/example/dqn_envpool_wandb_sweep_pong.py +++ b/ding/example/dqn_envpool_wandb_sweep_pong.py @@ -83,7 +83,15 @@ def main(cfg, seed=0, max_env_step=int(1e7)): # Here is the part of single process pipeline. task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) task.use(eps_greedy_handler(cfg)) - task.use(StepCollector(cfg, policy.collect_mode, collector_env)) + task.use( + StepCollector( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) if "nstep" in cfg.policy and cfg.policy.nstep > 1: task.use(nstep_reward_enhancer(cfg)) task.use(data_pusher(cfg, buffer_)) From 6a4d83e0b4efc646d5fc5e98fc05e85bfa5023ba Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 17 Aug 2023 02:47:57 +0800 Subject: [PATCH 192/244] polish code --- ding/example/dqn_envpool_wandb_sweep_pong.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ding/example/dqn_envpool_wandb_sweep_pong.py b/ding/example/dqn_envpool_wandb_sweep_pong.py index 730b4d232a..b810c6e16c 100644 --- a/ding/example/dqn_envpool_wandb_sweep_pong.py +++ b/ding/example/dqn_envpool_wandb_sweep_pong.py @@ -122,6 +122,7 @@ def sweep_main(): else: import time start_time = time.time() + pong_dqn_envpool_config.env.stop_value = 2000 pong_dqn_envpool_config.exp_name = f'Pong-v5-envpool-new-pipeline-speed-test-{wandb.config.collector_env_num}-{wandb.config.collector_batch_size}' pong_dqn_envpool_config.env.collector_env_num=wandb.config.collector_env_num pong_dqn_envpool_config.env.collector_batch_size=wandb.config.collector_batch_size From 52ded5a40b0997c96d19d8a977f555bdf7cb44d3 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 18 Aug 2023 12:43:20 +0800 Subject: [PATCH 193/244] Add main file --- ding/example/dqn_envpool_wandb_main.py | 127 +++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 ding/example/dqn_envpool_wandb_main.py diff --git a/ding/example/dqn_envpool_wandb_main.py b/ding/example/dqn_envpool_wandb_main.py new file mode 100644 index 0000000000..f8904bd7d1 --- /dev/null +++ b/ding/example/dqn_envpool_wandb_main.py @@ -0,0 +1,127 @@ +import gym +import datetime +import wandb +import numpy as np +from easydict import EasyDict +from ditk import logging +from ding.data.model_loader import FileModelLoader +from ding.data.storage_loader import FileStorageLoader +from ding.model import DQN +from ding.policy import DQNPolicy +from ding.envs import DingEnvWrapper, BaseEnvManagerV2 +from ding.envs.env_manager.envpool_env_manager import PoolEnvManager, PoolEnvManagerV2 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker, wandb_online_logger +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial import pong_dqn_envpool_config + + +def main(cfg): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'Pong-v5-DQN-envpool-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV2, DQNPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV2(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV2(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + # Here is the part of single process pipeline. + task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use( + StepCollector( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) + if "nstep" in cfg.policy and cfg.policy.nstep > 1: + task.use(nstep_reward_enhancer(cfg)) + task.use(data_pusher(cfg, buffer_)) + task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use( + wandb_online_logger( + metric_list=policy.monitor_vars(), + model=policy._model, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + )) + + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=10000000)) + + task.run() + + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") + parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") + arg=parser.parse_args() + + pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num + pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size + pong_dqn_envpool_config.seed = arg.seed + + main(pong_dqn_envpool_config) From 9aa23f7eaf67d277d525ed9201b4ef9f2f2d41df Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 18 Aug 2023 19:53:46 +0800 Subject: [PATCH 194/244] add test --- ding/envs/env_manager/envpool_env_manager.py | 7 ++ ding/example/dqn_envpool_wandb_test.py | 120 +++++++++++++++++++ 2 files changed, 127 insertions(+) create mode 100644 ding/example/dqn_envpool_wandb_test.py diff --git a/ding/envs/env_manager/envpool_env_manager.py b/ding/envs/env_manager/envpool_env_manager.py index 24b7adca7d..163ea0c895 100644 --- a/ding/envs/env_manager/envpool_env_manager.py +++ b/ding/envs/env_manager/envpool_env_manager.py @@ -202,6 +202,7 @@ def __init__(self, cfg: EasyDict) -> None: self._ready_obs = {} self._closed = True self._seed = None + self._test = False def launch(self) -> None: assert self._closed, "Please first close the env manager" @@ -221,6 +222,8 @@ def launch(self) -> None: kwargs["gray_scale"] = self._cfg.gray_scale if "frame_skip" in self._cfg: kwargs["frame_skip"] = self._cfg.frame_skip + if "test" in self._cfg: + self._test = self._cfg.test self._envs = envpool.make( task_id=self._cfg.env_id, @@ -256,6 +259,8 @@ def step(self, action: Union[List, np.ndarray]) -> Dict[int, namedtuple]: self._envs.send(action, env_id) obs, rew, done, info = self._envs.recv() + if self._test: + assert all(info['env_id'] == env_id) obs = obs.astype(np.float32) obs /= 255.0 rew = rew.astype(np.float32) @@ -311,6 +316,7 @@ def observation_space(self) -> 'gym.spaces.Space': # noqa except AttributeError: self.launch() self.close() + self._ready_obs = {} return self._observation_space @property @@ -320,4 +326,5 @@ def action_space(self) -> 'gym.spaces.Space': # noqa except AttributeError: self.launch() self.close() + self._ready_obs = {} return self._action_space diff --git a/ding/example/dqn_envpool_wandb_test.py b/ding/example/dqn_envpool_wandb_test.py new file mode 100644 index 0000000000..dac3ac7d33 --- /dev/null +++ b/ding/example/dqn_envpool_wandb_test.py @@ -0,0 +1,120 @@ +import gym +import datetime +import wandb +import numpy as np +from easydict import EasyDict +from ditk import logging +from ding.data.model_loader import FileModelLoader +from ding.data.storage_loader import FileStorageLoader +from ding.model import DQN +from ding.policy import DQNPolicy +from ding.envs import DingEnvWrapper, BaseEnvManagerV2 +from ding.envs.env_manager.envpool_env_manager import PoolEnvManager, PoolEnvManagerV2 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker, wandb_online_logger +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial import pong_dqn_envpool_config + + +def main(cfg): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'Pong-v5-DQN-envpool-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + cfg.env.collector_env_num = 8 + cfg.env.collector_batch_size = 8 + cfg.env['test']=True + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + 'test': cfg.env.test, + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + 'test': cfg.env.test, + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV2, DQNPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV2(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV2(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + # Here is the part of single process pipeline. + task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use( + StepCollector( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) + if "nstep" in cfg.policy and cfg.policy.nstep > 1: + task.use(nstep_reward_enhancer(cfg)) + task.use(data_pusher(cfg, buffer_)) + task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use( + wandb_online_logger( + metric_list=policy.monitor_vars(), + model=policy._model, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + )) + + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=10000000)) + + task.run() + + +if __name__ == "__main__": + main(pong_dqn_envpool_config) From c6e90a4b34347700faf0058fb916847b2dd4faf8 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 18 Aug 2023 19:58:28 +0800 Subject: [PATCH 195/244] add test --- ding/example/dqn_envpool_wandb_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ding/example/dqn_envpool_wandb_test.py b/ding/example/dqn_envpool_wandb_test.py index dac3ac7d33..bcc34dbfec 100644 --- a/ding/example/dqn_envpool_wandb_test.py +++ b/ding/example/dqn_envpool_wandb_test.py @@ -25,8 +25,8 @@ def main(cfg): logging.getLogger().setLevel(logging.INFO) cfg.exp_name = 'Pong-v5-DQN-envpool-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - cfg.env.collector_env_num = 8 - cfg.env.collector_batch_size = 8 + cfg.env.collector_env_num = 64 + cfg.env.collector_batch_size = 64 cfg.env['test']=True collector_env_cfg = EasyDict( { From ef994345dec515787e70a03cd2e1876897d8d1f9 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Mon, 21 Aug 2023 15:21:24 +0800 Subject: [PATCH 196/244] add time logger --- ding/example/dqn_envpool_wandb_main.py | 4 +++- ding/example/dqn_envpool_wandb_test.py | 4 +++- ding/framework/context.py | 7 +++++++ ding/framework/middleware/collector.py | 4 ++++ .../middleware/functional/data_processor.py | 5 ++++- ding/framework/middleware/functional/enhancer.py | 6 ++++++ ding/framework/middleware/functional/evaluator.py | 4 ++++ ding/framework/middleware/functional/logger.py | 13 +++++++++++++ ding/framework/middleware/functional/timer.py | 1 + ding/framework/middleware/learner.py | 3 +++ 10 files changed, 48 insertions(+), 3 deletions(-) diff --git a/ding/example/dqn_envpool_wandb_main.py b/ding/example/dqn_envpool_wandb_main.py index f8904bd7d1..e9420539d8 100644 --- a/ding/example/dqn_envpool_wandb_main.py +++ b/ding/example/dqn_envpool_wandb_main.py @@ -16,7 +16,7 @@ from ding.framework.context import OnlineRLContext from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger + termination_checker, wandb_online_logger, epoch_timer from ding.utils import set_pkg_seed from dizoo.atari.config.serial import pong_dqn_envpool_config @@ -80,6 +80,8 @@ def main(cfg): task.use(ContextExchanger(skip_n_iter=1)) task.use(ModelExchanger(model)) + task.use(epoch_timer()) + # Here is the part of single process pipeline. task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) task.use(eps_greedy_handler(cfg)) diff --git a/ding/example/dqn_envpool_wandb_test.py b/ding/example/dqn_envpool_wandb_test.py index bcc34dbfec..1b9208111e 100644 --- a/ding/example/dqn_envpool_wandb_test.py +++ b/ding/example/dqn_envpool_wandb_test.py @@ -16,7 +16,7 @@ from ding.framework.context import OnlineRLContext from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger + termination_checker, wandb_online_logger, epoch_timer from ding.utils import set_pkg_seed from dizoo.atari.config.serial import pong_dqn_envpool_config @@ -84,6 +84,8 @@ def main(cfg): task.use(ContextExchanger(skip_n_iter=1)) task.use(ModelExchanger(model)) + task.use(epoch_timer()) + # Here is the part of single process pipeline. task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) task.use(eps_greedy_handler(cfg)) diff --git a/ding/framework/context.py b/ding/framework/context.py index 8ef8f764fd..5c269a65c1 100644 --- a/ding/framework/context.py +++ b/ding/framework/context.py @@ -69,12 +69,19 @@ class OnlineRLContext(Context): eval_output: List = dataclasses.field(default_factory=dict) # wandb wandb_url: str = "" + evaluator_time=0.0 + collector_time=0.0 + learner_time=0.0 + data_pusher_time=0.0 + nstep_time=0.0 + total_time=0.0 def __post_init__(self): # This method is called just after __init__ method. Here, concretely speaking, # this method is called just after the object initialize its fields. # We use this method here to keep the fields needed for each iteration. self.keep('env_step', 'env_episode', 'train_iter', 'last_eval_iter', 'last_eval_value', 'wandb_url') + self.keep('evaluator_time', 'collector_time', 'learner_time', 'data_pusher_time', 'nstep_time', 'total_time') @dataclasses.dataclass diff --git a/ding/framework/middleware/collector.py b/ding/framework/middleware/collector.py index beb4894ad9..939316cd97 100644 --- a/ding/framework/middleware/collector.py +++ b/ding/framework/middleware/collector.py @@ -10,6 +10,7 @@ if TYPE_CHECKING: from ding.framework import OnlineRLContext +import time class StepCollector: """ @@ -49,6 +50,7 @@ def __call__(self, ctx: "OnlineRLContext") -> None: Input of ctx: - env_step (:obj:`int`): The env steps which will increase during collection. """ + start=time.time() old = ctx.env_step if self.random_collect_size > 0 and old < self.random_collect_size: target_size = self.random_collect_size - old @@ -67,6 +69,8 @@ def __call__(self, ctx: "OnlineRLContext") -> None: self._transitions.clear() break + ctx.collector_time += time.time()-start + class PPOFStepCollector: """ diff --git a/ding/framework/middleware/functional/data_processor.py b/ding/framework/middleware/functional/data_processor.py index 542d71ba99..eed2895e04 100644 --- a/ding/framework/middleware/functional/data_processor.py +++ b/ding/framework/middleware/functional/data_processor.py @@ -11,6 +11,7 @@ if TYPE_CHECKING: from ding.framework import OnlineRLContext, OfflineRLContext +import time def data_pusher(cfg: EasyDict, buffer_: Buffer, group_by_env: Optional[bool] = None): """ @@ -31,7 +32,7 @@ def _push(ctx: "OnlineRLContext"): - trajectories (:obj:`List[Dict]`): Trajectories. - episodes (:obj:`List[Dict]`): Episodes. """ - + start=time.time() if ctx.trajectories is not None: # each data in buffer is a transition if group_by_env: for i, t in enumerate(ctx.trajectories): @@ -47,6 +48,8 @@ def _push(ctx: "OnlineRLContext"): else: raise RuntimeError("Either ctx.trajectories or ctx.episodes should be not None.") + ctx.data_pusher_time += time.time()-start + return _push diff --git a/ding/framework/middleware/functional/enhancer.py b/ding/framework/middleware/functional/enhancer.py index 597a086850..20f5f6f5f8 100644 --- a/ding/framework/middleware/functional/enhancer.py +++ b/ding/framework/middleware/functional/enhancer.py @@ -8,6 +8,8 @@ from ding.reward_model import BaseRewardModel, HerRewardModel from ding.data import Buffer +import time + def reward_estimator(cfg: EasyDict, reward_model: "BaseRewardModel") -> Callable: """ @@ -77,6 +79,8 @@ def nstep_reward_enhancer(cfg: EasyDict) -> Callable: return task.void() def _enhance(ctx: "OnlineRLContext"): + + start=time.time() nstep = cfg.policy.nstep gamma = cfg.policy.discount_factor L = len(ctx.trajectories) @@ -99,6 +103,8 @@ def _enhance(ctx: "OnlineRLContext"): ctx.trajectories[i].reward = nstep_rewards[i] ctx.trajectories[i].value_gamma = value_gamma[i] + ctx.nstep_time += time.time()-start + return _enhance diff --git a/ding/framework/middleware/functional/evaluator.py b/ding/framework/middleware/functional/evaluator.py index 611bbcdea6..c07daa31dd 100644 --- a/ding/framework/middleware/functional/evaluator.py +++ b/ding/framework/middleware/functional/evaluator.py @@ -15,6 +15,7 @@ from ding.torch_utils import to_ndarray, get_shape0 from ding.utils import lists_to_dicts +import time class IMetric(ABC): @@ -238,6 +239,7 @@ def _evaluate(ctx: Union["OnlineRLContext", "OfflineRLContext"]): """ # evaluation will be executed if the task begins or enough train_iter after last evaluation + start=time.time() if ctx.last_eval_iter != -1 and \ (ctx.train_iter - ctx.last_eval_iter < cfg.policy.eval.evaluator.eval_freq): return @@ -302,6 +304,8 @@ def _evaluate(ctx: Union["OnlineRLContext", "OfflineRLContext"]): if stop_flag: task.finish = True + ctx.evaluator_time+=time.time()-start + return _evaluate diff --git a/ding/framework/middleware/functional/logger.py b/ding/framework/middleware/functional/logger.py index 27b7f43a72..fe16c63593 100644 --- a/ding/framework/middleware/functional/logger.py +++ b/ding/framework/middleware/functional/logger.py @@ -263,6 +263,19 @@ def _plot(ctx: "OnlineRLContext"): "If you want to use wandb to visualize the result, please set plot_logger = True in the config." ) + if hasattr(ctx,"evaluator_time"): + info_for_logging.update({"evaluator_time": ctx.evaluator_time}) + if hasattr(ctx,"collector_time"): + info_for_logging.update({"collector_time": ctx.collector_time}) + if hasattr(ctx,"learner_time"): + info_for_logging.update({"learner_time": ctx.learner_time}) + if hasattr(ctx,"data_pusher_time"): + info_for_logging.update({"data_pusher_time": ctx.data_pusher_time}) + if hasattr(ctx,"nstep_time"): + info_for_logging.update({"nstep_time": ctx.nstep_time}) + if hasattr(ctx,"total_time"): + info_for_logging.update({"total_time": ctx.total_time}) + if ctx.eval_value != -np.inf: info_for_logging.update( { diff --git a/ding/framework/middleware/functional/timer.py b/ding/framework/middleware/functional/timer.py index db8a2c0056..7c73b9b809 100644 --- a/ding/framework/middleware/functional/timer.py +++ b/ding/framework/middleware/functional/timer.py @@ -31,5 +31,6 @@ def _epoch_timer(ctx: "Context"): np.mean(records) * 1000 ) ) + ctx.total_time += time_cost return _epoch_timer diff --git a/ding/framework/middleware/learner.py b/ding/framework/middleware/learner.py index e582ef22d0..cf15b7b5e5 100644 --- a/ding/framework/middleware/learner.py +++ b/ding/framework/middleware/learner.py @@ -11,6 +11,7 @@ from ding.policy import Policy from ding.reward_model import BaseRewardModel +import time class OffPolicyLearner: """ @@ -54,6 +55,7 @@ def __call__(self, ctx: "OnlineRLContext") -> None: Output of ctx: - train_output (:obj:`Deque`): The training output in deque. """ + start=time.time() train_output_queue = [] for _ in range(self.cfg.policy.learn.update_per_collect): self._fetcher(ctx) @@ -65,6 +67,7 @@ def __call__(self, ctx: "OnlineRLContext") -> None: train_output_queue.append(ctx.train_output) ctx.train_output_for_post_process = ctx.train_output ctx.train_output = train_output_queue + ctx.learner_time += time.time()-start class HERLearner: From 27cb8bdf650b145482df74a2549c1145ade78f20 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 22 Aug 2023 16:56:14 +0800 Subject: [PATCH 197/244] add new envmanager and collector --- ding/envs/env_manager/envpool_env_manager.py | 258 ++++++++++ ding/example/dqn_envpool_wandb.py | 7 +- ding/example/dqn_envpool_wandb_main.py | 9 +- ding/example/dqn_envpool_wandb_new.py | 129 +++++ ding/example/dqn_envpool_wandb_origin.py | 133 +++++ ding/example/dqn_envpool_wandb_sweep_pong.py | 40 +- ding/example/dqn_envpool_wandb_test.py | 9 +- ding/framework/context.py | 12 +- ding/framework/middleware/__init__.py | 2 +- ding/framework/middleware/collector.py | 49 +- .../middleware/functional/data_processor.py | 5 +- .../middleware/functional/enhancer.py | 4 +- .../middleware/functional/evaluator.py | 5 +- .../framework/middleware/functional/logger.py | 12 +- ding/framework/middleware/learner.py | 5 +- ding/policy/__init__.py | 2 +- ding/policy/common_utils.py | 59 ++- ding/policy/dqn.py | 487 +++++++++++++++++- ding/utils/default_helper.py | 4 +- 19 files changed, 1173 insertions(+), 58 deletions(-) create mode 100644 ding/example/dqn_envpool_wandb_new.py create mode 100644 ding/example/dqn_envpool_wandb_origin.py diff --git a/ding/envs/env_manager/envpool_env_manager.py b/ding/envs/env_manager/envpool_env_manager.py index 163ea0c895..a0f1da4166 100644 --- a/ding/envs/env_manager/envpool_env_manager.py +++ b/ding/envs/env_manager/envpool_env_manager.py @@ -3,6 +3,7 @@ from copy import deepcopy import numpy as np import torch +import treetensor.torch as ttorch import treetensor.numpy as tnp from collections import namedtuple import enum @@ -282,6 +283,37 @@ def step(self, action: Union[List, np.ndarray]) -> Dict[int, namedtuple]: self._ready_obs[env_id[i]] = obs[i] return new_data + def step_v2(self, action: Dict[int, np.ndarray]) -> Dict[int, namedtuple]: + # env_id = np.array(list(action.keys())) + # action = np.array(action) + # if len(action.shape) == 2: + # action = action.squeeze(1) + self._envs.send(action) + + obs, rew, done, info = self._envs.recv() + if self._test: + assert all(info['env_id'] == env_id) + obs = obs.astype(np.float32) + obs /= 255.0 + rew = rew.astype(np.float32) + env_id = info['env_id'] + timesteps = {} + new_data = [] + self._ready_obs = {} + for i in range(len(env_id)): + d = bool(done[i]) + r = to_ndarray([rew[i]]) + self._eval_episode_return[env_id[i]] += r + info_dict = {'env_id': i} + timesteps[env_id[i]] = BaseEnvTimestep(obs[i], r, d, info=info_dict) + if d: + info_dict['eval_episode_return'] = self._eval_episode_return[env_id[i]] + timesteps[env_id[i]].info['eval_episode_return'] = info_dict['eval_episode_return'] + self._eval_episode_return[env_id[i]] = 0. + new_data.append(tnp.array({'obs': obs[i], 'reward': r, 'done': d, 'info': info_dict, 'env_id': env_id[i]})) + self._ready_obs[env_id[i]] = obs[i] + return new_data + def close(self) -> None: if self._closed: return @@ -309,6 +341,232 @@ def ready_obs(self) -> tnp.array: else: raise NotImplementedError + @property + def ready_obs_v2(self) -> tnp.array: + if self._ready_obs is not None: + return self._ready_obs + else: + raise ValueError + + @property + def observation_space(self) -> 'gym.spaces.Space': # noqa + try: + return self._observation_space + except AttributeError: + self.launch() + self.close() + self._ready_obs = {} + return self._observation_space + + @property + def action_space(self) -> 'gym.spaces.Space': # noqa + try: + return self._action_space + except AttributeError: + self.launch() + self.close() + self._ready_obs = {} + return self._action_space + + +@ENV_MANAGER_REGISTRY.register('env_pool_v3') +class PoolEnvManagerV3(): + ''' + Overview: + Envpool now supports Atari, Classic Control, Toy Text, ViZDoom. + Here we list some commonly used env_ids as follows. + For more examples, you can refer to . + + - Atari: "Pong-v5", "SpaceInvaders-v5", "Qbert-v5" + - Classic Control: "CartPole-v0", "CartPole-v1", "Pendulum-v1" + ''' + + @classmethod + def default_config(cls) -> EasyDict: + return EasyDict(deepcopy(cls.config)) + + config = dict( + type='envpool', + # Sync mode: batch_size == env_num + # Async mode: batch_size < env_num + env_num=8, + batch_size=8, + ) + + def __init__(self, cfg: EasyDict) -> None: + super().__init__() + self._cfg = cfg + self._env_num = cfg.env_num + self._batch_size = cfg.batch_size + self._ready_obs = {} + self._closed = True + self._seed = None + self._test = False + + def launch(self) -> None: + assert self._closed, "Please first close the env manager" + if self._seed is None: + seed = 0 + else: + seed = self._seed + + kwargs = {} + if "episodic_life" in self._cfg: + kwargs["episodic_life"] = self._cfg.episodic_life + if "reward_clip" in self._cfg: + kwargs["reward_clip"] = self._cfg.reward_clip + if "stack_num" in self._cfg: + kwargs["stack_num"] = self._cfg.stack_num + if "gray_scale" in self._cfg: + kwargs["gray_scale"] = self._cfg.gray_scale + if "frame_skip" in self._cfg: + kwargs["frame_skip"] = self._cfg.frame_skip + if "test" in self._cfg: + self._test = self._cfg.test + + self._envs = envpool.make( + task_id=self._cfg.env_id, + env_type="gym", + num_envs=self._env_num, + batch_size=self._batch_size, + seed=seed, + **kwargs + ) + self._action_space = self._envs.action_space + self._observation_space = self._envs.observation_space + self._closed = False + self.reset() + + def reset(self) -> None: + self._ready_obs = {} + self._ready_obs_send = {} + self._ready_action_send = {} + self._envs.async_reset() + while True: + obs, _, _, info = self._envs.recv() + env_id = info['env_id'] + obs = obs.astype(np.float32) + obs /= 255.0 + self._ready_obs = deep_merge_dicts({i: o for i, o in zip(env_id, obs)}, self._ready_obs) + if len(self._ready_obs) == self._env_num: + break + self._eval_episode_return = [0. for _ in range(self._env_num)] + + def step(self, action: Union[List, np.ndarray]) -> Dict[int, namedtuple]: + env_id = np.array(list(self._ready_obs.keys())) + action = np.array(action) + if len(action.shape) == 2: + action = action.squeeze(1) + self._envs.send(action, env_id) + + obs, rew, done, info = self._envs.recv() + if self._test: + assert all(info['env_id'] == env_id) + obs = obs.astype(np.float32) + obs /= 255.0 + rew = rew.astype(np.float32) + env_id = info['env_id'] + timesteps = {} + new_data = [] + self._ready_obs = {} + for i in range(len(env_id)): + d = bool(done[i]) + r = to_ndarray([rew[i]]) + self._eval_episode_return[env_id[i]] += r + info_dict = {'env_id': i} + timesteps[env_id[i]] = BaseEnvTimestep(obs[i], r, d, info=info_dict) + if d: + info_dict['eval_episode_return'] = self._eval_episode_return[env_id[i]] + timesteps[env_id[i]].info['eval_episode_return'] = info_dict['eval_episode_return'] + self._eval_episode_return[env_id[i]] = 0. + new_data.append(tnp.array({'obs': obs[i], 'reward': r, 'done': d, 'info': info_dict, 'env_id': env_id[i]})) + self._ready_obs[env_id[i]] = obs[i] + return new_data + + def collect_data(self, num, policy=None, policy_forward_kwargs=None): + if self.closed: + self.launch() + + new_data = [] + + while len(new_data) < num: + + obs_to_send = self._ready_obs + env_id_to_send = list(obs_to_send.keys()) + num_to_send = len(obs_to_send.keys()) + if num_to_send > 0: + if policy: + action_to_send = policy.forward(obs_to_send, **policy_forward_kwargs) + else: + #random policy + action_to_send = {i: {"action": np.array([self._action_space.sample()])} for i in env_id_to_send} + self._ready_obs_send.update(obs_to_send) + self._ready_action_send.update(action_to_send) + action = np.array([action_to_send[i]['action'] for i in env_id_to_send]) + if action.ndim == 2 and action.shape[1] == 1: + action = action.squeeze(1) + env_id = np.array(env_id_to_send) + self._envs.send(action, env_id) + + next_obs, rew, done, info = self._envs.recv() + next_obs = next_obs.astype(np.float32) + next_obs /= 255.0 + rew = rew.astype(np.float32) + env_id = info['env_id'] + + self._ready_obs = {} + for i in range(len(env_id)): + # d = bool(done[i]) + # r = to_ndarray([rew[i]]) + new_data.append( + ttorch.tensor( + { + 'obs': self._ready_obs_send[env_id[i]], + 'action': self._ready_action_send[env_id[i]]['action'], + 'next_obs': next_obs[i], + 'reward': np.array([rew[i]]), + 'done': done[i] + } + ) + ) + self._ready_obs[env_id[i]] = next_obs[i] + + return new_data + + def close(self) -> None: + if self._closed: + return + # Envpool has no `close` API + self._closed = True + + @property + def closed(self) -> None: + return self._closed + + def seed(self, seed: int, dynamic_seed=False) -> None: + # The i-th environment seed in Envpool will be set with i+seed, so we don't do extra transformation here + self._seed = seed + logging.warning("envpool doesn't support dynamic_seed in different episode") + + @property + def env_num(self) -> int: + return self._env_num + + @property + def ready_obs(self) -> tnp.array: + if isinstance(self._ready_obs, dict): + obs = [tnp.array(o) for k, o in self._ready_obs.items()] + return tnp.stack(obs) + else: + raise NotImplementedError + + @property + def ready_obs_v2(self) -> tnp.array: + if self._ready_obs is not None: + return self._ready_obs + else: + raise ValueError + @property def observation_space(self) -> 'gym.spaces.Space': # noqa try: diff --git a/ding/example/dqn_envpool_wandb.py b/ding/example/dqn_envpool_wandb.py index 1fd03374a5..c479eff3c5 100644 --- a/ding/example/dqn_envpool_wandb.py +++ b/ding/example/dqn_envpool_wandb.py @@ -86,8 +86,8 @@ def main(cfg): task.use(eps_greedy_handler(cfg)) task.use( StepCollector( - cfg, - policy.collect_mode, + cfg, + policy.collect_mode, collector_env, random_collect_size=cfg.policy.random_collect_size \ if hasattr(cfg.policy, 'random_collect_size') else 0, @@ -105,7 +105,8 @@ def main(cfg): anonymous=True, project_name=cfg.exp_name, wandb_sweep=False, - )) + ) + ) #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) task.use(termination_checker(max_env_step=10000000)) diff --git a/ding/example/dqn_envpool_wandb_main.py b/ding/example/dqn_envpool_wandb_main.py index e9420539d8..54ee1387bd 100644 --- a/ding/example/dqn_envpool_wandb_main.py +++ b/ding/example/dqn_envpool_wandb_main.py @@ -87,8 +87,8 @@ def main(cfg): task.use(eps_greedy_handler(cfg)) task.use( StepCollector( - cfg, - policy.collect_mode, + cfg, + policy.collect_mode, collector_env, random_collect_size=cfg.policy.random_collect_size \ if hasattr(cfg.policy, 'random_collect_size') else 0, @@ -106,7 +106,8 @@ def main(cfg): anonymous=True, project_name=cfg.exp_name, wandb_sweep=False, - )) + ) + ) #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) task.use(termination_checker(max_env_step=10000000)) @@ -120,7 +121,7 @@ def main(cfg): parser.add_argument("--seed", type=int, default=0, help="random seed") parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") - arg=parser.parse_args() + arg = parser.parse_args() pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size diff --git a/ding/example/dqn_envpool_wandb_new.py b/ding/example/dqn_envpool_wandb_new.py new file mode 100644 index 0000000000..d6037f6e26 --- /dev/null +++ b/ding/example/dqn_envpool_wandb_new.py @@ -0,0 +1,129 @@ +import datetime +from easydict import EasyDict +from ditk import logging +from ding.model import DQN +from ding.policy import DQNFastPolicy +from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV3 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, interaction_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollector +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial import pong_dqn_envpool_config + + +def main(cfg): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'Pong-v5-DQN-envpool-new-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV3, DQNFastPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV3(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV3(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNFastPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + task.use(epoch_timer()) + + # Here is the part of single process pipeline. + task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use( + EnvpoolStepCollector( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) + if "nstep" in cfg.policy and cfg.policy.nstep > 1: + task.use(nstep_reward_enhancer(cfg)) + task.use(data_pusher(cfg, buffer_)) + task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use( + wandb_online_logger( + metric_list=policy.monitor_vars(), + model=policy._model, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + ) + ) + + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=10000000)) + + task.run() + + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") + parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") + arg = parser.parse_args() + + pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num + pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size + pong_dqn_envpool_config.seed = arg.seed + pong_dqn_envpool_config.env.stop_value = 2000 + pong_dqn_envpool_config.policy.nstep = 1 + pong_dqn_envpool_config.nstep = 1 + + pong_dqn_envpool_config.seed = arg.seed + + main(pong_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_wandb_origin.py b/ding/example/dqn_envpool_wandb_origin.py new file mode 100644 index 0000000000..bebad77553 --- /dev/null +++ b/ding/example/dqn_envpool_wandb_origin.py @@ -0,0 +1,133 @@ +import gym +import datetime +import wandb +import numpy as np +from easydict import EasyDict +from ditk import logging +from ding.data.model_loader import FileModelLoader +from ding.data.storage_loader import FileStorageLoader +from ding.model import DQN +from ding.policy import DQNPolicy +from ding.envs import DingEnvWrapper, BaseEnvManagerV2 +from ding.envs.env_manager.envpool_env_manager import PoolEnvManager, PoolEnvManagerV2 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker, wandb_online_logger, epoch_timer +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial import pong_dqn_envpool_config + + +def main(cfg): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'Pong-v5-DQN-envpool-origin-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV2, DQNPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV2(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV2(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + task.use(epoch_timer()) + + # Here is the part of single process pipeline. + task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use( + StepCollector( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) + if "nstep" in cfg.policy and cfg.policy.nstep > 1: + task.use(nstep_reward_enhancer(cfg)) + task.use(data_pusher(cfg, buffer_)) + task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use( + wandb_online_logger( + metric_list=policy.monitor_vars(), + model=policy._model, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + ) + ) + + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=10000000)) + + task.run() + + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") + parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") + arg = parser.parse_args() + + pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num + pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size + pong_dqn_envpool_config.seed = arg.seed + pong_dqn_envpool_config.env.stop_value = 2000 + pong_dqn_envpool_config.policy.nstep = 1 + pong_dqn_envpool_config.nstep = 1 + + main(pong_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_wandb_sweep_pong.py b/ding/example/dqn_envpool_wandb_sweep_pong.py index b810c6e16c..168c6e7656 100644 --- a/ding/example/dqn_envpool_wandb_sweep_pong.py +++ b/ding/example/dqn_envpool_wandb_sweep_pong.py @@ -85,8 +85,8 @@ def main(cfg, seed=0, max_env_step=int(1e7)): task.use(eps_greedy_handler(cfg)) task.use( StepCollector( - cfg, - policy.collect_mode, + cfg, + policy.collect_mode, collector_env, random_collect_size=cfg.policy.random_collect_size \ if hasattr(cfg.policy, 'random_collect_size') else 0, @@ -104,7 +104,8 @@ def main(cfg, seed=0, max_env_step=int(1e7)): anonymous=True, project_name=cfg.exp_name, wandb_sweep=False, - )) + ) + ) #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) task.use(termination_checker(max_env_step=max_env_step)) @@ -115,8 +116,8 @@ def main(cfg, seed=0, max_env_step=int(1e7)): def sweep_main(): wandb.init() - good_pair=(wandb.config.collector_env_num%wandb.config.collector_batch_size==0) - + good_pair = (wandb.config.collector_env_num % wandb.config.collector_batch_size == 0) + if not good_pair: wandb.log({"time": 0.0}) else: @@ -124,11 +125,11 @@ def sweep_main(): start_time = time.time() pong_dqn_envpool_config.env.stop_value = 2000 pong_dqn_envpool_config.exp_name = f'Pong-v5-envpool-new-pipeline-speed-test-{wandb.config.collector_env_num}-{wandb.config.collector_batch_size}' - pong_dqn_envpool_config.env.collector_env_num=wandb.config.collector_env_num - pong_dqn_envpool_config.env.collector_batch_size=wandb.config.collector_batch_size + pong_dqn_envpool_config.env.collector_env_num = wandb.config.collector_env_num + pong_dqn_envpool_config.env.collector_batch_size = wandb.config.collector_batch_size main(EasyDict(pong_dqn_envpool_config), max_env_step=10000000) - print(time.time()-start_time) - wandb.log({"time_cost": time.time()-start_time}) + print(time.time() - start_time) + wandb.log({"time_cost": time.time() - start_time}) #remove the directory named as exp_name shutil.rmtree(pong_dqn_envpool_config.exp_name) @@ -137,21 +138,20 @@ def sweep_main(): sweep_configuration = { 'method': 'grid', - 'metric': - { - 'goal': 'maximize', + 'metric': { + 'goal': 'maximize', 'name': 'time_cost' + }, + 'parameters': { + 'collector_env_num': { + 'values': [64] + }, + 'collector_batch_size': { + 'values': [64, 32, 16, 8] }, - 'parameters': - { - 'collector_env_num': {'values': [64]}, - 'collector_batch_size': {'values': [64, 32, 16, 8]}, } } - sweep_id = wandb.sweep( - sweep=sweep_configuration, - project='Pong-v5-envpool-new-pipeline-speed-test' - ) + sweep_id = wandb.sweep(sweep=sweep_configuration, project='Pong-v5-envpool-new-pipeline-speed-test') wandb.agent(sweep_id, function=sweep_main) diff --git a/ding/example/dqn_envpool_wandb_test.py b/ding/example/dqn_envpool_wandb_test.py index 1b9208111e..7a10828384 100644 --- a/ding/example/dqn_envpool_wandb_test.py +++ b/ding/example/dqn_envpool_wandb_test.py @@ -27,7 +27,7 @@ def main(cfg): cfg.exp_name = 'Pong-v5-DQN-envpool-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") cfg.env.collector_env_num = 64 cfg.env.collector_batch_size = 64 - cfg.env['test']=True + cfg.env['test'] = True collector_env_cfg = EasyDict( { 'env_id': cfg.env.env_id, @@ -91,8 +91,8 @@ def main(cfg): task.use(eps_greedy_handler(cfg)) task.use( StepCollector( - cfg, - policy.collect_mode, + cfg, + policy.collect_mode, collector_env, random_collect_size=cfg.policy.random_collect_size \ if hasattr(cfg.policy, 'random_collect_size') else 0, @@ -110,7 +110,8 @@ def main(cfg): anonymous=True, project_name=cfg.exp_name, wandb_sweep=False, - )) + ) + ) #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) task.use(termination_checker(max_env_step=10000000)) diff --git a/ding/framework/context.py b/ding/framework/context.py index 5c269a65c1..7516922b7d 100644 --- a/ding/framework/context.py +++ b/ding/framework/context.py @@ -69,12 +69,12 @@ class OnlineRLContext(Context): eval_output: List = dataclasses.field(default_factory=dict) # wandb wandb_url: str = "" - evaluator_time=0.0 - collector_time=0.0 - learner_time=0.0 - data_pusher_time=0.0 - nstep_time=0.0 - total_time=0.0 + evaluator_time = 0.0 + collector_time = 0.0 + learner_time = 0.0 + data_pusher_time = 0.0 + nstep_time = 0.0 + total_time = 0.0 def __post_init__(self): # This method is called just after __init__ method. Here, concretely speaking, diff --git a/ding/framework/middleware/__init__.py b/ding/framework/middleware/__init__.py index 3ed37dca20..fa7c5b7200 100644 --- a/ding/framework/middleware/__init__.py +++ b/ding/framework/middleware/__init__.py @@ -1,5 +1,5 @@ from .functional import * -from .collector import StepCollector, EpisodeCollector, PPOFStepCollector +from .collector import StepCollector, EpisodeCollector, PPOFStepCollector, EnvpoolStepCollector from .learner import OffPolicyLearner, HERLearner from .ckpt_handler import CkptSaver from .distributer import ContextExchanger, ModelExchanger, PeriodicalModelExchanger diff --git a/ding/framework/middleware/collector.py b/ding/framework/middleware/collector.py index 939316cd97..e255b4d412 100644 --- a/ding/framework/middleware/collector.py +++ b/ding/framework/middleware/collector.py @@ -12,6 +12,7 @@ import time + class StepCollector: """ Overview: @@ -50,7 +51,7 @@ def __call__(self, ctx: "OnlineRLContext") -> None: Input of ctx: - env_step (:obj:`int`): The env steps which will increase during collection. """ - start=time.time() + start = time.time() old = ctx.env_step if self.random_collect_size > 0 and old < self.random_collect_size: target_size = self.random_collect_size - old @@ -69,7 +70,51 @@ def __call__(self, ctx: "OnlineRLContext") -> None: self._transitions.clear() break - ctx.collector_time += time.time()-start + ctx.collector_time += time.time() - start + + +class EnvpoolStepCollector: + + def __new__(cls, *args, **kwargs): + if task.router.is_active and not task.has_role(task.role.COLLECTOR): + return task.void() + return super(EnvpoolStepCollector, cls).__new__(cls) + + def __init__(self, cfg: EasyDict, policy, env: BaseEnvManager, random_collect_size: int = 0) -> None: + """ + Arguments: + - cfg (:obj:`EasyDict`): Config. + - policy (:obj:`Policy`): The policy to be collected. + - env (:obj:`BaseEnvManager`): The env for the collection, the BaseEnvManager object or \ + its derivatives are supported. + - random_collect_size (:obj:`int`): The count of samples that will be collected randomly, \ + typically used in initial runs. + """ + self.cfg = cfg + self.env = env + self.policy = policy + self.random_collect_size = random_collect_size + + def __call__(self, ctx: "OnlineRLContext") -> None: + """ + Overview: + An encapsulation of inference and rollout middleware. Stop when completing \ + the target number of steps. + Input of ctx: + - env_step (:obj:`int`): The env steps which will increase during collection. + """ + start = time.time() + old = ctx.env_step + if self.random_collect_size > 0 and old < self.random_collect_size: + target_size = self.random_collect_size - old + trajectories = self.env.collect_data(target_size) + else: + # compatible with old config, a train sample = unroll_len step + target_size = self.cfg.policy.collect.n_sample * self.cfg.policy.collect.unroll_len + trajectories = self.env.collect_data(target_size, self.policy, policy_forward_kwargs=ctx.collect_kwargs) + ctx.trajectories = trajectories + ctx.env_step += len(ctx.trajectories) + ctx.collector_time += time.time() - start class PPOFStepCollector: diff --git a/ding/framework/middleware/functional/data_processor.py b/ding/framework/middleware/functional/data_processor.py index eed2895e04..c5fb797c25 100644 --- a/ding/framework/middleware/functional/data_processor.py +++ b/ding/framework/middleware/functional/data_processor.py @@ -13,6 +13,7 @@ import time + def data_pusher(cfg: EasyDict, buffer_: Buffer, group_by_env: Optional[bool] = None): """ Overview: @@ -32,7 +33,7 @@ def _push(ctx: "OnlineRLContext"): - trajectories (:obj:`List[Dict]`): Trajectories. - episodes (:obj:`List[Dict]`): Episodes. """ - start=time.time() + start = time.time() if ctx.trajectories is not None: # each data in buffer is a transition if group_by_env: for i, t in enumerate(ctx.trajectories): @@ -48,7 +49,7 @@ def _push(ctx: "OnlineRLContext"): else: raise RuntimeError("Either ctx.trajectories or ctx.episodes should be not None.") - ctx.data_pusher_time += time.time()-start + ctx.data_pusher_time += time.time() - start return _push diff --git a/ding/framework/middleware/functional/enhancer.py b/ding/framework/middleware/functional/enhancer.py index 20f5f6f5f8..c96fc3012e 100644 --- a/ding/framework/middleware/functional/enhancer.py +++ b/ding/framework/middleware/functional/enhancer.py @@ -80,7 +80,7 @@ def nstep_reward_enhancer(cfg: EasyDict) -> Callable: def _enhance(ctx: "OnlineRLContext"): - start=time.time() + start = time.time() nstep = cfg.policy.nstep gamma = cfg.policy.discount_factor L = len(ctx.trajectories) @@ -103,7 +103,7 @@ def _enhance(ctx: "OnlineRLContext"): ctx.trajectories[i].reward = nstep_rewards[i] ctx.trajectories[i].value_gamma = value_gamma[i] - ctx.nstep_time += time.time()-start + ctx.nstep_time += time.time() - start return _enhance diff --git a/ding/framework/middleware/functional/evaluator.py b/ding/framework/middleware/functional/evaluator.py index c07daa31dd..548214faa6 100644 --- a/ding/framework/middleware/functional/evaluator.py +++ b/ding/framework/middleware/functional/evaluator.py @@ -17,6 +17,7 @@ import time + class IMetric(ABC): @abstractmethod @@ -239,7 +240,7 @@ def _evaluate(ctx: Union["OnlineRLContext", "OfflineRLContext"]): """ # evaluation will be executed if the task begins or enough train_iter after last evaluation - start=time.time() + start = time.time() if ctx.last_eval_iter != -1 and \ (ctx.train_iter - ctx.last_eval_iter < cfg.policy.eval.evaluator.eval_freq): return @@ -304,7 +305,7 @@ def _evaluate(ctx: Union["OnlineRLContext", "OfflineRLContext"]): if stop_flag: task.finish = True - ctx.evaluator_time+=time.time()-start + ctx.evaluator_time += time.time() - start return _evaluate diff --git a/ding/framework/middleware/functional/logger.py b/ding/framework/middleware/functional/logger.py index fe16c63593..6dc327bfef 100644 --- a/ding/framework/middleware/functional/logger.py +++ b/ding/framework/middleware/functional/logger.py @@ -263,17 +263,17 @@ def _plot(ctx: "OnlineRLContext"): "If you want to use wandb to visualize the result, please set plot_logger = True in the config." ) - if hasattr(ctx,"evaluator_time"): + if hasattr(ctx, "evaluator_time"): info_for_logging.update({"evaluator_time": ctx.evaluator_time}) - if hasattr(ctx,"collector_time"): + if hasattr(ctx, "collector_time"): info_for_logging.update({"collector_time": ctx.collector_time}) - if hasattr(ctx,"learner_time"): + if hasattr(ctx, "learner_time"): info_for_logging.update({"learner_time": ctx.learner_time}) - if hasattr(ctx,"data_pusher_time"): + if hasattr(ctx, "data_pusher_time"): info_for_logging.update({"data_pusher_time": ctx.data_pusher_time}) - if hasattr(ctx,"nstep_time"): + if hasattr(ctx, "nstep_time"): info_for_logging.update({"nstep_time": ctx.nstep_time}) - if hasattr(ctx,"total_time"): + if hasattr(ctx, "total_time"): info_for_logging.update({"total_time": ctx.total_time}) if ctx.eval_value != -np.inf: diff --git a/ding/framework/middleware/learner.py b/ding/framework/middleware/learner.py index cf15b7b5e5..8b2c414bac 100644 --- a/ding/framework/middleware/learner.py +++ b/ding/framework/middleware/learner.py @@ -13,6 +13,7 @@ import time + class OffPolicyLearner: """ Overview: @@ -55,7 +56,7 @@ def __call__(self, ctx: "OnlineRLContext") -> None: Output of ctx: - train_output (:obj:`Deque`): The training output in deque. """ - start=time.time() + start = time.time() train_output_queue = [] for _ in range(self.cfg.policy.learn.update_per_collect): self._fetcher(ctx) @@ -67,7 +68,7 @@ def __call__(self, ctx: "OnlineRLContext") -> None: train_output_queue.append(ctx.train_output) ctx.train_output_for_post_process = ctx.train_output ctx.train_output = train_output_queue - ctx.learner_time += time.time()-start + ctx.learner_time += time.time() - start class HERLearner: diff --git a/ding/policy/__init__.py b/ding/policy/__init__.py index 9789de56f6..2abbbc1a22 100755 --- a/ding/policy/__init__.py +++ b/ding/policy/__init__.py @@ -1,6 +1,6 @@ from .base_policy import Policy, CommandModePolicy, create_policy, get_policy_cls from .common_utils import single_env_forward_wrapper, single_env_forward_wrapper_ttorch -from .dqn import DQNSTDIMPolicy, DQNPolicy +from .dqn import DQNSTDIMPolicy, DQNPolicy, DQNFastPolicy from .mdqn import MDQNPolicy from .iqn import IQNPolicy from .fqf import FQFPolicy diff --git a/ding/policy/common_utils.py b/ding/policy/common_utils.py index 4f5193803a..cb290a5e5c 100644 --- a/ding/policy/common_utils.py +++ b/ding/policy/common_utils.py @@ -2,7 +2,8 @@ import torch import treetensor.torch as ttorch from ding.utils.data import default_collate -from ding.torch_utils import to_tensor, to_ndarray, unsqueeze, squeeze +from ding.torch_utils import to_tensor, to_ndarray, unsqueeze, squeeze, to_device +import time def default_preprocess_learn( @@ -50,6 +51,62 @@ def default_preprocess_learn( return data +def fast_preprocess_learn( + data: List[Any], + use_priority_IS_weight: bool = False, + use_priority: bool = False, + cuda: bool = False, + device: str = 'cpu', +) -> dict: + # data preprocess + processes_data = {} + + action = torch.stack([data[i]['action'] for i in range(len(data))]) + if cuda: + action = to_device(action, device=device) + if action.ndim == 2 and action.shape[1] == 1: + action = action.squeeze(1) + processes_data['action'] = action + + obs = torch.stack([data[i]['obs'] for i in range(len(data))]) + if cuda: + obs = to_device(obs, device=device) + processes_data['obs'] = obs + + next_obs = torch.stack([data[i]['next_obs'] for i in range(len(data))]) + if cuda: + next_obs = to_device(next_obs, device=device) + processes_data['next_obs'] = next_obs + + reward = torch.stack([data[i]['reward'] for i in range(len(data))]) + if cuda: + reward = to_device(reward, device=device) + reward = reward.permute(1, 0).contiguous() + processes_data['reward'] = reward + + done = torch.tensor([data[i]['done'] for i in range(len(data))], dtype=torch.float32) + if cuda: + done = to_device(done, device=device) + processes_data['done'] = done + + if use_priority and use_priority_IS_weight: + if 'priority_IS' in data: + weight = data['priority_IS'] + else: # for compability + weight = data['IS'] + else: + if 'weight' in data[0]: + weight = torch.tensor([data[i]['weight'] for i in range(len(data))]) + else: + weight = None + + if weight and cuda: + weight = to_device(weight, device=device) + processes_data['weight'] = weight + + return processes_data + + def single_env_forward_wrapper(forward_fn): def _forward(obs): diff --git a/ding/policy/dqn.py b/ding/policy/dqn.py index c039230777..f81ba90d45 100644 --- a/ding/policy/dqn.py +++ b/ding/policy/dqn.py @@ -10,7 +10,9 @@ from ding.utils.data import default_collate, default_decollate from .base_policy import Policy -from .common_utils import default_preprocess_learn +from .common_utils import default_preprocess_learn, fast_preprocess_learn + +import time @POLICY_REGISTRY.register('dqn') @@ -218,6 +220,9 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: - necessary: ``cur_lr``, ``total_loss``, ``priority`` - optional: ``action_distribution`` """ + + start = time.time() + data = default_preprocess_learn( data, use_priority=self._priority, @@ -225,6 +230,10 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: ignore_done=self._cfg.learn.ignore_done, use_nstep=True ) + + time_data_process = time.time() - start + start = time.time() + if self._cuda: data = to_device(data, self._device) # ==================== @@ -259,6 +268,482 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: # after update # ============= self._target_model.update(self._learn_model.state_dict()) + + time_learn = time.time() - start + # print("time_data_process:",time_data_process) + # print("time_learn:",time_learn) + + return { + 'cur_lr': self._optimizer.defaults['lr'], + 'total_loss': loss.item(), + 'q_value': q_value.mean().item(), + 'target_q_value': target_q_value.mean().item(), + 'priority': td_error_per_sample.abs().tolist(), + # Only discrete action satisfying len(data['action'])==1 can return this and draw histogram on tensorboard. + # '[histogram]action_distribution': data['action'], + } + + def _monitor_vars_learn(self) -> List[str]: + return ['cur_lr', 'total_loss', 'q_value', 'target_q_value'] + + def _state_dict_learn(self) -> Dict[str, Any]: + """ + Overview: + Return the state_dict of learn mode, usually including model and optimizer. + Returns: + - state_dict (:obj:`Dict[str, Any]`): the dict of current policy learn state, for saving and restoring. + """ + return { + 'model': self._learn_model.state_dict(), + 'target_model': self._target_model.state_dict(), + 'optimizer': self._optimizer.state_dict(), + } + + def _load_state_dict_learn(self, state_dict: Dict[str, Any]) -> None: + """ + Overview: + Load the state_dict variable into policy learn mode. + Arguments: + - state_dict (:obj:`Dict[str, Any]`): the dict of policy learn state saved before. + + .. tip:: + If you want to only load some parts of model, you can simply set the ``strict`` argument in \ + load_state_dict to ``False``, or refer to ``ding.torch_utils.checkpoint_helper`` for more \ + complicated operation. + """ + self._learn_model.load_state_dict(state_dict['model']) + self._target_model.load_state_dict(state_dict['target_model']) + self._optimizer.load_state_dict(state_dict['optimizer']) + + def _init_collect(self) -> None: + """ + Overview: + Collect mode init method. Called by ``self.__init__``, initialize algorithm arguments and collect_model, \ + enable the eps_greedy_sample for exploration. + """ + self._unroll_len = self._cfg.collect.unroll_len + self._gamma = self._cfg.discount_factor # necessary for parallel + self._nstep = self._cfg.nstep # necessary for parallel + self._collect_model = model_wrap(self._model, wrapper_name='eps_greedy_sample') + self._collect_model.reset() + + def _forward_collect(self, data: Dict[int, Any], eps: float) -> Dict[int, Any]: + """ + Overview: + Forward computation graph of collect mode(collect training data), with eps_greedy for exploration. + Arguments: + - data (:obj:`Dict[str, Any]`): Dict type data, stacked env data for predicting policy_output(action), \ + values are torch.Tensor or np.ndarray or dict/list combinations, keys are env_id indicated by integer. + - eps (:obj:`float`): epsilon value for exploration, which is decayed by collected env step. + Returns: + - output (:obj:`Dict[int, Any]`): The dict of predicting policy_output(action) for the interaction with \ + env and the constructing of transition. + ArgumentsKeys: + - necessary: ``obs`` + ReturnsKeys + - necessary: ``logit``, ``action`` + """ + data_id = list(data.keys()) + data = default_collate(list(data.values())) + if self._cuda: + data = to_device(data, self._device) + self._collect_model.eval() + with torch.no_grad(): + output = self._collect_model.forward(data, eps=eps) + if self._cuda: + output = to_device(output, 'cpu') + output = default_decollate(output) + return {i: d for i, d in zip(data_id, output)} + + def _get_train_sample(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """ + Overview: + For a given trajectory(transitions, a list of transition) data, process it into a list of sample that \ + can be used for training directly. A train sample can be a processed transition(DQN with nstep TD) \ + or some continuous transitions(DRQN). + Arguments: + - data (:obj:`List[Dict[str, Any]`): The trajectory data(a list of transition), each element is the same \ + format as the return value of ``self._process_transition`` method. + Returns: + - samples (:obj:`dict`): The list of training samples. + + .. note:: + We will vectorize ``process_transition`` and ``get_train_sample`` method in the following release version. \ + And the user can customize the this data processing procecure by overriding this two methods and collector \ + itself. + """ + data = get_nstep_return_data(data, self._nstep, gamma=self._gamma) + return get_train_sample(data, self._unroll_len) + + def _process_transition(self, obs: Any, policy_output: Dict[str, Any], timestep: namedtuple) -> Dict[str, Any]: + """ + Overview: + Generate a transition(e.g.: ) for this algorithm training. + Arguments: + - obs (:obj:`Any`): Env observation. + - policy_output (:obj:`Dict[str, Any]`): The output of policy collect mode(``self._forward_collect``),\ + including at least ``action``. + - timestep (:obj:`namedtuple`): The output after env step(execute policy output action), including at \ + least ``obs``, ``reward``, ``done``, (here obs indicates obs after env step). + Returns: + - transition (:obj:`dict`): Dict type transition data. + """ + transition = { + 'obs': obs, + 'next_obs': timestep.obs, + 'action': policy_output['action'], + 'reward': timestep.reward, + 'done': timestep.done, + } + return transition + + def _init_eval(self) -> None: + r""" + Overview: + Evaluate mode init method. Called by ``self.__init__``, initialize eval_model. + """ + self._eval_model = model_wrap(self._model, wrapper_name='argmax_sample') + self._eval_model.reset() + + def _forward_eval(self, data: Dict[int, Any]) -> Dict[int, Any]: + """ + Overview: + Forward computation graph of eval mode(evaluate policy performance), at most cases, it is similar to \ + ``self._forward_collect``. + Arguments: + - data (:obj:`Dict[str, Any]`): Dict type data, stacked env data for predicting policy_output(action), \ + values are torch.Tensor or np.ndarray or dict/list combinations, keys are env_id indicated by integer. + Returns: + - output (:obj:`Dict[int, Any]`): The dict of predicting action for the interaction with env. + ArgumentsKeys: + - necessary: ``obs`` + ReturnsKeys + - necessary: ``action`` + """ + data_id = list(data.keys()) + data = default_collate(list(data.values())) + if self._cuda: + data = to_device(data, self._device) + self._eval_model.eval() + with torch.no_grad(): + output = self._eval_model.forward(data) + if self._cuda: + output = to_device(output, 'cpu') + output = default_decollate(output) + return {i: d for i, d in zip(data_id, output)} + + def monitor_vars(self) -> List[str]: + return ['cur_lr', 'total_loss', 'q_value'] + + def calculate_priority(self, data: Dict[int, Any], update_target_model: bool = False) -> Dict[str, Any]: + """ + Overview: + Calculate priority for replay buffer. + Arguments: + - data (:obj:`Dict[str, Any]`): Dict type data, a batch of data for training. + Returns: + - priority (:obj:`Dict[str, Any]`): Dict type priority data, values are python scalar or a list of scalars. + ArgumentsKeys: + - necessary: ``obs``, ``action``, ``reward``, ``next_obs``, ``done`` + - optional: ``value_gamma`` + ReturnsKeys: + - necessary: ``priority`` + """ + + if update_target_model: + self._target_model.load_state_dict(self._learn_model.state_dict()) + + data = default_preprocess_learn( + data, + use_priority=False, + use_priority_IS_weight=False, + ignore_done=self._cfg.learn.ignore_done, + use_nstep=True + ) + if self._cuda: + data = to_device(data, self._device) + # ==================== + # Q-learning forward + # ==================== + self._learn_model.eval() + self._target_model.eval() + with torch.no_grad(): + # Current q value (main model) + q_value = self._learn_model.forward(data['obs'])['logit'] + # Target q value + target_q_value = self._target_model.forward(data['next_obs'])['logit'] + # Max q value action (main model), i.e. Double DQN + target_q_action = self._learn_model.forward(data['next_obs'])['action'] + data_n = q_nstep_td_data( + q_value, target_q_value, data['action'], target_q_action, data['reward'], data['done'], data['weight'] + ) + value_gamma = data.get('value_gamma') + loss, td_error_per_sample = q_nstep_td_error( + data_n, self._gamma, nstep=self._nstep, value_gamma=value_gamma + ) + return {'priority': td_error_per_sample.abs().tolist()} + + +@POLICY_REGISTRY.register('dqn_fast') +class DQNFastPolicy(Policy): + """ + Overview: + Policy class of DQN algorithm, extended by Double DQN/Dueling DQN/PER/multi-step TD. + + Config: + == ===================== ======== ============== ======================================= ======================= + ID Symbol Type Default Value Description Other(Shape) + == ===================== ======== ============== ======================================= ======================= + 1 ``type`` str dqn | RL policy register name, refer to | This arg is optional, + | registry ``POLICY_REGISTRY`` | a placeholder + 2 ``cuda`` bool False | Whether to use cuda for network | This arg can be diff- + | erent from modes + 3 ``on_policy`` bool False | Whether the RL algorithm is on-policy + | or off-policy + 4 ``priority`` bool False | Whether use priority(PER) | Priority sample, + | update priority + 5 | ``priority_IS`` bool False | Whether use Importance Sampling + | ``_weight`` | Weight to correct biased update. If + | True, priority must be True. + 6 | ``discount_`` float 0.97, | Reward's future discount factor, aka. | May be 1 when sparse + | ``factor`` [0.95, 0.999] | gamma | reward env + 7 ``nstep`` int 1, | N-step reward discount sum for target + [3, 5] | q_value estimation + 8 | ``model.dueling`` bool True | dueling head architecture + 9 | ``model.encoder`` list [32, 64, | Sequence of ``hidden_size`` of | default kernel_size + | ``_hidden`` (int) 64, 128] | subsequent conv layers and the | is [8, 4, 3] + | ``_size_list`` | final dense layer. | default stride is + | [4, 2 ,1] + 10 | ``learn.update`` int 3 | How many updates(iterations) to train | This args can be vary + | ``per_collect`` | after collector's one collection. | from envs. Bigger val + | Only valid in serial training | means more off-policy + 11 | ``learn.batch_`` int 64 | The number of samples of an iteration + | ``size`` + 12 | ``learn.learning`` float 0.001 | Gradient step length of an iteration. + | ``_rate`` + 13 | ``learn.target_`` int 100 | Frequence of target network update. | Hard(assign) update + | ``update_freq`` + 14 | ``learn.target_`` float 0.005 | Frequence of target network update. | Soft(assign) update + | ``theta`` | Only one of [target_update_freq, + | | target_theta] should be set + 15 | ``learn.ignore_`` bool False | Whether ignore done for target value | Enable it for some + | ``done`` | calculation. | fake termination env + 16 ``collect.n_sample`` int [8, 128] | The number of training samples of a | It varies from + | call of collector. | different envs + 17 ``collect.n_episode`` int 8 | The number of training episodes of a | only one of [n_sample + | call of collector | ,n_episode] should + | | be set + 18 | ``collect.unroll`` int 1 | unroll length of an iteration | In RNN, unroll_len>1 + | ``_len`` + 19 | ``other.eps.type`` str exp | exploration rate decay type | Support ['exp', + | 'linear']. + 20 | ``other.eps.`` float 0.95 | start value of exploration rate | [0,1] + | ``start`` + 21 | ``other.eps.`` float 0.1 | end value of exploration rate | [0,1] + | ``end`` + 22 | ``other.eps.`` int 10000 | decay length of exploration | greater than 0. set + | ``decay`` | decay=10000 means + | the exploration rate + | decay from start + | value to end value + | during decay length. + == ===================== ======== ============== ======================================= ======================= + """ + + config = dict( + # (str) RL policy register name (refer to function "POLICY_REGISTRY"). + type='dqn_fast', + # (bool) Whether use cuda in policy. + cuda=False, + # (bool) Whether learning policy is the same as collecting data policy(on-policy). + on_policy=False, + # (bool) Whether enable priority experience sample. + priority=False, + # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True. + priority_IS_weight=False, + # (float) Discount factor(gamma) for returns. + discount_factor=0.97, + model=dict( + #(list(int)) Sequence of ``hidden_size`` of subsequent conv layers and the final dense layer. + encoder_hidden_size_list=[128, 128, 64], + ), + learn=dict( + # (int) How many updates(iterations) to train after collector's one collection. + # Bigger "update_per_collect" means bigger off-policy. + # collect data -> update policy-> collect data -> ... + update_per_collect=3, + # (int) How many samples in a training batch. + batch_size=64, + # (float) The step size of gradient descent. + learning_rate=0.001, + # (int) Frequence of target network update. + # Only one of [target_update_freq, target_theta] should be set. + target_update_freq=100, + # (float) : Used for soft update of the target network. + # aka. Interpolation factor in EMA update for target network. + # Only one of [target_update_freq, target_theta] should be set. + target_theta=0.005, + # (bool) Whether ignore done(usually for max step termination env). + # Note: Gym wraps the MuJoCo envs by default with TimeLimit environment wrappers. + # These limit HalfCheetah, and several other MuJoCo envs, to max length of 1000. + # However, interaction with HalfCheetah always gets done with done is False, + # Since we inplace done==True with done==False to keep + # TD-error accurate computation(``gamma * (1 - done) * next_v + reward``), + # when the episode step is greater than max episode step. + ignore_done=False, + ), + # collect_mode config + collect=dict( + # (int) How many training samples collected in one collection procedure. + # Only one of [n_sample, n_episode] shoule be set. + n_sample=8, + # (int) Split episodes or trajectories into pieces with length `unroll_len`. + unroll_len=1, + ), + eval=dict(), + # other config + other=dict( + # Epsilon greedy with decay. + eps=dict( + # (str) Decay type. Support ['exp', 'linear']. + type='exp', + # (float) Epsilon start value. + start=0.95, + # (float) Epsilon end value. + end=0.1, + # (int) Decay length(env step). + decay=10000, + ), + replay_buffer=dict( + # (int) Maximum size of replay buffer. Usually, larger buffer size is good. + replay_buffer_size=10000, + ), + ), + ) + + def default_model(self) -> Tuple[str, List[str]]: + """ + Overview: + Return this algorithm default model setting for demonstration. + Returns: + - model_info (:obj:`Tuple[str, List[str]]`): model name and mode import_names + + .. note:: + The user can define and use customized network model but must obey the same inferface definition indicated \ + by import_names path. For DQN, ``ding.model.template.q_learning.DQN`` + """ + return 'dqn', ['ding.model.template.q_learning'] + + def _init_learn(self) -> None: + """ + Overview: + Learn mode init method. Called by ``self.__init__``, initialize the optimizer, algorithm arguments, main \ + and target model. + """ + self._priority = self._cfg.priority + self._priority_IS_weight = self._cfg.priority_IS_weight + # Optimizer + self._optimizer = Adam(self._model.parameters(), lr=self._cfg.learn.learning_rate) + + self._gamma = self._cfg.discount_factor + + # use model_wrapper for specialized demands of different modes + self._target_model = copy.deepcopy(self._model) + if 'target_update_freq' in self._cfg.learn: + self._target_model = model_wrap( + self._target_model, + wrapper_name='target', + update_type='assign', + update_kwargs={'freq': self._cfg.learn.target_update_freq} + ) + elif 'target_theta' in self._cfg.learn: + self._target_model = model_wrap( + self._target_model, + wrapper_name='target', + update_type='momentum', + update_kwargs={'theta': self._cfg.learn.target_theta} + ) + else: + raise RuntimeError("DQN needs target network, please either indicate target_update_freq or target_theta") + self._learn_model = model_wrap(self._model, wrapper_name='argmax_sample') + self._learn_model.reset() + self._target_model.reset() + + def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: + """ + Overview: + Forward computation graph of learn mode(updating policy). + Arguments: + - data (:obj:`Dict[str, Any]`): Dict type data, a batch of data for training, values are torch.Tensor or \ + np.ndarray or dict/list combinations. + Returns: + - info_dict (:obj:`Dict[str, Any]`): Dict type data, a info dict indicated training result, which will be \ + recorded in text log and tensorboard, values are python scalar or a list of scalars. + ArgumentsKeys: + - necessary: ``obs``, ``action``, ``reward``, ``next_obs``, ``done`` + - optional: ``value_gamma`` + ReturnsKeys: + - necessary: ``cur_lr``, ``total_loss``, ``priority`` + - optional: ``action_distribution`` + """ + + start = time.time() + + data = fast_preprocess_learn( + data, + use_priority=self._priority, + use_priority_IS_weight=self._cfg.priority_IS_weight, + cuda=self._cuda, + device=self._device, + ) + + time_data_process = time.time() - start + start = time.time() + + # if self._cuda: + # for key in data.keys(): + # if isinstance(data[key], torch.Tensor): + # data[key] = to_device(data[key], self._device) + + # ==================== + # Q-learning forward + # ==================== + self._learn_model.train() + self._target_model.train() + # Current q value (main model) + q_value = self._learn_model.forward(data['obs'])['logit'] + # Target q value + with torch.no_grad(): + target_q_value = self._target_model.forward(data['next_obs'])['logit'] + # Max q value action (main model), i.e. Double DQN + target_q_action = self._learn_model.forward(data['next_obs'])['action'] + + data_n = q_nstep_td_data( + q_value, target_q_value, data['action'], target_q_action, data['reward'], data['done'], data['weight'] + ) + value_gamma = data.get('value_gamma') if 'value_gamma' in data else self._cfg.discount_factor * torch.ones_like( + data['reward'] + ) + loss, td_error_per_sample = q_nstep_td_error(data_n, self._gamma, nstep=self._nstep, value_gamma=value_gamma) + + # ==================== + # Q-learning update + # ==================== + self._optimizer.zero_grad() + loss.backward() + if self._cfg.multi_gpu: + self.sync_gradients(self._learn_model) + self._optimizer.step() + + # ============= + # after update + # ============= + self._target_model.update(self._learn_model.state_dict()) + + time_learn = time.time() - start + # print("time_data_process:",time_data_process) + # print("time_learn:",time_learn) + return { 'cur_lr': self._optimizer.defaults['lr'], 'total_loss': loss.item(), diff --git a/ding/utils/default_helper.py b/ding/utils/default_helper.py index cce57aea11..54dc6e5dc9 100644 --- a/ding/utils/default_helper.py +++ b/ding/utils/default_helper.py @@ -8,7 +8,7 @@ import treetensor.torch as ttorch -def get_shape0(data: Union[List, Dict, torch.Tensor, ttorch.Tensor]) -> int: +def get_shape0(data: Union[List, Dict, np.ndarray, torch.Tensor, ttorch.Tensor]) -> int: """ Overview: Get shape[0] of data's torch tensor or treetensor @@ -34,6 +34,8 @@ def fn(t): return fn(item) return fn(data.shape) + elif isinstance(data, np.ndarray): + return data.shape[0] else: raise TypeError("Error in getting shape0, not support type: {}".format(data)) From 5a41f632505e5b2fdd2a25a7710788a899570bb5 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Tue, 22 Aug 2023 19:41:42 +0800 Subject: [PATCH 198/244] fix bug in learner --- ding/policy/dqn.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/ding/policy/dqn.py b/ding/policy/dqn.py index f81ba90d45..8966dab157 100644 --- a/ding/policy/dqn.py +++ b/ding/policy/dqn.py @@ -721,9 +721,13 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: data_n = q_nstep_td_data( q_value, target_q_value, data['action'], target_q_action, data['reward'], data['done'], data['weight'] ) - value_gamma = data.get('value_gamma') if 'value_gamma' in data else self._cfg.discount_factor * torch.ones_like( - data['reward'] - ) + + if self._cfg.nstep==1: + value_gamma=None + else: + value_gamma = data.get('value_gamma') if 'value_gamma' in data else self._cfg.discount_factor * torch.ones_like( + data['done'] + ) loss, td_error_per_sample = q_nstep_td_error(data_n, self._gamma, nstep=self._nstep, value_gamma=value_gamma) # ==================== From 1b7cf2a6a6a11c7a5f5fc9861142681214c77f67 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 6 Sep 2023 18:44:10 +0800 Subject: [PATCH 199/244] add nstep support for fast dqn --- ding/envs/env_manager/envpool_env_manager.py | 185 +++++++++++++++++- ding/example/dqn_envpool_wandb_new.py | 30 +-- ding/example/dqn_envpool_wandb_new_nstep.py | 135 +++++++++++++ ding/example/dqn_envpool_wandb_origin.py | 1 + ding/framework/middleware/__init__.py | 4 +- ding/framework/middleware/collector.py | 161 +++++++++++++++ .../middleware/functional/__init__.py | 4 +- .../middleware/functional/data_processor.py | 102 ++++++++++ .../middleware/functional/enhancer.py | 2 +- .../middleware/functional/evaluator.py | 124 ++++++++++++ ding/framework/middleware/learner.py | 152 +++++++++++++- ding/policy/common_utils.py | 12 ++ ding/policy/dqn.py | 25 ++- 13 files changed, 905 insertions(+), 32 deletions(-) create mode 100644 ding/example/dqn_envpool_wandb_new_nstep.py diff --git a/ding/envs/env_manager/envpool_env_manager.py b/ding/envs/env_manager/envpool_env_manager.py index a0f1da4166..7b1451eab3 100644 --- a/ding/envs/env_manager/envpool_env_manager.py +++ b/ding/envs/env_manager/envpool_env_manager.py @@ -489,6 +489,57 @@ def collect_data(self, num, policy=None, policy_forward_kwargs=None): new_data = [] + while len(new_data) < num: + + obs_to_send = self._ready_obs + env_id_to_send = list(obs_to_send.keys()) + num_to_send = len(obs_to_send.keys()) + if num_to_send > 0: + if policy: + action_to_send = policy.forward(obs_to_send, **policy_forward_kwargs) + else: + #random policy + action_to_send = {i: {"action": np.array([self._action_space.sample()])} for i in env_id_to_send} + self._ready_obs_send.update(obs_to_send) + self._ready_action_send.update(action_to_send) + action = np.array([action_to_send[i]['action'] for i in env_id_to_send]) + if action.ndim == 2 and action.shape[1] == 1: + action = action.squeeze(1) + env_id = np.array(env_id_to_send) + self._envs.send(action, env_id) + + + + next_obs, rew, done, info = self._envs.recv() + next_obs = next_obs.astype(np.float32) + next_obs /= 255.0 + rew = rew.astype(np.float32) + env_id = info['env_id'] + + self._ready_obs = {} + for i in range(len(env_id)): + new_data.append( + ttorch.tensor( + { + 'obs': self._ready_obs_send[env_id[i]], + 'action': self._ready_action_send[env_id[i]]['action'], + 'next_obs': next_obs[i], + 'reward': np.array([rew[i]]), + 'done': done[i] + } + ) + ) + self._ready_obs[env_id[i]] = next_obs[i] + + return new_data + + def collect_data_nstep(self, num, n_step=1, policy=None, policy_forward_kwargs=None): + if self.closed: + self.launch() + + new_data = [] + + while len(new_data) < num: obs_to_send = self._ready_obs @@ -516,8 +567,6 @@ def collect_data(self, num, policy=None, policy_forward_kwargs=None): self._ready_obs = {} for i in range(len(env_id)): - # d = bool(done[i]) - # r = to_ndarray([rew[i]]) new_data.append( ttorch.tensor( { @@ -586,3 +635,135 @@ def action_space(self) -> 'gym.spaces.Space': # noqa self.close() self._ready_obs = {} return self._action_space + + +@ENV_MANAGER_REGISTRY.register('env_pool_v4') +class PoolEnvManagerV4(): + ''' + Overview: + Envpool now supports Atari, Classic Control, Toy Text, ViZDoom. + Here we list some commonly used env_ids as follows. + For more examples, you can refer to . + + - Atari: "Pong-v5", "SpaceInvaders-v5", "Qbert-v5" + - Classic Control: "CartPole-v0", "CartPole-v1", "Pendulum-v1" + ''' + + @classmethod + def default_config(cls) -> EasyDict: + return EasyDict(deepcopy(cls.config)) + + config = dict( + type='envpool', + env_num=8, + batch_size=8, + ) + + def __init__(self, cfg: EasyDict) -> None: + super().__init__() + self._cfg = cfg + self._env_num = cfg.env_num + self._batch_size = cfg.batch_size + + self._closed = True + self._seed = None + self._test = False + + def launch(self) -> None: + assert self._closed, "Please first close the env manager" + if self._seed is None: + seed = 0 + else: + seed = self._seed + + kwargs = {} + if "episodic_life" in self._cfg: + kwargs["episodic_life"] = self._cfg.episodic_life + if "reward_clip" in self._cfg: + kwargs["reward_clip"] = self._cfg.reward_clip + if "stack_num" in self._cfg: + kwargs["stack_num"] = self._cfg.stack_num + if "gray_scale" in self._cfg: + kwargs["gray_scale"] = self._cfg.gray_scale + if "frame_skip" in self._cfg: + kwargs["frame_skip"] = self._cfg.frame_skip + if "test" in self._cfg: + self._test = self._cfg.test + + self._envs = envpool.make( + task_id=self._cfg.env_id, + env_type="gym", + num_envs=self._env_num, + batch_size=self._batch_size, + seed=seed, + **kwargs + ) + self._action_space = self._envs.action_space + self._observation_space = self._envs.observation_space + self._closed = False + return self.reset() + + def reset(self) -> None: + self._envs.async_reset() + ready_obs={} + while True: + obs, _, _, info = self._envs.recv() + env_id = info['env_id'] + obs = obs.astype(np.float32) + obs /= 255.0 + ready_obs = deep_merge_dicts({i: o for i, o in zip(env_id, obs)}, ready_obs) + if len(ready_obs) == self._env_num: + break + self._eval_episode_return = [0. for _ in range(self._env_num)] + + return ready_obs + + def send_action(self, action, env_id) -> Dict[int, namedtuple]: + self._envs.send(action, env_id) + + def receive_data(self): + next_obs, rew, done, info = self._envs.recv() + next_obs = next_obs.astype(np.float32) + next_obs /= 255.0 + rew = rew.astype(np.float32) + + return next_obs, rew, done, info + + def close(self) -> None: + if self._closed: + return + # Envpool has no `close` API + self._closed = True + + @property + def closed(self) -> None: + return self._closed + + def seed(self, seed: int, dynamic_seed=False) -> None: + # The i-th environment seed in Envpool will be set with i+seed, so we don't do extra transformation here + self._seed = seed + logging.warning("envpool doesn't support dynamic_seed in different episode") + + @property + def env_num(self) -> int: + return self._env_num + + @property + def observation_space(self) -> 'gym.spaces.Space': # noqa + try: + return self._observation_space + except AttributeError: + self.launch() + self.close() + self._ready_obs = {} + return self._observation_space + + @property + def action_space(self) -> 'gym.spaces.Space': # noqa + try: + return self._action_space + except AttributeError: + self.launch() + self.close() + self._ready_obs = {} + return self._action_space diff --git a/ding/example/dqn_envpool_wandb_new.py b/ding/example/dqn_envpool_wandb_new.py index d6037f6e26..379a95a4e5 100644 --- a/ding/example/dqn_envpool_wandb_new.py +++ b/ding/example/dqn_envpool_wandb_new.py @@ -1,4 +1,9 @@ import datetime +import torch +try: + torch.multiprocessing.set_start_method('spawn') +except RuntimeError: + pass from easydict import EasyDict from ditk import logging from ding.model import DQN @@ -10,7 +15,7 @@ from ding.framework.context import OnlineRLContext from ding.framework.middleware import OffPolicyLearner, interaction_evaluator, data_pusher, \ eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollector + termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollector, OffPolicyLearnerV2 from ding.utils import set_pkg_seed from dizoo.atari.config.serial import pong_dqn_envpool_config @@ -91,17 +96,19 @@ def main(cfg): if "nstep" in cfg.policy and cfg.policy.nstep > 1: task.use(nstep_reward_enhancer(cfg)) task.use(data_pusher(cfg, buffer_)) - task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(OffPolicyLearnerV2(cfg, policy, buffer_)) task.use(online_logger(train_show_freq=10)) - task.use( - wandb_online_logger( - metric_list=policy.monitor_vars(), - model=policy._model, - anonymous=True, - project_name=cfg.exp_name, - wandb_sweep=False, - ) - ) + # task.use( + # wandb_online_logger( + # metric_list=policy.monitor_vars(), + # model=policy._model, + # exp_config=cfg, + # anonymous=True, + # project_name=cfg.exp_name, + # wandb_sweep=False, + # ) + # ) #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) task.use(termination_checker(max_env_step=10000000)) @@ -110,6 +117,7 @@ def main(cfg): if __name__ == "__main__": + import argparse parser = argparse.ArgumentParser() parser.add_argument("--seed", type=int, default=0, help="random seed") diff --git a/ding/example/dqn_envpool_wandb_new_nstep.py b/ding/example/dqn_envpool_wandb_new_nstep.py new file mode 100644 index 0000000000..2de943d176 --- /dev/null +++ b/ding/example/dqn_envpool_wandb_new_nstep.py @@ -0,0 +1,135 @@ +import datetime +import torch +try: + torch.multiprocessing.set_start_method('spawn') +except RuntimeError: + pass +from easydict import EasyDict +from ditk import logging +from ding.model import DQN +from ding.policy import DQNFastPolicy +from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV2 +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial import pong_dqn_envpool_config + + +def main(cfg): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'Pong-v5-DQN-envpool-new-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNFastPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + task.use(epoch_timer()) + + # Here is the part of single process pipeline. + task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use( + EnvpoolStepCollectorV2( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) + task.use(data_pusher(cfg, buffer_)) + #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(OffPolicyLearnerV2(cfg, policy, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use( + wandb_online_logger( + metric_list=policy.monitor_vars(), + model=policy._model, + exp_config=cfg, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + ) + ) + + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=10000000)) + + task.run() + + +if __name__ == "__main__": + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") + parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") + arg = parser.parse_args() + + pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num + pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size + pong_dqn_envpool_config.seed = arg.seed + pong_dqn_envpool_config.env.stop_value = 2000 + pong_dqn_envpool_config.policy.nstep = 1 + pong_dqn_envpool_config.nstep = 3 + pong_dqn_envpool_config.policy.nstep = 3 + pong_dqn_envpool_config.seed = arg.seed + + main(pong_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_wandb_origin.py b/ding/example/dqn_envpool_wandb_origin.py index bebad77553..bdc49e714c 100644 --- a/ding/example/dqn_envpool_wandb_origin.py +++ b/ding/example/dqn_envpool_wandb_origin.py @@ -103,6 +103,7 @@ def main(cfg): wandb_online_logger( metric_list=policy.monitor_vars(), model=policy._model, + exp_config=cfg, anonymous=True, project_name=cfg.exp_name, wandb_sweep=False, diff --git a/ding/framework/middleware/__init__.py b/ding/framework/middleware/__init__.py index fa7c5b7200..72f8e944aa 100644 --- a/ding/framework/middleware/__init__.py +++ b/ding/framework/middleware/__init__.py @@ -1,6 +1,6 @@ from .functional import * -from .collector import StepCollector, EpisodeCollector, PPOFStepCollector, EnvpoolStepCollector -from .learner import OffPolicyLearner, HERLearner +from .collector import StepCollector, EpisodeCollector, PPOFStepCollector, EnvpoolStepCollector, EnvpoolStepCollectorV2 +from .learner import OffPolicyLearner, HERLearner, OffPolicyLearnerV2 from .ckpt_handler import CkptSaver from .distributer import ContextExchanger, ModelExchanger, PeriodicalModelExchanger from .barrier import Barrier, BarrierRuntime diff --git a/ding/framework/middleware/collector.py b/ding/framework/middleware/collector.py index e255b4d412..cb864dad64 100644 --- a/ding/framework/middleware/collector.py +++ b/ding/framework/middleware/collector.py @@ -12,6 +12,7 @@ import time +import numpy as np class StepCollector: """ @@ -117,6 +118,166 @@ def __call__(self, ctx: "OnlineRLContext") -> None: ctx.collector_time += time.time() - start +class EnvpoolStepCollectorV2: + + def __new__(cls, *args, **kwargs): + if task.router.is_active and not task.has_role(task.role.COLLECTOR): + return task.void() + return super(EnvpoolStepCollectorV2, cls).__new__(cls) + + def __init__(self, cfg: EasyDict, policy, env: BaseEnvManager, random_collect_size: int = 0) -> None: + """ + Arguments: + - cfg (:obj:`EasyDict`): Config. + - policy (:obj:`Policy`): The policy to be collected. + - env (:obj:`BaseEnvManager`): The env for the collection, the BaseEnvManager object or \ + its derivatives are supported. + - random_collect_size (:obj:`int`): The count of samples that will be collected randomly, \ + typically used in initial runs. + """ + self.cfg = cfg + self.env = env + + self._ready_obs_receive = {} + self._ready_obs_send = {} + self._ready_action_send = {} + self._trajectory = {i:[] for i in range(env.env_num)} + self._nsteps=self.cfg.policy.nstep if hasattr(self.cfg.policy, 'nstep') else 1 + self._discount_ratio_list=[self.cfg.policy.discount_factor**(i+1) for i in range(self._nsteps)] + self._nsteps_range=list(range(1,self._nsteps)) + self.policy = policy + self.random_collect_size = random_collect_size + + def __call__(self, ctx: "OnlineRLContext") -> None: + """ + Overview: + An encapsulation of inference and rollout middleware. Stop when completing \ + the target number of steps. + Input of ctx: + - env_step (:obj:`int`): The env steps which will increase during collection. + """ + start = time.time() + old = ctx.env_step + + if self.random_collect_size > 0 and old < self.random_collect_size: + target_size = self.random_collect_size - old + random=True + else: + target_size = self.cfg.policy.collect.n_sample * self.cfg.policy.collect.unroll_len + random=False + + if self.env.closed: + self._ready_obs_receive = self.env.launch() + + counter=0 + + while True: + + if len(self._ready_obs_receive.keys()) > 0: + if random: + action_to_send = {i: {"action": np.array([self.env.action_space.sample()])} for i in self._ready_obs_receive.keys()} + else: + action_to_send = self.policy.forward(self._ready_obs_receive, **ctx.collect_kwargs) + + self._ready_obs_send.update(self._ready_obs_receive) + self._ready_obs_receive = {} + self._ready_action_send.update(action_to_send) + + action_send = np.array([action_to_send[i]['action'] for i in action_to_send.keys()]) + if action_send.ndim == 2 and action_send.shape[1] == 1: + action_send = action_send.squeeze(1) + env_id_send = np.array(list(action_to_send.keys())) + self.env.send_action(action_send, env_id_send) + + next_obs, rew, done, info = self.env.receive_data() + env_id_receive = info['env_id'] + counter+=len(env_id_receive) + self._ready_obs_receive.update({i: next_obs[i] for i in range(len(next_obs))}) + + #todo + for i in range(len(env_id_receive)): + current_reward=ttorch.tensor(np.array([rew[i]])) + if self._nsteps>1: + self._trajectory[env_id_receive[i]].append( + { + 'obs': ttorch.tensor(self._ready_obs_send[env_id_receive[i]]), + 'action': ttorch.tensor(self._ready_action_send[env_id_receive[i]]['action']), + 'next_obs': ttorch.tensor(next_obs[i]), + # n-step reward + 'reward': [current_reward], + 'done': ttorch.tensor(done[i]) + } + ) + else: + self._trajectory[env_id_receive[i]].append( + { + 'obs': ttorch.tensor(self._ready_obs_send[env_id_receive[i]]), + 'action': ttorch.tensor(self._ready_action_send[env_id_receive[i]]['action']), + 'next_obs': ttorch.tensor(next_obs[i]), + # n-step reward + 'reward': ttorch.tensor(current_reward), + 'done': ttorch.tensor(done[i]) + } + ) + + if self._nsteps>1: + if done[i]==False and counter < target_size: + reverse_record_position=min(self._nsteps,len(self._trajectory[env_id_receive[i]])) + real_reverse_record_position=reverse_record_position + + for j in range(1,reverse_record_position+1): + if j==1: + pass + else: + if self._trajectory[env_id_receive[i]][-j]['done']==True: + real_reverse_record_position=j-1 + break + else: + self._trajectory[env_id_receive[i]][-j]['reward'].append(current_reward) + + if real_reverse_record_position==self._nsteps: + self._trajectory[env_id_receive[i]][-real_reverse_record_position]['next_n_obs']=ttorch.tensor(next_obs[i]) + self._trajectory[env_id_receive[i]][-real_reverse_record_position]['value_gamma']=ttorch.tensor(self._discount_ratio_list[real_reverse_record_position-1]) + + else: # done[i] == True or counter >= target_size + + reverse_record_position=min(self._nsteps,len(self._trajectory[env_id_receive[i]])) + real_reverse_record_position=reverse_record_position + + for j in range(1,reverse_record_position+1): + if j==1: + self._trajectory[env_id_receive[i]][-j]['reward'].extend([ttorch.zeros_like(current_reward) for _ in range(self._nsteps-len(self._trajectory[env_id_receive[i]][-j]['reward']))]) + self._trajectory[env_id_receive[i]][-j]['next_n_obs']=ttorch.tensor(next_obs[i]) + self._trajectory[env_id_receive[i]][-j]['value_gamma']=ttorch.tensor(self._discount_ratio_list[j-1]) + else: + if self._trajectory[env_id_receive[i]][-j]['done']==True: + real_reverse_record_position=j + break + else: + self._trajectory[env_id_receive[i]][-j]['reward'].append(current_reward) + self._trajectory[env_id_receive[i]][-j]['reward'].extend([ttorch.zeros_like(current_reward) for _ in range(self._nsteps-len(self._trajectory[env_id_receive[i]][-j]['reward']))]) + self._trajectory[env_id_receive[i]][-j]['next_n_obs']=ttorch.tensor(next_obs[i]) + self._trajectory[env_id_receive[i]][-j]['value_gamma']=ttorch.tensor(self._discount_ratio_list[j-1]) + + + else: + self._trajectory[env_id_receive[i]][-1]['value_gamma']=ttorch.tensor(self._discount_ratio_list[0]) + + if counter >= target_size: + # transform reward to ttorch.tensor + for i in range(self.env.env_num): + for j in range(len(self._trajectory[i])): + self._trajectory[i][j]['reward']=ttorch.concat(self._trajectory[env_id_receive[i]][j]['reward']) + break + + ctx.trajectories=[] + for i in range(self.env.env_num): + ctx.trajectories.extend(self._trajectory[i]) + self._trajectory[i]=[] + ctx.env_step += len(ctx.trajectories) + ctx.collector_time += time.time() - start + + class PPOFStepCollector: """ Overview: diff --git a/ding/framework/middleware/functional/__init__.py b/ding/framework/middleware/functional/__init__.py index 061c27767c..ccd2741460 100644 --- a/ding/framework/middleware/functional/__init__.py +++ b/ding/framework/middleware/functional/__init__.py @@ -1,8 +1,8 @@ from .trainer import trainer, multistep_trainer from .data_processor import offpolicy_data_fetcher, data_pusher, offline_data_fetcher, offline_data_saver, \ - offline_data_fetcher_from_mem, sqil_data_pusher, buffer_saver + offline_data_fetcher_from_mem, sqil_data_pusher, buffer_saver, offpolicy_data_fetcher_v2 from .collector import inferencer, rolloutor, TransitionList -from .evaluator import interaction_evaluator, interaction_evaluator_ttorch +from .evaluator import interaction_evaluator, interaction_evaluator_ttorch, envpool_evaluator from .termination_checker import termination_checker, ddp_termination_checker from .logger import online_logger, offline_logger, wandb_online_logger, wandb_offline_logger from .ctx_helper import final_ctx_saver diff --git a/ding/framework/middleware/functional/data_processor.py b/ding/framework/middleware/functional/data_processor.py index c5fb797c25..bd985ae26a 100644 --- a/ding/framework/middleware/functional/data_processor.py +++ b/ding/framework/middleware/functional/data_processor.py @@ -1,4 +1,5 @@ import os +import torch.multiprocessing as mp from typing import TYPE_CHECKING, Callable, List, Union, Tuple, Dict, Optional from easydict import EasyDict from ditk import logging @@ -8,6 +9,8 @@ from ding.framework import task from ding.utils import get_rank +from ding.policy.common_utils import default_preprocess_learn, fast_preprocess_learn + if TYPE_CHECKING: from ding.framework import OnlineRLContext, OfflineRLContext @@ -185,6 +188,105 @@ def _fetch(ctx: "OnlineRLContext"): return _fetch + +def offpolicy_data_fetcher_v2( + cfg: EasyDict, + buffer_: Union[Buffer, List[Tuple[Buffer, float]], Dict[str, Buffer]], + data_shortage_warning: bool = False, +) -> Callable: + """ + Overview: + The return function is a generator which meanly fetch a batch of data from a buffer, \ + a list of buffers, or a dict of buffers. + Arguments: + - cfg (:obj:`EasyDict`): Config which should contain the following keys: `cfg.policy.learn.batch_size`. + - buffer (:obj:`Union[Buffer, List[Tuple[Buffer, float]], Dict[str, Buffer]]`): \ + The buffer where the data is fetched from. \ + ``Buffer`` type means a buffer.\ + ``List[Tuple[Buffer, float]]`` type means a list of tuple. In each tuple there is a buffer and a float. \ + The float defines, how many batch_size is the size of the data \ + which is sampled from the corresponding buffer.\ + ``Dict[str, Buffer]`` type means a dict in which the value of each element is a buffer. \ + For each key-value pair of dict, batch_size of data will be sampled from the corresponding buffer \ + and assigned to the same key of `ctx.train_data`. + - data_shortage_warning (:obj:`bool`): Whether to output warning when data shortage occurs in fetching. + """ + + + + def _fetch(ctx: "OnlineRLContext"): + """ + Input of ctx: + - train_output (:obj:`Union[Dict, Deque[Dict]]`): This attribute should exist \ + if `buffer_` is of type Buffer and if `buffer_` use the middleware `PriorityExperienceReplay`. \ + The meta data `priority` of the sampled data in the `buffer_` will be updated \ + to the `priority` attribute of `ctx.train_output` if `ctx.train_output` is a dict, \ + or the `priority` attribute of `ctx.train_output`'s popped element \ + if `ctx.train_output` is a deque of dicts. + Output of ctx: + - train_data (:obj:`Union[List[Dict], Dict[str, List[Dict]]]`): The fetched data. \ + ``List[Dict]`` type means a list of data. + `train_data` is of this type if the type of `buffer_` is Buffer or List. + ``Dict[str, List[Dict]]]`` type means a dict, in which the value of each key-value pair + is a list of data. `train_data` is of this type if the type of `buffer_` is Dict. + """ + try: + unroll_len = cfg.policy.collect.unroll_len + if isinstance(buffer_, Buffer): + if unroll_len > 1: + buffered_data = buffer_.sample( + cfg.policy.learn.batch_size, groupby="env", unroll_len=unroll_len, replace=True + ) + ctx.train_data_sample = [[t.data for t in d] for d in buffered_data] # B, unroll_len + else: + buffered_data = buffer_.sample(cfg.policy.learn.batch_size) + ctx.train_data_sample = [d.data for d in buffered_data] + elif isinstance(buffer_, List): # like sqil, r2d3 + assert unroll_len == 1, "not support" + buffered_data = [] + for buffer_elem, p in buffer_: + data_elem = buffer_elem.sample(int(cfg.policy.learn.batch_size * p)) + assert data_elem is not None + buffered_data.append(data_elem) + buffered_data = sum(buffered_data, []) + ctx.train_data_sample = [d.data for d in buffered_data] + elif isinstance(buffer_, Dict): # like ppg_offpolicy + assert unroll_len == 1, "not support" + buffered_data = {k: v.sample(cfg.policy.learn.batch_size) for k, v in buffer_.items()} + ctx.train_data_sample = {k: [d.data for d in v] for k, v in buffered_data.items()} + else: + raise TypeError("not support buffer argument type: {}".format(type(buffer_))) + + assert buffered_data is not None + except (ValueError, AssertionError): + if data_shortage_warning: + # You can modify data collect config to avoid this warning, e.g. increasing n_sample, n_episode. + # Fetcher will skip this this attempt. + logging.warning( + "Replay buffer's data is not enough to support training, so skip this training to wait more data." + ) + ctx.train_data_sample = None + return + + + yield + + if isinstance(buffer_, Buffer): + if any([isinstance(m, PriorityExperienceReplay) for m in buffer_._middleware]): + index = [d.index for d in buffered_data] + meta = [d.meta for d in buffered_data] + # such as priority + if isinstance(ctx.train_output_for_post_process, List): + priority = ctx.train_output_for_post_process.pop()['priority'] + else: + priority = ctx.train_output_for_post_process['priority'] + for idx, m, p in zip(index, meta, priority): + m['priority'] = p + buffer_.update(index=idx, data=None, meta=m) + + return _fetch + + def offline_data_fetcher_from_mem(cfg: EasyDict, dataset: Dataset) -> Callable: from threading import Thread diff --git a/ding/framework/middleware/functional/enhancer.py b/ding/framework/middleware/functional/enhancer.py index c96fc3012e..2804ead95c 100644 --- a/ding/framework/middleware/functional/enhancer.py +++ b/ding/framework/middleware/functional/enhancer.py @@ -84,7 +84,7 @@ def _enhance(ctx: "OnlineRLContext"): nstep = cfg.policy.nstep gamma = cfg.policy.discount_factor L = len(ctx.trajectories) - reward_template = ctx.trajectories[0].reward + reward_template = ctx.trajectories[0]["reward"] nstep_rewards = [] value_gamma = [] for i in range(L): diff --git a/ding/framework/middleware/functional/evaluator.py b/ding/framework/middleware/functional/evaluator.py index 548214faa6..27a73f248a 100644 --- a/ding/framework/middleware/functional/evaluator.py +++ b/ding/framework/middleware/functional/evaluator.py @@ -310,6 +310,130 @@ def _evaluate(ctx: Union["OnlineRLContext", "OfflineRLContext"]): return _evaluate +def envpool_evaluator(cfg: EasyDict, policy: Policy, env: BaseEnvManager, render: bool = False) -> Callable: + """ + Overview: + The middleware that executes the evaluation. + Arguments: + - cfg (:obj:`EasyDict`): Config. + - policy (:obj:`Policy`): The policy to be evaluated. + - env (:obj:`BaseEnvManager`): The env for the evaluation. + - render (:obj:`bool`): Whether to render env images and policy logits. + """ + if task.router.is_active and not task.has_role(task.role.EVALUATOR): + return task.void() + + env.seed(cfg.seed, dynamic_seed=False) + + def _evaluate(ctx: Union["OnlineRLContext", "OfflineRLContext"]): + """ + Overview: + - The evaluation will be executed if the task begins and enough train_iter passed \ + since last evaluation. + Input of ctx: + - last_eval_iter (:obj:`int`): Last evaluation iteration. + - train_iter (:obj:`int`): Current train iteration. + Output of ctx: + - eval_value (:obj:`float`): The average reward in the current evaluation. + """ + + # evaluation will be executed if the task begins or enough train_iter after last evaluation + start = time.time() + if ctx.last_eval_iter != -1 and \ + (ctx.train_iter - ctx.last_eval_iter < cfg.policy.eval.evaluator.eval_freq): + return + + ready_obs_receive = {} + ready_obs_send = {} + ready_action_send = {} + trajectory = {i:[] for i in range(env.env_num)} + + if env.closed: + ready_obs_receive=env.launch() + else: + ready_obs_receive=env.reset() + policy.reset() + eval_monitor = VectorEvalMonitor(env.env_num, cfg.env.n_evaluator_episode) + + while not eval_monitor.is_finished(): + + if len(ready_obs_receive.keys()) > 0: + action_to_send = policy.forward(ready_obs_receive) + output = [v for v in action_to_send.values()] + + ready_obs_send.update(ready_obs_receive) + ready_obs_receive = {} + ready_action_send.update(action_to_send) + + action_send = np.array([action_to_send[i]['action'] for i in action_to_send.keys()]) + if action_send.ndim == 2 and action_send.shape[1] == 1: + action_send = action_send.squeeze(1) + env_id_send = np.array(list(action_to_send.keys())) + env.send_action(action_send, env_id_send) + + next_obs, rew, done, info = env.receive_data() + env_id_receive = info['env_id'] + ready_obs_receive.update({i: next_obs[i] for i in range(len(next_obs))}) + + #todo + for i in range(len(env_id_receive)): + current_reward=ttorch.tensor(np.array([rew[i]])) + trajectory[env_id_receive[i]].append( + { + 'obs': ttorch.tensor(ready_obs_send[env_id_receive[i]]), + 'action': ttorch.tensor(ready_action_send[env_id_receive[i]]['action']), + 'next_obs': ttorch.tensor(next_obs[i]), + # n-step reward + 'reward': [current_reward], + 'done': ttorch.tensor(done[i]) + } + ) + + if done[i]==True: + episode_return_i = 0.0 + for item in trajectory[env_id_receive[i]]: + episode_return_i+=item['reward'][0] + eval_monitor.update_reward(env_id_receive[i], episode_return_i) + policy.reset([env_id_receive[i]]) + trajectory[env_id_receive[i]]=[] + + episode_return = eval_monitor.get_episode_return() + episode_return_min = np.min(episode_return) + episode_return_max = np.max(episode_return) + episode_return_std = np.std(episode_return) + episode_return = np.mean(episode_return) + stop_flag = episode_return >= cfg.env.stop_value and ctx.train_iter > 0 + if isinstance(ctx, OnlineRLContext): + logging.info( + 'Evaluation: Train Iter({})\tEnv Step({})\tEpisode Return({:.3f})'.format( + ctx.train_iter, ctx.env_step, episode_return + ) + ) + elif isinstance(ctx, OfflineRLContext): + logging.info('Evaluation: Train Iter({})\tEval Reward({:.3f})'.format(ctx.train_iter, episode_return)) + else: + raise TypeError("not supported ctx type: {}".format(type(ctx))) + ctx.last_eval_iter = ctx.train_iter + ctx.eval_value = episode_return + ctx.eval_value_min = episode_return_min + ctx.eval_value_max = episode_return_max + ctx.eval_value_std = episode_return_std + ctx.last_eval_value = ctx.eval_value + ctx.eval_output = {'episode_return': episode_return} + episode_info = eval_monitor.get_episode_info() + if episode_info is not None: + ctx.eval_output['episode_info'] = episode_info + + ctx.eval_output['output'] = output # for compatibility + + if stop_flag: + task.finish = True + + ctx.evaluator_time += time.time() - start + + return _evaluate + + def interaction_evaluator_ttorch( seed: int, policy: Policy, diff --git a/ding/framework/middleware/learner.py b/ding/framework/middleware/learner.py index 8b2c414bac..6cb46742bb 100644 --- a/ding/framework/middleware/learner.py +++ b/ding/framework/middleware/learner.py @@ -4,15 +4,55 @@ from ding.framework import task from ding.data import Buffer -from .functional import trainer, offpolicy_data_fetcher, reward_estimator, her_data_enhancer +from .functional import trainer, offpolicy_data_fetcher, reward_estimator, her_data_enhancer, offpolicy_data_fetcher_v2 if TYPE_CHECKING: from ding.framework import Context, OnlineRLContext from ding.policy import Policy from ding.reward_model import BaseRewardModel +from queue import Queue import time +import torch.multiprocessing as mp +from threading import Thread +from ding.policy.common_utils import default_preprocess_learn, fast_preprocess_learn +def data_process_func(data_queue_input, data_queue_output): + while True: + data = data_queue_input.get() + if data is None: + break + else: + #print("get one data") + output_data=fast_preprocess_learn( + data, + use_priority=False, #policy._cfg.priority, + use_priority_IS_weight=False, #policy._cfg.priority_IS_weight, + cuda=True, #policy._cuda, + device="cuda:0", #policy._device, + ) + data_queue_output.put(output_data) + #print("put one data, queue size:{}".format(data_queue_output.qsize())) + +def data_process_func_v2(data_queue_input, data_queue_output): + while True: + if data_queue_input.empty(): + time.sleep(0.001) + else: + data = data_queue_input.get() + if data is None: + break + else: + #print("get one data") + output_data=fast_preprocess_learn( + data, + use_priority=False, #policy._cfg.priority, + use_priority_IS_weight=False, #policy._cfg.priority_IS_weight, + cuda=True, #policy._cuda, + device="cuda:0", #policy._device, + ) + data_queue_output.put(output_data) + #print("put one data, queue size:{}".format(data_queue_output.qsize())) class OffPolicyLearner: """ @@ -57,18 +97,128 @@ def __call__(self, ctx: "OnlineRLContext") -> None: - train_output (:obj:`Deque`): The training output in deque. """ start = time.time() + time_fetcher = 0.0 + time_trainer = 0.0 train_output_queue = [] for _ in range(self.cfg.policy.learn.update_per_collect): + start_fetcher = time.time() self._fetcher(ctx) + time_fetcher += time.time() - start_fetcher if ctx.train_data is None: break if self._reward_estimator: self._reward_estimator(ctx) + start_trainer = time.time() self._trainer(ctx) + time_trainer += time.time() - start_trainer train_output_queue.append(ctx.train_output) ctx.train_output_for_post_process = ctx.train_output ctx.train_output = train_output_queue ctx.learner_time += time.time() - start + print("time_trainer:time_fetcher={}:{}={}".format(time_trainer, time_fetcher, time_trainer / time_fetcher)) + + + +class OffPolicyLearnerV2: + """ + Overview: + The class of the off-policy learner, including data fetching and model training. Use \ + the `__call__` method to execute the whole learning process. + """ + + def __new__(cls, *args, **kwargs): + if task.router.is_active and not task.has_role(task.role.LEARNER): + return task.void() + return super(OffPolicyLearnerV2, cls).__new__(cls) + + def __init__( + self, + cfg: EasyDict, + policy: 'Policy', + buffer_: Union[Buffer, List[Tuple[Buffer, float]], Dict[str, Buffer]], + reward_model: Optional['BaseRewardModel'] = None, + log_freq: int = 100, + ) -> None: + """ + Arguments: + - cfg (:obj:`EasyDict`): Config. + - policy (:obj:`Policy`): The policy to be trained. + - buffer (:obj:`Buffer`): The replay buffer to store the data for training. + - reward_model (:obj:`BaseRewardModel`): Additional reward estimator likes RND, ICM, etc. \ + default to None. + - log_freq (:obj:`int`): The frequency (iteration) of showing log. + """ + self.cfg = cfg + + self._fetcher = task.wrap(offpolicy_data_fetcher_v2(cfg, buffer_)) + #self._data_queue_input = mp.Queue() + #self._data_queue_output = mp.Queue() + + self._data_queue_input = Queue() + self._data_queue_output = Queue() + + self.thread_worker=Thread(target=data_process_func_v2, args=(self._data_queue_input, self._data_queue_output)) + self.thread_worker.start() + + #self._fetcher_worker_process = mp.Process(target=data_process_func, args=(self._data_queue_input, self._data_queue_output)) + #self._fetcher_worker_process.start() + + self._trainer = task.wrap(trainer(cfg, policy.learn_mode, log_freq=log_freq)) + if reward_model is not None: + self._reward_estimator = task.wrap(reward_estimator(cfg, reward_model)) + else: + self._reward_estimator = None + + def __call__(self, ctx: "OnlineRLContext") -> None: + """ + Output of ctx: + - train_output (:obj:`Deque`): The training output in deque. + """ + start = time.time() + time_fetcher = 0.0 + time_trainer = 0.0 + time_fetch_data=0.0 + time_get_data=0.0 + + train_output_queue = [] + data_counter=0 + + + start_fetcher = time.time() + for _ in range(self.cfg.policy.learn.update_per_collect): + start_fetch_data = time.time() + self._fetcher(ctx) + time_fetch_data += time.time() - start_fetch_data + if ctx.train_data_sample is None: + break + self._data_queue_input.put(ctx.train_data_sample) + data_counter+=1 + time_fetcher += time.time() - start_fetcher + + start_trainer = time.time() + for _ in range(data_counter): + start_get_data = time.time() + while True: + if self._data_queue_output.empty(): + time.sleep(0.001) + continue + else: + ctx.train_data=self._data_queue_output.get() + break + time_get_data += time.time() - start_get_data + if self._reward_estimator: + self._reward_estimator(ctx) + self._trainer(ctx) + + train_output_queue.append(ctx.train_output) + ctx.train_output_for_post_process = ctx.train_output + time_trainer += time.time() - start_trainer + + ctx.train_output = train_output_queue + ctx.learner_time += time.time() - start + #print("time_fetcher:time_fetch_data={}:{}={}".format(time_fetcher, time_fetch_data, time_fetcher / time_fetch_data)) + #print("time_trainer:time_get_data={}:{}={}".format(time_trainer, time_get_data, time_trainer / time_get_data)) + #print("time_trainer:time_fetcher={}:{}={}".format(time_trainer, time_fetcher, time_trainer / time_fetcher)) class HERLearner: diff --git a/ding/policy/common_utils.py b/ding/policy/common_utils.py index cb290a5e5c..c53fb336aa 100644 --- a/ding/policy/common_utils.py +++ b/ding/policy/common_utils.py @@ -78,12 +78,24 @@ def fast_preprocess_learn( next_obs = to_device(next_obs, device=device) processes_data['next_obs'] = next_obs + if 'next_n_obs' in data[0]: + next_n_obs = torch.stack([data[i]['next_n_obs'] for i in range(len(data))]) + if cuda: + next_n_obs = to_device(next_n_obs, device=device) + processes_data['next_n_obs'] = next_n_obs + reward = torch.stack([data[i]['reward'] for i in range(len(data))]) if cuda: reward = to_device(reward, device=device) reward = reward.permute(1, 0).contiguous() processes_data['reward'] = reward + if 'value_gamma' in data[0]: + value_gamma = torch.stack([data[i]['value_gamma'] for i in range(len(data))]) + if cuda: + value_gamma = to_device(value_gamma, device=device) + processes_data['value_gamma'] = value_gamma + done = torch.tensor([data[i]['done'] for i in range(len(data))], dtype=torch.float32) if cuda: done = to_device(done, device=device) diff --git a/ding/policy/dqn.py b/ding/policy/dqn.py index 8966dab157..f70f12d1d8 100644 --- a/ding/policy/dqn.py +++ b/ding/policy/dqn.py @@ -689,13 +689,13 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: start = time.time() - data = fast_preprocess_learn( - data, - use_priority=self._priority, - use_priority_IS_weight=self._cfg.priority_IS_weight, - cuda=self._cuda, - device=self._device, - ) + # data = fast_preprocess_learn( + # data, + # use_priority=self._priority, + # use_priority_IS_weight=self._cfg.priority_IS_weight, + # cuda=self._cuda, + # device=self._device, + # ) time_data_process = time.time() - start start = time.time() @@ -714,12 +714,12 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: q_value = self._learn_model.forward(data['obs'])['logit'] # Target q value with torch.no_grad(): - target_q_value = self._target_model.forward(data['next_obs'])['logit'] + target_next_n_q_value = self._target_model.forward(data['next_obs'])['logit'] # Max q value action (main model), i.e. Double DQN - target_q_action = self._learn_model.forward(data['next_obs'])['action'] + target_next_n_action = self._learn_model.forward(data['next_obs'])['action'] data_n = q_nstep_td_data( - q_value, target_q_value, data['action'], target_q_action, data['reward'], data['done'], data['weight'] + q_value, target_next_n_q_value, data['action'], target_next_n_action, data['reward'], data['done'], data['weight'] ) if self._cfg.nstep==1: @@ -745,14 +745,13 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: self._target_model.update(self._learn_model.state_dict()) time_learn = time.time() - start - # print("time_data_process:",time_data_process) - # print("time_learn:",time_learn) + # print(f"time_data_process:time_learn={time_data_process}:{time_learn}={time_data_process/time_learn}") return { 'cur_lr': self._optimizer.defaults['lr'], 'total_loss': loss.item(), 'q_value': q_value.mean().item(), - 'target_q_value': target_q_value.mean().item(), + 'target_q_value': target_next_n_q_value.mean().item(), 'priority': td_error_per_sample.abs().tolist(), # Only discrete action satisfying len(data['action'])==1 can return this and draw histogram on tensorboard. # '[histogram]action_distribution': data['action'], From aab3847a6c608d81d405808775b775a3d0c2cfb1 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 7 Sep 2023 13:30:48 +0800 Subject: [PATCH 200/244] change data type --- ding/framework/middleware/collector.py | 62 ++++++++++++++--------- ding/policy/common_utils.py | 68 ++++++++++++++++++++++++++ 2 files changed, 106 insertions(+), 24 deletions(-) diff --git a/ding/framework/middleware/collector.py b/ding/framework/middleware/collector.py index cb864dad64..09f7ec5c6b 100644 --- a/ding/framework/middleware/collector.py +++ b/ding/framework/middleware/collector.py @@ -171,8 +171,12 @@ def __call__(self, ctx: "OnlineRLContext") -> None: counter=0 + time_send=0.0 + time_receive=0.0 + time_process=0.0 + while True: - + start_send=time.time() if len(self._ready_obs_receive.keys()) > 0: if random: action_to_send = {i: {"action": np.array([self.env.action_space.sample()])} for i in self._ready_obs_receive.keys()} @@ -188,35 +192,39 @@ def __call__(self, ctx: "OnlineRLContext") -> None: action_send = action_send.squeeze(1) env_id_send = np.array(list(action_to_send.keys())) self.env.send_action(action_send, env_id_send) + time_send+=time.time()-start_send + start_receive=time.time() next_obs, rew, done, info = self.env.receive_data() env_id_receive = info['env_id'] counter+=len(env_id_receive) self._ready_obs_receive.update({i: next_obs[i] for i in range(len(next_obs))}) + time_receive+=time.time()-start_receive + start_process=time.time() #todo for i in range(len(env_id_receive)): - current_reward=ttorch.tensor(np.array([rew[i]])) + current_reward=rew[i] if self._nsteps>1: self._trajectory[env_id_receive[i]].append( { - 'obs': ttorch.tensor(self._ready_obs_send[env_id_receive[i]]), - 'action': ttorch.tensor(self._ready_action_send[env_id_receive[i]]['action']), - 'next_obs': ttorch.tensor(next_obs[i]), + 'obs': self._ready_obs_send[env_id_receive[i]], + 'action': self._ready_action_send[env_id_receive[i]]['action'], + 'next_obs': next_obs[i], # n-step reward 'reward': [current_reward], - 'done': ttorch.tensor(done[i]) + 'done': done[i], } ) else: self._trajectory[env_id_receive[i]].append( { - 'obs': ttorch.tensor(self._ready_obs_send[env_id_receive[i]]), - 'action': ttorch.tensor(self._ready_action_send[env_id_receive[i]]['action']), - 'next_obs': ttorch.tensor(next_obs[i]), + 'obs': self._ready_obs_send[env_id_receive[i]], + 'action': self._ready_action_send[env_id_receive[i]]['action'], + 'next_obs': next_obs[i], # n-step reward - 'reward': ttorch.tensor(current_reward), - 'done': ttorch.tensor(done[i]) + 'reward': current_reward, + 'done': done[i], } ) @@ -236,8 +244,8 @@ def __call__(self, ctx: "OnlineRLContext") -> None: self._trajectory[env_id_receive[i]][-j]['reward'].append(current_reward) if real_reverse_record_position==self._nsteps: - self._trajectory[env_id_receive[i]][-real_reverse_record_position]['next_n_obs']=ttorch.tensor(next_obs[i]) - self._trajectory[env_id_receive[i]][-real_reverse_record_position]['value_gamma']=ttorch.tensor(self._discount_ratio_list[real_reverse_record_position-1]) + self._trajectory[env_id_receive[i]][-real_reverse_record_position]['next_n_obs']=next_obs[i] + self._trajectory[env_id_receive[i]][-real_reverse_record_position]['value_gamma']=self._discount_ratio_list[real_reverse_record_position-1] else: # done[i] == True or counter >= target_size @@ -246,29 +254,32 @@ def __call__(self, ctx: "OnlineRLContext") -> None: for j in range(1,reverse_record_position+1): if j==1: - self._trajectory[env_id_receive[i]][-j]['reward'].extend([ttorch.zeros_like(current_reward) for _ in range(self._nsteps-len(self._trajectory[env_id_receive[i]][-j]['reward']))]) - self._trajectory[env_id_receive[i]][-j]['next_n_obs']=ttorch.tensor(next_obs[i]) - self._trajectory[env_id_receive[i]][-j]['value_gamma']=ttorch.tensor(self._discount_ratio_list[j-1]) + self._trajectory[env_id_receive[i]][-j]['reward'].extend([np.zeros_like(current_reward) for _ in range(self._nsteps-len(self._trajectory[env_id_receive[i]][-j]['reward']))]) + self._trajectory[env_id_receive[i]][-j]['next_n_obs']=next_obs[i] + self._trajectory[env_id_receive[i]][-j]['value_gamma']=self._discount_ratio_list[j-1] else: if self._trajectory[env_id_receive[i]][-j]['done']==True: real_reverse_record_position=j break else: self._trajectory[env_id_receive[i]][-j]['reward'].append(current_reward) - self._trajectory[env_id_receive[i]][-j]['reward'].extend([ttorch.zeros_like(current_reward) for _ in range(self._nsteps-len(self._trajectory[env_id_receive[i]][-j]['reward']))]) - self._trajectory[env_id_receive[i]][-j]['next_n_obs']=ttorch.tensor(next_obs[i]) - self._trajectory[env_id_receive[i]][-j]['value_gamma']=ttorch.tensor(self._discount_ratio_list[j-1]) + self._trajectory[env_id_receive[i]][-j]['reward'].extend([np.zeros_like(current_reward) for _ in range(self._nsteps-len(self._trajectory[env_id_receive[i]][-j]['reward']))]) + self._trajectory[env_id_receive[i]][-j]['next_n_obs']=next_obs[i] + self._trajectory[env_id_receive[i]][-j]['value_gamma']=self._discount_ratio_list[j-1] else: - self._trajectory[env_id_receive[i]][-1]['value_gamma']=ttorch.tensor(self._discount_ratio_list[0]) + self._trajectory[env_id_receive[i]][-1]['value_gamma']=self._discount_ratio_list[0] + time_process+=time.time()-start_process if counter >= target_size: - # transform reward to ttorch.tensor - for i in range(self.env.env_num): - for j in range(len(self._trajectory[i])): - self._trajectory[i][j]['reward']=ttorch.concat(self._trajectory[env_id_receive[i]][j]['reward']) + # if self._nsteps>1: + # # transform reward to ttorch.tensor + # for i in range(self.env.env_num): + # for j in range(len(self._trajectory[i])): + # self._trajectory[i][j]['reward']=np.concatenate(self._trajectory[env_id_receive[i]][j]['reward']) break + ctx.trajectories=[] for i in range(self.env.env_num): @@ -277,6 +288,9 @@ def __call__(self, ctx: "OnlineRLContext") -> None: ctx.env_step += len(ctx.trajectories) ctx.collector_time += time.time() - start + print(f'time_send:[{time_send}]') + print(f'time_receive:[{time_receive}]') + print(f'time_process:[{time_process}]') class PPOFStepCollector: """ diff --git a/ding/policy/common_utils.py b/ding/policy/common_utils.py index c53fb336aa..4accd1d0fc 100644 --- a/ding/policy/common_utils.py +++ b/ding/policy/common_utils.py @@ -1,4 +1,5 @@ from typing import List, Any +import numpy as np import torch import treetensor.torch as ttorch from ding.utils.data import default_collate @@ -61,6 +62,73 @@ def fast_preprocess_learn( # data preprocess processes_data = {} + action=torch.tensor(np.array([data[i]['action'] for i in range(len(data))])) + if cuda: + action = to_device(action, device=device) + if action.ndim == 2 and action.shape[1] == 1: + action = action.squeeze(1) + processes_data['action'] = action + + obs = torch.tensor(np.array([data[i]['obs'] for i in range(len(data))])) + if cuda: + obs = to_device(obs, device=device) + processes_data['obs'] = obs + + next_obs = torch.tensor(np.array([data[i]['next_obs'] for i in range(len(data))])) + if cuda: + next_obs = to_device(next_obs, device=device) + processes_data['next_obs'] = next_obs + + if 'next_n_obs' in data[0]: + next_n_obs = torch.tensor(np.array([data[i]['next_n_obs'] for i in range(len(data))])) + if cuda: + next_n_obs = to_device(next_n_obs, device=device) + processes_data['next_n_obs'] = next_n_obs + + reward = torch.tensor(np.array([data[i]['reward'] for i in range(len(data))])) + if cuda: + reward = to_device(reward, device=device) + reward = reward.permute(1, 0).contiguous() + processes_data['reward'] = reward + + if 'value_gamma' in data[0]: + value_gamma = torch.tensor(np.array([data[i]['value_gamma'] for i in range(len(data))]), dtype=torch.float32) + if cuda: + value_gamma = to_device(value_gamma, device=device) + processes_data['value_gamma'] = value_gamma + + done = torch.tensor(np.array([data[i]['done'] for i in range(len(data))]), dtype=torch.float32) + if cuda: + done = to_device(done, device=device) + processes_data['done'] = done + + if use_priority and use_priority_IS_weight: + if 'priority_IS' in data: + weight = data['priority_IS'] + else: # for compability + weight = data['IS'] + else: + if 'weight' in data[0]: + weight = torch.tensor(np.array([data[i]['weight'] for i in range(len(data))])) + else: + weight = None + + if weight and cuda: + weight = to_device(weight, device=device) + processes_data['weight'] = weight + + return processes_data + +def fast_preprocess_learn_v2( + data: List[Any], + use_priority_IS_weight: bool = False, + use_priority: bool = False, + cuda: bool = False, + device: str = 'cpu', +) -> dict: + # data preprocess + processes_data = {} + action = torch.stack([data[i]['action'] for i in range(len(data))]) if cuda: action = to_device(action, device=device) From 83ece4f5b01203bcd2a33a51cb7eae07942f7781 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 7 Sep 2023 15:49:41 +0800 Subject: [PATCH 201/244] polish code --- ding/example/dqn_envpool_wandb_new_nstep.py | 1 - ding/policy/dqn.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/ding/example/dqn_envpool_wandb_new_nstep.py b/ding/example/dqn_envpool_wandb_new_nstep.py index 2de943d176..4a20334cd2 100644 --- a/ding/example/dqn_envpool_wandb_new_nstep.py +++ b/ding/example/dqn_envpool_wandb_new_nstep.py @@ -127,7 +127,6 @@ def main(cfg): pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size pong_dqn_envpool_config.seed = arg.seed pong_dqn_envpool_config.env.stop_value = 2000 - pong_dqn_envpool_config.policy.nstep = 1 pong_dqn_envpool_config.nstep = 3 pong_dqn_envpool_config.policy.nstep = 3 pong_dqn_envpool_config.seed = arg.seed diff --git a/ding/policy/dqn.py b/ding/policy/dqn.py index f70f12d1d8..e11810b3ba 100644 --- a/ding/policy/dqn.py +++ b/ding/policy/dqn.py @@ -714,9 +714,9 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: q_value = self._learn_model.forward(data['obs'])['logit'] # Target q value with torch.no_grad(): - target_next_n_q_value = self._target_model.forward(data['next_obs'])['logit'] + target_next_n_q_value = self._target_model.forward(data['next_n_obs'])['logit'] # Max q value action (main model), i.e. Double DQN - target_next_n_action = self._learn_model.forward(data['next_obs'])['action'] + target_next_n_action = self._learn_model.forward(data['next_n_obs'])['action'] data_n = q_nstep_td_data( q_value, target_next_n_q_value, data['action'], target_next_n_action, data['reward'], data['done'], data['weight'] From 1980d51f1fd2fcddd3e28cdd02a193a07196b3a2 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 8 Sep 2023 15:08:16 +0800 Subject: [PATCH 202/244] add spaceinvaders envpool --- ...n_envpool_wandb_new_nstep_spaceinvaders.py | 134 ++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 ding/example/dqn_envpool_wandb_new_nstep_spaceinvaders.py diff --git a/ding/example/dqn_envpool_wandb_new_nstep_spaceinvaders.py b/ding/example/dqn_envpool_wandb_new_nstep_spaceinvaders.py new file mode 100644 index 0000000000..3fae024051 --- /dev/null +++ b/ding/example/dqn_envpool_wandb_new_nstep_spaceinvaders.py @@ -0,0 +1,134 @@ +import datetime +import torch +try: + torch.multiprocessing.set_start_method('spawn') +except RuntimeError: + pass +from easydict import EasyDict +from ditk import logging +from ding.model import DQN +from ding.policy import DQNFastPolicy +from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV2 +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial.spaceinvaders import spaceinvaders_dqn_envpool_config + + +def main(cfg): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'Pong-v5-DQN-envpool-spaceinvaders-new-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNFastPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + task.use(epoch_timer()) + + # Here is the part of single process pipeline. + task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use( + EnvpoolStepCollectorV2( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) + task.use(data_pusher(cfg, buffer_)) + #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(OffPolicyLearnerV2(cfg, policy, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use( + wandb_online_logger( + metric_list=policy.monitor_vars(), + model=policy._model, + exp_config=cfg, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + ) + ) + + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=10000000)) + + task.run() + + +if __name__ == "__main__": + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") + parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") + arg = parser.parse_args() + + spaceinvaders_dqn_envpool_config.env.collector_env_num = arg.collector_env_num + spaceinvaders_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size + spaceinvaders_dqn_envpool_config.seed = arg.seed + spaceinvaders_dqn_envpool_config.env.stop_value = 1000000000 + spaceinvaders_dqn_envpool_config.nstep = 3 + spaceinvaders_dqn_envpool_config.policy.nstep = 3 + spaceinvaders_dqn_envpool_config.seed = arg.seed + + main(spaceinvaders_dqn_envpool_config) From 96c0bbf63c9ff8a592ef7e21bd8029a167a3e804 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 8 Sep 2023 19:57:23 +0800 Subject: [PATCH 203/244] fix import bug --- ding/example/dqn_envpool_wandb_new_nstep_spaceinvaders.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_spaceinvaders.py b/ding/example/dqn_envpool_wandb_new_nstep_spaceinvaders.py index 3fae024051..439c03aaa1 100644 --- a/ding/example/dqn_envpool_wandb_new_nstep_spaceinvaders.py +++ b/ding/example/dqn_envpool_wandb_new_nstep_spaceinvaders.py @@ -18,12 +18,12 @@ termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV2 from ding.utils import set_pkg_seed -from dizoo.atari.config.serial.spaceinvaders import spaceinvaders_dqn_envpool_config +from dizoo.atari.config.serial.spaceinvaders.spaceinvaders_dqn_envpool_config import spaceinvaders_dqn_envpool_config def main(cfg): logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Pong-v5-DQN-envpool-spaceinvaders-new-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + cfg.exp_name = 'Spaceinvaders-v5-DQN-envpool-new-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") collector_env_cfg = EasyDict( { From 7adbc773b27762b46750bd4543cabf062db2cd76 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 11 Oct 2023 12:52:12 +0800 Subject: [PATCH 204/244] merge file from main --- ding/config/A2C/__init__.py | 27 ---- ding/config/A2C/gym_bipedalwalker_v3.py | 43 ----- ding/config/A2C/gym_halfcheetah_v3.py | 48 ------ ding/config/A2C/gym_hopper_v3.py | 46 ------ ding/config/A2C/gym_lunarlander_v2.py | 38 ----- ding/config/A2C/gym_pendulum_v1.py | 1 - ding/config/A2C/gym_walker2d_v3.py | 46 ------ ding/config/C51/__init__.py | 23 --- ding/config/C51/gym_lunarlander_v2.py | 52 ------ ding/config/C51/gym_pongnoframeskip_v4.py | 54 ------- ding/config/C51/gym_qbertnoframeskip_v4.py | 54 ------- .../C51/gym_spaceInvadersnoframeskip_v4.py | 54 ------- ding/config/DDPG/__init__.py | 29 ---- ding/config/DDPG/gym_bipedalwalker_v3.py | 45 ------ ding/config/DDPG/gym_halfcheetah_v3.py | 55 ------- ding/config/DDPG/gym_hopper_v3.py | 55 ------- .../DDPG/gym_lunarlandercontinuous_v2.py | 60 ------- ding/config/DDPG/gym_pendulum_v1.py | 52 ------ ding/config/DDPG/gym_walker2d_v3.py | 55 ------- ding/config/DQN/__init__.py | 23 --- ding/config/DQN/gym_lunarlander_v2.py | 53 ------- ding/config/DQN/gym_pongnoframeskip_v4.py | 50 ------ ding/config/DQN/gym_qbertnoframeskip_v4.py | 50 ------ .../DQN/gym_spaceInvadersnoframeskip_v4.py | 51 ------ ding/config/PG/__init__.py | 26 --- ding/config/PG/gym_bipedalwalker_v3.py | 43 ----- ding/config/PG/gym_halfcheetah_v3.py | 46 ------ ding/config/PG/gym_hopper_v3.py | 46 ------ ding/config/PG/gym_lunarlander_v2.py | 38 ----- ding/config/PG/gym_pendulum_v1.py | 42 ----- ding/config/PG/gym_walker2d_v3.py | 46 ------ ding/config/PPOF/__init__.py | 17 -- ding/config/PPOF/gym_lunarlander_v2.py | 13 -- .../PPOF/gym_lunarlandercontinuous_v2.py | 15 -- ding/config/PPOOffPolicy/__init__.py | 26 --- .../config/PPOOffPolicy/gym_lunarlander_v2.py | 44 ----- .../gym_lunarlandercontinuous_v2.py | 109 ------------- .../PPOOffPolicy/gym_pongnoframeskip_v4.py | 54 ------- .../PPOOffPolicy/gym_qbertnoframeskip_v4.py | 48 ------ .../gym_spaceInvadersnoframeskip_v4.py | 48 ------ ding/config/SAC/__init__.py | 29 ---- ding/config/SAC/gym_bipedalwalker_v3.py | 47 ------ ding/config/SAC/gym_halfcheetah_v3.py | 56 ------- ding/config/SAC/gym_hopper_v3.py | 43 ----- .../SAC/gym_lunarlandercontinuous_v2.py | 44 ----- ding/config/SAC/gym_pendulum_v1.py | 49 ------ ding/config/SAC/gym_walker2d_v3.py | 56 ------- ding/config/SQL/__init__.py | 14 -- ding/config/SQL/gym_lunarlander_v2.py | 43 ----- ding/config/TD3/__init__.py | 29 ---- ding/config/TD3/gym_bipedalwalker_v3.py | 52 ------ ding/config/TD3/gym_halfcheetah_v3.py | 58 ------- ding/config/TD3/gym_hopper_v3.py | 37 ----- .../TD3/gym_lunarlandercontinuous_v2.py | 50 ------ ding/config/TD3/gym_pendulum_v1.py | 54 ------- ding/config/TD3/gym_walker2d_v3.py | 60 ------- ding/torch_utils/modules/__init__.py | 6 - ding/torch_utils/modules/distribution.py | 34 ---- ding/torch_utils/modules/function.py | 25 --- ding/torch_utils/modules/gaussian.py | 150 ------------------ ding/torch_utils/modules/matrix.py | 52 ------ ding/torch_utils/modules/parameter.py | 37 ----- ding/torch_utils/modules/perceptron.py | 52 ------ .../pendulum/config/pendulum_pg_config.py | 54 ------- dizoo/mujoco/config/halfcheetah_a2c_config.py | 65 -------- dizoo/mujoco/config/halfcheetah_pg_config.py | 52 ------ dizoo/mujoco/config/hopper_a2c_config.py | 64 -------- dizoo/mujoco/config/hopper_pg_config.py | 52 ------ dizoo/mujoco/config/walker2d_a2c_config.py | 64 -------- dizoo/mujoco/config/walker2d_pg_config.py | 52 ------ 70 files changed, 3205 deletions(-) delete mode 100644 ding/config/A2C/__init__.py delete mode 100644 ding/config/A2C/gym_bipedalwalker_v3.py delete mode 100644 ding/config/A2C/gym_halfcheetah_v3.py delete mode 100644 ding/config/A2C/gym_hopper_v3.py delete mode 100644 ding/config/A2C/gym_lunarlander_v2.py delete mode 100644 ding/config/A2C/gym_pendulum_v1.py delete mode 100644 ding/config/A2C/gym_walker2d_v3.py delete mode 100644 ding/config/C51/__init__.py delete mode 100644 ding/config/C51/gym_lunarlander_v2.py delete mode 100644 ding/config/C51/gym_pongnoframeskip_v4.py delete mode 100644 ding/config/C51/gym_qbertnoframeskip_v4.py delete mode 100644 ding/config/C51/gym_spaceInvadersnoframeskip_v4.py delete mode 100644 ding/config/DDPG/__init__.py delete mode 100644 ding/config/DDPG/gym_bipedalwalker_v3.py delete mode 100644 ding/config/DDPG/gym_halfcheetah_v3.py delete mode 100644 ding/config/DDPG/gym_hopper_v3.py delete mode 100644 ding/config/DDPG/gym_lunarlandercontinuous_v2.py delete mode 100644 ding/config/DDPG/gym_pendulum_v1.py delete mode 100644 ding/config/DDPG/gym_walker2d_v3.py delete mode 100644 ding/config/DQN/__init__.py delete mode 100644 ding/config/DQN/gym_lunarlander_v2.py delete mode 100644 ding/config/DQN/gym_pongnoframeskip_v4.py delete mode 100644 ding/config/DQN/gym_qbertnoframeskip_v4.py delete mode 100644 ding/config/DQN/gym_spaceInvadersnoframeskip_v4.py delete mode 100644 ding/config/PG/__init__.py delete mode 100644 ding/config/PG/gym_bipedalwalker_v3.py delete mode 100644 ding/config/PG/gym_halfcheetah_v3.py delete mode 100644 ding/config/PG/gym_hopper_v3.py delete mode 100644 ding/config/PG/gym_lunarlander_v2.py delete mode 100644 ding/config/PG/gym_pendulum_v1.py delete mode 100644 ding/config/PG/gym_walker2d_v3.py delete mode 100644 ding/config/PPOF/__init__.py delete mode 100644 ding/config/PPOF/gym_lunarlander_v2.py delete mode 100644 ding/config/PPOF/gym_lunarlandercontinuous_v2.py delete mode 100644 ding/config/PPOOffPolicy/__init__.py delete mode 100644 ding/config/PPOOffPolicy/gym_lunarlander_v2.py delete mode 100644 ding/config/PPOOffPolicy/gym_lunarlandercontinuous_v2.py delete mode 100644 ding/config/PPOOffPolicy/gym_pongnoframeskip_v4.py delete mode 100644 ding/config/PPOOffPolicy/gym_qbertnoframeskip_v4.py delete mode 100644 ding/config/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py delete mode 100644 ding/config/SAC/__init__.py delete mode 100644 ding/config/SAC/gym_bipedalwalker_v3.py delete mode 100644 ding/config/SAC/gym_halfcheetah_v3.py delete mode 100644 ding/config/SAC/gym_hopper_v3.py delete mode 100644 ding/config/SAC/gym_lunarlandercontinuous_v2.py delete mode 100644 ding/config/SAC/gym_pendulum_v1.py delete mode 100644 ding/config/SAC/gym_walker2d_v3.py delete mode 100644 ding/config/SQL/__init__.py delete mode 100644 ding/config/SQL/gym_lunarlander_v2.py delete mode 100644 ding/config/TD3/__init__.py delete mode 100644 ding/config/TD3/gym_bipedalwalker_v3.py delete mode 100644 ding/config/TD3/gym_halfcheetah_v3.py delete mode 100644 ding/config/TD3/gym_hopper_v3.py delete mode 100644 ding/config/TD3/gym_lunarlandercontinuous_v2.py delete mode 100644 ding/config/TD3/gym_pendulum_v1.py delete mode 100644 ding/config/TD3/gym_walker2d_v3.py delete mode 100644 ding/torch_utils/modules/__init__.py delete mode 100644 ding/torch_utils/modules/distribution.py delete mode 100644 ding/torch_utils/modules/function.py delete mode 100644 ding/torch_utils/modules/gaussian.py delete mode 100644 ding/torch_utils/modules/matrix.py delete mode 100644 ding/torch_utils/modules/parameter.py delete mode 100644 ding/torch_utils/modules/perceptron.py delete mode 100644 dizoo/classic_control/pendulum/config/pendulum_pg_config.py delete mode 100644 dizoo/mujoco/config/halfcheetah_a2c_config.py delete mode 100644 dizoo/mujoco/config/halfcheetah_pg_config.py delete mode 100644 dizoo/mujoco/config/hopper_a2c_config.py delete mode 100644 dizoo/mujoco/config/hopper_pg_config.py delete mode 100644 dizoo/mujoco/config/walker2d_a2c_config.py delete mode 100644 dizoo/mujoco/config/walker2d_pg_config.py diff --git a/ding/config/A2C/__init__.py b/ding/config/A2C/__init__.py deleted file mode 100644 index 63c6804ef6..0000000000 --- a/ding/config/A2C/__init__.py +++ /dev/null @@ -1,27 +0,0 @@ -from easydict import EasyDict -from . import gym_bipedalwalker_v3 -from . import gym_halfcheetah_v3 -from . import gym_hopper_v3 -from . import gym_lunarlander_v2 -from . import gym_pendulum_v1 -from . import gym_walker2d_v3 - -supported_env_cfg = { - gym_bipedalwalker_v3.cfg.env.env_id: gym_bipedalwalker_v3.cfg, - gym_halfcheetah_v3.cfg.env.env_id: gym_halfcheetah_v3.cfg, - gym_hopper_v3.cfg.env.env_id: gym_hopper_v3.cfg, - gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.cfg, - gym_walker2d_v3.cfg.env.env_id: gym_walker2d_v3.cfg, -} - -supported_env_cfg = EasyDict(supported_env_cfg) - -supported_env = { - gym_bipedalwalker_v3.cfg.env.env_id: gym_bipedalwalker_v3.env, - gym_halfcheetah_v3.cfg.env.env_id: gym_halfcheetah_v3.env, - gym_hopper_v3.cfg.env.env_id: gym_hopper_v3.env, - gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.env, - gym_walker2d_v3.cfg.env.env_id: gym_walker2d_v3.env, -} - -supported_env = EasyDict(supported_env) diff --git a/ding/config/A2C/gym_bipedalwalker_v3.py b/ding/config/A2C/gym_bipedalwalker_v3.py deleted file mode 100644 index 8293baed12..0000000000 --- a/ding/config/A2C/gym_bipedalwalker_v3.py +++ /dev/null @@ -1,43 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='Bipedalwalker-v3-A2C', - seed=0, - env=dict( - env_id='BipedalWalker-v3', - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - act_scale=True, - rew_clip=True, - ), - policy=dict( - cuda=True, - action_space='continuous', - model=dict( - action_space='continuous', - obs_shape=24, - action_shape=4, - ), - learn=dict( - batch_size=64, - learning_rate=0.0003, - value_weight=0.7, - entropy_weight=0.0005, - discount_factor=0.99, - adv_norm=True, - ), - collect=dict( - n_sample=64, - discount_factor=0.99, - ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/A2C/gym_halfcheetah_v3.py b/ding/config/A2C/gym_halfcheetah_v3.py deleted file mode 100644 index 4f06bab30d..0000000000 --- a/ding/config/A2C/gym_halfcheetah_v3.py +++ /dev/null @@ -1,48 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='HalfCheetah-v3-A2C', - env=dict( - env_id='HalfCheetah-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=1, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=12000, - env_wrapper='mujoco_default', - act_scale=True, - rew_clip=True, - ), - policy=dict( - cuda=True, - action_space='continuous', - model=dict( - action_space='continuous', - obs_shape=17, - action_shape=6, - ), - learn=dict( - batch_size=256, - learning_rate=0.0003, - value_weight=0.5, - entropy_weight=0.01, - grad_norm=0.5, - ignore_done=True, - adv_norm=True, - ), - collect=dict( - n_sample=256, - discount_factor=0.99, - gae_lambda=0.95, - ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/A2C/gym_hopper_v3.py b/ding/config/A2C/gym_hopper_v3.py deleted file mode 100644 index bcefe401b5..0000000000 --- a/ding/config/A2C/gym_hopper_v3.py +++ /dev/null @@ -1,46 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='Hopper-v3-A2C', - env=dict( - env_id='Hopper-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=1, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=12000, - env_wrapper='mujoco_default', - act_scale=True, - rew_clip=True, - ), - policy=dict( - cuda=True, - action_space='continuous', - model=dict( - obs_shape=11, - action_shape=3, - action_space='continuous', - ), - learn=dict( - batch_size=128, - learning_rate=0.0003, - value_weight=0.5, - entropy_weight=0.02, - adv_norm=True, - ), - collect=dict( - n_sample=128, - discount_factor=0.99, - gae_lambda=0.95, - ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/A2C/gym_lunarlander_v2.py b/ding/config/A2C/gym_lunarlander_v2.py deleted file mode 100644 index 8e85171768..0000000000 --- a/ding/config/A2C/gym_lunarlander_v2.py +++ /dev/null @@ -1,38 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='LunarLander-v2-A2C', - env=dict( - collector_env_num=8, - evaluator_env_num=8, - env_id='LunarLander-v2', - n_evaluator_episode=8, - stop_value=260, - ), - policy=dict( - cuda=True, - model=dict( - obs_shape=8, - action_shape=4, - ), - learn=dict( - batch_size=64, - learning_rate=3e-4, - entropy_weight=0.001, - adv_norm=True, - ), - collect=dict( - n_sample=64, - discount_factor=0.99, - gae_lambda=0.95, - ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/A2C/gym_pendulum_v1.py b/ding/config/A2C/gym_pendulum_v1.py deleted file mode 100644 index 464090415c..0000000000 --- a/ding/config/A2C/gym_pendulum_v1.py +++ /dev/null @@ -1 +0,0 @@ -# TODO diff --git a/ding/config/A2C/gym_walker2d_v3.py b/ding/config/A2C/gym_walker2d_v3.py deleted file mode 100644 index 287e9b0fe3..0000000000 --- a/ding/config/A2C/gym_walker2d_v3.py +++ /dev/null @@ -1,46 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='Walker2d-v3-A2C', - env=dict( - env_id='Walker2d-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=1, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=12000, - env_wrapper='mujoco_default', - act_scale=True, - rew_clip=True, - ), - policy=dict( - cuda=True, - action_space='continuous', - model=dict( - action_space='continuous', - obs_shape=17, - action_shape=6, - ), - learn=dict( - batch_size=32, - learning_rate=0.0003, - value_weight=0.5, - entropy_weight=0.005, - adv_norm=True, - ), - collect=dict( - n_sample=32, - discount_factor=0.99, - gae_lambda=0.95, - ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/C51/__init__.py b/ding/config/C51/__init__.py deleted file mode 100644 index 2704b04c53..0000000000 --- a/ding/config/C51/__init__.py +++ /dev/null @@ -1,23 +0,0 @@ -from easydict import EasyDict -from . import gym_lunarlander_v2 -from . import gym_pongnoframeskip_v4 -from . import gym_qbertnoframeskip_v4 -from . import gym_spaceInvadersnoframeskip_v4 - -supported_env_cfg = { - gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.cfg, - gym_pongnoframeskip_v4.cfg.env.env_id: gym_pongnoframeskip_v4.cfg, - gym_qbertnoframeskip_v4.cfg.env.env_id: gym_qbertnoframeskip_v4.cfg, - gym_spaceInvadersnoframeskip_v4.cfg.env.env_id: gym_spaceInvadersnoframeskip_v4.cfg, -} - -supported_env_cfg = EasyDict(supported_env_cfg) - -supported_env = { - gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.env, - gym_pongnoframeskip_v4.cfg.env.env_id: gym_pongnoframeskip_v4.env, - gym_qbertnoframeskip_v4.cfg.env.env_id: gym_qbertnoframeskip_v4.env, - gym_spaceInvadersnoframeskip_v4.cfg.env.env_id: gym_spaceInvadersnoframeskip_v4.env, -} - -supported_env = EasyDict(supported_env) diff --git a/ding/config/C51/gym_lunarlander_v2.py b/ding/config/C51/gym_lunarlander_v2.py deleted file mode 100644 index 6c52cc691a..0000000000 --- a/ding/config/C51/gym_lunarlander_v2.py +++ /dev/null @@ -1,52 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='lunarlander_c51', - seed=0, - env=dict( - collector_env_num=8, - evaluator_env_num=8, - env_id='LunarLander-v2', - n_evaluator_episode=8, - stop_value=260, - ), - policy=dict( - cuda=False, - model=dict( - obs_shape=8, - action_shape=4, - encoder_hidden_size_list=[512, 64], - v_min=-30, - v_max=30, - n_atom=51, - ), - discount_factor=0.99, - nstep=3, - learn=dict( - update_per_collect=10, - batch_size=64, - learning_rate=0.001, - target_update_freq=100, - ), - collect=dict( - n_sample=64, - unroll_len=1, - ), - other=dict( - eps=dict( - type='exp', - start=0.95, - end=0.1, - decay=50000, - ), replay_buffer=dict(replay_buffer_size=100000, ) - ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/C51/gym_pongnoframeskip_v4.py b/ding/config/C51/gym_pongnoframeskip_v4.py deleted file mode 100644 index d3dc9ffb9c..0000000000 --- a/ding/config/C51/gym_pongnoframeskip_v4.py +++ /dev/null @@ -1,54 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='PongNoFrameskip-v4-C51', - seed=0, - env=dict( - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=30, - env_id='PongNoFrameskip-v4', - frame_stack=4, - env_wrapper='atari_default', - ), - policy=dict( - cuda=True, - priority=False, - model=dict( - obs_shape=[4, 84, 84], - action_shape=6, - encoder_hidden_size_list=[128, 128, 512], - v_min=-10, - v_max=10, - n_atom=51, - ), - nstep=3, - discount_factor=0.99, - learn=dict( - update_per_collect=10, - batch_size=32, - learning_rate=0.0001, - target_update_freq=500, - ), - collect=dict(n_sample=100, ), - eval=dict(evaluator=dict(eval_freq=4000, )), - other=dict( - eps=dict( - type='exp', - start=1., - end=0.05, - decay=250000, - ), - replay_buffer=dict(replay_buffer_size=100000, ), - ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/C51/gym_qbertnoframeskip_v4.py b/ding/config/C51/gym_qbertnoframeskip_v4.py deleted file mode 100644 index b68231f1b1..0000000000 --- a/ding/config/C51/gym_qbertnoframeskip_v4.py +++ /dev/null @@ -1,54 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='QbertNoFrameskip-v4-C51', - seed=0, - env=dict( - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=30000, - env_id='QbertNoFrameskip-v4', - frame_stack=4, - env_wrapper='atari_default', - ), - policy=dict( - cuda=True, - priority=True, - model=dict( - obs_shape=[4, 84, 84], - action_shape=6, - encoder_hidden_size_list=[128, 128, 512], - v_min=-10, - v_max=10, - n_atom=51, - ), - nstep=3, - discount_factor=0.99, - learn=dict( - update_per_collect=10, - batch_size=32, - learning_rate=0.0001, - target_update_freq=500, - ), - collect=dict(n_sample=100, ), - eval=dict(evaluator=dict(eval_freq=4000, )), - other=dict( - eps=dict( - type='exp', - start=1., - end=0.05, - decay=1000000, - ), - replay_buffer=dict(replay_buffer_size=400000, ), - ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/C51/gym_spaceInvadersnoframeskip_v4.py b/ding/config/C51/gym_spaceInvadersnoframeskip_v4.py deleted file mode 100644 index e635005b75..0000000000 --- a/ding/config/C51/gym_spaceInvadersnoframeskip_v4.py +++ /dev/null @@ -1,54 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='SpaceInvadersNoFrameskip-v4-C51', - seed=0, - env=dict( - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=10000000000, - env_id='SpaceInvadersNoFrameskip-v4', - frame_stack=4, - env_wrapper='atari_default', - ), - policy=dict( - cuda=True, - priority=False, - model=dict( - obs_shape=[4, 84, 84], - action_shape=6, - encoder_hidden_size_list=[128, 128, 512], - v_min=-10, - v_max=10, - n_atom=51, - ), - nstep=3, - discount_factor=0.99, - learn=dict( - update_per_collect=10, - batch_size=32, - learning_rate=0.0001, - target_update_freq=500, - ), - collect=dict(n_sample=100, ), - eval=dict(evaluator=dict(eval_freq=4000, )), - other=dict( - eps=dict( - type='exp', - start=1., - end=0.05, - decay=1000000, - ), - replay_buffer=dict(replay_buffer_size=400000, ), - ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/DDPG/__init__.py b/ding/config/DDPG/__init__.py deleted file mode 100644 index 6e01f29d74..0000000000 --- a/ding/config/DDPG/__init__.py +++ /dev/null @@ -1,29 +0,0 @@ -from easydict import EasyDict -from . import gym_bipedalwalker_v3 -from . import gym_halfcheetah_v3 -from . import gym_hopper_v3 -from . import gym_lunarlandercontinuous_v2 -from . import gym_pendulum_v1 -from . import gym_walker2d_v3 - -supported_env_cfg = { - gym_bipedalwalker_v3.cfg.env.env_id: gym_bipedalwalker_v3.cfg, - gym_halfcheetah_v3.cfg.env.env_id: gym_halfcheetah_v3.cfg, - gym_hopper_v3.cfg.env.env_id: gym_hopper_v3.cfg, - gym_lunarlandercontinuous_v2.cfg.env.env_id: gym_lunarlandercontinuous_v2.cfg, - gym_pendulum_v1.cfg.env.env_id: gym_pendulum_v1.cfg, - gym_walker2d_v3.cfg.env.env_id: gym_walker2d_v3.cfg, -} - -supported_env_cfg = EasyDict(supported_env_cfg) - -supported_env = { - gym_bipedalwalker_v3.cfg.env.env_id: gym_bipedalwalker_v3.env, - gym_halfcheetah_v3.cfg.env.env_id: gym_halfcheetah_v3.env, - gym_hopper_v3.cfg.env.env_id: gym_hopper_v3.env, - gym_lunarlandercontinuous_v2.cfg.env.env_id: gym_lunarlandercontinuous_v2.env, - gym_pendulum_v1.cfg.env.env_id: gym_pendulum_v1.env, - gym_walker2d_v3.cfg.env.env_id: gym_walker2d_v3.env, -} - -supported_env = EasyDict(supported_env) diff --git a/ding/config/DDPG/gym_bipedalwalker_v3.py b/ding/config/DDPG/gym_bipedalwalker_v3.py deleted file mode 100644 index 4a09a2070c..0000000000 --- a/ding/config/DDPG/gym_bipedalwalker_v3.py +++ /dev/null @@ -1,45 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='Bipedalwalker-v3-DDPG', - seed=0, - env=dict( - env_id='BipedalWalker-v3', - collector_env_num=8, - evaluator_env_num=5, - n_evaluator_episode=5, - act_scale=True, - rew_clip=True, - ), - policy=dict( - cuda=True, - random_collect_size=10000, - model=dict( - obs_shape=24, - action_shape=4, - twin_critic=False, - action_space='regression', - actor_head_hidden_size=400, - critic_head_hidden_size=400, - ), - learn=dict( - update_per_collect=64, - batch_size=256, - learning_rate_actor=0.0003, - learning_rate_critic=0.0003, - target_theta=0.005, - discount_factor=0.99, - learner=dict(hook=dict(log_show_after_iter=1000, )) - ), - collect=dict(n_sample=64, ), - other=dict(replay_buffer=dict(replay_buffer_size=300000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/DDPG/gym_halfcheetah_v3.py b/ding/config/DDPG/gym_halfcheetah_v3.py deleted file mode 100644 index 197c633db6..0000000000 --- a/ding/config/DDPG/gym_halfcheetah_v3.py +++ /dev/null @@ -1,55 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='HalfCheetah-v3-DDPG', - seed=0, - env=dict( - env_id='HalfCheetah-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=1, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=11000, - env_wrapper='mujoco_default', - act_scale=True, - rew_clip=True, - ), - policy=dict( - cuda=True, - random_collect_size=25000, - model=dict( - obs_shape=17, - action_shape=6, - twin_critic=False, - actor_head_hidden_size=256, - critic_head_hidden_size=256, - action_space='regression', - ), - learn=dict( - update_per_collect=1, - batch_size=256, - learning_rate_actor=1e-3, - learning_rate_critic=1e-3, - ignore_done=True, - target_theta=0.005, - discount_factor=0.99, - actor_update_freq=1, - noise=False, - ), - collect=dict( - n_sample=1, - unroll_len=1, - noise_sigma=0.1, - ), - other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/DDPG/gym_hopper_v3.py b/ding/config/DDPG/gym_hopper_v3.py deleted file mode 100644 index e84dd7aa8a..0000000000 --- a/ding/config/DDPG/gym_hopper_v3.py +++ /dev/null @@ -1,55 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='Hopper-v3-DDPG', - seed=0, - env=dict( - env_id='Hopper-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=1, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=6000, - env_wrapper='mujoco_default', - act_scale=True, - rew_clip=True, - ), - policy=dict( - cuda=True, - random_collect_size=25000, - model=dict( - obs_shape=11, - action_shape=3, - twin_critic=False, - actor_head_hidden_size=256, - critic_head_hidden_size=256, - action_space='regression', - ), - learn=dict( - update_per_collect=1, - batch_size=256, - learning_rate_actor=1e-3, - learning_rate_critic=1e-3, - ignore_done=False, - target_theta=0.005, - discount_factor=0.99, - actor_update_freq=1, - noise=False, - ), - collect=dict( - n_sample=1, - unroll_len=1, - noise_sigma=0.1, - ), - other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/DDPG/gym_lunarlandercontinuous_v2.py b/ding/config/DDPG/gym_lunarlandercontinuous_v2.py deleted file mode 100644 index 2dca929028..0000000000 --- a/ding/config/DDPG/gym_lunarlandercontinuous_v2.py +++ /dev/null @@ -1,60 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='LunarLanderContinuous-V2-DDPG', - seed=0, - env=dict( - env_id='LunarLanderContinuous-v2', - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=260, - act_scale=True, - ), - policy=dict( - cuda=True, - random_collect_size=0, - model=dict( - obs_shape=8, - action_shape=2, - twin_critic=True, - action_space='regression', - ), - learn=dict( - update_per_collect=2, - batch_size=128, - learning_rate_actor=0.001, - learning_rate_critic=0.001, - ignore_done=False, # TODO(pu) - # (int) When critic network updates once, how many times will actor network update. - # Delayed Policy Updates in original TD3 paper(https://arxiv.org/pdf/1802.09477.pdf). - # Default 1 for DDPG, 2 for TD3. - actor_update_freq=1, - # (bool) Whether to add noise on target network's action. - # Target Policy Smoothing Regularization in original TD3 paper(https://arxiv.org/pdf/1802.09477.pdf). - # Default True for TD3, False for DDPG. - noise=False, - noise_sigma=0.1, - noise_range=dict( - min=-0.5, - max=0.5, - ), - ), - collect=dict( - n_sample=48, - noise_sigma=0.1, - collector=dict(collect_print_freq=1000, ), - ), - eval=dict(evaluator=dict(eval_freq=100, ), ), - other=dict(replay_buffer=dict(replay_buffer_size=20000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -from functools import partial -env = partial(ding.envs.gym_env.env, continuous=True) diff --git a/ding/config/DDPG/gym_pendulum_v1.py b/ding/config/DDPG/gym_pendulum_v1.py deleted file mode 100644 index 41cc09933c..0000000000 --- a/ding/config/DDPG/gym_pendulum_v1.py +++ /dev/null @@ -1,52 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='Pendulum-v1-DDPG', - seed=0, - env=dict( - env_id='Pendulum-v1', - collector_env_num=8, - evaluator_env_num=5, - n_evaluator_episode=5, - stop_value=-250, - act_scale=True, - ), - policy=dict( - cuda=False, - priority=False, - random_collect_size=800, - model=dict( - obs_shape=3, - action_shape=1, - twin_critic=False, - action_space='regression', - ), - learn=dict( - update_per_collect=2, - batch_size=128, - learning_rate_actor=0.001, - learning_rate_critic=0.001, - ignore_done=True, - actor_update_freq=1, - noise=False, - ), - collect=dict( - n_sample=48, - noise_sigma=0.1, - collector=dict(collect_print_freq=1000, ), - ), - eval=dict(evaluator=dict(eval_freq=100, )), - other=dict(replay_buffer=dict( - replay_buffer_size=20000, - max_use=16, - ), ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/DDPG/gym_walker2d_v3.py b/ding/config/DDPG/gym_walker2d_v3.py deleted file mode 100644 index e510bc05be..0000000000 --- a/ding/config/DDPG/gym_walker2d_v3.py +++ /dev/null @@ -1,55 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='Walker2d-v3-DDPG', - seed=0, - env=dict( - env_id='Walker2d-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=1, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=6000, - env_wrapper='mujoco_default', - act_scale=True, - rew_clip=True, - ), - policy=dict( - cuda=True, - random_collect_size=25000, - model=dict( - obs_shape=17, - action_shape=6, - twin_critic=False, - actor_head_hidden_size=256, - critic_head_hidden_size=256, - action_space='regression', - ), - learn=dict( - update_per_collect=1, - batch_size=256, - learning_rate_actor=1e-3, - learning_rate_critic=1e-3, - ignore_done=False, - target_theta=0.005, - discount_factor=0.99, - actor_update_freq=1, - noise=False, - ), - collect=dict( - n_sample=1, - unroll_len=1, - noise_sigma=0.1, - ), - other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/DQN/__init__.py b/ding/config/DQN/__init__.py deleted file mode 100644 index 2704b04c53..0000000000 --- a/ding/config/DQN/__init__.py +++ /dev/null @@ -1,23 +0,0 @@ -from easydict import EasyDict -from . import gym_lunarlander_v2 -from . import gym_pongnoframeskip_v4 -from . import gym_qbertnoframeskip_v4 -from . import gym_spaceInvadersnoframeskip_v4 - -supported_env_cfg = { - gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.cfg, - gym_pongnoframeskip_v4.cfg.env.env_id: gym_pongnoframeskip_v4.cfg, - gym_qbertnoframeskip_v4.cfg.env.env_id: gym_qbertnoframeskip_v4.cfg, - gym_spaceInvadersnoframeskip_v4.cfg.env.env_id: gym_spaceInvadersnoframeskip_v4.cfg, -} - -supported_env_cfg = EasyDict(supported_env_cfg) - -supported_env = { - gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.env, - gym_pongnoframeskip_v4.cfg.env.env_id: gym_pongnoframeskip_v4.env, - gym_qbertnoframeskip_v4.cfg.env.env_id: gym_qbertnoframeskip_v4.env, - gym_spaceInvadersnoframeskip_v4.cfg.env.env_id: gym_spaceInvadersnoframeskip_v4.env, -} - -supported_env = EasyDict(supported_env) diff --git a/ding/config/DQN/gym_lunarlander_v2.py b/ding/config/DQN/gym_lunarlander_v2.py deleted file mode 100644 index 0307f7031f..0000000000 --- a/ding/config/DQN/gym_lunarlander_v2.py +++ /dev/null @@ -1,53 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='LunarLander-v2-DQN', - seed=0, - env=dict( - env_id='LunarLander-v2', - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=260, - ), - policy=dict( - cuda=True, - random_collect_size=25000, - discount_factor=0.99, - nstep=3, - learn=dict( - update_per_collect=10, - batch_size=64, - learning_rate=0.001, - # Frequency of target network update. - target_update_freq=100, - ), - model=dict( - obs_shape=8, - action_shape=4, - encoder_hidden_size_list=[512, 64], - # Whether to use dueling head. - dueling=True, - ), - collect=dict( - n_sample=64, - unroll_len=1, - ), - other=dict( - eps=dict( - type='exp', - start=0.95, - end=0.1, - decay=50000, - ), replay_buffer=dict(replay_buffer_size=100000, ) - ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/DQN/gym_pongnoframeskip_v4.py b/ding/config/DQN/gym_pongnoframeskip_v4.py deleted file mode 100644 index 696ee6f50d..0000000000 --- a/ding/config/DQN/gym_pongnoframeskip_v4.py +++ /dev/null @@ -1,50 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='PongNoFrameskip-v4-DQN', - seed=0, - env=dict( - env_id='PongNoFrameskip-v4', - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=30, - fram_stack=4, - env_wrapper='atari_default', - ), - policy=dict( - cuda=True, - priority=False, - discount_factor=0.99, - nstep=3, - learn=dict( - update_per_collect=10, - batch_size=32, - learning_rate=0.0001, - # Frequency of target network update. - target_update_freq=500, - ), - model=dict( - obs_shape=[4, 84, 84], - action_shape=6, - encoder_hidden_size_list=[128, 128, 512], - ), - collect=dict(n_sample=96, ), - other=dict( - eps=dict( - type='exp', - start=1., - end=0.05, - decay=250000, - ), replay_buffer=dict(replay_buffer_size=100000, ) - ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/DQN/gym_qbertnoframeskip_v4.py b/ding/config/DQN/gym_qbertnoframeskip_v4.py deleted file mode 100644 index 15f2b818e6..0000000000 --- a/ding/config/DQN/gym_qbertnoframeskip_v4.py +++ /dev/null @@ -1,50 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='QbertNoFrameskip-v4-DQN', - seed=0, - env=dict( - env_id='QbertNoFrameskip-v4', - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - fram_stack=4, - stop_value=30000, - env_wrapper='atari_default', - ), - policy=dict( - cuda=True, - priority=False, - discount_factor=0.99, - nstep=3, - learn=dict( - update_per_collect=10, - batch_size=32, - learning_rate=0.0001, - # Frequency of target network update. - target_update_freq=500, - ), - model=dict( - obs_shape=[4, 84, 84], - action_shape=6, - encoder_hidden_size_list=[128, 128, 512], - ), - collect=dict(n_sample=100, ), - other=dict( - eps=dict( - type='exp', - start=1., - end=0.05, - decay=1000000, - ), replay_buffer=dict(replay_buffer_size=400000, ) - ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/DQN/gym_spaceInvadersnoframeskip_v4.py b/ding/config/DQN/gym_spaceInvadersnoframeskip_v4.py deleted file mode 100644 index ea71f743be..0000000000 --- a/ding/config/DQN/gym_spaceInvadersnoframeskip_v4.py +++ /dev/null @@ -1,51 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='SpaceInvadersNoFrameskip-v4-DQN', - seed=0, - env=dict( - env_id='SpaceInvadersNoFrameskip-v4', - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - fram_stack=4, - stop_value=2000, - env_wrapper='atari_default', - ), - policy=dict( - cuda=True, - priority=False, - discount_factor=0.99, - nstep=3, - learn=dict( - update_per_collect=10, - batch_size=32, - learning_rate=0.0001, - # Frequency of target network update. - target_update_freq=500, - hook=dict(save_ckpt_after_iter=1000000, ) - ), - model=dict( - obs_shape=[4, 84, 84], - action_shape=6, - encoder_hidden_size_list=[128, 128, 512], - ), - collect=dict(n_sample=100, ), - other=dict( - eps=dict( - type='exp', - start=1., - end=0.05, - decay=1000000, - ), replay_buffer=dict(replay_buffer_size=400000, ) - ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/PG/__init__.py b/ding/config/PG/__init__.py deleted file mode 100644 index 280e8cef9f..0000000000 --- a/ding/config/PG/__init__.py +++ /dev/null @@ -1,26 +0,0 @@ -from easydict import EasyDict -from . import gym_bipedalwalker_v3 -from . import gym_halfcheetah_v3 -from . import gym_hopper_v3 -from . import gym_lunarlander_v2 -from . import gym_walker2d_v3 - -supported_env_cfg = { - gym_bipedalwalker_v3.cfg.env.env_id: gym_bipedalwalker_v3.cfg, - gym_halfcheetah_v3.cfg.env.env_id: gym_halfcheetah_v3.cfg, - gym_hopper_v3.cfg.env.env_id: gym_hopper_v3.cfg, - gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.cfg, - gym_walker2d_v3.cfg.env.env_id: gym_walker2d_v3.cfg, -} - -supported_env_cfg = EasyDict(supported_env_cfg) - -supported_env = { - gym_bipedalwalker_v3.cfg.env.env_id: gym_bipedalwalker_v3.env, - gym_halfcheetah_v3.cfg.env.env_id: gym_halfcheetah_v3.env, - gym_hopper_v3.cfg.env.env_id: gym_hopper_v3.env, - gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.env, - gym_walker2d_v3.cfg.env.env_id: gym_walker2d_v3.env, -} - -supported_env = EasyDict(supported_env) diff --git a/ding/config/PG/gym_bipedalwalker_v3.py b/ding/config/PG/gym_bipedalwalker_v3.py deleted file mode 100644 index 21cff070a2..0000000000 --- a/ding/config/PG/gym_bipedalwalker_v3.py +++ /dev/null @@ -1,43 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='Bipedalwalker-v3-PG', - seed=0, - env=dict( - env_id='BipedalWalker-v3', - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=300, - act_scale=True, - rew_clip=True, - ), - policy=dict( - cuda=True, - action_space='continuous', - model=dict( - action_space='continuous', - obs_shape=24, - action_shape=4, - ), - learn=dict( - batch_size=64, - learning_rate=0.001, - entropy_weight=0.001, - ), - collect=dict( - n_episode=20, - unroll_len=1, - discount_factor=0.99, - ), - eval=dict(evaluator=dict(eval_freq=1, )) - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/PG/gym_halfcheetah_v3.py b/ding/config/PG/gym_halfcheetah_v3.py deleted file mode 100644 index a2e9b00db4..0000000000 --- a/ding/config/PG/gym_halfcheetah_v3.py +++ /dev/null @@ -1,46 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='HalfCheetah-v3-PG', - seed=0, - env=dict( - env_id='HalfCheetah-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=6000, - env_wrapper='mujoco_default', - act_scale=True, - rew_clip=True, - ), - policy=dict( - cuda=True, - action_space='continuous', - model=dict( - action_space='continuous', - obs_shape=17, - action_shape=6, - ), - learn=dict( - batch_size=64, - learning_rate=0.001, - entropy_weight=0.001, - ), - collect=dict( - n_episode=20, - unroll_len=1, - discount_factor=0.99, - ), - eval=dict(evaluator=dict(eval_freq=1, )) - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/PG/gym_hopper_v3.py b/ding/config/PG/gym_hopper_v3.py deleted file mode 100644 index 7851e7f316..0000000000 --- a/ding/config/PG/gym_hopper_v3.py +++ /dev/null @@ -1,46 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='Hopper-v3-PG', - seed=0, - env=dict( - env_id='Hopper-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=6000, - env_wrapper='mujoco_default', - act_scale=True, - rew_clip=True, - ), - policy=dict( - cuda=True, - action_space='continuous', - model=dict( - action_space='continuous', - obs_shape=11, - action_shape=3, - ), - learn=dict( - batch_size=64, - learning_rate=0.005, - entropy_weight=0.01, - ), - collect=dict( - n_episode=34, - unroll_len=1, - discount_factor=0.99, - ), - eval=dict(evaluator=dict(eval_freq=1, )) - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/PG/gym_lunarlander_v2.py b/ding/config/PG/gym_lunarlander_v2.py deleted file mode 100644 index 414e4940c7..0000000000 --- a/ding/config/PG/gym_lunarlander_v2.py +++ /dev/null @@ -1,38 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='LunarLander-v2-PG', - env=dict( - collector_env_num=8, - evaluator_env_num=8, - env_id='LunarLander-v2', - n_evaluator_episode=8, - stop_value=260, - ), - policy=dict( - cuda=True, - model=dict( - obs_shape=8, - action_shape=4, - ), - learn=dict( - batch_size=320, - learning_rate=3e-4, - entropy_weight=0.001, - grad_norm=0.5, - ), - collect=dict( - n_episode=8, - discount_factor=0.99, - ), - eval=dict(evaluator=dict(eval_freq=1000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/PG/gym_pendulum_v1.py b/ding/config/PG/gym_pendulum_v1.py deleted file mode 100644 index 3ec9c68904..0000000000 --- a/ding/config/PG/gym_pendulum_v1.py +++ /dev/null @@ -1,42 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='Pendulum-v1-PG', - seed=0, - env=dict( - env_id='Pendulum-v1', - collector_env_num=8, - evaluator_env_num=5, - n_evaluator_episode=5, - stop_value=-200, - act_scale=True, - ), - policy=dict( - cuda=False, - action_space='continuous', - model=dict( - action_space='continuous', - obs_shape=3, - action_shape=1, - ), - learn=dict( - batch_size=4000, - learning_rate=0.001, - entropy_weight=0.001, - ), - collect=dict( - n_episode=20, - unroll_len=1, - discount_factor=0.99, - ), - eval=dict(evaluator=dict(eval_freq=1, )) - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/PG/gym_walker2d_v3.py b/ding/config/PG/gym_walker2d_v3.py deleted file mode 100644 index db516070d0..0000000000 --- a/ding/config/PG/gym_walker2d_v3.py +++ /dev/null @@ -1,46 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='Walker2d-v3-PG', - seed=0, - env=dict( - env_id='Walker2d-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=6000, - env_wrapper='mujoco_default', - act_scale=True, - rew_clip=True, - ), - policy=dict( - cuda=True, - action_space='continuous', - model=dict( - action_space='continuous', - obs_shape=17, - action_shape=6, - ), - learn=dict( - batch_size=64, - learning_rate=0.001, - entropy_weight=0.001, - ), - collect=dict( - n_episode=20, - unroll_len=1, - discount_factor=0.99, - ), - eval=dict(evaluator=dict(eval_freq=1, )) - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/PPOF/__init__.py b/ding/config/PPOF/__init__.py deleted file mode 100644 index 2adaaf4df2..0000000000 --- a/ding/config/PPOF/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -from easydict import EasyDict -from . import gym_lunarlander_v2 -from . import gym_lunarlandercontinuous_v2 - -supported_env_cfg = { - gym_lunarlander_v2.cfg.env_id: gym_lunarlander_v2.cfg, - gym_lunarlandercontinuous_v2.cfg.env_id: gym_lunarlandercontinuous_v2.cfg, -} - -supported_env_cfg = EasyDict(supported_env_cfg) - -supported_env = { - gym_lunarlander_v2.cfg.env_id: gym_lunarlander_v2.env, - gym_lunarlandercontinuous_v2.cfg.env_id: gym_lunarlandercontinuous_v2.env, -} - -supported_env = EasyDict(supported_env) diff --git a/ding/config/PPOF/gym_lunarlander_v2.py b/ding/config/PPOF/gym_lunarlander_v2.py deleted file mode 100644 index 2844a177b6..0000000000 --- a/ding/config/PPOF/gym_lunarlander_v2.py +++ /dev/null @@ -1,13 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='LunarLander-v2-PPO', - env_id='LunarLander-v2', - n_sample=400, - value_norm='popart', -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/PPOF/gym_lunarlandercontinuous_v2.py b/ding/config/PPOF/gym_lunarlandercontinuous_v2.py deleted file mode 100644 index 67603f7997..0000000000 --- a/ding/config/PPOF/gym_lunarlandercontinuous_v2.py +++ /dev/null @@ -1,15 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='LunarLanderContinuous-V2-PPO', - env_id='LunarLanderContinuous-v2', - action_space='continuous', - n_sample=400, - act_scale=True, -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -from functools import partial -env = partial(ding.envs.gym_env.env, continuous=True) diff --git a/ding/config/PPOOffPolicy/__init__.py b/ding/config/PPOOffPolicy/__init__.py deleted file mode 100644 index e2e296bd7d..0000000000 --- a/ding/config/PPOOffPolicy/__init__.py +++ /dev/null @@ -1,26 +0,0 @@ -from easydict import EasyDict -from . import gym_lunarlander_v2 -from . import gym_lunarlandercontinuous_v2 -from . import gym_pongnoframeskip_v4 -from . import gym_qbertnoframeskip_v4 -from . import gym_spaceInvadersnoframeskip_v4 - -supported_env_cfg = { - gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.cfg, - gym_lunarlandercontinuous_v2.cfg.env.env_id: gym_lunarlandercontinuous_v2.cfg, - gym_pongnoframeskip_v4.cfg.env.env_id: gym_pongnoframeskip_v4.cfg, - gym_qbertnoframeskip_v4.cfg.env.env_id: gym_qbertnoframeskip_v4.cfg, - gym_spaceInvadersnoframeskip_v4.cfg.env.env_id: gym_spaceInvadersnoframeskip_v4.cfg, -} - -supported_env_cfg = EasyDict(supported_env_cfg) - -supported_env = { - gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.env, - gym_lunarlandercontinuous_v2.cfg.env.env_id: gym_lunarlandercontinuous_v2.env, - gym_pongnoframeskip_v4.cfg.env.env_id: gym_pongnoframeskip_v4.env, - gym_qbertnoframeskip_v4.cfg.env.env_id: gym_qbertnoframeskip_v4.env, - gym_spaceInvadersnoframeskip_v4.cfg.env.env_id: gym_spaceInvadersnoframeskip_v4.env, -} - -supported_env = EasyDict(supported_env) diff --git a/ding/config/PPOOffPolicy/gym_lunarlander_v2.py b/ding/config/PPOOffPolicy/gym_lunarlander_v2.py deleted file mode 100644 index e68c3dd285..0000000000 --- a/ding/config/PPOOffPolicy/gym_lunarlander_v2.py +++ /dev/null @@ -1,44 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='LunarLander-v2-PPOOffPolicy', - env=dict( - collector_env_num=8, - evaluator_env_num=8, - env_id='LunarLander-v2', - n_evaluator_episode=8, - stop_value=260, - ), - policy=dict( - cuda=True, - model=dict( - obs_shape=8, - action_shape=4, - ), - learn=dict( - update_per_collect=4, - batch_size=64, - learning_rate=0.001, - value_weight=0.5, - entropy_weight=0.01, - clip_ratio=0.2, - nstep=1, - nstep_return=False, - adv_norm=True, - ), - collect=dict( - n_sample=128, - unroll_len=1, - discount_factor=0.99, - gae_lambda=0.95, - ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/PPOOffPolicy/gym_lunarlandercontinuous_v2.py b/ding/config/PPOOffPolicy/gym_lunarlandercontinuous_v2.py deleted file mode 100644 index 7e8d89a609..0000000000 --- a/ding/config/PPOOffPolicy/gym_lunarlandercontinuous_v2.py +++ /dev/null @@ -1,109 +0,0 @@ -from easydict import EasyDict - -action_shape = 2 -obs_shape = 8 - -cfg = dict( - exp_name='LunarLanderContinuous-v2-PPOOffPolicy', - seed=0, - env=dict( - env_id='LunarLanderContinuous-v2', - collector_env_num=8, - evaluator_env_num=4, - n_evaluator_episode=4, - stop_value=260, - act_scale=True, - ), - policy=dict( - cuda=True, - action_space='general', - model=dict( - obs_shape=8, - action_shape=action_shape, - action_space='general', - customized_model=True, - actor=dict( - model_type='GaussianTanh', - model=dict( - mu_model=dict( - hidden_sizes=[obs_shape, 256, 256], - activation=['relu', 'relu', 'tanh'], - output_size=action_shape, - dropout=0, - layernorm=False, - final_activation='tanh', - scale=5.0, - shrink=0.01, - ), - cov=dict( - dim=action_shape, - functional=True, - random_init=False, - sigma_lambda=dict( - hidden_sizes=[obs_shape, 128], - activation='tanh', - output_size=action_shape, - dropout=0, - layernorm=False, - final_activation='tanh', - scale=5.0, - offset=-5.0, - ), - sigma_offdiag=dict( - hidden_sizes=[obs_shape, 128], - activation='tanh', - output_size=int(action_shape * (action_shape - 1) // 2), - dropout=0, - layernorm=False, - ), - ), - ), - ), - critic=dict( - model_num=1, - model=dict( - hidden_sizes=[obs_shape, 512, 256], - activation=['relu', 'softplus', 'softplus'], - output_size=1, - dropout=0, - layernorm=False, - ), - ), - ), - learn=dict( - update_per_collect=1, - batch_size=512, - learning_rate=3e-4, - value_weight=0.5, - entropy_weight=0.01, - clip_ratio=0.05, - nstep=1, - nstep_return=False, - adv_norm=False, - value_norm=False, - ppo_param_init=False, - separate_optimizer=True, - weight_decay=0.0, - ), - collect=dict( - n_sample=512, - unroll_len=1, - discount_factor=0.999, - gae_lambda=1.0, - ), - eval=dict( - evaluator=dict(eval_freq=100, ), - render=True, - ), - other=dict(replay_buffer=dict(replay_buffer_size=int(512), ), ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=False, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -from functools import partial -env = partial(ding.envs.gym_env.env, continuous=True) diff --git a/ding/config/PPOOffPolicy/gym_pongnoframeskip_v4.py b/ding/config/PPOOffPolicy/gym_pongnoframeskip_v4.py deleted file mode 100644 index 93f603f323..0000000000 --- a/ding/config/PPOOffPolicy/gym_pongnoframeskip_v4.py +++ /dev/null @@ -1,54 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='PongNoFrameskip-v4-PPOOffPolicy', - env=dict( - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=30, - env_id='PongNoFrameskip-v4', - frame_stack=4, - env_wrapper='atari_default', - ), - policy=dict( - cuda=True, - recompute_adv=True, - action_space='discrete', - model=dict( - obs_shape=[4, 84, 84], - action_shape=6, - action_space='discrete', - encoder_hidden_size_list=[64, 64, 128], - actor_head_hidden_size=128, - critic_head_hidden_size=128, - ), - learn=dict( - update_per_collect=10, - batch_size=320, - learning_rate=3e-4, - value_weight=0.5, - entropy_weight=0.001, - clip_ratio=0.2, - adv_norm=True, - # value_norm=True, - ignore_done=False, - grad_clip_type='clip_norm', - grad_clip_value=0.5, - ), - collect=dict( - n_sample=3200, - unroll_len=1, - discount_factor=0.99, - gae_lambda=0.95, - ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/PPOOffPolicy/gym_qbertnoframeskip_v4.py b/ding/config/PPOOffPolicy/gym_qbertnoframeskip_v4.py deleted file mode 100644 index 6f07200b48..0000000000 --- a/ding/config/PPOOffPolicy/gym_qbertnoframeskip_v4.py +++ /dev/null @@ -1,48 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='QbertNoFrameskip-v4-PPOOffPolicy', - env=dict( - collector_env_num=16, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=10000000000, - env_id='QbertNoFrameskip-v4', - frame_stack=4, - env_wrapper='atari_default', - ), - policy=dict( - cuda=True, - model=dict( - obs_shape=[4, 84, 84], - action_shape=6, - encoder_hidden_size_list=[32, 64, 64, 128], - actor_head_hidden_size=128, - critic_head_hidden_size=128, - critic_head_layer_num=2, - ), - learn=dict( - update_per_collect=18, - batch_size=128, - learning_rate=0.0001, - value_weight=1.0, - entropy_weight=0.005, - clip_ratio=0.1, - adv_norm=False, - ), - collect=dict( - n_sample=1024, - unroll_len=1, - discount_factor=0.99, - gae_lambda=0.95, - ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py b/ding/config/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py deleted file mode 100644 index 492ed090fe..0000000000 --- a/ding/config/PPOOffPolicy/gym_spaceInvadersnoframeskip_v4.py +++ /dev/null @@ -1,48 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='SpaceInvadersNoFrameskip-v4-PPOOffPolicy', - env=dict( - collector_env_num=16, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=10000000000, - env_id='SpaceInvadersNoFrameskip-v4', - frame_stack=4, - env_wrapper='atari_default', - ), - policy=dict( - cuda=True, - model=dict( - obs_shape=[4, 84, 84], - action_shape=6, - encoder_hidden_size_list=[32, 64, 64, 128], - actor_head_hidden_size=128, - critic_head_hidden_size=128, - critic_head_layer_num=2, - ), - learn=dict( - update_per_collect=24, - batch_size=128, - learning_rate=0.0001, - value_weight=1.0, - entropy_weight=0.03, - clip_ratio=0.1, - adv_norm=False, - ), - collect=dict( - n_sample=1024, - unroll_len=1, - discount_factor=0.99, - gae_lambda=0.95, - ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/SAC/__init__.py b/ding/config/SAC/__init__.py deleted file mode 100644 index 6e01f29d74..0000000000 --- a/ding/config/SAC/__init__.py +++ /dev/null @@ -1,29 +0,0 @@ -from easydict import EasyDict -from . import gym_bipedalwalker_v3 -from . import gym_halfcheetah_v3 -from . import gym_hopper_v3 -from . import gym_lunarlandercontinuous_v2 -from . import gym_pendulum_v1 -from . import gym_walker2d_v3 - -supported_env_cfg = { - gym_bipedalwalker_v3.cfg.env.env_id: gym_bipedalwalker_v3.cfg, - gym_halfcheetah_v3.cfg.env.env_id: gym_halfcheetah_v3.cfg, - gym_hopper_v3.cfg.env.env_id: gym_hopper_v3.cfg, - gym_lunarlandercontinuous_v2.cfg.env.env_id: gym_lunarlandercontinuous_v2.cfg, - gym_pendulum_v1.cfg.env.env_id: gym_pendulum_v1.cfg, - gym_walker2d_v3.cfg.env.env_id: gym_walker2d_v3.cfg, -} - -supported_env_cfg = EasyDict(supported_env_cfg) - -supported_env = { - gym_bipedalwalker_v3.cfg.env.env_id: gym_bipedalwalker_v3.env, - gym_halfcheetah_v3.cfg.env.env_id: gym_halfcheetah_v3.env, - gym_hopper_v3.cfg.env.env_id: gym_hopper_v3.env, - gym_lunarlandercontinuous_v2.cfg.env.env_id: gym_lunarlandercontinuous_v2.env, - gym_pendulum_v1.cfg.env.env_id: gym_pendulum_v1.env, - gym_walker2d_v3.cfg.env.env_id: gym_walker2d_v3.env, -} - -supported_env = EasyDict(supported_env) diff --git a/ding/config/SAC/gym_bipedalwalker_v3.py b/ding/config/SAC/gym_bipedalwalker_v3.py deleted file mode 100644 index 8f427083b3..0000000000 --- a/ding/config/SAC/gym_bipedalwalker_v3.py +++ /dev/null @@ -1,47 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='BipedalWalker-v3-SAC', - seed=0, - env=dict( - env_id='BipedalWalker-v3', - collector_env_num=8, - evaluator_env_num=5, - n_evaluator_episode=5, - act_scale=True, - rew_clip=True, - ), - policy=dict( - cuda=True, - random_collect_size=10000, - model=dict( - obs_shape=24, - action_shape=4, - twin_critic=True, - action_space='reparameterization', - actor_head_hidden_size=128, - critic_head_hidden_size=128, - ), - learn=dict( - update_per_collect=64, - batch_size=256, - learning_rate_q=0.0003, - learning_rate_policy=0.0003, - learning_rate_alpha=0.0003, - target_theta=0.005, - discount_factor=0.99, - auto_alpha=True, - learner=dict(hook=dict(log_show_after_iter=1000, )) - ), - collect=dict(n_sample=64, ), - other=dict(replay_buffer=dict(replay_buffer_size=300000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/SAC/gym_halfcheetah_v3.py b/ding/config/SAC/gym_halfcheetah_v3.py deleted file mode 100644 index add0a8c636..0000000000 --- a/ding/config/SAC/gym_halfcheetah_v3.py +++ /dev/null @@ -1,56 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='HalfCheetah-v3-SAC', - seed=0, - env=dict( - env_id='HalfCheetah-v3', - collector_env_num=1, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=12000, - env_wrapper='mujoco_default', - act_scale=True, - rew_clip=True, - ), - policy=dict( - cuda=True, - random_collect_size=10000, - model=dict( - obs_shape=17, - action_shape=6, - twin_critic=True, - action_space='reparameterization', - actor_head_hidden_size=256, - critic_head_hidden_size=256, - ), - learn=dict( - update_per_collect=1, - batch_size=256, - learning_rate_q=1e-3, - learning_rate_policy=1e-3, - learning_rate_alpha=3e-4, - ignore_done=True, - target_theta=0.005, - discount_factor=0.99, - alpha=0.2, - reparameterization=True, - auto_alpha=False, - ), - collect=dict( - n_sample=1, - unroll_len=1, - ), - command=dict(), - eval=dict(), - other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/SAC/gym_hopper_v3.py b/ding/config/SAC/gym_hopper_v3.py deleted file mode 100644 index 9ee256973d..0000000000 --- a/ding/config/SAC/gym_hopper_v3.py +++ /dev/null @@ -1,43 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='Hopper-v3-SAC', - seed=0, - env=dict( - env_id='Hopper-v3', - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=6000, - env_wrapper='mujoco_default', - act_scale=True, - rew_clip=True, - ), - policy=dict( - cuda=True, - random_collect_size=10000, - model=dict( - obs_shape=11, - action_shape=3, - action_space='reparameterization', - actor_head_hidden_size=256, - critic_head_hidden_size=256, - ), - learn=dict( - update_per_collect=1, - batch_size=256, - learning_rate_q=1e-3, - learning_rate_policy=1e-3, - reparameterization=True, - auto_alpha=False, - ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/SAC/gym_lunarlandercontinuous_v2.py b/ding/config/SAC/gym_lunarlandercontinuous_v2.py deleted file mode 100644 index 4af21e86aa..0000000000 --- a/ding/config/SAC/gym_lunarlandercontinuous_v2.py +++ /dev/null @@ -1,44 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='LunarLanderContinuous-v2-SAC', - seed=0, - env=dict( - env_id='LunarLanderContinuous-v2', - collector_env_num=4, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=260, - act_scale=True, - ), - policy=dict( - cuda=True, - random_collect_size=10000, - model=dict( - obs_shape=8, - action_shape=2, - action_space='reparameterization', - twin_critic=True, - ), - learn=dict( - update_per_collect=256, - batch_size=128, - learning_rate_q=1e-3, - learning_rate_policy=3e-4, - learning_rate_alpha=3e-4, - auto_alpha=True, - ), - collect=dict(n_sample=256, ), - eval=dict(evaluator=dict(eval_freq=1000, ), ), - other=dict(replay_buffer=dict(replay_buffer_size=int(1e5), ), ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -from functools import partial -env = partial(ding.envs.gym_env.env, continuous=True) diff --git a/ding/config/SAC/gym_pendulum_v1.py b/ding/config/SAC/gym_pendulum_v1.py deleted file mode 100644 index ddda04197d..0000000000 --- a/ding/config/SAC/gym_pendulum_v1.py +++ /dev/null @@ -1,49 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='Pendulum-v1-SAC', - seed=0, - env=dict( - env_id='Pendulum-v1', - collector_env_num=10, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=-250, - act_scale=True, - ), - policy=dict( - cuda=True, - priority=False, - random_collect_size=1000, - model=dict( - obs_shape=3, - action_shape=1, - twin_critic=True, - action_space='reparameterization', - actor_head_hidden_size=128, - critic_head_hidden_size=128, - ), - learn=dict( - update_per_collect=1, - batch_size=128, - learning_rate_q=0.001, - learning_rate_policy=0.001, - learning_rate_alpha=0.0003, - ignore_done=True, - target_theta=0.005, - discount_factor=0.99, - auto_alpha=True, - ), - collect=dict(n_sample=10, ), - eval=dict(evaluator=dict(eval_freq=100, )), - other=dict(replay_buffer=dict(replay_buffer_size=100000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/SAC/gym_walker2d_v3.py b/ding/config/SAC/gym_walker2d_v3.py deleted file mode 100644 index 6936603247..0000000000 --- a/ding/config/SAC/gym_walker2d_v3.py +++ /dev/null @@ -1,56 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='Walker2d-v3-SAC', - seed=0, - env=dict( - env_id='Walker2d-v3', - collector_env_num=1, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=6000, - env_wrapper='mujoco_default', - act_scale=True, - rew_clip=True, - ), - policy=dict( - cuda=True, - random_collect_size=10000, - model=dict( - obs_shape=17, - action_shape=6, - twin_critic=True, - action_space='reparameterization', - actor_head_hidden_size=256, - critic_head_hidden_size=256, - ), - learn=dict( - update_per_collect=1, - batch_size=256, - learning_rate_q=1e-3, - learning_rate_policy=1e-3, - learning_rate_alpha=3e-4, - ignore_done=False, - target_theta=0.005, - discount_factor=0.99, - alpha=0.2, - reparameterization=True, - auto_alpha=False, - ), - collect=dict( - n_sample=1, - unroll_len=1, - ), - command=dict(), - eval=dict(), - other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/SQL/__init__.py b/ding/config/SQL/__init__.py deleted file mode 100644 index 9637366fb4..0000000000 --- a/ding/config/SQL/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -from easydict import EasyDict -from . import gym_lunarlander_v2 - -supported_env_cfg = { - gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.cfg, -} - -supported_env_cfg = EasyDict(supported_env_cfg) - -supported_env = { - gym_lunarlander_v2.cfg.env.env_id: gym_lunarlander_v2.env, -} - -supported_env = EasyDict(supported_env) diff --git a/ding/config/SQL/gym_lunarlander_v2.py b/ding/config/SQL/gym_lunarlander_v2.py deleted file mode 100644 index 271e793c55..0000000000 --- a/ding/config/SQL/gym_lunarlander_v2.py +++ /dev/null @@ -1,43 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='LunarLander-v2-SQL', - env=dict( - collector_env_num=8, - evaluator_env_num=8, - env_id='LunarLander-v2', - n_evaluator_episode=8, - stop_value=260, - ), - policy=dict( - cuda=True, - model=dict( - obs_shape=8, - action_shape=4, - encoder_hidden_size_list=[128, 128, 64], - dueling=True, - ), - nstep=1, - discount_factor=0.97, - learn=dict(batch_size=64, learning_rate=0.001, alpha=0.08), - collect=dict(n_sample=64), - eval=dict(evaluator=dict(eval_freq=50, )), # note: this is the times after which you learns to evaluate - other=dict( - eps=dict( - type='exp', - start=0.95, - end=0.1, - decay=10000, - ), - replay_buffer=dict(replay_buffer_size=20000, ), - ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/TD3/__init__.py b/ding/config/TD3/__init__.py deleted file mode 100644 index 6e01f29d74..0000000000 --- a/ding/config/TD3/__init__.py +++ /dev/null @@ -1,29 +0,0 @@ -from easydict import EasyDict -from . import gym_bipedalwalker_v3 -from . import gym_halfcheetah_v3 -from . import gym_hopper_v3 -from . import gym_lunarlandercontinuous_v2 -from . import gym_pendulum_v1 -from . import gym_walker2d_v3 - -supported_env_cfg = { - gym_bipedalwalker_v3.cfg.env.env_id: gym_bipedalwalker_v3.cfg, - gym_halfcheetah_v3.cfg.env.env_id: gym_halfcheetah_v3.cfg, - gym_hopper_v3.cfg.env.env_id: gym_hopper_v3.cfg, - gym_lunarlandercontinuous_v2.cfg.env.env_id: gym_lunarlandercontinuous_v2.cfg, - gym_pendulum_v1.cfg.env.env_id: gym_pendulum_v1.cfg, - gym_walker2d_v3.cfg.env.env_id: gym_walker2d_v3.cfg, -} - -supported_env_cfg = EasyDict(supported_env_cfg) - -supported_env = { - gym_bipedalwalker_v3.cfg.env.env_id: gym_bipedalwalker_v3.env, - gym_halfcheetah_v3.cfg.env.env_id: gym_halfcheetah_v3.env, - gym_hopper_v3.cfg.env.env_id: gym_hopper_v3.env, - gym_lunarlandercontinuous_v2.cfg.env.env_id: gym_lunarlandercontinuous_v2.env, - gym_pendulum_v1.cfg.env.env_id: gym_pendulum_v1.env, - gym_walker2d_v3.cfg.env.env_id: gym_walker2d_v3.env, -} - -supported_env = EasyDict(supported_env) diff --git a/ding/config/TD3/gym_bipedalwalker_v3.py b/ding/config/TD3/gym_bipedalwalker_v3.py deleted file mode 100644 index e2949f5ff9..0000000000 --- a/ding/config/TD3/gym_bipedalwalker_v3.py +++ /dev/null @@ -1,52 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='Bipedalwalker-v3-TD3', - seed=0, - env=dict( - env_id='BipedalWalker-v3', - collector_env_num=8, - evaluator_env_num=5, - n_evaluator_episode=5, - act_scale=True, - rew_clip=True, - ), - policy=dict( - cuda=True, - random_collect_size=10000, - model=dict( - obs_shape=24, - action_shape=4, - twin_critic=True, - action_space='regression', - actor_head_hidden_size=400, - critic_head_hidden_size=400, - ), - learn=dict( - update_per_collect=64, - batch_size=256, - learning_rate_actor=0.0003, - learning_rate_critic=0.0003, - target_theta=0.005, - discount_factor=0.99, - actor_update_freq=2, - noise=True, - noise_sigma=0.2, - noise_range=dict( - min=-0.5, - max=0.5, - ), - learner=dict(hook=dict(log_show_after_iter=1000, )) - ), - collect=dict(n_sample=64, ), - other=dict(replay_buffer=dict(replay_buffer_size=300000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/TD3/gym_halfcheetah_v3.py b/ding/config/TD3/gym_halfcheetah_v3.py deleted file mode 100644 index 6aba1bcefd..0000000000 --- a/ding/config/TD3/gym_halfcheetah_v3.py +++ /dev/null @@ -1,58 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='HalfCheetah-v3-TD3', - seed=0, - env=dict( - env_id='HalfCheetah-v3', - collector_env_num=1, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=11000, - env_wrapper='mujoco_default', - act_scale=True, - rew_clip=True, - ), - policy=dict( - cuda=True, - random_collect_size=25000, - model=dict( - obs_shape=17, - action_shape=6, - twin_critic=True, - actor_head_hidden_size=256, - critic_head_hidden_size=256, - action_space='regression', - ), - learn=dict( - update_per_collect=1, - batch_size=256, - learning_rate_actor=1e-3, - learning_rate_critic=1e-3, - ignore_done=True, - target_theta=0.005, - discount_factor=0.99, - actor_update_freq=2, - noise=True, - noise_sigma=0.2, - noise_range=dict( - min=-0.5, - max=0.5, - ), - ), - collect=dict( - n_sample=1, - unroll_len=1, - noise_sigma=0.1, - ), - other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/TD3/gym_hopper_v3.py b/ding/config/TD3/gym_hopper_v3.py deleted file mode 100644 index 02773b5a9f..0000000000 --- a/ding/config/TD3/gym_hopper_v3.py +++ /dev/null @@ -1,37 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='Hopper-v3-TD3', - seed=0, - env=dict( - env_id='Hopper-v3', - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=6000, - env_wrapper='mujoco_default', - act_scale=True, - rew_clip=True, - ), - policy=dict( - cuda=True, - random_collect_size=25000, - model=dict( - obs_shape=11, - action_shape=3, - actor_head_hidden_size=256, - critic_head_hidden_size=256, - action_space='regression', - ), - collect=dict(n_sample=1, ), - other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/TD3/gym_lunarlandercontinuous_v2.py b/ding/config/TD3/gym_lunarlandercontinuous_v2.py deleted file mode 100644 index 7586c3ffc8..0000000000 --- a/ding/config/TD3/gym_lunarlandercontinuous_v2.py +++ /dev/null @@ -1,50 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='LunarLanderContinuous-V2-TD3', - seed=0, - env=dict( - env_id='LunarLanderContinuous-v2', - collector_env_num=4, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=240, - act_scale=True, - ), - policy=dict( - cuda=True, - random_collect_size=10000, - model=dict( - obs_shape=8, - action_shape=2, - action_space='regression', - ), - learn=dict( - update_per_collect=256, - batch_size=256, - learning_rate_actor=3e-4, - learning_rate_critic=1e-3, - noise=True, - noise_sigma=0.1, - noise_range=dict( - min=-0.5, - max=0.5, - ), - ), - collect=dict( - n_sample=256, - noise_sigma=0.1, - ), - eval=dict(evaluator=dict(eval_freq=1000, ), ), - other=dict(replay_buffer=dict(replay_buffer_size=100000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -from functools import partial -env = partial(ding.envs.gym_env.env, continuous=True) diff --git a/ding/config/TD3/gym_pendulum_v1.py b/ding/config/TD3/gym_pendulum_v1.py deleted file mode 100644 index 7e1fb80965..0000000000 --- a/ding/config/TD3/gym_pendulum_v1.py +++ /dev/null @@ -1,54 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='Pendulum-v1-TD3', - seed=0, - env=dict( - env_id='Pendulum-v1', - collector_env_num=8, - evaluator_env_num=5, - n_evaluator_episode=5, - stop_value=-250, - act_scale=True, - ), - policy=dict( - cuda=False, - priority=False, - random_collect_size=800, - model=dict( - obs_shape=3, - action_shape=1, - twin_critic=True, - action_space='regression', - ), - learn=dict( - update_per_collect=2, - batch_size=128, - learning_rate_actor=0.001, - learning_rate_critic=0.001, - ignore_done=True, - actor_update_freq=2, - noise=True, - noise_sigma=0.1, - noise_range=dict( - min=-0.5, - max=0.5, - ), - ), - collect=dict( - n_sample=48, - noise_sigma=0.1, - collector=dict(collect_print_freq=1000, ), - ), - eval=dict(evaluator=dict(eval_freq=100, ), ), - other=dict(replay_buffer=dict(replay_buffer_size=20000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/config/TD3/gym_walker2d_v3.py b/ding/config/TD3/gym_walker2d_v3.py deleted file mode 100644 index 9cdd4a6885..0000000000 --- a/ding/config/TD3/gym_walker2d_v3.py +++ /dev/null @@ -1,60 +0,0 @@ -from easydict import EasyDict - -cfg = dict( - exp_name='Walker2d-v3-TD3', - seed=0, - env=dict( - env_id='Walker2d-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=1, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=6000, - env_wrapper='mujoco_default', - act_scale=True, - rew_clip=True, - ), - policy=dict( - cuda=True, - random_collect_size=25000, - model=dict( - obs_shape=17, - action_shape=6, - twin_critic=True, - actor_head_hidden_size=256, - critic_head_hidden_size=256, - action_space='regression', - ), - learn=dict( - update_per_collect=1, - batch_size=256, - learning_rate_actor=1e-3, - learning_rate_critic=1e-3, - ignore_done=False, - target_theta=0.005, - discount_factor=0.99, - actor_update_freq=2, - noise=True, - noise_sigma=0.2, - noise_range=dict( - min=-0.5, - max=0.5, - ), - ), - collect=dict( - n_sample=1, - unroll_len=1, - noise_sigma=0.1, - ), - other=dict(replay_buffer=dict(replay_buffer_size=1000000, ), ), - ), - wandb_logger=dict( - gradient_logger=True, video_logger=True, plot_logger=True, action_logger=True, return_logger=False - ), -) - -cfg = EasyDict(cfg) - -import ding.envs.gym_env -env = ding.envs.gym_env.env diff --git a/ding/torch_utils/modules/__init__.py b/ding/torch_utils/modules/__init__.py deleted file mode 100644 index 66c6778ab7..0000000000 --- a/ding/torch_utils/modules/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from .parameter import NonegativeParameter, TanhParameter -from .perceptron import multilayer_perceptron -from .distribution import Distribution -from .gaussian import StandardGaussian, Gaussian, GaussianTanh -from .function import NonegativeFunction, TanhFunction -from .matrix import CovarianceMatrix diff --git a/ding/torch_utils/modules/distribution.py b/ding/torch_utils/modules/distribution.py deleted file mode 100644 index e17aabb211..0000000000 --- a/ding/torch_utils/modules/distribution.py +++ /dev/null @@ -1,34 +0,0 @@ -import torch -from torch import nn - - -class Distribution(nn.Module): - - def __init__(self): - super().__init__() - - def forward(self, *args, **kwargs): - raise RuntimeError("Forward method cannot be called for a Distribution object.") - - def log_prob(self, x, condition=None, **kwargs): - raise NotImplementedError - - def sample(self, num=1, condition=None, **kwargs): - with torch.no_grad(): - return self.rsample(num, condition, **kwargs) - - def rsample(self, num=1, condition=None, **kwargs): - raise NotImplementedError - - def entropy(self, *args, **kwargs): - raise NotImplementedError - - def dist(self, *args, **kwargs): - raise NotImplementedError - - def sample_and_log_prob(self, num=1, condition=None, **kwargs): - with torch.no_grad(): - return self.rsample_and_log_prob(num, condition, **kwargs) - - def rsample_and_log_prob(self, num=1, condition=None, **kwargs): - raise NotImplementedError diff --git a/ding/torch_utils/modules/function.py b/ding/torch_utils/modules/function.py deleted file mode 100644 index 9a971e3cc3..0000000000 --- a/ding/torch_utils/modules/function.py +++ /dev/null @@ -1,25 +0,0 @@ -import torch -from torch import nn -from torch.distributions.transforms import TanhTransform -from .perceptron import multilayer_perceptron - - -class NonegativeFunction(nn.Module): - - def __init__(self, cfg): - super().__init__() - self.model = multilayer_perceptron(cfg) - - def forward(self, x): - return torch.exp(self.model(x)) - - -class TanhFunction(nn.Module): - - def __init__(self, cfg): - super().__init__() - self.transform = TanhTransform(cache_size=1) - self.model = multilayer_perceptron(cfg) - - def forward(self, x): - return self.transform(self.model(x)) diff --git a/ding/torch_utils/modules/gaussian.py b/ding/torch_utils/modules/gaussian.py deleted file mode 100644 index 26cf9b5ca6..0000000000 --- a/ding/torch_utils/modules/gaussian.py +++ /dev/null @@ -1,150 +0,0 @@ -import torch -from torch import nn -from .perceptron import multilayer_perceptron -from .parameter import NonegativeParameter -from .matrix import CovarianceMatrix -from torch.distributions import TransformedDistribution, MultivariateNormal, Independent -from torch.distributions.transforms import TanhTransform -from .distribution import Distribution - - -class StandardGaussian(Distribution): - - def __init__(self, dim) -> None: - super().__init__() - self.dim = dim - self.dist = MultivariateNormal(torch.zeros(dim), torch.eye(dim)) - - def log_prob(self, x, condition=None, **kwargs): - return self.dist.log_prob(x) - - def rsample_and_log_prob(self, condition=None, sample_shape=torch.Size(), **kwargs): - if condition is not None: - sample_shape = condition.shape[0] - x = self.dist.rsample(sample_shape=sample_shape) - log_prob = self.dist.log_prob(x) - return x, log_prob - - def sample_and_log_prob(self, condition=None, sample_shape=torch.Size(), **kwargs): - with torch.no_grad(): - return self.rsample_and_log_prob(condition, sample_shape, **kwargs) - - def rsample(self, condition=None, sample_shape=torch.Size(), **kwargs): - if condition is not None: - sample_shape = condition.shape[0] - return self.dist.rsample(sample_shape=sample_shape) - - def sample(self, condition=None, sample_shape=torch.Size(), **kwargs): - with torch.no_grad(): - return self.rsample(condition=condition, sample_shape=sample_shape, **kwargs) - - def entropy(self): - return self.dist.entropy() - - def dist(self): - return self.dist - - -class Gaussian(Distribution): - - def __init__(self, cfg): - super().__init__() - self.cfg = cfg - self.mu_model = multilayer_perceptron(cfg.mu_model) - self.cov = CovarianceMatrix(cfg.cov) - self.functional_cov = cfg.cov.functional - - def dist(self, conditioning): - mu = self.mu_model(conditioning) - # repeat the sigma to match the shape of mu - if self.functional_cov: - scale_tril = self.cov.low_triangle_matrix(conditioning) - else: - scale_tril = self.cov.low_triangle_matrix().unsqueeze(0).repeat(mu.shape[0], 1, 1) - return MultivariateNormal(loc=mu, scale_tril=scale_tril) - - def log_prob(self, x, conditioning): - return self.dist(conditioning).log_prob(x) - - def sample(self, conditioning, sample_shape=torch.Size()): - return self.dist(conditioning).sample(sample_shape=sample_shape) - - def rsample(self, conditioning, sample_shape=torch.Size()): - return self.dist(conditioning).rsample(sample_shape=sample_shape) - - def entropy(self, conditioning): - return self.dist(conditioning).entropy() - - def rsample_and_log_prob(self, conditioning, sample_shape=torch.Size()): - dist = self.dist(conditioning) - x = dist.rsample(sample_shape=sample_shape) - log_prob = dist.log_prob(x) - return x, log_prob - - def sample_and_log_prob(self, conditioning, sample_shape=torch.Size()): - with torch.no_grad(): - return self.rsample_and_log_prob(conditioning, sample_shape) - - def forward(self, conditioning): - dist = self.dist(conditioning) - x = dist.rsample() - log_prob = dist.log_prob(x) - return x, log_prob - - -class GaussianTanh(Distribution): - - def __init__(self, cfg): - super().__init__() - self.cfg = cfg - self.mu_model = multilayer_perceptron(cfg.mu_model) - self.cov = CovarianceMatrix(cfg.cov) - self.functional_cov = cfg.cov.functional - - def dist(self, conditioning): - mu = self.mu_model(conditioning) - # repeat the sigma to match the shape of mu - if self.functional_cov: - scale_tril = self.cov.low_triangle_matrix(conditioning) - else: - scale_tril = self.cov.low_triangle_matrix().unsqueeze(0).repeat(mu.shape[0], 1, 1) - return TransformedDistribution( - base_distribution=MultivariateNormal(loc=mu, scale_tril=scale_tril), - transforms=[TanhTransform(cache_size=1)] - ) - - def log_prob(self, x, conditioning): - return self.dist(conditioning).log_prob(x) - - def sample(self, conditioning, sample_shape=torch.Size()): - return self.dist(conditioning).sample(sample_shape=sample_shape) - - def rsample(self, conditioning, sample_shape=torch.Size()): - return self.dist(conditioning).rsample(sample_shape=sample_shape) - - def rsample_and_log_prob(self, conditioning, sample_shape=torch.Size()): - dist = self.dist(conditioning) - x = dist.rsample(sample_shape=sample_shape) - log_prob = dist.log_prob(x) - return x, log_prob - - def sample_and_log_prob(self, conditioning, sample_shape=torch.Size()): - with torch.no_grad(): - return self.rsample_and_log_prob(conditioning, sample_shape) - - def entropy(self, conditioning): - mu = self.mu_model(conditioning) - # repeat the sigma to match the shape of mu - if self.functional_cov: - scale_tril = self.cov.low_triangle_matrix(conditioning) - else: - scale_tril = self.cov.low_triangle_matrix().unsqueeze(0).repeat(mu.shape[0], 1, 1) - base_distribution = MultivariateNormal(loc=mu, scale_tril=scale_tril) - x = base_distribution.rsample(sample_shape=torch.Size([1000])) - return base_distribution.entropy() + torch.sum(torch.log(1.0 - torch.tanh(x) ** 2), dim=(0, 2)) / 1000 - - def forward(self, conditioning): - dist = self.dist(conditioning) - x = dist.rsample() - log_prob = dist.log_prob(x) - return x, log_prob diff --git a/ding/torch_utils/modules/matrix.py b/ding/torch_utils/modules/matrix.py deleted file mode 100644 index a287735dc8..0000000000 --- a/ding/torch_utils/modules/matrix.py +++ /dev/null @@ -1,52 +0,0 @@ -import torch -from torch import nn -from .parameter import NonegativeParameter, TanhParameter -from .function import NonegativeFunction, TanhFunction - - -class CovarianceMatrix(nn.Module): - - def __init__(self, cfg=None, delta=1e-8): - super().__init__() - self.dim = cfg.dim - if cfg.functional: - self.functional = True - self.sigma_lambda = NonegativeFunction(cfg.sigma_lambda) - self.sigma_offdiag = TanhFunction(cfg.sigma_offdiag) - else: - self.functional = False - if cfg.random_init: - self.sigma_lambda = NonegativeParameter(torch.abs(nn.init.normal_(torch.Tensor(self.dim)))) - self.sigma_offdiag = TanhParameter( - torch.tanh(nn.init.normal_(torch.Tensor(self.dim * (self.dim - 1) // 2))) - ) - else: - self.sigma_lambda = NonegativeParameter(torch.ones(self.dim)) - self.sigma_offdiag = TanhParameter(torch.tanh(torch.zeros(self.dim * (self.dim - 1) // 2))) - # register eye matrix - self.eye = nn.Parameter(torch.eye(self.dim), requires_grad=False) - self.delta = delta - - def low_triangle_matrix(self, x=None): - low_t_m = self.eye.clone() - if self.functional: - low_t_m = low_t_m.repeat(x.shape[0], 1, 1) - low_t_m[torch.cat( - ( - torch.reshape(torch.arange(x.shape[0]).repeat(self.dim * (self.dim - 1) // 2, 1).T, - (1, -1)), torch.tril_indices(self.dim, self.dim, offset=-1).repeat(1, x.shape[0]) - ) - ).tolist()] = torch.reshape(self.sigma_offdiag(x), (-1, 1)).squeeze(-1) - low_t_m = torch.einsum( - "bj,bjk,bk->bjk", self.delta + self.sigma_lambda(x), low_t_m, self.delta + self.sigma_lambda(x) - ) - else: - low_t_m[torch.tril_indices(self.dim, self.dim, offset=-1).tolist()] = self.sigma_offdiag.data - low_t_m = torch.mul( - self.delta + self.sigma_lambda.data, - torch.mul(low_t_m, self.delta + self.sigma_lambda.data).T - ).T - return low_t_m - - def forward(self, x=None): - return torch.matmul(self.low_triangle_matrix(x), self.low_triangle_matrix(x).T) diff --git a/ding/torch_utils/modules/parameter.py b/ding/torch_utils/modules/parameter.py deleted file mode 100644 index e5879933a8..0000000000 --- a/ding/torch_utils/modules/parameter.py +++ /dev/null @@ -1,37 +0,0 @@ -import torch -from torch import nn -from torch.distributions.transforms import TanhTransform - - -class NonegativeParameter(nn.Module): - - def __init__(self, data=None, requires_grad=True, delta=1e-8): - super().__init__() - if data is None: - data = torch.zeros(1) - self.log_data = nn.Parameter(torch.log(data + delta), requires_grad=requires_grad) - - def forward(self): - return torch.exp(self.log_data) - - @property - def data(self): - return torch.exp(self.log_data) - - -class TanhParameter(nn.Module): - - def __init__(self, data=None, requires_grad=True): - super().__init__() - if data is None: - data = torch.zeros(1) - self.transform = TanhTransform(cache_size=1) - - self.data_inv = nn.Parameter(self.transform.inv(data), requires_grad=requires_grad) - - def forward(self): - return self.transform(self.data_inv) - - @property - def data(self): - return self.transform(self.data_inv) diff --git a/ding/torch_utils/modules/perceptron.py b/ding/torch_utils/modules/perceptron.py deleted file mode 100644 index c8640b2ca2..0000000000 --- a/ding/torch_utils/modules/perceptron.py +++ /dev/null @@ -1,52 +0,0 @@ -import torch -from torch import nn -from ding.torch_utils.activation import get_activation - - -class multilayer_perceptron(nn.Module): - - def __init__(self, cfg): - super(multilayer_perceptron, self).__init__() - - self.model = nn.Sequential() - - for i in range(len(cfg.hidden_sizes) - 1): - self.model.add_module('linear' + str(i), nn.Linear(cfg.hidden_sizes[i], cfg.hidden_sizes[i + 1])) - - if isinstance(cfg.activation, list): - self.model.add_module('activation' + str(i), get_activation(cfg.activation[i])) - else: - self.model.add_module('activation' + str(i), get_activation(cfg.activation)) - if hasattr(cfg, "dropout") and cfg.dropout > 0: - self.model.add_module('dropout', nn.Dropout(cfg.dropout)) - if hasattr(cfg, "layernorm") and cfg.layernorm: - self.model.add_module('layernorm', nn.LayerNorm(cfg.hidden_sizes[i])) - - self.model.add_module( - 'linear' + str(len(cfg.hidden_sizes) - 1), nn.Linear(cfg.hidden_sizes[-1], cfg.output_size) - ) - - if hasattr(cfg, 'final_activation'): - self.model.add_module('final_activation', get_activation(cfg.final_activation)) - - if hasattr(cfg, 'scale'): - self.scale = nn.Parameter(torch.tensor(cfg.scale), requires_grad=False) - else: - self.scale = 1.0 - - if hasattr(cfg, 'offset'): - self.offset = nn.Parameter(torch.tensor(cfg.offset), requires_grad=False) - else: - self.offset = 0.0 - - # shrink the weight of linear layer 'linear'+str(len(cfg.hidden_sizes) to it's origin 0.01 - if hasattr(cfg, 'shrink'): - if hasattr(cfg, 'final_activation'): - self.model[-2].weight.data.normal_(0, cfg.shrink) - self.model[-2].bias.data.normal_(0, cfg.shrink) - else: - self.model[-1].weight.data.normal_(0, cfg.shrink) - self.model[-1].bias.data.normal_(0, cfg.shrink) - - def forward(self, x): - return self.scale * self.model(x) + self.offset diff --git a/dizoo/classic_control/pendulum/config/pendulum_pg_config.py b/dizoo/classic_control/pendulum/config/pendulum_pg_config.py deleted file mode 100644 index 0ba8e235f0..0000000000 --- a/dizoo/classic_control/pendulum/config/pendulum_pg_config.py +++ /dev/null @@ -1,54 +0,0 @@ -from easydict import EasyDict - -pendulum_pg_config = dict( - exp_name='pendulum_pg_seed0', - env=dict( - collector_env_num=8, - evaluator_env_num=5, - act_scale=True, - n_evaluator_episode=5, - stop_value=-200, - ), - policy=dict( - cuda=False, - action_space='continuous', - model=dict( - action_space='continuous', - obs_shape=3, - action_shape=1, - ), - learn=dict( -<<<<<<< HEAD - batch_size=400 -======= - batch_size=400, ->>>>>>> 11cc7de83c4e40c2a3929a46ac4fb132e730df5b - learning_rate=0.001, - entropy_weight=0.001, - ), - collect=dict( - n_episode=2, - unroll_len=1, - discount_factor=0.99, - ), - eval=dict(evaluator=dict(eval_freq=200, )) - ), -) -pendulum_pg_config = EasyDict(pendulum_pg_config) -main_config = pendulum_pg_config -pendulum_pg_create_config = dict( - env=dict( - type='pendulum', - import_names=['dizoo.classic_control.pendulum.envs.pendulum_env'], - ), - env_manager=dict(type='base'), - policy=dict(type='pg'), - collector=dict(type='episode'), -) -pendulum_pg_create_config = EasyDict(pendulum_pg_create_config) -create_config = pendulum_pg_create_config - -if __name__ == "__main__": - # or you can enter `ding -m serial_onpolicy -c pendulum_pg_config.py -s 0` - from ding.entry import serial_pipeline_onpolicy - serial_pipeline_onpolicy([main_config, create_config], seed=0) diff --git a/dizoo/mujoco/config/halfcheetah_a2c_config.py b/dizoo/mujoco/config/halfcheetah_a2c_config.py deleted file mode 100644 index 017431633d..0000000000 --- a/dizoo/mujoco/config/halfcheetah_a2c_config.py +++ /dev/null @@ -1,65 +0,0 @@ -from easydict import EasyDict - -halfcheetah_a2c_config = dict( - exp_name='halfcheetah_a2c_seed0', - env=dict( - env_id='HalfCheetah-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=12000, - ), - policy=dict( - cuda=True, - action_space='continuous', - model=dict( - action_space='continuous', - obs_shape=17, - action_shape=6, - ), - learn=dict( - # (int) the number of data for a train iteration - batch_size=256, - learning_rate=0.0003, - # (float) loss weight of the value network, the weight of policy network is set to 1 - value_weight=0.5, - # (float) loss weight of the entropy regularization, the weight of policy network is set to 1 - entropy_weight=0.001, - # (float) discount factor for future reward, defaults int [0, 1] - discount_factor=0.99, - ignore_done=True, - adv_norm=True, - ), - collect=dict( - n_sample=256, - collector=dict(collect_print_freq=100, ), - ), - command=dict(), - eval=dict(evaluator=dict(eval_freq=100, )), - ), -) - -halfcheetah_a2c_config = EasyDict(halfcheetah_a2c_config) -main_config = halfcheetah_a2c_config - -halfcheetah_a2c_create_config = dict( - env=dict( - type='mujoco', - import_names=['dizoo.mujoco.envs.mujoco_env'], - ), - env_manager=dict(type='subprocess'), - policy=dict( - type='a2c', - import_names=['ding.policy.a2c'], - ), - replay_buffer=dict(type='naive', ), -) -halfcheetah_a2c_create_config = EasyDict(halfcheetah_a2c_create_config) -create_config = halfcheetah_a2c_create_config - -if __name__ == "__main__": - # or you can enter `ding -m serial -c halfcheetah_a2c_config.py -s 0` - from ding.entry import serial_pipeline_onpolicy - serial_pipeline_onpolicy((main_config, create_config), seed=0) diff --git a/dizoo/mujoco/config/halfcheetah_pg_config.py b/dizoo/mujoco/config/halfcheetah_pg_config.py deleted file mode 100644 index acc8be1c7e..0000000000 --- a/dizoo/mujoco/config/halfcheetah_pg_config.py +++ /dev/null @@ -1,52 +0,0 @@ -from easydict import EasyDict - -halfcheetah_pg_config = dict( - exp_name='halfcheetah_pg_seed0', - env=dict( - env_id='HalfCheetah-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=6000, - ), - policy=dict( - cuda=True, - action_space='continuous', - model=dict( - action_space='continuous', - obs_shape=17, - action_shape=6, - ), - learn=dict( - batch_size=64, - learning_rate=0.001, - entropy_weight=0.001, - ), - collect=dict( - n_episode=8, - unroll_len=1, - discount_factor=0.99, - ), - eval=dict(evaluator=dict(eval_freq=200, )) - ), -) -halfcheetah_pg_config = EasyDict(halfcheetah_pg_config) -main_config = halfcheetah_pg_config -halfcheetah_pg_create_config = dict( - env=dict( - type='mujoco', - import_names=['dizoo.mujoco.envs.mujoco_env'], - ), - env_manager=dict(type='subprocess'), - policy=dict(type='pg'), - collector=dict(type='episode'), -) -halfcheetah_pg_create_config = EasyDict(halfcheetah_pg_create_config) -create_config = halfcheetah_pg_create_config - -if __name__ == "__main__": - # or you can enter `ding -m serial_onpolicy -c halfcheetah_pg_config.py -s 0` - from ding.entry import serial_pipeline_onpolicy - serial_pipeline_onpolicy([main_config, create_config], seed=0) diff --git a/dizoo/mujoco/config/hopper_a2c_config.py b/dizoo/mujoco/config/hopper_a2c_config.py deleted file mode 100644 index cd0a12a881..0000000000 --- a/dizoo/mujoco/config/hopper_a2c_config.py +++ /dev/null @@ -1,64 +0,0 @@ -from easydict import EasyDict - -hopper_a2c_config = dict( - exp_name='hopper_a2c_seed0', - env=dict( - env_id='Hopper-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=6000, - ), - policy=dict( - cuda=True, - action_space='continuous', - model=dict( - obs_shape=11, - action_shape=3, - action_space='continuous', - ), - learn=dict( - # (int) the number of data for a train iteration - batch_size=256, - learning_rate=0.0003, - # (float) loss weight of the value network, the weight of policy network is set to 1 - value_weight=0.5, - # (float) loss weight of the entropy regularization, the weight of policy network is set to 1 - entropy_weight=0.001, - # (float) discount factor for future reward, defaults int [0, 1] - discount_factor=0.99, - adv_norm=True, - ), - collect=dict( - n_sample=256, - collector=dict(collect_print_freq=100, ), - ), - command=dict(), - eval=dict(evaluator=dict(eval_freq=100, )), - ), -) - -hopper_a2c_config = EasyDict(hopper_a2c_config) -main_config = hopper_a2c_config - -hopper_a2c_create_config = dict( - env=dict( - type='mujoco', - import_names=['dizoo.mujoco.envs.mujoco_env'], - ), - env_manager=dict(type='subprocess'), - policy=dict( - type='a2c', - import_names=['ding.policy.a2c'], - ), - replay_buffer=dict(type='naive', ), -) -hopper_a2c_create_config = EasyDict(hopper_a2c_create_config) -create_config = hopper_a2c_create_config - -if __name__ == "__main__": - # or you can enter `ding -m serial -c hopper_a2c_config.py -s 0` - from ding.entry import serial_pipeline_onpolicy - serial_pipeline_onpolicy([main_config, create_config], seed=0) diff --git a/dizoo/mujoco/config/hopper_pg_config.py b/dizoo/mujoco/config/hopper_pg_config.py deleted file mode 100644 index 18427131aa..0000000000 --- a/dizoo/mujoco/config/hopper_pg_config.py +++ /dev/null @@ -1,52 +0,0 @@ -from easydict import EasyDict - -hopper_pg_config = dict( - exp_name='hopper_pg_seed0', - env=dict( - env_id='Hopper-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=6000, - ), - policy=dict( - cuda=True, - action_space='continuous', - model=dict( - action_space='continuous', - obs_shape=11, - action_shape=3, - ), - learn=dict( - batch_size=64, - learning_rate=0.005, - entropy_weight=0.01, - ), - collect=dict( - n_episode=34, - unroll_len=1, - discount_factor=0.99, - ), - eval=dict(evaluator=dict(eval_freq=1, )) - ), -) -hopper_pg_config = EasyDict(hopper_pg_config) -main_config = hopper_pg_config -hopper_pg_create_config = dict( - env=dict( - type='mujoco', - import_names=['dizoo.mujoco.envs.mujoco_env'], - ), - env_manager=dict(type='subprocess'), - policy=dict(type='pg'), - collector=dict(type='episode'), -) -hopper_pg_create_config = EasyDict(hopper_pg_create_config) -create_config = hopper_pg_create_config - -if __name__ == "__main__": - # or you can enter `ding -m serial_onpolicy -c hopper_pg_config.py -s 0` - from ding.entry import serial_pipeline_onpolicy - serial_pipeline_onpolicy([main_config, create_config], seed=0) diff --git a/dizoo/mujoco/config/walker2d_a2c_config.py b/dizoo/mujoco/config/walker2d_a2c_config.py deleted file mode 100644 index 0d822b8bbe..0000000000 --- a/dizoo/mujoco/config/walker2d_a2c_config.py +++ /dev/null @@ -1,64 +0,0 @@ -from easydict import EasyDict - -walker2d_a2c_config = dict( - exp_name='walker2d_a2c_seed0', - env=dict( - env_id='Walker2d-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=6000, - ), - policy=dict( - cuda=True, - action_space='continuous', - model=dict( - action_space='continuous', - obs_shape=17, - action_shape=6, - ), - learn=dict( - # (int) the number of data for a train iteration - batch_size=256, - learning_rate=0.0003, - # (float) loss weight of the value network, the weight of policy network is set to 1 - value_weight=0.5, - # (float) loss weight of the entropy regularization, the weight of policy network is set to 1 - entropy_weight=0.001, - # (float) discount factor for future reward, defaults int [0, 1] - discount_factor=0.99, - adv_norm=True, - ), - collect=dict( - n_sample=256, - collector=dict(collect_print_freq=100, ), - ), - command=dict(), - eval=dict(evaluator=dict(eval_freq=100, )), - ), -) - -walker2d_a2c_config = EasyDict(walker2d_a2c_config) -main_config = walker2d_a2c_config - -walker2d_a2c_create_config = dict( - env=dict( - type='mujoco', - import_names=['dizoo.mujoco.envs.mujoco_env'], - ), - env_manager=dict(type='subprocess'), - policy=dict( - type='a2c', - import_names=['ding.policy.a2c'], - ), - replay_buffer=dict(type='naive', ), -) -walker2d_a2c_create_config = EasyDict(walker2d_a2c_create_config) -create_config = walker2d_a2c_create_config - -if __name__ == "__main__": - # or you can enter `ding -m serial -c walker2d_a2c_config.py -s 0` - from ding.entry import serial_pipeline_onpolicy - serial_pipeline_onpolicy([main_config, create_config], seed=0) diff --git a/dizoo/mujoco/config/walker2d_pg_config.py b/dizoo/mujoco/config/walker2d_pg_config.py deleted file mode 100644 index ab031f0635..0000000000 --- a/dizoo/mujoco/config/walker2d_pg_config.py +++ /dev/null @@ -1,52 +0,0 @@ -from easydict import EasyDict - -walker2d_pg_config = dict( - exp_name='walker2d_pg_seed0', - env=dict( - env_id='Walker2d-v3', - norm_obs=dict(use_norm=False, ), - norm_reward=dict(use_norm=False, ), - collector_env_num=8, - evaluator_env_num=8, - n_evaluator_episode=8, - stop_value=6000, - ), - policy=dict( - cuda=True, - action_space='continuous', - model=dict( - action_space='continuous', - obs_shape=17, - action_shape=6, - ), - learn=dict( - batch_size=64, - learning_rate=0.001, - entropy_weight=0.001, - ), - collect=dict( - n_episode=8, - unroll_len=1, - discount_factor=0.99, - ), - eval=dict(evaluator=dict(eval_freq=200, )) - ), -) -walker2d_pg_config = EasyDict(walker2d_pg_config) -main_config = walker2d_pg_config -walker2d_pg_create_config = dict( - env=dict( - type='mujoco', - import_names=['dizoo.mujoco.envs.mujoco_env'], - ), - env_manager=dict(type='subprocess'), - policy=dict(type='pg'), - collector=dict(type='episode'), -) -walker2d_pg_create_config = EasyDict(walker2d_pg_create_config) -create_config = walker2d_pg_create_config - -if __name__ == "__main__": - # or you can enter `ding -m serial_onpolicy -c walker2d_pg_config.py -s 0` - from ding.entry import serial_pipeline_onpolicy - serial_pipeline_onpolicy([main_config, create_config], seed=0) From 7b581eb9f1e9c9b92d2edb24459a2c90d0202133 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 11 Oct 2023 13:05:09 +0800 Subject: [PATCH 205/244] merge file from main --- ding/envs/ding_env_manager.py | 23 ---- ding/model/template/__init__.py | 8 +- ding/model/template/stochastic_policy.py | 35 ----- ding/model/template/vac.py | 30 ----- ding/model/template/value_network.py | 41 ------ ding/policy/pg.py | 8 -- ding/policy/ppo.py | 164 ++--------------------- ding/policy/ppof.py | 5 - 8 files changed, 11 insertions(+), 303 deletions(-) delete mode 100644 ding/envs/ding_env_manager.py delete mode 100644 ding/model/template/stochastic_policy.py delete mode 100644 ding/model/template/value_network.py diff --git a/ding/envs/ding_env_manager.py b/ding/envs/ding_env_manager.py deleted file mode 100644 index 8dbffd4042..0000000000 --- a/ding/envs/ding_env_manager.py +++ /dev/null @@ -1,23 +0,0 @@ -from .env_manager import BaseEnvManagerV2, SubprocessEnvManagerV2 -from .env import DingEnvWrapper -from typing import Optional -from functools import partial - - -def setup_ding_env_manager( - env: DingEnvWrapper, - env_num: int, - context: Optional[str] = None, - debug: bool = False, - caller: str = 'collector' -) -> BaseEnvManagerV2: - assert caller in ['evaluator', 'collector'] - if debug: - env_cls = BaseEnvManagerV2 - manager_cfg = env_cls.default_config() - else: - env_cls = SubprocessEnvManagerV2 - manager_cfg = env_cls.default_config() - if context is not None: - manager_cfg.context = context - return env_cls([partial(env.clone, caller) for _ in range(env_num)], manager_cfg) diff --git a/ding/model/template/__init__.py b/ding/model/template/__init__.py index cd25c0d4e6..b2dd815287 100755 --- a/ding/model/template/__init__.py +++ b/ding/model/template/__init__.py @@ -2,7 +2,7 @@ from .q_learning import DQN, RainbowDQN, QRDQN, IQN, FQF, DRQN, C51DQN, BDQ, GTrXLDQN from .qac import DiscreteQAC, ContinuousQAC from .pdqn import PDQN -from .vac import BaseVAC, VAC, DREAMERVAC +from .vac import VAC, DREAMERVAC from .bc import DiscreteBC, ContinuousBC from .language_transformer import LanguageTransformer # algorithm-specific @@ -25,10 +25,4 @@ from .decision_transformer import DecisionTransformer from .procedure_cloning import ProcedureCloningMCTS, ProcedureCloningBFS from .bcq import BCQ -<<<<<<< HEAD -from .edac import QACEnsemble -from .value_network import QModel, VModel -from .stochastic_policy import StochasticPolicy -======= from .edac import EDAC ->>>>>>> 11cc7de83c4e40c2a3929a46ac4fb132e730df5b diff --git a/ding/model/template/stochastic_policy.py b/ding/model/template/stochastic_policy.py deleted file mode 100644 index 2a2eb0411e..0000000000 --- a/ding/model/template/stochastic_policy.py +++ /dev/null @@ -1,35 +0,0 @@ -import torch -from torch import nn -from ding.torch_utils import Gaussian, GaussianTanh - - -class StochasticPolicy(nn.Module): - - def __init__(self, cfg): - super().__init__() - self.cfg = cfg - if cfg.model_type == 'Gaussian': - self.model = Gaussian(cfg.model) - elif cfg.model_type == 'GaussianTanh': - self.model = GaussianTanh(cfg.model) - else: - raise NotImplementedError - - def forward(self, obs): - action, log_prob = self.model(obs) - return action, log_prob - - def log_prob(self, action, obs): - return self.model.log_prob(action, obs) - - def sample(self, obs, sample_shape=torch.Size()): - return self.model.sample(obs, sample_shape) - - def rsample(self, obs, sample_shape=torch.Size()): - return self.model.rsample(obs, sample_shape) - - def entropy(self, obs): - return self.model.entropy(obs) - - def dist(self, obs): - return self.model.dist(obs) diff --git a/ding/model/template/vac.py b/ding/model/template/vac.py index ddf1ecccaa..0f4c7ee94f 100644 --- a/ding/model/template/vac.py +++ b/ding/model/template/vac.py @@ -103,30 +103,6 @@ def __init__( Overview: Initialize the VAC model according to corresponding input arguments. Arguments: -<<<<<<< HEAD - - obs_shape (:obj:`Union[int, SequenceType]`): Observation's space. - - action_shape (:obj:`Union[int, SequenceType]`): Action's space. - - action_space (:obj:`str`): Choose action head in ['discrete', 'continuous', 'hybrid'] - - share_encoder (:obj:`bool`): Whether share encoder. - - encoder_hidden_size_list (:obj:`SequenceType`): Collection of ``hidden_size`` to pass to ``Encoder`` - - actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor-nn's ``Head``. - - actor_head_layer_num (:obj:`int`): - The num of layers used in the network to compute Q value output for actor's nn. - - critic_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to critic-nn's ``Head``. - - critic_head_layer_num (:obj:`int`): - The num of layers used in the network to compute Q value output for critic's nn. - - activation (:obj:`Optional[Union[str, nn.Module]]`): - The type of activation function to use in ``MLP`` the after ``layer_fn``, - if ``None`` then default set to ``nn.ReLU()`` - - policy_activation (:obj:`Optional[Union[str, nn.Module]]`): - The type of activation function to use in ``MLP`` the after ``layer_fn`` in actor's nn, - if ``None`` then default set to ``activation`` - - value_activation (:obj:`Optional[Union[str, nn.Module]]`): - The type of activation function to use in ``MLP`` the after ``layer_fn`` in critic's nn, - if ``None`` then default set to ``activation`` - - norm_type (:obj:`Optional[str]`): - The type of normalization to use, see ``ding.torch_utils.fc_block`` for more details` -======= - obs_shape (:obj:`Union[int, SequenceType]`): Observation space shape, such as 8 or [4, 84, 84]. - action_shape (:obj:`Union[int, SequenceType]`): Action space shape, such as 6 or [2, 3, 3]. - action_space (:obj:`str`): The type of different action spaces, including ['discrete', 'continuous', \ @@ -154,7 +130,6 @@ def __init__( - encoder (:obj:`Optional[torch.nn.Module]`): The encoder module, defaults to ``None``, you can define \ your own encoder module and pass it into VAC to deal with different observation space. - impala_cnn_encoder (:obj:`bool`): Whether to use IMPALA CNN encoder, defaults to ``False``. ->>>>>>> 11cc7de83c4e40c2a3929a46ac4fb132e730df5b """ super(VAC, self).__init__() obs_shape: int = squeeze(obs_shape) @@ -216,13 +191,8 @@ def new_encoder(outsize, activation): else: raise ValueError("illegal encoder instance.") else: -<<<<<<< HEAD - self.actor_encoder = new_encoder(actor_head_hidden_size, policy_activation) - self.critic_encoder = new_encoder(critic_head_hidden_size, value_activation) -======= self.actor_encoder = new_encoder(actor_head_hidden_size, activation) self.critic_encoder = new_encoder(critic_head_hidden_size, activation) ->>>>>>> 11cc7de83c4e40c2a3929a46ac4fb132e730df5b # Head Type self.critic_head = RegressionHead( diff --git a/ding/model/template/value_network.py b/ding/model/template/value_network.py deleted file mode 100644 index 958f066505..0000000000 --- a/ding/model/template/value_network.py +++ /dev/null @@ -1,41 +0,0 @@ -import torch -from torch import nn -from ding.torch_utils import multilayer_perceptron - - -class QModel(nn.Module): - - def __init__(self, cfg): - super().__init__() - self.cfg = cfg - self.model_num = cfg.model_num if hasattr(cfg, 'model_num') else 1 - self.models = nn.ModuleList([multilayer_perceptron(cfg.model) for _ in range(self.model_num)]) - - def forward(self, obs, action): - if self.model_num == 1: - return self.models[0](torch.cat((obs, action), dim=1)).squeeze(dim=1) - else: - return torch.cat([model(torch.cat((obs, action), dim=1)) for model in self.models], dim=1) - - def min_q(self, obs, action): - return torch.min( - input=torch.cat([model(torch.cat((obs, action), dim=1)) for model in self.models], dim=1), dim=1 - ).values - - -class VModel(nn.Module): - - def __init__(self, cfg): - super().__init__() - self.cfg = cfg - self.model_num = cfg.model_num if hasattr(cfg, 'model_num') else 1 - self.models = nn.ModuleList([multilayer_perceptron(cfg.model) for _ in range(self.model_num)]) - - def forward(self, obs): - if self.model_num == 1: - return self.models[0](obs).squeeze(dim=1) - else: - return torch.cat([model(obs) for model in self.models], dim=1) - - def min_q(self, obs): - return torch.min(input=torch.cat([model(obs) for model in self.models], dim=1), dim=1).values diff --git a/ding/policy/pg.py b/ding/policy/pg.py index 8b8d720df0..e5ed4827be 100644 --- a/ding/policy/pg.py +++ b/ding/policy/pg.py @@ -169,11 +169,7 @@ def _get_train_sample(self, data: list) -> Union[None, List[Any]]: Returns: - samples (:obj:`dict`): The training samples generated """ -<<<<<<< HEAD - assert data[-1]['done'] == True, "PG needs a complete epsiode" -======= assert data[-1]['done'], "PG needs a complete epsiode" ->>>>>>> 11cc7de83c4e40c2a3929a46ac4fb132e730df5b if self._cfg.learn.ignore_done: raise NotImplementedError @@ -186,11 +182,7 @@ def _get_train_sample(self, data: list) -> Union[None, List[Any]]: return get_train_sample(data, self._unroll_len) elif isinstance(data, ttorch.Tensor): data_size = data['done'].shape[0] -<<<<<<< HEAD - data['return'] = ttorch.Tensor([0.0 for i in range(data_size)]) -======= data['return'] = ttorch.torch.zeros(data_size) ->>>>>>> 11cc7de83c4e40c2a3929a46ac4fb132e730df5b for i in reversed(range(data_size)): R = self._gamma * R + data['reward'][i] data['return'][i] = R diff --git a/ding/policy/ppo.py b/ding/policy/ppo.py index 48997f70ab..2245a47096 100644 --- a/ding/policy/ppo.py +++ b/ding/policy/ppo.py @@ -7,8 +7,7 @@ from ding.torch_utils import Adam, to_device, to_dtype, unsqueeze, ContrastiveLoss from ding.rl_utils import ppo_data, ppo_error, ppo_policy_error, ppo_policy_data, get_gae_with_default_last_value, \ v_nstep_td_data, v_nstep_td_error, get_nstep_return_data, get_train_sample, gae, gae_data, ppo_error_continuous, \ - get_gae, ppo_policy_error_continuous, ppo_error_general, ppo_policy_error_general, ppo_data_general, \ - ppo_policy_data_general + get_gae, ppo_policy_error_continuous from ding.model import model_wrap from ding.utils import POLICY_REGISTRY, split_data_generator, RunningMeanStd from ding.utils.data import default_collate, default_decollate @@ -459,9 +458,6 @@ def _monitor_vars_learn(self) -> List[str]: variables += ['mu_mean', 'sigma_mean', 'sigma_grad', 'act'] return variables - def monitor_vars(self) -> List[str]: - return self._monitor_vars_learn() - @POLICY_REGISTRY.register('ppo_pg') class PPOPGPolicy(Policy): @@ -686,11 +682,7 @@ class PPOOffPolicy(Policy): priority=False, # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True. priority_IS_weight=False, -<<<<<<< HEAD - # (str) Which kind of action space used in PPOPolicy, ["general", "continuous", "discrete", "hybrid"] -======= # (str) Which kind of action space used in PPOPolicy, ["continuous", "discrete", "hybrid"] ->>>>>>> 11cc7de83c4e40c2a3929a46ac4fb132e730df5b action_space='discrete', # (bool) Whether to use nstep_return for value loss nstep_return=False, @@ -750,17 +742,10 @@ def _init_learn(self) -> None: self._priority_IS_weight = self._cfg.priority_IS_weight assert not self._priority and not self._priority_IS_weight, "Priority is not implemented in PPO" -<<<<<<< HEAD - assert self._cfg.action_space in ["general", "continuous", "discrete", "hybrid"] - self._action_space = self._cfg.action_space - - if self._action_space != "general" and self._cfg.learn.ppo_param_init: -======= assert self._cfg.action_space in ["continuous", "discrete", "hybrid"] self._action_space = self._cfg.action_space if self._cfg.learn.ppo_param_init: ->>>>>>> 11cc7de83c4e40c2a3929a46ac4fb132e730df5b for n, m in self._model.named_modules(): if isinstance(m, torch.nn.Linear): torch.nn.init.orthogonal_(m.weight) @@ -788,37 +773,12 @@ def _init_learn(self) -> None: m.weight.data.copy_(0.01 * m.weight.data) # Optimizer -<<<<<<< HEAD - if self._cfg.learn.separate_optimizer: - self._actor_optimizer = Adam( - self._model.actor.parameters(), - lr=self._cfg.learn.learning_rate, - grad_clip_type=self._cfg.learn.grad_clip_type, - clip_value=self._cfg.learn.grad_clip_value, - weight_decay=self._cfg.learn.weight_decay, - ) - self._critic_optimizer = Adam( - self._model.critic.parameters(), - lr=self._cfg.learn.learning_rate, - grad_clip_type=self._cfg.learn.grad_clip_type, - clip_value=self._cfg.learn.grad_clip_value, - ) - else: - self._optimizer = Adam( - self._model.parameters(), - lr=self._cfg.learn.learning_rate, - grad_clip_type=self._cfg.learn.grad_clip_type, - clip_value=self._cfg.learn.grad_clip_value, - weight_decay=self._cfg.learn.weight_decay, - ) -======= self._optimizer = Adam( self._model.parameters(), lr=self._cfg.learn.learning_rate, grad_clip_type=self._cfg.learn.grad_clip_type, clip_value=self._cfg.learn.grad_clip_value ) ->>>>>>> 11cc7de83c4e40c2a3929a46ac4fb132e730df5b self._learn_model = model_wrap(self._model, wrapper_name='base') @@ -861,11 +821,7 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: self._learn_model.train() with torch.no_grad(): -<<<<<<< HEAD - if self._value_norm: -======= if hasattr(self, "_value_norm") and self._value_norm: ->>>>>>> 11cc7de83c4e40c2a3929a46ac4fb132e730df5b unnormalized_return = data['adv'] + data['value'] * self._running_mean_std.std data['return'] = unnormalized_return / self._running_mean_std.std self._running_mean_std.update(unnormalized_return.cpu().numpy()) @@ -874,8 +830,7 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: # normal ppo if not self._nstep_return: - if self._action_space != 'general': - output = self._learn_model.forward(data['obs'], mode='compute_actor_critic') + output = self._learn_model.forward(data['obs'], mode='compute_actor_critic') adv = data['adv'] if self._adv_norm: @@ -916,31 +871,12 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: max(ppo_continuous_info.approx_kl, ppo_discrete_info.approx_kl), max(ppo_continuous_info.clipfrac, ppo_discrete_info.clipfrac) ) -<<<<<<< HEAD - elif self._action_space == 'general': - entropy = self._learn_model.actor.entropy(data['obs']) - log_prob = self._learn_model.actor.log_prob(action=data['action'], obs=data['obs']) - value = self._learn_model.critic(data['obs']) - ppodata = ppo_data_general( - log_prob, data['log_prob'], value, data['value'], data['adv'], data['return'], data['weight'] - ) - ppo_loss, ppo_info = ppo_error_general( - data=ppodata, entropy=entropy, clip_ratio=self._clip_ratio, use_value_clip=False - ) -======= ->>>>>>> 11cc7de83c4e40c2a3929a46ac4fb132e730df5b wv, we = self._value_weight, self._entropy_weight total_loss = ppo_loss.policy_loss + wv * ppo_loss.value_loss - we * ppo_loss.entropy_loss - if self._cfg.learn.separate_optimizer: - actor_loss = ppo_loss.policy_loss - we * ppo_loss.entropy_loss - print(f"actor_loss:[{actor_loss}]") - critic_loss = ppo_loss.value_loss - print(f"critic_loss:[{critic_loss}]") else: - if self._action_space != 'general': - output = self._learn_model.forward(data['obs'], mode='compute_actor') + output = self._learn_model.forward(data['obs'], mode='compute_actor') adv = data['adv'] if self._adv_norm: # Normalize advantage in a total train_batch @@ -977,16 +913,6 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: max(ppo_continuous_info.approx_kl, ppo_discrete_info.approx_kl), max(ppo_continuous_info.clipfrac, ppo_discrete_info.clipfrac) ) -<<<<<<< HEAD - elif self._action_space == 'general': - entropy = self._learn_model.actor.entropy(data['obs']) - log_prob = self._learn_model.actor.log_prob(action=data['action'], obs=data['obs']) - ppodata = ppo_policy_data_general(log_prob, data['log_prob'], adv, data['weight']) - ppo_policy_loss, ppo_info = ppo_policy_error_general( - data=ppodata, entropy=entropy, clip_ratio=self._clip_ratio - ) -======= ->>>>>>> 11cc7de83c4e40c2a3929a46ac4fb132e730df5b wv, we = self._value_weight, self._entropy_weight next_obs = data.get('next_obs') @@ -1007,34 +933,15 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: ppo_loss_data = namedtuple('ppo_loss', ['policy_loss', 'value_loss', 'entropy_loss']) ppo_loss = ppo_loss_data(ppo_policy_loss.policy_loss, critic_loss, ppo_policy_loss.entropy_loss) total_loss = ppo_policy_loss.policy_loss + wv * critic_loss - we * ppo_policy_loss.entropy_loss - if self._cfg.learn.separate_optimizer: - actor_loss = ppo_policy_loss.policy_loss - we * ppo_policy_loss.entropy_loss # ==================== # PPO update # ==================== -<<<<<<< HEAD - if self._cfg.learn.separate_optimizer: - self._actor_optimizer.zero_grad() - actor_loss.backward() - self._actor_optimizer.step() - self._critic_optimizer.zero_grad() - critic_loss.backward() - self._critic_optimizer.step() - else: - self._optimizer.zero_grad() - total_loss.backward() - self._optimizer.step() - return_info = { - 'cur_lr': self._optimizer.defaults['lr'] - if not self._cfg.learn.separate_optimizer else self._actor_optimizer.defaults['lr'], -======= self._optimizer.zero_grad() total_loss.backward() self._optimizer.step() return_info = { 'cur_lr': self._optimizer.defaults['lr'], ->>>>>>> 11cc7de83c4e40c2a3929a46ac4fb132e730df5b 'total_loss': total_loss.item(), 'policy_loss': ppo_loss.policy_loss.item(), 'value': data['value'].mean().item(), @@ -1055,26 +962,14 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: return return_info def _state_dict_learn(self) -> Dict[str, Any]: - if self._cfg.learn.separate_optimizer: - return { - 'model': self._learn_model.state_dict(), - 'actor_optimizer': self._actor_optimizer.state_dict(), - 'critic_optimizer': self._critic_optimizer.state_dict(), - } - else: - return { - 'model': self._learn_model.state_dict(), - 'optimizer': self._optimizer.state_dict(), - } + return { + 'model': self._learn_model.state_dict(), + 'optimizer': self._optimizer.state_dict(), + } def _load_state_dict_learn(self, state_dict: Dict[str, Any]) -> None: - if self._cfg.learn.separate_optimizer: - self._learn_model.load_state_dict(state_dict['model']) - self._actor_optimizer.load_state_dict(state_dict['actor_optimizer']) - self._critic_optimizer.load_state_dict(state_dict['critic_optimizer']) - else: - self._learn_model.load_state_dict(state_dict['model']) - self._optimizer.load_state_dict(state_dict['optimizer']) + self._learn_model.load_state_dict(state_dict['model']) + self._optimizer.load_state_dict(state_dict['optimizer']) def _init_collect(self) -> None: r""" @@ -1083,11 +978,7 @@ def _init_collect(self) -> None: Init traj and unroll length, collect model. """ self._unroll_len = self._cfg.collect.unroll_len -<<<<<<< HEAD - assert self._cfg.action_space in ["general", "continuous", "discrete", "hybrid"] -======= assert self._cfg.action_space in ["continuous", "discrete", "hybrid"] ->>>>>>> 11cc7de83c4e40c2a3929a46ac4fb132e730df5b self._action_space = self._cfg.action_space if self._action_space == 'continuous': self._collect_model = model_wrap(self._model, wrapper_name='reparam_sample') @@ -1095,11 +986,6 @@ def _init_collect(self) -> None: self._collect_model = model_wrap(self._model, wrapper_name='multinomial_sample') elif self._action_space == 'hybrid': self._collect_model = model_wrap(self._model, wrapper_name='hybrid_reparam_multinomial_sample') -<<<<<<< HEAD - elif self._action_space == 'general': - self._collect_model = model_wrap(self._model, wrapper_name='base') -======= ->>>>>>> 11cc7de83c4e40c2a3929a46ac4fb132e730df5b self._collect_model.reset() self._gamma = self._cfg.collect.discount_factor self._gae_lambda = self._cfg.collect.gae_lambda @@ -1146,15 +1032,12 @@ def _process_transition(self, obs: Any, model_output: dict, timestep: namedtuple transition = { 'obs': obs, 'next_obs': timestep.obs, + 'logit': model_output['logit'], 'action': model_output['action'], 'value': model_output['value'], 'reward': timestep.reward, 'done': timestep.done, } - if model_output.get('logit', None) is not None: - transition['logit'] = model_output['logit'] - if model_output.get('log_prob', None) is not None: - transition['log_prob'] = model_output['log_prob'] return transition def _get_train_sample(self, data: list) -> Union[None, List[Any]]: @@ -1183,11 +1066,7 @@ def _get_train_sample(self, data: list) -> Union[None, List[Any]]: )['value'] if len(last_value.shape) == 2: # multi_agent case: last_value = last_value.squeeze(0) -<<<<<<< HEAD - if self._value_norm: -======= if hasattr(self, "_value_norm") and self._value_norm: ->>>>>>> 11cc7de83c4e40c2a3929a46ac4fb132e730df5b last_value *= self._running_mean_std.std for i in range(len(data)): data[i]['value'] *= self._running_mean_std.std @@ -1198,13 +1077,6 @@ def _get_train_sample(self, data: list) -> Union[None, List[Any]]: gae_lambda=self._gae_lambda, cuda=False, ) -<<<<<<< HEAD - if self._value_norm: - for i in range(len(data)): - data[i]['value'] /= self._running_mean_std.std - - return get_train_sample(data, self._unroll_len) -======= if hasattr(self, "_value_norm") and self._value_norm: for i in range(len(data)): data[i]['value'] /= self._running_mean_std.std @@ -1213,7 +1085,6 @@ def _get_train_sample(self, data: list) -> Union[None, List[Any]]: return get_train_sample(data, self._unroll_len) else: return get_nstep_return_data(data, self._nstep) ->>>>>>> 11cc7de83c4e40c2a3929a46ac4fb132e730df5b def _init_eval(self) -> None: r""" @@ -1221,11 +1092,7 @@ def _init_eval(self) -> None: Evaluate mode init method. Called by ``self.__init__``. Init eval model with argmax strategy. """ -<<<<<<< HEAD - assert self._cfg.action_space in ["general", "continuous", "discrete", "hybrid"] -======= assert self._cfg.action_space in ["continuous", "discrete", "hybrid"] ->>>>>>> 11cc7de83c4e40c2a3929a46ac4fb132e730df5b self._action_space = self._cfg.action_space if self._action_space == 'continuous': self._eval_model = model_wrap(self._model, wrapper_name='deterministic_sample') @@ -1233,11 +1100,6 @@ def _init_eval(self) -> None: self._eval_model = model_wrap(self._model, wrapper_name='argmax_sample') elif self._action_space == 'hybrid': self._eval_model = model_wrap(self._model, wrapper_name='hybrid_deterministic_argmax_sample') -<<<<<<< HEAD - elif self._action_space == 'general': - self._eval_model = model_wrap(self._model, wrapper_name='base') -======= ->>>>>>> 11cc7de83c4e40c2a3929a46ac4fb132e730df5b self._eval_model.reset() def _forward_eval(self, data: dict) -> dict: @@ -1274,12 +1136,6 @@ def _monitor_vars_learn(self) -> List[str]: if self._action_space == 'continuous': variables += ['mu_mean', 'sigma_mean', 'sigma_grad', 'act'] return variables -<<<<<<< HEAD - - def monitor_vars(self) -> List[str]: - return self._monitor_vars_learn() -======= ->>>>>>> 11cc7de83c4e40c2a3929a46ac4fb132e730df5b @POLICY_REGISTRY.register('ppo_stdim') diff --git a/ding/policy/ppof.py b/ding/policy/ppof.py index f35dbe1b88..81e605384c 100644 --- a/ding/policy/ppof.py +++ b/ding/policy/ppof.py @@ -27,13 +27,8 @@ class PPOFPolicy: epoch_per_collect=10, batch_size=64, learning_rate=3e-4, -<<<<<<< HEAD - # learningrate scheduler - lr_scheduler=None, # (10000, 0.1) -======= # learningrate scheduler, which the format is (10000, 0.1) lr_scheduler=None, ->>>>>>> 11cc7de83c4e40c2a3929a46ac4fb132e730df5b weight_decay=0, value_weight=0.5, entropy_weight=0.01, From fe30fbe78b283d380d8847c846a6d76a3dfa68ec Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 11 Oct 2023 14:41:54 +0800 Subject: [PATCH 206/244] merge file from main --- ding/bonus/ppof.py | 4 ---- ding/envs/env_manager/envpool_env_manager.py | 8 ++++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py index a5862209bf..149b42c3c1 100644 --- a/ding/bonus/ppof.py +++ b/ding/bonus/ppof.py @@ -239,10 +239,6 @@ def deploy( return EvalReturn(eval_value=np.mean(returns), eval_value_std=np.std(returns)) - env.close() - - return return_ - def collect_data( self, env_num: int = 8, diff --git a/ding/envs/env_manager/envpool_env_manager.py b/ding/envs/env_manager/envpool_env_manager.py index 7b1451eab3..93d2fd6298 100644 --- a/ding/envs/env_manager/envpool_env_manager.py +++ b/ding/envs/env_manager/envpool_env_manager.py @@ -32,7 +32,7 @@ class EnvState(enum.IntEnum): @ENV_MANAGER_REGISTRY.register('env_pool') -class PoolEnvManager(): +class PoolEnvManager: ''' Overview: Envpool now supports Atari, Classic Control, Toy Text, ViZDoom. @@ -172,7 +172,7 @@ def action_space(self) -> 'gym.spaces.Space': # noqa @ENV_MANAGER_REGISTRY.register('env_pool_v2') -class PoolEnvManagerV2(): +class PoolEnvManagerV2: ''' Overview: Envpool now supports Atari, Classic Control, Toy Text, ViZDoom. @@ -370,7 +370,7 @@ def action_space(self) -> 'gym.spaces.Space': # noqa @ENV_MANAGER_REGISTRY.register('env_pool_v3') -class PoolEnvManagerV3(): +class PoolEnvManagerV3: ''' Overview: Envpool now supports Atari, Classic Control, Toy Text, ViZDoom. @@ -638,7 +638,7 @@ def action_space(self) -> 'gym.spaces.Space': # noqa @ENV_MANAGER_REGISTRY.register('env_pool_v4') -class PoolEnvManagerV4(): +class PoolEnvManagerV4: ''' Overview: Envpool now supports Atari, Classic Control, Toy Text, ViZDoom. From dc0ea3ac3e0d153d4311aa9f27aa76f88765968e Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 11 Oct 2023 14:44:07 +0800 Subject: [PATCH 207/244] merge file from main --- ding/model/template/vac.py | 80 ++++---------------------------------- 1 file changed, 7 insertions(+), 73 deletions(-) diff --git a/ding/model/template/vac.py b/ding/model/template/vac.py index 0f4c7ee94f..29363d3570 100644 --- a/ding/model/template/vac.py +++ b/ding/model/template/vac.py @@ -3,66 +3,12 @@ import torch import torch.nn as nn from copy import deepcopy -from ding.torch_utils import get_activation from ding.utils import SequenceType, squeeze, MODEL_REGISTRY from ..common import ReparameterizationHead, RegressionHead, DiscreteHead, MultiHead, \ FCEncoder, ConvEncoder, IMPALAConvEncoder from ding.torch_utils.network.dreamer import ActionHead, DenseHead -@MODEL_REGISTRY.register('base_vac') -class BaseVAC(nn.Module): - r""" - Overview: - The VAC model. - Interfaces: - ``__init__``, ``forward``, ``compute_actor``, ``compute_critic`` - """ - mode = ['compute_actor', 'compute_critic', 'compute_actor_critic'] - - def __init__( - self, - actor: nn.Module, - critic: nn.Module, - action_space: str, - ) -> None: - super(BaseVAC, self).__init__() - self.actor = actor - self.critic = critic - self.action_space = action_space - - def forward(self, inputs: Union[torch.Tensor, Dict], mode: str): - assert mode in self.mode, "not support forward mode: {}/{}".format(mode, self.mode) - return getattr(self, mode)(inputs) - - def compute_actor(self, x: torch.Tensor): - if self.action_space == 'discrete': - raise NotImplementedError - elif self.action_space == 'continuous': - raise NotImplementedError - elif self.action_space == 'general': - action, log_prob = self.actor(x) - return {'action': action, 'log_prob': log_prob} - else: - raise NotImplementedError - - def compute_critic(self, x: torch.Tensor): - value = self.critic(x) - return {'value': value} - - def compute_actor_critic(self, x: torch.Tensor): - if self.action_space == 'discrete': - raise NotImplementedError - elif self.action_space == 'continuous': - raise NotImplementedError - elif self.action_space == 'general': - action, log_prob = self.actor(x) - value = self.critic(x) - return {'action': action, 'log_prob': log_prob, 'value': value} - else: - raise NotImplementedError - - @MODEL_REGISTRY.register('vac') class VAC(nn.Module): """ @@ -89,9 +35,7 @@ def __init__( actor_head_layer_num: int = 1, critic_head_hidden_size: int = 64, critic_head_layer_num: int = 1, - activation: Optional[Union[str, nn.Module]] = nn.ReLU(), - policy_activation: Optional[Union[str, nn.Module]] = None, - value_activation: Optional[Union[str, nn.Module]] = None, + activation: Optional[nn.Module] = nn.ReLU(), norm_type: Optional[str] = None, sigma_type: Optional[str] = 'independent', fixed_sigma_value: Optional[int] = 0.3, @@ -137,16 +81,6 @@ def __init__( self.obs_shape, self.action_shape = obs_shape, action_shape self.impala_cnn_encoder = impala_cnn_encoder self.share_encoder = share_encoder - if isinstance(activation, str): - activation = get_activation(activation) - if policy_activation is not None and isinstance(policy_activation, str): - policy_activation = get_activation(policy_activation) - else: - policy_activation = activation - if value_activation is not None and isinstance(value_activation, str): - value_activation = get_activation(value_activation) - else: - value_activation = activation # Encoder Type def new_encoder(outsize, activation): @@ -196,7 +130,7 @@ def new_encoder(outsize, activation): # Head Type self.critic_head = RegressionHead( - critic_head_hidden_size, 1, critic_head_layer_num, activation=value_activation, norm_type=norm_type + critic_head_hidden_size, 1, critic_head_layer_num, activation=activation, norm_type=norm_type ) self.action_space = action_space assert self.action_space in ['discrete', 'continuous', 'hybrid'], self.action_space @@ -207,7 +141,7 @@ def new_encoder(outsize, activation): action_shape, actor_head_layer_num, sigma_type=sigma_type, - activation=policy_activation, + activation=activation, norm_type=norm_type, bound_type=bound_type ) @@ -221,7 +155,7 @@ def new_encoder(outsize, activation): actor_head_hidden_size, action_shape, layer_num=actor_head_layer_num, - activation=policy_activation, + activation=activation, norm_type=norm_type ) else: @@ -229,7 +163,7 @@ def new_encoder(outsize, activation): actor_head_hidden_size, action_shape, actor_head_layer_num, - activation=policy_activation, + activation=activation, norm_type=norm_type ) elif self.action_space == 'hybrid': # HPPO @@ -243,7 +177,7 @@ def new_encoder(outsize, activation): actor_head_layer_num, sigma_type=sigma_type, fixed_sigma_value=fixed_sigma_value, - activation=policy_activation, + activation=activation, norm_type=norm_type, bound_type=bound_type, ) @@ -251,7 +185,7 @@ def new_encoder(outsize, activation): actor_head_hidden_size, action_shape.action_type_shape, actor_head_layer_num, - activation=policy_activation, + activation=activation, norm_type=norm_type, ) self.actor_head = nn.ModuleList([actor_action_type, actor_action_args]) From c7b76452bfee127007fa12614fdee3896d96b222 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 11 Oct 2023 14:49:59 +0800 Subject: [PATCH 208/244] merge file from main --- ding/policy/ddpg.py | 3 -- ding/policy/impala.py | 3 -- ding/policy/pg.py | 3 -- ding/policy/sac.py | 3 -- ding/policy/sql.py | 3 -- ding/torch_utils/__init__.py | 1 - ding/torch_utils/activation.py | 29 ----------- .../pendulum/config/pendulum_pg_config.py | 50 +++++++++++++++++++ 8 files changed, 50 insertions(+), 45 deletions(-) delete mode 100644 ding/torch_utils/activation.py create mode 100644 dizoo/classic_control/pendulum/config/pendulum_pg_config.py diff --git a/ding/policy/ddpg.py b/ding/policy/ddpg.py index 6aa16cbe40..8629cca4af 100644 --- a/ding/policy/ddpg.py +++ b/ding/policy/ddpg.py @@ -450,6 +450,3 @@ def _monitor_vars_learn(self) -> List[str]: if self._twin_critic: ret += ['critic_twin_loss'] return ret - - def monitor_vars(self) -> List[str]: - return self._monitor_vars_learn() diff --git a/ding/policy/impala.py b/ding/policy/impala.py index 1db548fb00..958275fc31 100644 --- a/ding/policy/impala.py +++ b/ding/policy/impala.py @@ -436,6 +436,3 @@ def _monitor_vars_learn(self) -> List[str]: by import_names path. For IMPALA, ``ding.model.interface.IMPALA`` """ return super()._monitor_vars_learn() + ['policy_loss', 'value_loss', 'entropy_loss'] - - def monitor_vars(self): - return self._monitor_vars_learn() diff --git a/ding/policy/pg.py b/ding/policy/pg.py index e5ed4827be..667439d07b 100644 --- a/ding/policy/pg.py +++ b/ding/policy/pg.py @@ -217,6 +217,3 @@ def _forward_eval(self, data: dict) -> dict: def _monitor_vars_learn(self) -> List[str]: return super()._monitor_vars_learn() + ['policy_loss', 'entropy_loss', 'return_abs_max', 'grad_norm'] - - def monitor_vars(self) -> List[str]: - return self._monitor_vars_learn() diff --git a/ding/policy/sac.py b/ding/policy/sac.py index 81ed4e8737..ebf2845e51 100644 --- a/ding/policy/sac.py +++ b/ding/policy/sac.py @@ -827,9 +827,6 @@ def _monitor_vars_learn(self) -> List[str]: 'transformed_log_prob', ] + twin_critic + alpha_loss - def monitor_vars(self) -> List[str]: - return self._monitor_vars_learn() - @POLICY_REGISTRY.register('sqil_sac') class SQILSACPolicy(SACPolicy): diff --git a/ding/policy/sql.py b/ding/policy/sql.py index ff7343b067..dc6170dfb7 100644 --- a/ding/policy/sql.py +++ b/ding/policy/sql.py @@ -294,6 +294,3 @@ def _forward_eval(self, data: dict) -> dict: def _monitor_vars_learn(self) -> List[str]: return super()._monitor_vars_learn() + ['record_value_function'] - - def monitor_vars(self) -> List[str]: - return self._monitor_vars_learn() diff --git a/ding/torch_utils/__init__.py b/ding/torch_utils/__init__.py index ea82d47d74..151b4da7e1 100755 --- a/ding/torch_utils/__init__.py +++ b/ding/torch_utils/__init__.py @@ -1,4 +1,3 @@ -from .activation import get_activation from .checkpoint_helper import build_checkpoint_helper, CountVar, auto_checkpoint from .data_helper import to_device, to_tensor, to_ndarray, to_list, to_dtype, same_shape, tensor_to_list, \ build_log_buffer, CudaFetcher, get_tensor_data, unsqueeze, squeeze, get_null_data, get_shape0, to_item, \ diff --git a/ding/torch_utils/activation.py b/ding/torch_utils/activation.py deleted file mode 100644 index c0434e27af..0000000000 --- a/ding/torch_utils/activation.py +++ /dev/null @@ -1,29 +0,0 @@ -import torch -import torch.nn as nn - - -class Lambda(nn.Module): - - def __init__(self, f): - super(Lambda, self).__init__() - self.f = f - - def forward(self, x): - return self.f(x) - - -NONLINEARITIES = { - "tanh": nn.Tanh(), - "relu": nn.ReLU(), - "softplus": nn.Softplus(), - "elu": nn.ELU(), - "square": Lambda(lambda x: x ** 2), - "identity": Lambda(lambda x: x), -} - - -def get_activation(name: str): - name = name.lower() - if name not in NONLINEARITIES: - raise ValueError("Unknown activation function {}".format(name)) - return NONLINEARITIES[name] diff --git a/dizoo/classic_control/pendulum/config/pendulum_pg_config.py b/dizoo/classic_control/pendulum/config/pendulum_pg_config.py new file mode 100644 index 0000000000..b512548398 --- /dev/null +++ b/dizoo/classic_control/pendulum/config/pendulum_pg_config.py @@ -0,0 +1,50 @@ +from easydict import EasyDict + +pendulum_pg_config = dict( + exp_name='pendulum_pg_seed0', + env=dict( + collector_env_num=8, + evaluator_env_num=5, + act_scale=True, + n_evaluator_episode=5, + stop_value=-200, + ), + policy=dict( + cuda=False, + action_space='continuous', + model=dict( + action_space='continuous', + obs_shape=3, + action_shape=1, + ), + learn=dict( + batch_size=400, + learning_rate=0.001, + entropy_weight=0.001, + ), + collect=dict( + n_episode=2, + unroll_len=1, + discount_factor=0.99, + ), + eval=dict(evaluator=dict(eval_freq=200, )) + ), +) +pendulum_pg_config = EasyDict(pendulum_pg_config) +main_config = pendulum_pg_config +pendulum_pg_create_config = dict( + env=dict( + type='pendulum', + import_names=['dizoo.classic_control.pendulum.envs.pendulum_env'], + ), + env_manager=dict(type='base'), + policy=dict(type='pg'), + collector=dict(type='episode'), +) +pendulum_pg_create_config = EasyDict(pendulum_pg_create_config) +create_config = pendulum_pg_create_config + +if __name__ == "__main__": + # or you can enter `ding -m serial_onpolicy -c pendulum_pg_config.py -s 0` + from ding.entry import serial_pipeline_onpolicy + serial_pipeline_onpolicy([main_config, create_config], seed=0) From 2f9a41f9b5027159ff46bb17db7e4a3441a0d096 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 11 Oct 2023 14:53:17 +0800 Subject: [PATCH 209/244] merge file from main --- ding/policy/common_utils.py | 4 ---- ding/policy/dqn.py | 3 --- 2 files changed, 7 deletions(-) diff --git a/ding/policy/common_utils.py b/ding/policy/common_utils.py index 29bce98bef..2af6bcb8d2 100644 --- a/ding/policy/common_utils.py +++ b/ding/policy/common_utils.py @@ -16,12 +16,8 @@ def default_preprocess_learn( ignore_done: bool = False, ) -> dict: # data preprocess -<<<<<<< HEAD - if data[0]['action'].dtype in [torch.int8, torch.int16, torch.int32, torch.int64]: -======= if data[0]['action'].dtype in [np.int8, np.int16, np.int32, np.int64, torch.int8, torch.int16, torch.int32, torch.int64]: ->>>>>>> 11cc7de83c4e40c2a3929a46ac4fb132e730df5b data = default_collate(data, cat_1dim=True) # for discrete action else: data = default_collate(data, cat_1dim=False) # for continuous action diff --git a/ding/policy/dqn.py b/ding/policy/dqn.py index 08fad57243..1d7d4bc3a6 100644 --- a/ding/policy/dqn.py +++ b/ding/policy/dqn.py @@ -909,9 +909,6 @@ def _forward_eval(self, data: Dict[int, Any]) -> Dict[int, Any]: output = default_decollate(output) return {i: d for i, d in zip(data_id, output)} - def monitor_vars(self) -> List[str]: - return ['cur_lr', 'total_loss', 'q_value'] - def calculate_priority(self, data: Dict[int, Any], update_target_model: bool = False) -> Dict[str, Any]: """ Overview: From 871fdc0a571ec48cf30689950b6fd9e95459d386 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 11 Oct 2023 15:02:42 +0800 Subject: [PATCH 210/244] merge file from main --- ding/envs/env_manager/envpool_env_manager.py | 9 +- ding/example/dqn_envpool_wandb.py | 2 +- ding/example/dqn_envpool_wandb_main.py | 2 +- ding/example/dqn_envpool_wandb_new.py | 18 +-- ding/example/dqn_envpool_wandb_new_nstep.py | 4 +- ...n_envpool_wandb_new_nstep_spaceinvaders.py | 6 +- ding/example/dqn_envpool_wandb_origin.py | 2 +- ding/example/dqn_envpool_wandb_sweep_pong.py | 2 +- ding/example/dqn_envpool_wandb_test.py | 2 +- ding/framework/middleware/collector.py | 130 ++++++++++-------- .../middleware/functional/data_processor.py | 4 - .../middleware/functional/evaluator.py | 18 +-- ding/framework/middleware/learner.py | 41 +++--- ding/policy/common_utils.py | 3 +- ding/policy/dqn.py | 13 +- 15 files changed, 136 insertions(+), 120 deletions(-) diff --git a/ding/envs/env_manager/envpool_env_manager.py b/ding/envs/env_manager/envpool_env_manager.py index 93d2fd6298..e6ad96af3e 100644 --- a/ding/envs/env_manager/envpool_env_manager.py +++ b/ding/envs/env_manager/envpool_env_manager.py @@ -508,8 +508,6 @@ def collect_data(self, num, policy=None, policy_forward_kwargs=None): env_id = np.array(env_id_to_send) self._envs.send(action, env_id) - - next_obs, rew, done, info = self._envs.recv() next_obs = next_obs.astype(np.float32) next_obs /= 255.0 @@ -538,7 +536,6 @@ def collect_data_nstep(self, num, n_step=1, policy=None, policy_forward_kwargs=N self.launch() new_data = [] - while len(new_data) < num: @@ -705,7 +702,7 @@ def launch(self) -> None: def reset(self) -> None: self._envs.async_reset() - ready_obs={} + ready_obs = {} while True: obs, _, _, info = self._envs.recv() env_id = info['env_id'] @@ -726,9 +723,9 @@ def receive_data(self): next_obs = next_obs.astype(np.float32) next_obs /= 255.0 rew = rew.astype(np.float32) - + return next_obs, rew, done, info - + def close(self) -> None: if self._closed: return diff --git a/ding/example/dqn_envpool_wandb.py b/ding/example/dqn_envpool_wandb.py index c479eff3c5..1e63982039 100644 --- a/ding/example/dqn_envpool_wandb.py +++ b/ding/example/dqn_envpool_wandb.py @@ -100,7 +100,7 @@ def main(cfg): task.use(online_logger(train_show_freq=10)) task.use( wandb_online_logger( - metric_list=policy.monitor_vars(), + metric_list=policy._monitor_vars_learn(), model=policy._model, anonymous=True, project_name=cfg.exp_name, diff --git a/ding/example/dqn_envpool_wandb_main.py b/ding/example/dqn_envpool_wandb_main.py index 54ee1387bd..8d5db7ad04 100644 --- a/ding/example/dqn_envpool_wandb_main.py +++ b/ding/example/dqn_envpool_wandb_main.py @@ -101,7 +101,7 @@ def main(cfg): task.use(online_logger(train_show_freq=10)) task.use( wandb_online_logger( - metric_list=policy.monitor_vars(), + metric_list=policy._monitor_vars_learn(), model=policy._model, anonymous=True, project_name=cfg.exp_name, diff --git a/ding/example/dqn_envpool_wandb_new.py b/ding/example/dqn_envpool_wandb_new.py index 379a95a4e5..b5131b9e8c 100644 --- a/ding/example/dqn_envpool_wandb_new.py +++ b/ding/example/dqn_envpool_wandb_new.py @@ -100,14 +100,14 @@ def main(cfg): task.use(OffPolicyLearnerV2(cfg, policy, buffer_)) task.use(online_logger(train_show_freq=10)) # task.use( - # wandb_online_logger( - # metric_list=policy.monitor_vars(), - # model=policy._model, - # exp_config=cfg, - # anonymous=True, - # project_name=cfg.exp_name, - # wandb_sweep=False, - # ) + # wandb_online_logger( + # metric_list=policy._monitor_vars_learn(), + # model=policy._model, + # exp_config=cfg, + # anonymous=True, + # project_name=cfg.exp_name, + # wandb_sweep=False, + # ) # ) #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) @@ -117,7 +117,7 @@ def main(cfg): if __name__ == "__main__": - + import argparse parser = argparse.ArgumentParser() parser.add_argument("--seed", type=int, default=0, help="random seed") diff --git a/ding/example/dqn_envpool_wandb_new_nstep.py b/ding/example/dqn_envpool_wandb_new_nstep.py index 4a20334cd2..5b3b1fa1fb 100644 --- a/ding/example/dqn_envpool_wandb_new_nstep.py +++ b/ding/example/dqn_envpool_wandb_new_nstep.py @@ -99,7 +99,7 @@ def main(cfg): task.use(online_logger(train_show_freq=10)) task.use( wandb_online_logger( - metric_list=policy.monitor_vars(), + metric_list=policy._monitor_vars_learn(), model=policy._model, exp_config=cfg, anonymous=True, @@ -115,7 +115,7 @@ def main(cfg): if __name__ == "__main__": - + import argparse parser = argparse.ArgumentParser() parser.add_argument("--seed", type=int, default=0, help="random seed") diff --git a/ding/example/dqn_envpool_wandb_new_nstep_spaceinvaders.py b/ding/example/dqn_envpool_wandb_new_nstep_spaceinvaders.py index 439c03aaa1..816921bd2f 100644 --- a/ding/example/dqn_envpool_wandb_new_nstep_spaceinvaders.py +++ b/ding/example/dqn_envpool_wandb_new_nstep_spaceinvaders.py @@ -23,7 +23,7 @@ def main(cfg): logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Spaceinvaders-v5-DQN-envpool-new-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + cfg.exp_name = 'Test-Spaceinvaders-v5-DQN-envpool-new-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") collector_env_cfg = EasyDict( { @@ -99,7 +99,7 @@ def main(cfg): task.use(online_logger(train_show_freq=10)) task.use( wandb_online_logger( - metric_list=policy.monitor_vars(), + metric_list=policy._monitor_vars_learn(), model=policy._model, exp_config=cfg, anonymous=True, @@ -115,7 +115,7 @@ def main(cfg): if __name__ == "__main__": - + import argparse parser = argparse.ArgumentParser() parser.add_argument("--seed", type=int, default=0, help="random seed") diff --git a/ding/example/dqn_envpool_wandb_origin.py b/ding/example/dqn_envpool_wandb_origin.py index bdc49e714c..07c1f8fcff 100644 --- a/ding/example/dqn_envpool_wandb_origin.py +++ b/ding/example/dqn_envpool_wandb_origin.py @@ -101,7 +101,7 @@ def main(cfg): task.use(online_logger(train_show_freq=10)) task.use( wandb_online_logger( - metric_list=policy.monitor_vars(), + metric_list=policy._monitor_vars_learn(), model=policy._model, exp_config=cfg, anonymous=True, diff --git a/ding/example/dqn_envpool_wandb_sweep_pong.py b/ding/example/dqn_envpool_wandb_sweep_pong.py index 168c6e7656..de642adf75 100644 --- a/ding/example/dqn_envpool_wandb_sweep_pong.py +++ b/ding/example/dqn_envpool_wandb_sweep_pong.py @@ -99,7 +99,7 @@ def main(cfg, seed=0, max_env_step=int(1e7)): task.use(online_logger(train_show_freq=10)) task.use( wandb_online_logger( - metric_list=policy.monitor_vars(), + metric_list=policy._monitor_vars_learn(), model=policy._model, anonymous=True, project_name=cfg.exp_name, diff --git a/ding/example/dqn_envpool_wandb_test.py b/ding/example/dqn_envpool_wandb_test.py index 7a10828384..4b6e7c15b7 100644 --- a/ding/example/dqn_envpool_wandb_test.py +++ b/ding/example/dqn_envpool_wandb_test.py @@ -105,7 +105,7 @@ def main(cfg): task.use(online_logger(train_show_freq=10)) task.use( wandb_online_logger( - metric_list=policy.monitor_vars(), + metric_list=policy._monitor_vars_learn(), model=policy._model, anonymous=True, project_name=cfg.exp_name, diff --git a/ding/framework/middleware/collector.py b/ding/framework/middleware/collector.py index 09f7ec5c6b..6c6f8e3567 100644 --- a/ding/framework/middleware/collector.py +++ b/ding/framework/middleware/collector.py @@ -14,6 +14,7 @@ import numpy as np + class StepCollector: """ Overview: @@ -141,10 +142,10 @@ def __init__(self, cfg: EasyDict, policy, env: BaseEnvManager, random_collect_si self._ready_obs_receive = {} self._ready_obs_send = {} self._ready_action_send = {} - self._trajectory = {i:[] for i in range(env.env_num)} - self._nsteps=self.cfg.policy.nstep if hasattr(self.cfg.policy, 'nstep') else 1 - self._discount_ratio_list=[self.cfg.policy.discount_factor**(i+1) for i in range(self._nsteps)] - self._nsteps_range=list(range(1,self._nsteps)) + self._trajectory = {i: [] for i in range(env.env_num)} + self._nsteps = self.cfg.policy.nstep if hasattr(self.cfg.policy, 'nstep') else 1 + self._discount_ratio_list = [self.cfg.policy.discount_factor ** (i + 1) for i in range(self._nsteps)] + self._nsteps_range = list(range(1, self._nsteps)) self.policy = policy self.random_collect_size = random_collect_size @@ -161,28 +162,33 @@ def __call__(self, ctx: "OnlineRLContext") -> None: if self.random_collect_size > 0 and old < self.random_collect_size: target_size = self.random_collect_size - old - random=True + random = True else: target_size = self.cfg.policy.collect.n_sample * self.cfg.policy.collect.unroll_len - random=False + random = False if self.env.closed: self._ready_obs_receive = self.env.launch() - counter=0 + counter = 0 - time_send=0.0 - time_receive=0.0 - time_process=0.0 + time_send = 0.0 + time_receive = 0.0 + time_process = 0.0 while True: - start_send=time.time() + start_send = time.time() if len(self._ready_obs_receive.keys()) > 0: if random: - action_to_send = {i: {"action": np.array([self.env.action_space.sample()])} for i in self._ready_obs_receive.keys()} + action_to_send = { + i: { + "action": np.array([self.env.action_space.sample()]) + } + for i in self._ready_obs_receive.keys() + } else: action_to_send = self.policy.forward(self._ready_obs_receive, **ctx.collect_kwargs) - + self._ready_obs_send.update(self._ready_obs_receive) self._ready_obs_receive = {} self._ready_action_send.update(action_to_send) @@ -192,20 +198,20 @@ def __call__(self, ctx: "OnlineRLContext") -> None: action_send = action_send.squeeze(1) env_id_send = np.array(list(action_to_send.keys())) self.env.send_action(action_send, env_id_send) - time_send+=time.time()-start_send + time_send += time.time() - start_send - start_receive=time.time() + start_receive = time.time() next_obs, rew, done, info = self.env.receive_data() env_id_receive = info['env_id'] - counter+=len(env_id_receive) + counter += len(env_id_receive) self._ready_obs_receive.update({i: next_obs[i] for i in range(len(next_obs))}) - time_receive+=time.time()-start_receive + time_receive += time.time() - start_receive - start_process=time.time() - #todo + start_process = time.time() + #todo for i in range(len(env_id_receive)): - current_reward=rew[i] - if self._nsteps>1: + current_reward = rew[i] + if self._nsteps > 1: self._trajectory[env_id_receive[i]].append( { 'obs': self._ready_obs_send[env_id_receive[i]], @@ -228,50 +234,64 @@ def __call__(self, ctx: "OnlineRLContext") -> None: } ) - if self._nsteps>1: - if done[i]==False and counter < target_size: - reverse_record_position=min(self._nsteps,len(self._trajectory[env_id_receive[i]])) - real_reverse_record_position=reverse_record_position + if self._nsteps > 1: + if done[i] == False and counter < target_size: + reverse_record_position = min(self._nsteps, len(self._trajectory[env_id_receive[i]])) + real_reverse_record_position = reverse_record_position - for j in range(1,reverse_record_position+1): - if j==1: + for j in range(1, reverse_record_position + 1): + if j == 1: pass else: - if self._trajectory[env_id_receive[i]][-j]['done']==True: - real_reverse_record_position=j-1 + if self._trajectory[env_id_receive[i]][-j]['done'] == True: + real_reverse_record_position = j - 1 break else: self._trajectory[env_id_receive[i]][-j]['reward'].append(current_reward) - - if real_reverse_record_position==self._nsteps: - self._trajectory[env_id_receive[i]][-real_reverse_record_position]['next_n_obs']=next_obs[i] - self._trajectory[env_id_receive[i]][-real_reverse_record_position]['value_gamma']=self._discount_ratio_list[real_reverse_record_position-1] - - else: # done[i] == True or counter >= target_size - reverse_record_position=min(self._nsteps,len(self._trajectory[env_id_receive[i]])) - real_reverse_record_position=reverse_record_position - - for j in range(1,reverse_record_position+1): - if j==1: - self._trajectory[env_id_receive[i]][-j]['reward'].extend([np.zeros_like(current_reward) for _ in range(self._nsteps-len(self._trajectory[env_id_receive[i]][-j]['reward']))]) - self._trajectory[env_id_receive[i]][-j]['next_n_obs']=next_obs[i] - self._trajectory[env_id_receive[i]][-j]['value_gamma']=self._discount_ratio_list[j-1] + if real_reverse_record_position == self._nsteps: + self._trajectory[env_id_receive[i] + ][-real_reverse_record_position]['next_n_obs'] = next_obs[i] + self._trajectory[env_id_receive[i]][-real_reverse_record_position][ + 'value_gamma'] = self._discount_ratio_list[real_reverse_record_position - 1] + + else: # done[i] == True or counter >= target_size + + reverse_record_position = min(self._nsteps, len(self._trajectory[env_id_receive[i]])) + real_reverse_record_position = reverse_record_position + + for j in range(1, reverse_record_position + 1): + if j == 1: + self._trajectory[env_id_receive[i]][-j]['reward'].extend( + [ + np.zeros_like(current_reward) for _ in + range(self._nsteps - len(self._trajectory[env_id_receive[i]][-j]['reward'])) + ] + ) + self._trajectory[env_id_receive[i]][-j]['next_n_obs'] = next_obs[i] + self._trajectory[env_id_receive[i]][-j]['value_gamma'] = self._discount_ratio_list[j - + 1] else: - if self._trajectory[env_id_receive[i]][-j]['done']==True: - real_reverse_record_position=j + if self._trajectory[env_id_receive[i]][-j]['done'] == True: + real_reverse_record_position = j break else: self._trajectory[env_id_receive[i]][-j]['reward'].append(current_reward) - self._trajectory[env_id_receive[i]][-j]['reward'].extend([np.zeros_like(current_reward) for _ in range(self._nsteps-len(self._trajectory[env_id_receive[i]][-j]['reward']))]) - self._trajectory[env_id_receive[i]][-j]['next_n_obs']=next_obs[i] - self._trajectory[env_id_receive[i]][-j]['value_gamma']=self._discount_ratio_list[j-1] - + self._trajectory[env_id_receive[i]][-j]['reward'].extend( + [ + np.zeros_like(current_reward) for _ in range( + self._nsteps - len(self._trajectory[env_id_receive[i]][-j]['reward']) + ) + ] + ) + self._trajectory[env_id_receive[i]][-j]['next_n_obs'] = next_obs[i] + self._trajectory[env_id_receive[i]][-j]['value_gamma'] = self._discount_ratio_list[ + j - 1] else: - self._trajectory[env_id_receive[i]][-1]['value_gamma']=self._discount_ratio_list[0] + self._trajectory[env_id_receive[i]][-1]['value_gamma'] = self._discount_ratio_list[0] - time_process+=time.time()-start_process + time_process += time.time() - start_process if counter >= target_size: # if self._nsteps>1: # # transform reward to ttorch.tensor @@ -279,12 +299,11 @@ def __call__(self, ctx: "OnlineRLContext") -> None: # for j in range(len(self._trajectory[i])): # self._trajectory[i][j]['reward']=np.concatenate(self._trajectory[env_id_receive[i]][j]['reward']) break - - - ctx.trajectories=[] + + ctx.trajectories = [] for i in range(self.env.env_num): ctx.trajectories.extend(self._trajectory[i]) - self._trajectory[i]=[] + self._trajectory[i] = [] ctx.env_step += len(ctx.trajectories) ctx.collector_time += time.time() - start @@ -292,6 +311,7 @@ def __call__(self, ctx: "OnlineRLContext") -> None: print(f'time_receive:[{time_receive}]') print(f'time_process:[{time_process}]') + class PPOFStepCollector: """ Overview: diff --git a/ding/framework/middleware/functional/data_processor.py b/ding/framework/middleware/functional/data_processor.py index f629366d9f..3c887f0bb1 100644 --- a/ding/framework/middleware/functional/data_processor.py +++ b/ding/framework/middleware/functional/data_processor.py @@ -188,7 +188,6 @@ def _fetch(ctx: "OnlineRLContext"): return _fetch - def offpolicy_data_fetcher_v2( cfg: EasyDict, buffer_: Union[Buffer, List[Tuple[Buffer, float]], Dict[str, Buffer]], @@ -212,8 +211,6 @@ def offpolicy_data_fetcher_v2( - data_shortage_warning (:obj:`bool`): Whether to output warning when data shortage occurs in fetching. """ - - def _fetch(ctx: "OnlineRLContext"): """ Input of ctx: @@ -268,7 +265,6 @@ def _fetch(ctx: "OnlineRLContext"): ctx.train_data_sample = None return - yield if isinstance(buffer_, Buffer): diff --git a/ding/framework/middleware/functional/evaluator.py b/ding/framework/middleware/functional/evaluator.py index 27a73f248a..b39e16adfb 100644 --- a/ding/framework/middleware/functional/evaluator.py +++ b/ding/framework/middleware/functional/evaluator.py @@ -346,12 +346,12 @@ def _evaluate(ctx: Union["OnlineRLContext", "OfflineRLContext"]): ready_obs_receive = {} ready_obs_send = {} ready_action_send = {} - trajectory = {i:[] for i in range(env.env_num)} + trajectory = {i: [] for i in range(env.env_num)} if env.closed: - ready_obs_receive=env.launch() + ready_obs_receive = env.launch() else: - ready_obs_receive=env.reset() + ready_obs_receive = env.reset() policy.reset() eval_monitor = VectorEvalMonitor(env.env_num, cfg.env.n_evaluator_episode) @@ -360,7 +360,7 @@ def _evaluate(ctx: Union["OnlineRLContext", "OfflineRLContext"]): if len(ready_obs_receive.keys()) > 0: action_to_send = policy.forward(ready_obs_receive) output = [v for v in action_to_send.values()] - + ready_obs_send.update(ready_obs_receive) ready_obs_receive = {} ready_action_send.update(action_to_send) @@ -375,9 +375,9 @@ def _evaluate(ctx: Union["OnlineRLContext", "OfflineRLContext"]): env_id_receive = info['env_id'] ready_obs_receive.update({i: next_obs[i] for i in range(len(next_obs))}) - #todo + #todo for i in range(len(env_id_receive)): - current_reward=ttorch.tensor(np.array([rew[i]])) + current_reward = ttorch.tensor(np.array([rew[i]])) trajectory[env_id_receive[i]].append( { 'obs': ttorch.tensor(ready_obs_send[env_id_receive[i]]), @@ -389,13 +389,13 @@ def _evaluate(ctx: Union["OnlineRLContext", "OfflineRLContext"]): } ) - if done[i]==True: + if done[i] == True: episode_return_i = 0.0 for item in trajectory[env_id_receive[i]]: - episode_return_i+=item['reward'][0] + episode_return_i += item['reward'][0] eval_monitor.update_reward(env_id_receive[i], episode_return_i) policy.reset([env_id_receive[i]]) - trajectory[env_id_receive[i]]=[] + trajectory[env_id_receive[i]] = [] episode_return = eval_monitor.get_episode_return() episode_return_min = np.min(episode_return) diff --git a/ding/framework/middleware/learner.py b/ding/framework/middleware/learner.py index 6cb46742bb..1bddb80bdb 100644 --- a/ding/framework/middleware/learner.py +++ b/ding/framework/middleware/learner.py @@ -14,9 +14,10 @@ from queue import Queue import time import torch.multiprocessing as mp -from threading import Thread +from threading import Thread from ding.policy.common_utils import default_preprocess_learn, fast_preprocess_learn + def data_process_func(data_queue_input, data_queue_output): while True: data = data_queue_input.get() @@ -24,16 +25,17 @@ def data_process_func(data_queue_input, data_queue_output): break else: #print("get one data") - output_data=fast_preprocess_learn( + output_data = fast_preprocess_learn( data, - use_priority=False, #policy._cfg.priority, - use_priority_IS_weight=False, #policy._cfg.priority_IS_weight, - cuda=True, #policy._cuda, - device="cuda:0", #policy._device, + use_priority=False, #policy._cfg.priority, + use_priority_IS_weight=False, #policy._cfg.priority_IS_weight, + cuda=True, #policy._cuda, + device="cuda:0", #policy._device, ) data_queue_output.put(output_data) #print("put one data, queue size:{}".format(data_queue_output.qsize())) + def data_process_func_v2(data_queue_input, data_queue_output): while True: if data_queue_input.empty(): @@ -44,16 +46,17 @@ def data_process_func_v2(data_queue_input, data_queue_output): break else: #print("get one data") - output_data=fast_preprocess_learn( + output_data = fast_preprocess_learn( data, - use_priority=False, #policy._cfg.priority, - use_priority_IS_weight=False, #policy._cfg.priority_IS_weight, - cuda=True, #policy._cuda, - device="cuda:0", #policy._device, + use_priority=False, #policy._cfg.priority, + use_priority_IS_weight=False, #policy._cfg.priority_IS_weight, + cuda=True, #policy._cuda, + device="cuda:0", #policy._device, ) data_queue_output.put(output_data) #print("put one data, queue size:{}".format(data_queue_output.qsize())) + class OffPolicyLearner: """ Overview: @@ -118,7 +121,6 @@ def __call__(self, ctx: "OnlineRLContext") -> None: print("time_trainer:time_fetcher={}:{}={}".format(time_trainer, time_fetcher, time_trainer / time_fetcher)) - class OffPolicyLearnerV2: """ Overview: @@ -157,7 +159,7 @@ def __init__( self._data_queue_input = Queue() self._data_queue_output = Queue() - self.thread_worker=Thread(target=data_process_func_v2, args=(self._data_queue_input, self._data_queue_output)) + self.thread_worker = Thread(target=data_process_func_v2, args=(self._data_queue_input, self._data_queue_output)) self.thread_worker.start() #self._fetcher_worker_process = mp.Process(target=data_process_func, args=(self._data_queue_input, self._data_queue_output)) @@ -177,12 +179,11 @@ def __call__(self, ctx: "OnlineRLContext") -> None: start = time.time() time_fetcher = 0.0 time_trainer = 0.0 - time_fetch_data=0.0 - time_get_data=0.0 - - train_output_queue = [] - data_counter=0 + time_fetch_data = 0.0 + time_get_data = 0.0 + train_output_queue = [] + data_counter = 0 start_fetcher = time.time() for _ in range(self.cfg.policy.learn.update_per_collect): @@ -192,7 +193,7 @@ def __call__(self, ctx: "OnlineRLContext") -> None: if ctx.train_data_sample is None: break self._data_queue_input.put(ctx.train_data_sample) - data_counter+=1 + data_counter += 1 time_fetcher += time.time() - start_fetcher start_trainer = time.time() @@ -203,7 +204,7 @@ def __call__(self, ctx: "OnlineRLContext") -> None: time.sleep(0.001) continue else: - ctx.train_data=self._data_queue_output.get() + ctx.train_data = self._data_queue_output.get() break time_get_data += time.time() - start_get_data if self._reward_estimator: diff --git a/ding/policy/common_utils.py b/ding/policy/common_utils.py index 2af6bcb8d2..7e0e1599fb 100644 --- a/ding/policy/common_utils.py +++ b/ding/policy/common_utils.py @@ -67,7 +67,7 @@ def fast_preprocess_learn( # data preprocess processes_data = {} - action=torch.tensor(np.array([data[i]['action'] for i in range(len(data))])) + action = torch.tensor(np.array([data[i]['action'] for i in range(len(data))])) if cuda: action = to_device(action, device=device) if action.ndim == 2 and action.shape[1] == 1: @@ -124,6 +124,7 @@ def fast_preprocess_learn( return processes_data + def fast_preprocess_learn_v2( data: List[Any], use_priority_IS_weight: bool = False, diff --git a/ding/policy/dqn.py b/ding/policy/dqn.py index 1d7d4bc3a6..3073d68508 100644 --- a/ding/policy/dqn.py +++ b/ding/policy/dqn.py @@ -722,15 +722,16 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: target_next_n_action = self._learn_model.forward(data['next_n_obs'])['action'] data_n = q_nstep_td_data( - q_value, target_next_n_q_value, data['action'], target_next_n_action, data['reward'], data['done'], data['weight'] + q_value, target_next_n_q_value, data['action'], target_next_n_action, data['reward'], data['done'], + data['weight'] ) - if self._cfg.nstep==1: - value_gamma=None + if self._cfg.nstep == 1: + value_gamma = None else: - value_gamma = data.get('value_gamma') if 'value_gamma' in data else self._cfg.discount_factor * torch.ones_like( - data['done'] - ) + value_gamma = data.get( + 'value_gamma' + ) if 'value_gamma' in data else self._cfg.discount_factor * torch.ones_like(data['done']) loss, td_error_per_sample = q_nstep_td_error(data_n, self._gamma, nstep=self._nstep, value_gamma=value_gamma) # ==================== From e6e6828d569d8d3a64560d22c8a1eddebda54d8d Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 11 Oct 2023 20:14:52 +0800 Subject: [PATCH 211/244] change offline learner --- ding/framework/middleware/learner.py | 89 ++++++++++++++++++++++++++-- 1 file changed, 84 insertions(+), 5 deletions(-) diff --git a/ding/framework/middleware/learner.py b/ding/framework/middleware/learner.py index 1bddb80bdb..459f3d8322 100644 --- a/ding/framework/middleware/learner.py +++ b/ding/framework/middleware/learner.py @@ -128,6 +128,90 @@ class OffPolicyLearnerV2: the `__call__` method to execute the whole learning process. """ + def __new__(cls, *args, **kwargs): + if task.router.is_active and not task.has_role(task.role.LEARNER): + return task.void() + return super(OffPolicyLearnerV2, cls).__new__(cls) + + def __init__( + self, + cfg: EasyDict, + policy: 'Policy', + buffer_: Union[Buffer, List[Tuple[Buffer, float]], Dict[str, Buffer]], + reward_model: Optional['BaseRewardModel'] = None, + log_freq: int = 100, + ) -> None: + """ + Arguments: + - cfg (:obj:`EasyDict`): Config. + - policy (:obj:`Policy`): The policy to be trained. + - buffer (:obj:`Buffer`): The replay buffer to store the data for training. + - reward_model (:obj:`BaseRewardModel`): Additional reward estimator likes RND, ICM, etc. \ + default to None. + - log_freq (:obj:`int`): The frequency (iteration) of showing log. + """ + self.cfg = cfg + self._fetcher = task.wrap(offpolicy_data_fetcher_v2(cfg, buffer_)) + self._trainer = task.wrap(trainer(cfg, policy.learn_mode, log_freq=log_freq)) + if reward_model is not None: + self._reward_estimator = task.wrap(reward_estimator(cfg, reward_model)) + else: + self._reward_estimator = None + + def __call__(self, ctx: "OnlineRLContext") -> None: + """ + Output of ctx: + - train_output (:obj:`Deque`): The training output in deque. + """ + start = time.time() + time_fetcher = 0.0 + time_process_data = 0.0 + time_trainer = 0.0 + + train_output_queue = [] + + for _ in range(self.cfg.policy.learn.update_per_collect): + start_fetch_data = time.time() + self._fetcher(ctx) + time_fetcher += time.time() - start_fetch_data + + start_process_data = time.time() + ctx.train_data = fast_preprocess_learn( + ctx.train_data_sample, + use_priority=False, #policy._cfg.priority, + use_priority_IS_weight=False, #policy._cfg.priority_IS_weight, + cuda=True, #policy._cuda, + device="cuda:0", #policy._device, + ) + time_process_data += time.time() - start_process_data + + if self._reward_estimator: + self._reward_estimator(ctx) + + start_trainer = time.time() + self._trainer(ctx) + time_trainer += time.time() - start_trainer + + train_output_queue.append(ctx.train_output) + ctx.train_output_for_post_process = ctx.train_output + + ctx.train_output = train_output_queue + ctx.learner_time += time.time() - start + print("time_fetcher:time_trainer={}:{}={}".format(time_fetcher, time_trainer, time_fetcher / time_trainer)) + print( + "time_process_data:time_trainer={}:{}={}".format( + time_process_data, time_trainer, time_process_data / time_trainer + ) + ) + + +class OffPolicyLearnerV3: + """ + Overview: + The class of the off-policy learner, including data fetching and model training. Use \ + the `__call__` method to execute the whole learning process. + """ + def __new__(cls, *args, **kwargs): if task.router.is_active and not task.has_role(task.role.LEARNER): return task.void() @@ -153,8 +237,6 @@ def __init__( self.cfg = cfg self._fetcher = task.wrap(offpolicy_data_fetcher_v2(cfg, buffer_)) - #self._data_queue_input = mp.Queue() - #self._data_queue_output = mp.Queue() self._data_queue_input = Queue() self._data_queue_output = Queue() @@ -162,9 +244,6 @@ def __init__( self.thread_worker = Thread(target=data_process_func_v2, args=(self._data_queue_input, self._data_queue_output)) self.thread_worker.start() - #self._fetcher_worker_process = mp.Process(target=data_process_func, args=(self._data_queue_input, self._data_queue_output)) - #self._fetcher_worker_process.start() - self._trainer = task.wrap(trainer(cfg, policy.learn_mode, log_freq=log_freq)) if reward_model is not None: self._reward_estimator = task.wrap(reward_estimator(cfg, reward_model)) From 8d79f668ccf3818ae45741a1cb4dbe25c907ad31 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 12 Oct 2023 01:07:59 +0800 Subject: [PATCH 212/244] add dqn policy timer --- ding/policy/dqn.py | 84 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 77 insertions(+), 7 deletions(-) diff --git a/ding/policy/dqn.py b/ding/policy/dqn.py index 3073d68508..4de2cbd0d5 100644 --- a/ding/policy/dqn.py +++ b/ding/policy/dqn.py @@ -671,6 +671,19 @@ def _init_learn(self) -> None: self._learn_model = model_wrap(self._model, wrapper_name='argmax_sample') self._learn_model.reset() self._target_model.reset() + self.time_counter=dict( + set_model_train_time=0, + forward_q_value_time=0, + forward_target_next_time=0, + q_nstep_td_data_time=0, + get_value_gamma_time=0, + loss_time=0, + backward_time=0, + gradient_step_time=0, + target_update_time=0, + time_learn_total=0, + counter_learn=0, + ) def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: """ @@ -690,8 +703,6 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: - optional: ``action_distribution`` """ - start = time.time() - # data = fast_preprocess_learn( # data, # use_priority=self._priority, @@ -700,56 +711,115 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: # device=self._device, # ) - time_data_process = time.time() - start - start = time.time() - # if self._cuda: # for key in data.keys(): # if isinstance(data[key], torch.Tensor): # data[key] = to_device(data[key], self._device) + start_total = time.time() # ==================== # Q-learning forward # ==================== + torch.cuda.synchronize() + start = time.time() self._learn_model.train() self._target_model.train() + torch.cuda.synchronize() + set_model_train_time = time.time() - start # Current q value (main model) + start = time.time() q_value = self._learn_model.forward(data['obs'])['logit'] + torch.cuda.synchronize() + forward_q_value_time = time.time() - start + + start = time.time() # Target q value with torch.no_grad(): target_next_n_q_value = self._target_model.forward(data['next_n_obs'])['logit'] # Max q value action (main model), i.e. Double DQN target_next_n_action = self._learn_model.forward(data['next_n_obs'])['action'] + torch.cuda.synchronize() + forward_target_next_time = time.time() - start + start = time.time() data_n = q_nstep_td_data( q_value, target_next_n_q_value, data['action'], target_next_n_action, data['reward'], data['done'], data['weight'] ) + torch.cuda.synchronize() + q_nstep_td_data_time = time.time() - start + start = time.time() if self._cfg.nstep == 1: value_gamma = None else: value_gamma = data.get( 'value_gamma' ) if 'value_gamma' in data else self._cfg.discount_factor * torch.ones_like(data['done']) + torch.cuda.synchronize() + get_value_gamma_time = time.time() - start + + start = time.time() loss, td_error_per_sample = q_nstep_td_error(data_n, self._gamma, nstep=self._nstep, value_gamma=value_gamma) + torch.cuda.synchronize() + loss_time = time.time() - start # ==================== # Q-learning update # ==================== + start = time.time() self._optimizer.zero_grad() loss.backward() + torch.cuda.synchronize() + backward_time = time.time() - start if self._cfg.multi_gpu: self.sync_gradients(self._learn_model) + start = time.time() self._optimizer.step() + torch.cuda.synchronize() + gradient_step_time = time.time() - start # ============= # after update # ============= + start = time.time() self._target_model.update(self._learn_model.state_dict()) + torch.cuda.synchronize() + target_update_time = time.time() - start + + + time_learn_total = time.time() - start_total + + # print(f"set_model_train_time:time_learn={set_model_train_time}:{time_learn_total}={set_model_train_time/time_learn_total}") + # print(f"forward_q_value_time:time_learn={forward_q_value_time}:{time_learn_total}={forward_q_value_time/time_learn_total}") + # print(f"forward_target_next_time:time_learn={forward_target_next_time}:{time_learn_total}={forward_target_next_time/time_learn_total}") + # print(f"q_nstep_td_data_time:time_learn={q_nstep_td_data_time}:{time_learn_total}={q_nstep_td_data_time/time_learn_total}") + # print(f"get_value_gamma_time:time_learn={get_value_gamma_time}:{time_learn_total}={get_value_gamma_time/time_learn_total}") + # print(f"loss_time:time_learn={loss_time}:{time_learn_total}={loss_time/time_learn_total}") + # print(f"backward_time:time_learn={backward_time}:{time_learn_total}={backward_time/time_learn_total}") + # print(f"gradient_step_time:time_learn={gradient_step_time}:{time_learn_total}={gradient_step_time/time_learn_total}") + # print(f"target_update_time:time_learn={target_update_time}:{time_learn_total}={target_update_time/time_learn_total}") + self.time_counter['set_model_train_time'] += set_model_train_time + self.time_counter['forward_q_value_time'] += forward_q_value_time + self.time_counter['forward_target_next_time'] += forward_target_next_time + self.time_counter['q_nstep_td_data_time'] += q_nstep_td_data_time + self.time_counter['get_value_gamma_time'] += get_value_gamma_time + self.time_counter['loss_time'] += loss_time + self.time_counter['backward_time'] += backward_time + self.time_counter['gradient_step_time'] += gradient_step_time + self.time_counter['target_update_time'] += target_update_time + self.time_counter['time_learn_total'] += time_learn_total + self.time_counter['counter_learn'] += 1 + print(f"set_model_train_time:time_learn={self.time_counter['set_model_train_time']}:{self.time_counter['time_learn_total']}={self.time_counter['set_model_train_time']/self.time_counter['time_learn_total']}") + print(f"forward_q_value_time:time_learn={self.time_counter['forward_q_value_time']}:{self.time_counter['time_learn_total']}={self.time_counter['forward_q_value_time']/self.time_counter['time_learn_total']}") + print(f"forward_target_next_time:time_learn={self.time_counter['forward_target_next_time']}:{self.time_counter['time_learn_total']}={self.time_counter['forward_target_next_time']/self.time_counter['time_learn_total']}") + print(f"q_nstep_td_data_time:time_learn={self.time_counter['q_nstep_td_data_time']}:{self.time_counter['time_learn_total']}={self.time_counter['q_nstep_td_data_time']/self.time_counter['time_learn_total']}") + print(f"get_value_gamma_time:time_learn={self.time_counter['get_value_gamma_time']}:{self.time_counter['time_learn_total']}={self.time_counter['get_value_gamma_time']/self.time_counter['time_learn_total']}") + print(f"loss_time:time_learn={self.time_counter['loss_time']}:{self.time_counter['time_learn_total']}={self.time_counter['loss_time']/self.time_counter['time_learn_total']}") + print(f"backward_time:time_learn={self.time_counter['backward_time']}:{self.time_counter['time_learn_total']}={self.time_counter['backward_time']/self.time_counter['time_learn_total']}") + print(f"gradient_step_time:time_learn={self.time_counter['gradient_step_time']}:{self.time_counter['time_learn_total']}={self.time_counter['gradient_step_time']/self.time_counter['time_learn_total']}") + print(f"target_update_time:time_learn={self.time_counter['target_update_time']}:{self.time_counter['time_learn_total']}={self.time_counter['target_update_time']/self.time_counter['time_learn_total']}") - time_learn = time.time() - start - # print(f"time_data_process:time_learn={time_data_process}:{time_learn}={time_data_process/time_learn}") return { 'cur_lr': self._optimizer.defaults['lr'], From e1c137a9546531827abc7552f7c6024bfde9eeab Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 12 Oct 2023 03:21:41 +0800 Subject: [PATCH 213/244] polish code --- ding/example/dqn_envpool_wandb_new_nstep_2.py | 134 ++++++++++++++++++ ding/framework/middleware/learner.py | 12 +- ding/policy/dqn.py | 60 ++++---- 3 files changed, 170 insertions(+), 36 deletions(-) create mode 100644 ding/example/dqn_envpool_wandb_new_nstep_2.py diff --git a/ding/example/dqn_envpool_wandb_new_nstep_2.py b/ding/example/dqn_envpool_wandb_new_nstep_2.py new file mode 100644 index 0000000000..7645703e1b --- /dev/null +++ b/ding/example/dqn_envpool_wandb_new_nstep_2.py @@ -0,0 +1,134 @@ +import datetime +import torch +try: + torch.multiprocessing.set_start_method('spawn') +except RuntimeError: + pass +from easydict import EasyDict +from ditk import logging +from ding.model import DQN +from ding.policy import DQNFastPolicy +from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV3, OffPolicyLearnerV2 +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial import pong_dqn_envpool_config + + +def main(cfg): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'Pong-v5-DQN-envpool-new-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNFastPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + task.use(epoch_timer()) + + # Here is the part of single process pipeline. + task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use( + EnvpoolStepCollectorV3( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) + task.use(data_pusher(cfg, buffer_)) + #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(OffPolicyLearnerV2(cfg, policy, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use( + wandb_online_logger( + metric_list=policy._monitor_vars_learn(), + model=policy._model, + exp_config=cfg, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + ) + ) + + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=10000000)) + + task.run() + + +if __name__ == "__main__": + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") + parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") + arg = parser.parse_args() + + pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num + pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size + pong_dqn_envpool_config.seed = arg.seed + pong_dqn_envpool_config.env.stop_value = 2000 + pong_dqn_envpool_config.nstep = 3 + pong_dqn_envpool_config.policy.nstep = 3 + pong_dqn_envpool_config.seed = arg.seed + + main(pong_dqn_envpool_config) diff --git a/ding/framework/middleware/learner.py b/ding/framework/middleware/learner.py index 459f3d8322..1935eb33eb 100644 --- a/ding/framework/middleware/learner.py +++ b/ding/framework/middleware/learner.py @@ -197,12 +197,12 @@ def __call__(self, ctx: "OnlineRLContext") -> None: ctx.train_output = train_output_queue ctx.learner_time += time.time() - start - print("time_fetcher:time_trainer={}:{}={}".format(time_fetcher, time_trainer, time_fetcher / time_trainer)) - print( - "time_process_data:time_trainer={}:{}={}".format( - time_process_data, time_trainer, time_process_data / time_trainer - ) - ) + #print("time_fetcher:time_trainer={}:{}={}".format(time_fetcher, time_trainer, time_fetcher / time_trainer)) + #print( + # "time_process_data:time_trainer={}:{}={}".format( + # time_process_data, time_trainer, time_process_data / time_trainer + # ) + #) class OffPolicyLearnerV3: diff --git a/ding/policy/dqn.py b/ding/policy/dqn.py index 4de2cbd0d5..08dd18406e 100644 --- a/ding/policy/dqn.py +++ b/ding/policy/dqn.py @@ -720,16 +720,16 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: # ==================== # Q-learning forward # ==================== - torch.cuda.synchronize() + #torch.cuda.synchronize() start = time.time() self._learn_model.train() self._target_model.train() - torch.cuda.synchronize() + #torch.cuda.synchronize() set_model_train_time = time.time() - start # Current q value (main model) start = time.time() q_value = self._learn_model.forward(data['obs'])['logit'] - torch.cuda.synchronize() + #torch.cuda.synchronize() forward_q_value_time = time.time() - start start = time.time() @@ -738,7 +738,7 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: target_next_n_q_value = self._target_model.forward(data['next_n_obs'])['logit'] # Max q value action (main model), i.e. Double DQN target_next_n_action = self._learn_model.forward(data['next_n_obs'])['action'] - torch.cuda.synchronize() + #torch.cuda.synchronize() forward_target_next_time = time.time() - start start = time.time() @@ -746,7 +746,7 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: q_value, target_next_n_q_value, data['action'], target_next_n_action, data['reward'], data['done'], data['weight'] ) - torch.cuda.synchronize() + #torch.cuda.synchronize() q_nstep_td_data_time = time.time() - start start = time.time() @@ -756,12 +756,12 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: value_gamma = data.get( 'value_gamma' ) if 'value_gamma' in data else self._cfg.discount_factor * torch.ones_like(data['done']) - torch.cuda.synchronize() + #torch.cuda.synchronize() get_value_gamma_time = time.time() - start start = time.time() loss, td_error_per_sample = q_nstep_td_error(data_n, self._gamma, nstep=self._nstep, value_gamma=value_gamma) - torch.cuda.synchronize() + #torch.cuda.synchronize() loss_time = time.time() - start # ==================== @@ -770,13 +770,13 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: start = time.time() self._optimizer.zero_grad() loss.backward() - torch.cuda.synchronize() + #torch.cuda.synchronize() backward_time = time.time() - start if self._cfg.multi_gpu: self.sync_gradients(self._learn_model) start = time.time() self._optimizer.step() - torch.cuda.synchronize() + #torch.cuda.synchronize() gradient_step_time = time.time() - start # ============= @@ -784,7 +784,7 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: # ============= start = time.time() self._target_model.update(self._learn_model.state_dict()) - torch.cuda.synchronize() + #torch.cuda.synchronize() target_update_time = time.time() - start @@ -799,26 +799,26 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: # print(f"backward_time:time_learn={backward_time}:{time_learn_total}={backward_time/time_learn_total}") # print(f"gradient_step_time:time_learn={gradient_step_time}:{time_learn_total}={gradient_step_time/time_learn_total}") # print(f"target_update_time:time_learn={target_update_time}:{time_learn_total}={target_update_time/time_learn_total}") - self.time_counter['set_model_train_time'] += set_model_train_time - self.time_counter['forward_q_value_time'] += forward_q_value_time - self.time_counter['forward_target_next_time'] += forward_target_next_time - self.time_counter['q_nstep_td_data_time'] += q_nstep_td_data_time - self.time_counter['get_value_gamma_time'] += get_value_gamma_time - self.time_counter['loss_time'] += loss_time - self.time_counter['backward_time'] += backward_time - self.time_counter['gradient_step_time'] += gradient_step_time - self.time_counter['target_update_time'] += target_update_time - self.time_counter['time_learn_total'] += time_learn_total - self.time_counter['counter_learn'] += 1 - print(f"set_model_train_time:time_learn={self.time_counter['set_model_train_time']}:{self.time_counter['time_learn_total']}={self.time_counter['set_model_train_time']/self.time_counter['time_learn_total']}") - print(f"forward_q_value_time:time_learn={self.time_counter['forward_q_value_time']}:{self.time_counter['time_learn_total']}={self.time_counter['forward_q_value_time']/self.time_counter['time_learn_total']}") - print(f"forward_target_next_time:time_learn={self.time_counter['forward_target_next_time']}:{self.time_counter['time_learn_total']}={self.time_counter['forward_target_next_time']/self.time_counter['time_learn_total']}") - print(f"q_nstep_td_data_time:time_learn={self.time_counter['q_nstep_td_data_time']}:{self.time_counter['time_learn_total']}={self.time_counter['q_nstep_td_data_time']/self.time_counter['time_learn_total']}") - print(f"get_value_gamma_time:time_learn={self.time_counter['get_value_gamma_time']}:{self.time_counter['time_learn_total']}={self.time_counter['get_value_gamma_time']/self.time_counter['time_learn_total']}") - print(f"loss_time:time_learn={self.time_counter['loss_time']}:{self.time_counter['time_learn_total']}={self.time_counter['loss_time']/self.time_counter['time_learn_total']}") - print(f"backward_time:time_learn={self.time_counter['backward_time']}:{self.time_counter['time_learn_total']}={self.time_counter['backward_time']/self.time_counter['time_learn_total']}") - print(f"gradient_step_time:time_learn={self.time_counter['gradient_step_time']}:{self.time_counter['time_learn_total']}={self.time_counter['gradient_step_time']/self.time_counter['time_learn_total']}") - print(f"target_update_time:time_learn={self.time_counter['target_update_time']}:{self.time_counter['time_learn_total']}={self.time_counter['target_update_time']/self.time_counter['time_learn_total']}") + # self.time_counter['set_model_train_time'] += set_model_train_time + # self.time_counter['forward_q_value_time'] += forward_q_value_time + # self.time_counter['forward_target_next_time'] += forward_target_next_time + # self.time_counter['q_nstep_td_data_time'] += q_nstep_td_data_time + # self.time_counter['get_value_gamma_time'] += get_value_gamma_time + # self.time_counter['loss_time'] += loss_time + # self.time_counter['backward_time'] += backward_time + # self.time_counter['gradient_step_time'] += gradient_step_time + # self.time_counter['target_update_time'] += target_update_time + # self.time_counter['time_learn_total'] += time_learn_total + # self.time_counter['counter_learn'] += 1 + # print(f"set_model_train_time:time_learn={self.time_counter['set_model_train_time']}:{self.time_counter['time_learn_total']}={self.time_counter['set_model_train_time']/self.time_counter['time_learn_total']}") + # print(f"forward_q_value_time:time_learn={self.time_counter['forward_q_value_time']}:{self.time_counter['time_learn_total']}={self.time_counter['forward_q_value_time']/self.time_counter['time_learn_total']}") + # print(f"forward_target_next_time:time_learn={self.time_counter['forward_target_next_time']}:{self.time_counter['time_learn_total']}={self.time_counter['forward_target_next_time']/self.time_counter['time_learn_total']}") + # print(f"q_nstep_td_data_time:time_learn={self.time_counter['q_nstep_td_data_time']}:{self.time_counter['time_learn_total']}={self.time_counter['q_nstep_td_data_time']/self.time_counter['time_learn_total']}") + # print(f"get_value_gamma_time:time_learn={self.time_counter['get_value_gamma_time']}:{self.time_counter['time_learn_total']}={self.time_counter['get_value_gamma_time']/self.time_counter['time_learn_total']}") + # print(f"loss_time:time_learn={self.time_counter['loss_time']}:{self.time_counter['time_learn_total']}={self.time_counter['loss_time']/self.time_counter['time_learn_total']}") + # print(f"backward_time:time_learn={self.time_counter['backward_time']}:{self.time_counter['time_learn_total']}={self.time_counter['backward_time']/self.time_counter['time_learn_total']}") + # print(f"gradient_step_time:time_learn={self.time_counter['gradient_step_time']}:{self.time_counter['time_learn_total']}={self.time_counter['gradient_step_time']/self.time_counter['time_learn_total']}") + # print(f"target_update_time:time_learn={self.time_counter['target_update_time']}:{self.time_counter['time_learn_total']}={self.time_counter['target_update_time']/self.time_counter['time_learn_total']}") return { From ab330011c36fa15230c54e8efdae54f47b3cfb42 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 12 Oct 2023 11:59:08 +0800 Subject: [PATCH 214/244] polish code --- ding/example/dqn_envpool_wandb_new_nstep_2.py | 6 +++--- ding/framework/middleware/__init__.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_2.py b/ding/example/dqn_envpool_wandb_new_nstep_2.py index 7645703e1b..8e35e9a960 100644 --- a/ding/example/dqn_envpool_wandb_new_nstep_2.py +++ b/ding/example/dqn_envpool_wandb_new_nstep_2.py @@ -15,7 +15,7 @@ from ding.framework.context import OnlineRLContext from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV3, OffPolicyLearnerV2 + termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV2, OffPolicyLearnerV3 from ding.utils import set_pkg_seed from dizoo.atari.config.serial import pong_dqn_envpool_config @@ -85,7 +85,7 @@ def main(cfg): task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) task.use(eps_greedy_handler(cfg)) task.use( - EnvpoolStepCollectorV3( + EnvpoolStepCollectorV2( cfg, policy.collect_mode, collector_env, @@ -95,7 +95,7 @@ def main(cfg): ) task.use(data_pusher(cfg, buffer_)) #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(OffPolicyLearnerV2(cfg, policy, buffer_)) + task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) task.use(online_logger(train_show_freq=10)) task.use( wandb_online_logger( diff --git a/ding/framework/middleware/__init__.py b/ding/framework/middleware/__init__.py index 7936e0477b..87f6129fa6 100644 --- a/ding/framework/middleware/__init__.py +++ b/ding/framework/middleware/__init__.py @@ -1,6 +1,6 @@ from .functional import * from .collector import StepCollector, EpisodeCollector, PPOFStepCollector, EnvpoolStepCollector, EnvpoolStepCollectorV2 -from .learner import OffPolicyLearner, HERLearner, OffPolicyLearnerV2 +from .learner import OffPolicyLearner, HERLearner, OffPolicyLearnerV2, OffPolicyLearnerV3 from .ckpt_handler import CkptSaver from .distributer import ContextExchanger, ModelExchanger, PeriodicalModelExchanger from .barrier import Barrier, BarrierRuntime From 340d50ee4e52e2d4b24cf79866a32f7a05c7f4f0 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 12 Oct 2023 12:09:24 +0800 Subject: [PATCH 215/244] polish code --- ding/framework/middleware/__init__.py | 2 +- ding/framework/middleware/learner.py | 92 ++++++++++++++++++++++++++- 2 files changed, 92 insertions(+), 2 deletions(-) diff --git a/ding/framework/middleware/__init__.py b/ding/framework/middleware/__init__.py index 87f6129fa6..bae9446ac9 100644 --- a/ding/framework/middleware/__init__.py +++ b/ding/framework/middleware/__init__.py @@ -1,6 +1,6 @@ from .functional import * from .collector import StepCollector, EpisodeCollector, PPOFStepCollector, EnvpoolStepCollector, EnvpoolStepCollectorV2 -from .learner import OffPolicyLearner, HERLearner, OffPolicyLearnerV2, OffPolicyLearnerV3 +from .learner import OffPolicyLearner, HERLearner, OffPolicyLearnerV2, OffPolicyLearnerV3, OffPolicyLearnerV4 from .ckpt_handler import CkptSaver from .distributer import ContextExchanger, ModelExchanger, PeriodicalModelExchanger from .barrier import Barrier, BarrierRuntime diff --git a/ding/framework/middleware/learner.py b/ding/framework/middleware/learner.py index 1935eb33eb..768a9458eb 100644 --- a/ding/framework/middleware/learner.py +++ b/ding/framework/middleware/learner.py @@ -215,7 +215,7 @@ class OffPolicyLearnerV3: def __new__(cls, *args, **kwargs): if task.router.is_active and not task.has_role(task.role.LEARNER): return task.void() - return super(OffPolicyLearnerV2, cls).__new__(cls) + return super(OffPolicyLearnerV3, cls).__new__(cls) def __init__( self, @@ -301,6 +301,96 @@ def __call__(self, ctx: "OnlineRLContext") -> None: #print("time_trainer:time_fetcher={}:{}={}".format(time_trainer, time_fetcher, time_trainer / time_fetcher)) +class OffPolicyLearnerV4: + """ + Overview: + The class of the off-policy learner, including data fetching and model training. Use \ + the `__call__` method to execute the whole learning process. + """ + + def __new__(cls, *args, **kwargs): + if task.router.is_active and not task.has_role(task.role.LEARNER): + return task.void() + return super(OffPolicyLearnerV4, cls).__new__(cls) + + def __init__( + self, + cfg: EasyDict, + policy: 'Policy', + buffer_: Union[Buffer, List[Tuple[Buffer, float]], Dict[str, Buffer]], + reward_model: Optional['BaseRewardModel'] = None, + log_freq: int = 100, + ) -> None: + """ + Arguments: + - cfg (:obj:`EasyDict`): Config. + - policy (:obj:`Policy`): The policy to be trained. + - buffer (:obj:`Buffer`): The replay buffer to store the data for training. + - reward_model (:obj:`BaseRewardModel`): Additional reward estimator likes RND, ICM, etc. \ + default to None. + - log_freq (:obj:`int`): The frequency (iteration) of showing log. + """ + self.cfg = cfg + self._fetcher = task.wrap(offpolicy_data_fetcher_v2(cfg, buffer_)) + self._trainer = task.wrap(trainer(cfg, policy.learn_mode, log_freq=log_freq)) + if reward_model is not None: + self._reward_estimator = task.wrap(reward_estimator(cfg, reward_model)) + else: + self._reward_estimator = None + + def __call__(self, ctx: "OnlineRLContext") -> None: + """ + Output of ctx: + - train_output (:obj:`Deque`): The training output in deque. + """ + start = time.time() + time_fetcher = 0.0 + time_process_data = 0.0 + time_trainer = 0.0 + + train_output_queue = [] + train_data_processed = [] + + for _ in range(self.cfg.policy.learn.update_per_collect): + start_fetch_data = time.time() + self._fetcher(ctx) + time_fetcher += time.time() - start_fetch_data + + start_process_data = time.time() + train_data = fast_preprocess_learn( + ctx.train_data_sample, + use_priority=False, #policy._cfg.priority, + use_priority_IS_weight=False, #policy._cfg.priority_IS_weight, + cuda=True, #policy._cuda, + device="cuda:0", #policy._device, + ) + time_process_data += time.time() - start_process_data + + train_data_processed.put(train_data) + + if self._reward_estimator: + self._reward_estimator(ctx) + + for _ in range(self.cfg.policy.learn.update_per_collect): + + start_trainer = time.time() + ctx.train_data = train_data_processed.get() + self._trainer(ctx) + time_trainer += time.time() - start_trainer + + train_output_queue.append(ctx.train_output) + ctx.train_output_for_post_process = ctx.train_output + + ctx.train_output = train_output_queue + ctx.learner_time += time.time() - start + #print("time_fetcher:time_trainer={}:{}={}".format(time_fetcher, time_trainer, time_fetcher / time_trainer)) + #print( + # "time_process_data:time_trainer={}:{}={}".format( + # time_process_data, time_trainer, time_process_data / time_trainer + # ) + #) + + class HERLearner: """ Overview: From e5af07863e780c031456e0e1fa9ea292933b49e1 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 12 Oct 2023 12:17:23 +0800 Subject: [PATCH 216/244] polish code --- ding/example/dqn_envpool_wandb_new_nstep_3.py | 134 ++++++++++++++++++ ding/framework/middleware/learner.py | 2 +- 2 files changed, 135 insertions(+), 1 deletion(-) create mode 100644 ding/example/dqn_envpool_wandb_new_nstep_3.py diff --git a/ding/example/dqn_envpool_wandb_new_nstep_3.py b/ding/example/dqn_envpool_wandb_new_nstep_3.py new file mode 100644 index 0000000000..660e5d8a80 --- /dev/null +++ b/ding/example/dqn_envpool_wandb_new_nstep_3.py @@ -0,0 +1,134 @@ +import datetime +import torch +try: + torch.multiprocessing.set_start_method('spawn') +except RuntimeError: + pass +from easydict import EasyDict +from ditk import logging +from ding.model import DQN +from ding.policy import DQNFastPolicy +from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV2, OffPolicyLearnerV3, OffPolicyLearnerV4 +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial import pong_dqn_envpool_config + + +def main(cfg): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'Test-Pong-v5-DQN-envpool-new-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNFastPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + task.use(epoch_timer()) + + # Here is the part of single process pipeline. + task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use( + EnvpoolStepCollectorV2( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) + task.use(data_pusher(cfg, buffer_)) + #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(OffPolicyLearnerV4(cfg, policy, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use( + wandb_online_logger( + metric_list=policy._monitor_vars_learn(), + model=policy._model, + exp_config=cfg, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + ) + ) + + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=10000000)) + + task.run() + + +if __name__ == "__main__": + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") + parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") + arg = parser.parse_args() + + pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num + pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size + pong_dqn_envpool_config.seed = arg.seed + pong_dqn_envpool_config.env.stop_value = 2000 + pong_dqn_envpool_config.nstep = 3 + pong_dqn_envpool_config.policy.nstep = 3 + pong_dqn_envpool_config.seed = arg.seed + + main(pong_dqn_envpool_config) diff --git a/ding/framework/middleware/learner.py b/ding/framework/middleware/learner.py index 768a9458eb..4aeb7eced6 100644 --- a/ding/framework/middleware/learner.py +++ b/ding/framework/middleware/learner.py @@ -349,7 +349,7 @@ def __call__(self, ctx: "OnlineRLContext") -> None: time_trainer = 0.0 train_output_queue = [] - train_data_processed = [] + train_data_processed = Queue() for _ in range(self.cfg.policy.learn.update_per_collect): start_fetch_data = time.time() From 20cbef12db8684666f62838659f6abd97180da85 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 12 Oct 2023 14:17:49 +0800 Subject: [PATCH 217/244] add shrink model --- ...dqn_envpool_wandb_new_nstep_shrink_size.py | 139 ++++++++++++++++++ ding/model/common/head.py | 58 ++++---- dizoo/atari/config/serial/pong/__init__.py | 1 + .../pong_dqn_envpool_shink_model_config.py | 64 ++++++++ 4 files changed, 236 insertions(+), 26 deletions(-) create mode 100644 ding/example/dqn_envpool_wandb_new_nstep_shrink_size.py create mode 100644 dizoo/atari/config/serial/pong/pong_dqn_envpool_shink_model_config.py diff --git a/ding/example/dqn_envpool_wandb_new_nstep_shrink_size.py b/ding/example/dqn_envpool_wandb_new_nstep_shrink_size.py new file mode 100644 index 0000000000..64ed0e2c86 --- /dev/null +++ b/ding/example/dqn_envpool_wandb_new_nstep_shrink_size.py @@ -0,0 +1,139 @@ +import datetime +import torch +try: + torch.multiprocessing.set_start_method('spawn') +except RuntimeError: + pass +from easydict import EasyDict +import torch +from torch import nn +from ditk import logging +from ding.model import DQN +from ding.policy import DQNFastPolicy +from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV2, OffPolicyLearnerV3 +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial import pong_dqn_envpool_shink_model_config + + +def main(cfg): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'Pong-v5-DQN-envpool-shrink-model-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + cfg.policy.model['activation'] = nn.ReLU(inplace=True) + cfg.policy.model['head_layer_num'] = 0 + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNFastPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + task.use(epoch_timer()) + + # Here is the part of single process pipeline. + task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use( + EnvpoolStepCollectorV2( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) + task.use(data_pusher(cfg, buffer_)) + #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use( + wandb_online_logger( + metric_list=policy._monitor_vars_learn(), + model=policy._model, + exp_config=cfg, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + ) + ) + + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=10000000)) + + task.run() + + +if __name__ == "__main__": + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") + parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") + arg = parser.parse_args() + + pong_dqn_envpool_shink_model_config.env.collector_env_num = arg.collector_env_num + pong_dqn_envpool_shink_model_config.env.collector_batch_size = arg.collector_batch_size + pong_dqn_envpool_shink_model_config.seed = arg.seed + pong_dqn_envpool_shink_model_config.env.stop_value = 2000 + pong_dqn_envpool_shink_model_config.nstep = 3 + pong_dqn_envpool_shink_model_config.policy.nstep = 3 + pong_dqn_envpool_shink_model_config.seed = arg.seed + + main(pong_dqn_envpool_shink_model_config) diff --git a/ding/model/common/head.py b/ding/model/common/head.py index c1d27fba89..7affd8d402 100755 --- a/ding/model/common/head.py +++ b/ding/model/common/head.py @@ -819,32 +819,38 @@ def __init__( v_layer_num = layer_num layer = NoiseLinearLayer if noise else nn.Linear block = noise_block if noise else fc_block - self.A = nn.Sequential( - MLP( - hidden_size, - hidden_size, - hidden_size, - a_layer_num, - layer_fn=layer, - activation=activation, - use_dropout=dropout is not None, - dropout_probability=dropout, - norm_type=norm_type - ), block(hidden_size, output_size) - ) - self.V = nn.Sequential( - MLP( - hidden_size, - hidden_size, - hidden_size, - v_layer_num, - layer_fn=layer, - activation=activation, - use_dropout=dropout is not None, - dropout_probability=dropout, - norm_type=norm_type - ), block(hidden_size, 1) - ) + if a_layer_num>0: + self.A = nn.Sequential( + MLP( + hidden_size, + hidden_size, + hidden_size, + a_layer_num, + layer_fn=layer, + activation=activation, + use_dropout=dropout is not None, + dropout_probability=dropout, + norm_type=norm_type + ), block(hidden_size, output_size) + ) + else: + self.A = block(hidden_size, output_size) + if v_layer_num>0: + self.V = nn.Sequential( + MLP( + hidden_size, + hidden_size, + hidden_size, + v_layer_num, + layer_fn=layer, + activation=activation, + use_dropout=dropout is not None, + dropout_probability=dropout, + norm_type=norm_type + ), block(hidden_size, 1) + ) + else: + self.V = block(hidden_size, 1) def forward(self, x: torch.Tensor) -> Dict: """ diff --git a/dizoo/atari/config/serial/pong/__init__.py b/dizoo/atari/config/serial/pong/__init__.py index 5ce3db9a5b..ab7e8af037 100644 --- a/dizoo/atari/config/serial/pong/__init__.py +++ b/dizoo/atari/config/serial/pong/__init__.py @@ -1,3 +1,4 @@ from .pong_dqn_config import pong_dqn_config, pong_dqn_create_config from .pong_dqn_envpool_config import pong_dqn_envpool_config, pong_dqn_envpool_create_config +from .pong_dqn_envpool_shink_model_config import pong_dqn_envpool_shink_model_config from .pong_dqfd_config import pong_dqfd_config, pong_dqfd_create_config \ No newline at end of file diff --git a/dizoo/atari/config/serial/pong/pong_dqn_envpool_shink_model_config.py b/dizoo/atari/config/serial/pong/pong_dqn_envpool_shink_model_config.py new file mode 100644 index 0000000000..b9980ca298 --- /dev/null +++ b/dizoo/atari/config/serial/pong/pong_dqn_envpool_shink_model_config.py @@ -0,0 +1,64 @@ +from easydict import EasyDict + +pong_dqn_envpool_shink_model_config = dict( + exp_name='pong_dqn_envpool_shink_model_seed0', + env=dict( + collector_env_num=8, + collector_batch_size=8, + evaluator_env_num=8, + evaluator_batch_size=8, + n_evaluator_episode=8, + stop_value=20, + env_id='Pong-v5', + #'ALE/Pong-v5' is available. But special setting is needed after gym make. + frame_stack=4, + ), + policy=dict( + cuda=True, + priority=False, + random_collect_size=50000, + model=dict( + obs_shape=[4, 84, 84], + action_shape=6, + encoder_hidden_size_list=[32, 32, 64], + # encoder_hidden_size_list=[128, 128, 512], + ), + nstep=3, + discount_factor=0.99, + learn=dict( + update_per_collect=10, + batch_size=32, + learning_rate=0.0001, + target_update_freq=500, + ), + collect=dict(n_sample=96, ), + eval=dict(evaluator=dict(eval_freq=4000, )), + other=dict( + eps=dict( + type='exp', + start=1., + end=0.05, + decay=250000, + ), + replay_buffer=dict(replay_buffer_size=100000, ), + ), + ), +) +pong_dqn_envpool_shink_model_config = EasyDict(pong_dqn_envpool_shink_model_config) +main_config = pong_dqn_envpool_shink_model_config +pong_dqn_envpool_create_config = dict( + env=dict( + type='atari', + import_names=['dizoo.atari.envs.atari_env'], + ), + env_manager=dict(type='env_pool'), + policy=dict(type='dqn'), + replay_buffer=dict(type='deque'), +) +pong_dqn_envpool_create_config = EasyDict(pong_dqn_envpool_create_config) +create_config = pong_dqn_envpool_create_config + +if __name__ == '__main__': + # or you can enter `ding -m serial -c pong_dqn_envpool_config.py -s 0` + from ding.entry import serial_pipeline + serial_pipeline((main_config, create_config), seed=0) From 532b5b8af925138c4539f16d7e942680deab2684 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 12 Oct 2023 20:12:25 +0800 Subject: [PATCH 218/244] add large batch --- ...dqn_envpool_wandb_new_nstep_large_batch.py | 138 ++++++++++++++++++ dizoo/atari/config/serial/pong/__init__.py | 1 + .../pong/pong_dqn_envpool_large_batch_config | 63 ++++++++ 3 files changed, 202 insertions(+) create mode 100644 ding/example/dqn_envpool_wandb_new_nstep_large_batch.py create mode 100644 dizoo/atari/config/serial/pong/pong_dqn_envpool_large_batch_config diff --git a/ding/example/dqn_envpool_wandb_new_nstep_large_batch.py b/ding/example/dqn_envpool_wandb_new_nstep_large_batch.py new file mode 100644 index 0000000000..6eef6fcb5b --- /dev/null +++ b/ding/example/dqn_envpool_wandb_new_nstep_large_batch.py @@ -0,0 +1,138 @@ +import datetime +import torch +try: + torch.multiprocessing.set_start_method('spawn') +except RuntimeError: + pass +from easydict import EasyDict +import torch +from torch import nn +from ditk import logging +from ding.model import DQN +from ding.policy import DQNFastPolicy +from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV2, OffPolicyLearnerV3 +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial import pong_dqn_envpool_large_batch_config + + +def main(cfg): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'Pong-v5-DQN-envpool-large_batch-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + cfg.policy.model['activation'] = nn.ReLU(inplace=True) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNFastPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + task.use(epoch_timer()) + + # Here is the part of single process pipeline. + task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use( + EnvpoolStepCollectorV2( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) + task.use(data_pusher(cfg, buffer_)) + #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use( + wandb_online_logger( + metric_list=policy._monitor_vars_learn(), + model=policy._model, + exp_config=cfg, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + ) + ) + + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=10000000)) + + task.run() + + +if __name__ == "__main__": + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") + parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") + arg = parser.parse_args() + + pong_dqn_envpool_large_batch_config.env.collector_env_num = arg.collector_env_num + pong_dqn_envpool_large_batch_config.env.collector_batch_size = arg.collector_batch_size + pong_dqn_envpool_large_batch_config.seed = arg.seed + pong_dqn_envpool_large_batch_config.env.stop_value = 2000 + pong_dqn_envpool_large_batch_config.nstep = 3 + pong_dqn_envpool_large_batch_config.policy.nstep = 3 + pong_dqn_envpool_large_batch_config.seed = arg.seed + + main(pong_dqn_envpool_large_batch_config) diff --git a/dizoo/atari/config/serial/pong/__init__.py b/dizoo/atari/config/serial/pong/__init__.py index ab7e8af037..c237b8b762 100644 --- a/dizoo/atari/config/serial/pong/__init__.py +++ b/dizoo/atari/config/serial/pong/__init__.py @@ -1,4 +1,5 @@ from .pong_dqn_config import pong_dqn_config, pong_dqn_create_config from .pong_dqn_envpool_config import pong_dqn_envpool_config, pong_dqn_envpool_create_config from .pong_dqn_envpool_shink_model_config import pong_dqn_envpool_shink_model_config +from .pong_dqn_envpool_large_batch_config import pong_dqn_envpool_large_batch_config from .pong_dqfd_config import pong_dqfd_config, pong_dqfd_create_config \ No newline at end of file diff --git a/dizoo/atari/config/serial/pong/pong_dqn_envpool_large_batch_config b/dizoo/atari/config/serial/pong/pong_dqn_envpool_large_batch_config new file mode 100644 index 0000000000..d64ae19819 --- /dev/null +++ b/dizoo/atari/config/serial/pong/pong_dqn_envpool_large_batch_config @@ -0,0 +1,63 @@ +from easydict import EasyDict + +pong_dqn_envpool_large_batch_config = dict( + exp_name='pong_dqn_envpool_large_batch_seed0', + env=dict( + collector_env_num=8, + collector_batch_size=8, + evaluator_env_num=8, + evaluator_batch_size=8, + n_evaluator_episode=8, + stop_value=20, + env_id='Pong-v5', + #'ALE/Pong-v5' is available. But special setting is needed after gym make. + frame_stack=4, + ), + policy=dict( + cuda=True, + priority=False, + random_collect_size=50000, + model=dict( + obs_shape=[4, 84, 84], + action_shape=6, + encoder_hidden_size_list=[128, 128, 512], + ), + nstep=3, + discount_factor=0.99, + learn=dict( + update_per_collect=3, + batch_size=64, + learning_rate=0.0003, + target_update_freq=500, + ), + collect=dict(n_sample=96, ), + eval=dict(evaluator=dict(eval_freq=4000, )), + other=dict( + eps=dict( + type='exp', + start=1., + end=0.05, + decay=250000, + ), + replay_buffer=dict(replay_buffer_size=100000, ), + ), + ), +) +pong_dqn_envpool_large_batch_config = EasyDict(pong_dqn_envpool_large_batch_config) +main_config = pong_dqn_envpool_large_batch_config +pong_dqn_envpool_create_config = dict( + env=dict( + type='atari', + import_names=['dizoo.atari.envs.atari_env'], + ), + env_manager=dict(type='env_pool'), + policy=dict(type='dqn'), + replay_buffer=dict(type='deque'), +) +pong_dqn_envpool_create_config = EasyDict(pong_dqn_envpool_create_config) +create_config = pong_dqn_envpool_create_config + +if __name__ == '__main__': + # or you can enter `ding -m serial -c pong_dqn_envpool_config.py -s 0` + from ding.entry import serial_pipeline + serial_pipeline((main_config, create_config), seed=0) From b31a7ca881b704ee51fdfad17c11443d102f7080 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 12 Oct 2023 23:29:14 +0800 Subject: [PATCH 219/244] add large batch --- ..._large_batch_config => pong_dqn_envpool_large_batch_config.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename dizoo/atari/config/serial/pong/{pong_dqn_envpool_large_batch_config => pong_dqn_envpool_large_batch_config.py} (100%) diff --git a/dizoo/atari/config/serial/pong/pong_dqn_envpool_large_batch_config b/dizoo/atari/config/serial/pong/pong_dqn_envpool_large_batch_config.py similarity index 100% rename from dizoo/atari/config/serial/pong/pong_dqn_envpool_large_batch_config rename to dizoo/atari/config/serial/pong/pong_dqn_envpool_large_batch_config.py From 35069aebb58920bdf86fae513b44388030987b6c Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 13 Oct 2023 12:28:31 +0800 Subject: [PATCH 220/244] add large learning rate; add priority --- ..._envpool_wandb_new_nstep_large_learning.py | 138 ++++++++++++++++++ .../dqn_envpool_wandb_new_nstep_priority.py | 138 ++++++++++++++++++ dizoo/atari/config/serial/pong/__init__.py | 2 + .../pong_dqn_envpool_large_learning_config.py | 63 ++++++++ .../pong/pong_dqn_envpool_priority_config.py | 63 ++++++++ 5 files changed, 404 insertions(+) create mode 100644 ding/example/dqn_envpool_wandb_new_nstep_large_learning.py create mode 100644 ding/example/dqn_envpool_wandb_new_nstep_priority.py create mode 100644 dizoo/atari/config/serial/pong/pong_dqn_envpool_large_learning_config.py create mode 100644 dizoo/atari/config/serial/pong/pong_dqn_envpool_priority_config.py diff --git a/ding/example/dqn_envpool_wandb_new_nstep_large_learning.py b/ding/example/dqn_envpool_wandb_new_nstep_large_learning.py new file mode 100644 index 0000000000..988829b6d5 --- /dev/null +++ b/ding/example/dqn_envpool_wandb_new_nstep_large_learning.py @@ -0,0 +1,138 @@ +import datetime +import torch +try: + torch.multiprocessing.set_start_method('spawn') +except RuntimeError: + pass +from easydict import EasyDict +import torch +from torch import nn +from ditk import logging +from ding.model import DQN +from ding.policy import DQNFastPolicy +from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV2, OffPolicyLearnerV3 +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial import pong_dqn_envpool_large_learning_config + + +def main(cfg): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'Pong-v5-DQN-envpool-large-learning-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + cfg.policy.model['activation'] = nn.ReLU(inplace=True) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNFastPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + task.use(epoch_timer()) + + # Here is the part of single process pipeline. + task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use( + EnvpoolStepCollectorV2( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) + task.use(data_pusher(cfg, buffer_)) + #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use( + wandb_online_logger( + metric_list=policy._monitor_vars_learn(), + model=policy._model, + exp_config=cfg, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + ) + ) + + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=10000000)) + + task.run() + + +if __name__ == "__main__": + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") + parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") + arg = parser.parse_args() + + pong_dqn_envpool_large_learning_config.env.collector_env_num = arg.collector_env_num + pong_dqn_envpool_large_learning_config.env.collector_batch_size = arg.collector_batch_size + pong_dqn_envpool_large_learning_config.seed = arg.seed + pong_dqn_envpool_large_learning_config.env.stop_value = 2000 + pong_dqn_envpool_large_learning_config.nstep = 3 + pong_dqn_envpool_large_learning_config.policy.nstep = 3 + pong_dqn_envpool_large_learning_config.seed = arg.seed + + main(pong_dqn_envpool_large_learning_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_priority.py b/ding/example/dqn_envpool_wandb_new_nstep_priority.py new file mode 100644 index 0000000000..02260f91b3 --- /dev/null +++ b/ding/example/dqn_envpool_wandb_new_nstep_priority.py @@ -0,0 +1,138 @@ +import datetime +import torch +try: + torch.multiprocessing.set_start_method('spawn') +except RuntimeError: + pass +from easydict import EasyDict +import torch +from torch import nn +from ditk import logging +from ding.model import DQN +from ding.policy import DQNFastPolicy +from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV2, OffPolicyLearnerV3 +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial import pong_dqn_envpool_priority_config + + +def main(cfg): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'Pong-v5-DQN-envpool-priority-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + cfg.policy.model['activation'] = nn.ReLU(inplace=True) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNFastPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + task.use(epoch_timer()) + + # Here is the part of single process pipeline. + task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use( + EnvpoolStepCollectorV2( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) + task.use(data_pusher(cfg, buffer_)) + #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use( + wandb_online_logger( + metric_list=policy._monitor_vars_learn(), + model=policy._model, + exp_config=cfg, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + ) + ) + + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=10000000)) + + task.run() + + +if __name__ == "__main__": + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") + parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") + arg = parser.parse_args() + + pong_dqn_envpool_priority_config.env.collector_env_num = arg.collector_env_num + pong_dqn_envpool_priority_config.env.collector_batch_size = arg.collector_batch_size + pong_dqn_envpool_priority_config.seed = arg.seed + pong_dqn_envpool_priority_config.env.stop_value = 2000 + pong_dqn_envpool_priority_config.nstep = 3 + pong_dqn_envpool_priority_config.policy.nstep = 3 + pong_dqn_envpool_priority_config.seed = arg.seed + + main(pong_dqn_envpool_priority_config) diff --git a/dizoo/atari/config/serial/pong/__init__.py b/dizoo/atari/config/serial/pong/__init__.py index c237b8b762..f3c6288910 100644 --- a/dizoo/atari/config/serial/pong/__init__.py +++ b/dizoo/atari/config/serial/pong/__init__.py @@ -2,4 +2,6 @@ from .pong_dqn_envpool_config import pong_dqn_envpool_config, pong_dqn_envpool_create_config from .pong_dqn_envpool_shink_model_config import pong_dqn_envpool_shink_model_config from .pong_dqn_envpool_large_batch_config import pong_dqn_envpool_large_batch_config +from .pong_dqn_envpool_large_learning_config import pong_dqn_envpool_large_learning_config +from .pong_dqn_envpool_priority_config import pong_dqn_envpool_priority_config from .pong_dqfd_config import pong_dqfd_config, pong_dqfd_create_config \ No newline at end of file diff --git a/dizoo/atari/config/serial/pong/pong_dqn_envpool_large_learning_config.py b/dizoo/atari/config/serial/pong/pong_dqn_envpool_large_learning_config.py new file mode 100644 index 0000000000..d74b24ca6d --- /dev/null +++ b/dizoo/atari/config/serial/pong/pong_dqn_envpool_large_learning_config.py @@ -0,0 +1,63 @@ +from easydict import EasyDict + +pong_dqn_envpool_large_learning_config = dict( + exp_name='pong_dqn_envpool_large_batch_seed0', + env=dict( + collector_env_num=8, + collector_batch_size=8, + evaluator_env_num=8, + evaluator_batch_size=8, + n_evaluator_episode=8, + stop_value=20, + env_id='Pong-v5', + #'ALE/Pong-v5' is available. But special setting is needed after gym make. + frame_stack=4, + ), + policy=dict( + cuda=True, + priority=False, + random_collect_size=50000, + model=dict( + obs_shape=[4, 84, 84], + action_shape=6, + encoder_hidden_size_list=[128, 128, 512], + ), + nstep=3, + discount_factor=0.99, + learn=dict( + update_per_collect=3, + batch_size=64, + learning_rate=0.001, + target_update_freq=500, + ), + collect=dict(n_sample=96, ), + eval=dict(evaluator=dict(eval_freq=4000, )), + other=dict( + eps=dict( + type='exp', + start=1., + end=0.05, + decay=250000, + ), + replay_buffer=dict(replay_buffer_size=100000, ), + ), + ), +) +pong_dqn_envpool_large_learning_config = EasyDict(pong_dqn_envpool_large_learning_config) +main_config = pong_dqn_envpool_large_learning_config +pong_dqn_envpool_create_config = dict( + env=dict( + type='atari', + import_names=['dizoo.atari.envs.atari_env'], + ), + env_manager=dict(type='env_pool'), + policy=dict(type='dqn'), + replay_buffer=dict(type='deque'), +) +pong_dqn_envpool_create_config = EasyDict(pong_dqn_envpool_create_config) +create_config = pong_dqn_envpool_create_config + +if __name__ == '__main__': + # or you can enter `ding -m serial -c pong_dqn_envpool_config.py -s 0` + from ding.entry import serial_pipeline + serial_pipeline((main_config, create_config), seed=0) diff --git a/dizoo/atari/config/serial/pong/pong_dqn_envpool_priority_config.py b/dizoo/atari/config/serial/pong/pong_dqn_envpool_priority_config.py new file mode 100644 index 0000000000..9cb58445a7 --- /dev/null +++ b/dizoo/atari/config/serial/pong/pong_dqn_envpool_priority_config.py @@ -0,0 +1,63 @@ +from easydict import EasyDict + +pong_dqn_envpool_priority_config = dict( + exp_name='pong_dqn_envpool_large_batch_seed0', + env=dict( + collector_env_num=8, + collector_batch_size=8, + evaluator_env_num=8, + evaluator_batch_size=8, + n_evaluator_episode=8, + stop_value=20, + env_id='Pong-v5', + #'ALE/Pong-v5' is available. But special setting is needed after gym make. + frame_stack=4, + ), + policy=dict( + cuda=True, + priority=True, + random_collect_size=50000, + model=dict( + obs_shape=[4, 84, 84], + action_shape=6, + encoder_hidden_size_list=[128, 128, 512], + ), + nstep=3, + discount_factor=0.99, + learn=dict( + update_per_collect=3, + batch_size=64, + learning_rate=0.0003, + target_update_freq=500, + ), + collect=dict(n_sample=96, ), + eval=dict(evaluator=dict(eval_freq=4000, )), + other=dict( + eps=dict( + type='exp', + start=1., + end=0.05, + decay=250000, + ), + replay_buffer=dict(replay_buffer_size=100000, ), + ), + ), +) +pong_dqn_envpool_priority_config = EasyDict(pong_dqn_envpool_priority_config) +main_config = pong_dqn_envpool_priority_config +pong_dqn_envpool_create_config = dict( + env=dict( + type='atari', + import_names=['dizoo.atari.envs.atari_env'], + ), + env_manager=dict(type='env_pool'), + policy=dict(type='dqn'), + replay_buffer=dict(type='deque'), +) +pong_dqn_envpool_create_config = EasyDict(pong_dqn_envpool_create_config) +create_config = pong_dqn_envpool_create_config + +if __name__ == '__main__': + # or you can enter `ding -m serial -c pong_dqn_envpool_config.py -s 0` + from ding.entry import serial_pipeline + serial_pipeline((main_config, create_config), seed=0) From a06bd3f373d1a8ad486966f56800468aa8ae92a0 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Mon, 16 Oct 2023 16:47:42 +0800 Subject: [PATCH 221/244] Add update per collect 5 and target update 100 --- ...vpool_wandb_new_nstep_target_update_100.py | 138 ++++++++++++++++++ ...ol_wandb_new_nstep_update_per_collect_5.py | 138 ++++++++++++++++++ dizoo/atari/config/serial/pong/__init__.py | 2 + ...ng_dqn_envpool_target_update_100_config.py | 63 ++++++++ ...dqn_envpool_update_per_collect_5_config.py | 63 ++++++++ 5 files changed, 404 insertions(+) create mode 100644 ding/example/dqn_envpool_wandb_new_nstep_target_update_100.py create mode 100644 ding/example/dqn_envpool_wandb_new_nstep_update_per_collect_5.py create mode 100644 dizoo/atari/config/serial/pong/pong_dqn_envpool_target_update_100_config.py create mode 100644 dizoo/atari/config/serial/pong/pong_dqn_envpool_update_per_collect_5_config.py diff --git a/ding/example/dqn_envpool_wandb_new_nstep_target_update_100.py b/ding/example/dqn_envpool_wandb_new_nstep_target_update_100.py new file mode 100644 index 0000000000..391897cd83 --- /dev/null +++ b/ding/example/dqn_envpool_wandb_new_nstep_target_update_100.py @@ -0,0 +1,138 @@ +import datetime +import torch +try: + torch.multiprocessing.set_start_method('spawn') +except RuntimeError: + pass +from easydict import EasyDict +import torch +from torch import nn +from ditk import logging +from ding.model import DQN +from ding.policy import DQNFastPolicy +from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV2, OffPolicyLearnerV3 +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial import pong_dqn_envpool_target_update_100_config + + +def main(cfg): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'Pong-v5-DQN-envpool-target-update-100-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + cfg.policy.model['activation'] = nn.ReLU(inplace=True) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNFastPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + task.use(epoch_timer()) + + # Here is the part of single process pipeline. + task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use( + EnvpoolStepCollectorV2( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) + task.use(data_pusher(cfg, buffer_)) + #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use( + wandb_online_logger( + metric_list=policy._monitor_vars_learn(), + model=policy._model, + exp_config=cfg, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + ) + ) + + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=10000000)) + + task.run() + + +if __name__ == "__main__": + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") + parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") + arg = parser.parse_args() + + pong_dqn_envpool_target_update_100_config.env.collector_env_num = arg.collector_env_num + pong_dqn_envpool_target_update_100_config.env.collector_batch_size = arg.collector_batch_size + pong_dqn_envpool_target_update_100_config.seed = arg.seed + pong_dqn_envpool_target_update_100_config.env.stop_value = 2000 + pong_dqn_envpool_target_update_100_config.nstep = 3 + pong_dqn_envpool_target_update_100_config.policy.nstep = 3 + pong_dqn_envpool_target_update_100_config.seed = arg.seed + + main(pong_dqn_envpool_target_update_100_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_update_per_collect_5.py b/ding/example/dqn_envpool_wandb_new_nstep_update_per_collect_5.py new file mode 100644 index 0000000000..68b89ed8fc --- /dev/null +++ b/ding/example/dqn_envpool_wandb_new_nstep_update_per_collect_5.py @@ -0,0 +1,138 @@ +import datetime +import torch +try: + torch.multiprocessing.set_start_method('spawn') +except RuntimeError: + pass +from easydict import EasyDict +import torch +from torch import nn +from ditk import logging +from ding.model import DQN +from ding.policy import DQNFastPolicy +from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV2, OffPolicyLearnerV3 +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial import pong_dqn_envpool_update_per_collect_5_config + + +def main(cfg): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'Pong-v5-DQN-envpool-update-per-collect-5-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + cfg.policy.model['activation'] = nn.ReLU(inplace=True) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNFastPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + task.use(epoch_timer()) + + # Here is the part of single process pipeline. + task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use( + EnvpoolStepCollectorV2( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) + task.use(data_pusher(cfg, buffer_)) + #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use( + wandb_online_logger( + metric_list=policy._monitor_vars_learn(), + model=policy._model, + exp_config=cfg, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + ) + ) + + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=10000000)) + + task.run() + + +if __name__ == "__main__": + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") + parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") + arg = parser.parse_args() + + pong_dqn_envpool_update_per_collect_5_config.env.collector_env_num = arg.collector_env_num + pong_dqn_envpool_update_per_collect_5_config.env.collector_batch_size = arg.collector_batch_size + pong_dqn_envpool_update_per_collect_5_config.seed = arg.seed + pong_dqn_envpool_update_per_collect_5_config.env.stop_value = 2000 + pong_dqn_envpool_update_per_collect_5_config.nstep = 3 + pong_dqn_envpool_update_per_collect_5_config.policy.nstep = 3 + pong_dqn_envpool_update_per_collect_5_config.seed = arg.seed + + main(pong_dqn_envpool_update_per_collect_5_config) diff --git a/dizoo/atari/config/serial/pong/__init__.py b/dizoo/atari/config/serial/pong/__init__.py index f3c6288910..ec63a0ce04 100644 --- a/dizoo/atari/config/serial/pong/__init__.py +++ b/dizoo/atari/config/serial/pong/__init__.py @@ -4,4 +4,6 @@ from .pong_dqn_envpool_large_batch_config import pong_dqn_envpool_large_batch_config from .pong_dqn_envpool_large_learning_config import pong_dqn_envpool_large_learning_config from .pong_dqn_envpool_priority_config import pong_dqn_envpool_priority_config +from .pong_dqn_envpool_target_update_100_config import pong_dqn_envpool_target_update_100_config +from .pong_dqn_envpool_update_per_collect_5_config import pong_dqn_envpool_update_per_collect_5_config from .pong_dqfd_config import pong_dqfd_config, pong_dqfd_create_config \ No newline at end of file diff --git a/dizoo/atari/config/serial/pong/pong_dqn_envpool_target_update_100_config.py b/dizoo/atari/config/serial/pong/pong_dqn_envpool_target_update_100_config.py new file mode 100644 index 0000000000..54f89e32b3 --- /dev/null +++ b/dizoo/atari/config/serial/pong/pong_dqn_envpool_target_update_100_config.py @@ -0,0 +1,63 @@ +from easydict import EasyDict + +pong_dqn_envpool_target_update_100_config = dict( + exp_name='pong_dqn_envpool_large_batch_seed0', + env=dict( + collector_env_num=8, + collector_batch_size=8, + evaluator_env_num=8, + evaluator_batch_size=8, + n_evaluator_episode=8, + stop_value=20, + env_id='Pong-v5', + #'ALE/Pong-v5' is available. But special setting is needed after gym make. + frame_stack=4, + ), + policy=dict( + cuda=True, + priority=False, + random_collect_size=50000, + model=dict( + obs_shape=[4, 84, 84], + action_shape=6, + encoder_hidden_size_list=[128, 128, 512], + ), + nstep=3, + discount_factor=0.99, + learn=dict( + update_per_collect=3, + batch_size=64, + learning_rate=0.0003, + target_update_freq=100, + ), + collect=dict(n_sample=96, ), + eval=dict(evaluator=dict(eval_freq=4000, )), + other=dict( + eps=dict( + type='exp', + start=1., + end=0.05, + decay=250000, + ), + replay_buffer=dict(replay_buffer_size=100000, ), + ), + ), +) +pong_dqn_envpool_target_update_100_config = EasyDict(pong_dqn_envpool_target_update_100_config) +main_config = pong_dqn_envpool_target_update_100_config +pong_dqn_envpool_create_config = dict( + env=dict( + type='atari', + import_names=['dizoo.atari.envs.atari_env'], + ), + env_manager=dict(type='env_pool'), + policy=dict(type='dqn'), + replay_buffer=dict(type='deque'), +) +pong_dqn_envpool_create_config = EasyDict(pong_dqn_envpool_create_config) +create_config = pong_dqn_envpool_create_config + +if __name__ == '__main__': + # or you can enter `ding -m serial -c pong_dqn_envpool_config.py -s 0` + from ding.entry import serial_pipeline + serial_pipeline((main_config, create_config), seed=0) diff --git a/dizoo/atari/config/serial/pong/pong_dqn_envpool_update_per_collect_5_config.py b/dizoo/atari/config/serial/pong/pong_dqn_envpool_update_per_collect_5_config.py new file mode 100644 index 0000000000..8523b54334 --- /dev/null +++ b/dizoo/atari/config/serial/pong/pong_dqn_envpool_update_per_collect_5_config.py @@ -0,0 +1,63 @@ +from easydict import EasyDict + +pong_dqn_envpool_update_per_collect_5_config = dict( + exp_name='pong_dqn_envpool_large_batch_seed0', + env=dict( + collector_env_num=8, + collector_batch_size=8, + evaluator_env_num=8, + evaluator_batch_size=8, + n_evaluator_episode=8, + stop_value=20, + env_id='Pong-v5', + #'ALE/Pong-v5' is available. But special setting is needed after gym make. + frame_stack=4, + ), + policy=dict( + cuda=True, + priority=False, + random_collect_size=50000, + model=dict( + obs_shape=[4, 84, 84], + action_shape=6, + encoder_hidden_size_list=[128, 128, 512], + ), + nstep=3, + discount_factor=0.99, + learn=dict( + update_per_collect=5, + batch_size=64, + learning_rate=0.0003, + target_update_freq=500, + ), + collect=dict(n_sample=96, ), + eval=dict(evaluator=dict(eval_freq=4000, )), + other=dict( + eps=dict( + type='exp', + start=1., + end=0.05, + decay=250000, + ), + replay_buffer=dict(replay_buffer_size=100000, ), + ), + ), +) +pong_dqn_envpool_update_per_collect_5_config = EasyDict(pong_dqn_envpool_update_per_collect_5_config) +main_config = pong_dqn_envpool_update_per_collect_5_config +pong_dqn_envpool_create_config = dict( + env=dict( + type='atari', + import_names=['dizoo.atari.envs.atari_env'], + ), + env_manager=dict(type='env_pool'), + policy=dict(type='dqn'), + replay_buffer=dict(type='deque'), +) +pong_dqn_envpool_create_config = EasyDict(pong_dqn_envpool_create_config) +create_config = pong_dqn_envpool_create_config + +if __name__ == '__main__': + # or you can enter `ding -m serial -c pong_dqn_envpool_config.py -s 0` + from ding.entry import serial_pipeline + serial_pipeline((main_config, create_config), seed=0) From e5ea2fd0afa9dc180907776e8473a03c3a7c3291 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 25 Oct 2023 12:43:12 +0800 Subject: [PATCH 222/244] Add qbert test 6 7 --- ding/example/dqn_envpool_wandb_new_nstep_6.py | 139 ++++++++++++++++++ ding/example/dqn_envpool_wandb_new_nstep_7.py | 139 ++++++++++++++++++ 2 files changed, 278 insertions(+) create mode 100644 ding/example/dqn_envpool_wandb_new_nstep_6.py create mode 100644 ding/example/dqn_envpool_wandb_new_nstep_7.py diff --git a/ding/example/dqn_envpool_wandb_new_nstep_6.py b/ding/example/dqn_envpool_wandb_new_nstep_6.py new file mode 100644 index 0000000000..303c2eced4 --- /dev/null +++ b/ding/example/dqn_envpool_wandb_new_nstep_6.py @@ -0,0 +1,139 @@ +import datetime +import torch +try: + torch.multiprocessing.set_start_method('spawn') +except RuntimeError: + pass +from easydict import EasyDict +from ditk import logging +from ding.model import DQN +from ding.policy import DQNFastPolicy +from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV2 +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial import pong_dqn_envpool_config + + +def main(cfg): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'Pong-v5-DQN-envpool-new-6-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNFastPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + task.use(epoch_timer()) + + # Here is the part of single process pipeline. + task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use( + EnvpoolStepCollectorV2( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) + task.use(data_pusher(cfg, buffer_)) + #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(OffPolicyLearnerV2(cfg, policy, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use( + wandb_online_logger( + metric_list=policy._monitor_vars_learn(), + model=policy._model, + exp_config=cfg, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + ) + ) + + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=10000000)) + + task.run() + + +if __name__ == "__main__": + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") + parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") + arg = parser.parse_args() + + pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num + pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size + pong_dqn_envpool_config.seed = arg.seed + pong_dqn_envpool_config.env.stop_value = 2000 + pong_dqn_envpool_config.nstep = 3 + pong_dqn_envpool_config.policy.nstep = 3 + pong_dqn_envpool_config.seed = arg.seed + + pong_dqn_envpool_config.policy.learn.update_per_collect = 5 + pong_dqn_envpool_config.policy.learn.batch_size = 32 + pong_dqn_envpool_config.policy.learn.learning_rate = 0.0001 + pong_dqn_envpool_config.policy.learn.target_update_freq = 500 + + main(pong_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_7.py b/ding/example/dqn_envpool_wandb_new_nstep_7.py new file mode 100644 index 0000000000..eddad4d1ed --- /dev/null +++ b/ding/example/dqn_envpool_wandb_new_nstep_7.py @@ -0,0 +1,139 @@ +import datetime +import torch +try: + torch.multiprocessing.set_start_method('spawn') +except RuntimeError: + pass +from easydict import EasyDict +from ditk import logging +from ding.model import DQN +from ding.policy import DQNFastPolicy +from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV2 +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial import pong_dqn_envpool_config + + +def main(cfg): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'Pong-v5-DQN-envpool-new-7-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNFastPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + task.use(epoch_timer()) + + # Here is the part of single process pipeline. + task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use( + EnvpoolStepCollectorV2( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) + task.use(data_pusher(cfg, buffer_)) + #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(OffPolicyLearnerV2(cfg, policy, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use( + wandb_online_logger( + metric_list=policy._monitor_vars_learn(), + model=policy._model, + exp_config=cfg, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + ) + ) + + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=10000000)) + + task.run() + + +if __name__ == "__main__": + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") + parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") + arg = parser.parse_args() + + pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num + pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size + pong_dqn_envpool_config.seed = arg.seed + pong_dqn_envpool_config.env.stop_value = 2000 + pong_dqn_envpool_config.nstep = 3 + pong_dqn_envpool_config.policy.nstep = 3 + pong_dqn_envpool_config.seed = arg.seed + + pong_dqn_envpool_config.policy.learn.update_per_collect = 5 + pong_dqn_envpool_config.policy.learn.batch_size = 32 + pong_dqn_envpool_config.policy.learn.learning_rate = 0.0001 + pong_dqn_envpool_config.policy.learn.target_update_freq = 200 + + main(pong_dqn_envpool_config) From d7c4983429af423bfde41d84dd62f6ac770dce47 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 25 Oct 2023 16:35:04 +0800 Subject: [PATCH 223/244] polish qbert test 6 7 --- ding/example/dqn_envpool_wandb_new_nstep_6.py | 4 ++-- ding/example/dqn_envpool_wandb_new_nstep_7.py | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_6.py b/ding/example/dqn_envpool_wandb_new_nstep_6.py index 303c2eced4..aee300d6c4 100644 --- a/ding/example/dqn_envpool_wandb_new_nstep_6.py +++ b/ding/example/dqn_envpool_wandb_new_nstep_6.py @@ -15,7 +15,7 @@ from ding.framework.context import OnlineRLContext from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV2 + termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV3 from ding.utils import set_pkg_seed from dizoo.atari.config.serial import pong_dqn_envpool_config @@ -95,7 +95,7 @@ def main(cfg): ) task.use(data_pusher(cfg, buffer_)) #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(OffPolicyLearnerV2(cfg, policy, buffer_)) + task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) task.use(online_logger(train_show_freq=10)) task.use( wandb_online_logger( diff --git a/ding/example/dqn_envpool_wandb_new_nstep_7.py b/ding/example/dqn_envpool_wandb_new_nstep_7.py index eddad4d1ed..77d30b5589 100644 --- a/ding/example/dqn_envpool_wandb_new_nstep_7.py +++ b/ding/example/dqn_envpool_wandb_new_nstep_7.py @@ -15,7 +15,7 @@ from ding.framework.context import OnlineRLContext from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV2 + termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV3 from ding.utils import set_pkg_seed from dizoo.atari.config.serial import pong_dqn_envpool_config @@ -60,6 +60,8 @@ def main(cfg): evaluator_env.seed(cfg.seed) set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + cfg.policy.model['activation'] = nn.ReLU(inplace=True) + model = DQN(**cfg.policy.model) buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) policy = DQNFastPolicy(cfg.policy, model=model) @@ -95,7 +97,7 @@ def main(cfg): ) task.use(data_pusher(cfg, buffer_)) #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(OffPolicyLearnerV2(cfg, policy, buffer_)) + task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) task.use(online_logger(train_show_freq=10)) task.use( wandb_online_logger( From e22df12a3aad82fde315c6d231143188b253e6e4 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 25 Oct 2023 20:01:42 +0800 Subject: [PATCH 224/244] polish qbert test 6 7 --- ding/example/dqn_envpool_wandb_new_nstep_7.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_7.py b/ding/example/dqn_envpool_wandb_new_nstep_7.py index 77d30b5589..7020f67766 100644 --- a/ding/example/dqn_envpool_wandb_new_nstep_7.py +++ b/ding/example/dqn_envpool_wandb_new_nstep_7.py @@ -60,8 +60,6 @@ def main(cfg): evaluator_env.seed(cfg.seed) set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - cfg.policy.model['activation'] = nn.ReLU(inplace=True) - model = DQN(**cfg.policy.model) buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) policy = DQNFastPolicy(cfg.policy, model=model) From c03a17ba960a17b37b7f810e2fac682c4b63fccd Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 26 Oct 2023 12:39:38 +0800 Subject: [PATCH 225/244] polish qbert test 8 9 --- ding/example/dqn_envpool_wandb_new_nstep_8.py | 139 ++++++++++++++++++ ding/example/dqn_envpool_wandb_new_nstep_9.py | 139 ++++++++++++++++++ 2 files changed, 278 insertions(+) create mode 100644 ding/example/dqn_envpool_wandb_new_nstep_8.py create mode 100644 ding/example/dqn_envpool_wandb_new_nstep_9.py diff --git a/ding/example/dqn_envpool_wandb_new_nstep_8.py b/ding/example/dqn_envpool_wandb_new_nstep_8.py new file mode 100644 index 0000000000..cf7dcb388e --- /dev/null +++ b/ding/example/dqn_envpool_wandb_new_nstep_8.py @@ -0,0 +1,139 @@ +import datetime +import torch +try: + torch.multiprocessing.set_start_method('spawn') +except RuntimeError: + pass +from easydict import EasyDict +from ditk import logging +from ding.model import DQN +from ding.policy import DQNFastPolicy +from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV3 +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial import pong_dqn_envpool_config + + +def main(cfg): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'Pong-v5-DQN-envpool-new-8-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNFastPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + task.use(epoch_timer()) + + # Here is the part of single process pipeline. + task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use( + EnvpoolStepCollectorV2( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) + task.use(data_pusher(cfg, buffer_)) + #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use( + wandb_online_logger( + metric_list=policy._monitor_vars_learn(), + model=policy._model, + exp_config=cfg, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + ) + ) + + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=10000000)) + + task.run() + + +if __name__ == "__main__": + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") + parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") + arg = parser.parse_args() + + pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num + pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size + pong_dqn_envpool_config.seed = arg.seed + pong_dqn_envpool_config.env.stop_value = 2000 + pong_dqn_envpool_config.nstep = 3 + pong_dqn_envpool_config.policy.nstep = 3 + pong_dqn_envpool_config.seed = arg.seed + + pong_dqn_envpool_config.policy.learn.update_per_collect = 4 + pong_dqn_envpool_config.policy.learn.batch_size = 32 + pong_dqn_envpool_config.policy.learn.learning_rate = 0.0001 + pong_dqn_envpool_config.policy.learn.target_update_freq = 200 + + main(pong_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_9.py b/ding/example/dqn_envpool_wandb_new_nstep_9.py new file mode 100644 index 0000000000..33fc67f187 --- /dev/null +++ b/ding/example/dqn_envpool_wandb_new_nstep_9.py @@ -0,0 +1,139 @@ +import datetime +import torch +try: + torch.multiprocessing.set_start_method('spawn') +except RuntimeError: + pass +from easydict import EasyDict +from ditk import logging +from ding.model import DQN +from ding.policy import DQNFastPolicy +from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV3 +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial import pong_dqn_envpool_config + + +def main(cfg): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'Pong-v5-DQN-envpool-new-9-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNFastPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + task.use(epoch_timer()) + + # Here is the part of single process pipeline. + task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use( + EnvpoolStepCollectorV2( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) + task.use(data_pusher(cfg, buffer_)) + #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use( + wandb_online_logger( + metric_list=policy._monitor_vars_learn(), + model=policy._model, + exp_config=cfg, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + ) + ) + + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=10000000)) + + task.run() + + +if __name__ == "__main__": + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") + parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") + arg = parser.parse_args() + + pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num + pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size + pong_dqn_envpool_config.seed = arg.seed + pong_dqn_envpool_config.env.stop_value = 2000 + pong_dqn_envpool_config.nstep = 3 + pong_dqn_envpool_config.policy.nstep = 3 + pong_dqn_envpool_config.seed = arg.seed + + pong_dqn_envpool_config.policy.learn.update_per_collect = 3 + pong_dqn_envpool_config.policy.learn.batch_size = 32 + pong_dqn_envpool_config.policy.learn.learning_rate = 0.0001 + pong_dqn_envpool_config.policy.learn.target_update_freq = 200 + + main(pong_dqn_envpool_config) From 48c13331b5e98a08f804239ef18c805d18c556fd Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 26 Oct 2023 19:59:47 +0800 Subject: [PATCH 226/244] polish qbert test 10~12 --- .../example/dqn_envpool_wandb_new_nstep_10.py | 140 ++++++++++++++++++ .../example/dqn_envpool_wandb_new_nstep_11.py | 140 ++++++++++++++++++ .../example/dqn_envpool_wandb_new_nstep_12.py | 140 ++++++++++++++++++ ding/policy/dqn.py | 4 +- 4 files changed, 422 insertions(+), 2 deletions(-) create mode 100644 ding/example/dqn_envpool_wandb_new_nstep_10.py create mode 100644 ding/example/dqn_envpool_wandb_new_nstep_11.py create mode 100644 ding/example/dqn_envpool_wandb_new_nstep_12.py diff --git a/ding/example/dqn_envpool_wandb_new_nstep_10.py b/ding/example/dqn_envpool_wandb_new_nstep_10.py new file mode 100644 index 0000000000..fdf6dcaeb4 --- /dev/null +++ b/ding/example/dqn_envpool_wandb_new_nstep_10.py @@ -0,0 +1,140 @@ +import datetime +import torch +try: + torch.multiprocessing.set_start_method('spawn') +except RuntimeError: + pass +from easydict import EasyDict +from ditk import logging +from ding.model import DQN +from ding.policy import DQNFastPolicy +from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV3 +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial import pong_dqn_envpool_config + + +def main(cfg): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'Pong-v5-DQN-envpool-new-10-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNFastPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + task.use(epoch_timer()) + + # Here is the part of single process pipeline. + task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use( + EnvpoolStepCollectorV2( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) + task.use(data_pusher(cfg, buffer_)) + #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use( + wandb_online_logger( + metric_list=policy._monitor_vars_learn(), + model=policy._model, + exp_config=cfg, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + ) + ) + + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=10000000)) + + task.run() + + +if __name__ == "__main__": + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") + parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") + arg = parser.parse_args() + + pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num + pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size + pong_dqn_envpool_config.seed = arg.seed + pong_dqn_envpool_config.env.stop_value = 2000 + pong_dqn_envpool_config.nstep = 3 + pong_dqn_envpool_config.policy.nstep = 3 + pong_dqn_envpool_config.seed = arg.seed + + pong_dqn_envpool_config.policy.learn.update_per_collect = 3 + pong_dqn_envpool_config.policy.learn.batch_size = 32 + pong_dqn_envpool_config.policy.learn.learning_rate = 0.0001 + pong_dqn_envpool_config.policy.learn.target_update_freq = 0 + pong_dqn_envpool_config.policy.learn.target_update = 0.01 + + main(pong_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_11.py b/ding/example/dqn_envpool_wandb_new_nstep_11.py new file mode 100644 index 0000000000..b462208622 --- /dev/null +++ b/ding/example/dqn_envpool_wandb_new_nstep_11.py @@ -0,0 +1,140 @@ +import datetime +import torch +try: + torch.multiprocessing.set_start_method('spawn') +except RuntimeError: + pass +from easydict import EasyDict +from ditk import logging +from ding.model import DQN +from ding.policy import DQNFastPolicy +from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV3 +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial import pong_dqn_envpool_config + + +def main(cfg): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'Pong-v5-DQN-envpool-new-11-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNFastPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + task.use(epoch_timer()) + + # Here is the part of single process pipeline. + task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use( + EnvpoolStepCollectorV2( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) + task.use(data_pusher(cfg, buffer_)) + #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use( + wandb_online_logger( + metric_list=policy._monitor_vars_learn(), + model=policy._model, + exp_config=cfg, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + ) + ) + + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=10000000)) + + task.run() + + +if __name__ == "__main__": + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") + parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") + arg = parser.parse_args() + + pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num + pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size + pong_dqn_envpool_config.seed = arg.seed + pong_dqn_envpool_config.env.stop_value = 2000 + pong_dqn_envpool_config.nstep = 3 + pong_dqn_envpool_config.policy.nstep = 3 + pong_dqn_envpool_config.seed = arg.seed + + pong_dqn_envpool_config.policy.learn.update_per_collect = 3 + pong_dqn_envpool_config.policy.learn.batch_size = 32 + pong_dqn_envpool_config.policy.learn.learning_rate = 0.0001 + pong_dqn_envpool_config.policy.learn.target_update_freq = 0 + pong_dqn_envpool_config.policy.learn.target_update = 0.02 + + main(pong_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_12.py b/ding/example/dqn_envpool_wandb_new_nstep_12.py new file mode 100644 index 0000000000..b78367fca0 --- /dev/null +++ b/ding/example/dqn_envpool_wandb_new_nstep_12.py @@ -0,0 +1,140 @@ +import datetime +import torch +try: + torch.multiprocessing.set_start_method('spawn') +except RuntimeError: + pass +from easydict import EasyDict +from ditk import logging +from ding.model import DQN +from ding.policy import DQNFastPolicy +from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV3 +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial import pong_dqn_envpool_config + + +def main(cfg): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'Pong-v5-DQN-envpool-new-12-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNFastPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + task.use(epoch_timer()) + + # Here is the part of single process pipeline. + task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use( + EnvpoolStepCollectorV2( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) + task.use(data_pusher(cfg, buffer_)) + #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use( + wandb_online_logger( + metric_list=policy._monitor_vars_learn(), + model=policy._model, + exp_config=cfg, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + ) + ) + + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=10000000)) + + task.run() + + +if __name__ == "__main__": + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") + parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") + arg = parser.parse_args() + + pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num + pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size + pong_dqn_envpool_config.seed = arg.seed + pong_dqn_envpool_config.env.stop_value = 2000 + pong_dqn_envpool_config.nstep = 3 + pong_dqn_envpool_config.policy.nstep = 3 + pong_dqn_envpool_config.seed = arg.seed + + pong_dqn_envpool_config.policy.learn.update_per_collect = 3 + pong_dqn_envpool_config.policy.learn.batch_size = 32 + pong_dqn_envpool_config.policy.learn.learning_rate = 0.0001 + pong_dqn_envpool_config.policy.learn.target_update_freq = 0 + pong_dqn_envpool_config.policy.learn.target_update = 0.04 + + main(pong_dqn_envpool_config) diff --git a/ding/policy/dqn.py b/ding/policy/dqn.py index 08dd18406e..4e5d177091 100644 --- a/ding/policy/dqn.py +++ b/ding/policy/dqn.py @@ -652,14 +652,14 @@ def _init_learn(self) -> None: # use model_wrapper for specialized demands of different modes self._target_model = copy.deepcopy(self._model) - if 'target_update_freq' in self._cfg.learn: + if 'target_update_freq' in self._cfg.learn and self._cfg.learn.target_update_freq > 0: self._target_model = model_wrap( self._target_model, wrapper_name='target', update_type='assign', update_kwargs={'freq': self._cfg.learn.target_update_freq} ) - elif 'target_theta' in self._cfg.learn: + elif 'target_theta' in self._cfg.learn and self._cfg.learn.target_theta > 0: self._target_model = model_wrap( self._target_model, wrapper_name='target', From dda0ffc040d85ba10ee6d8fa2c740f3b6b6bb9de Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 26 Oct 2023 21:57:57 +0800 Subject: [PATCH 227/244] polish qbert test 13 --- .../example/dqn_envpool_wandb_new_nstep_13.py | 140 ++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 ding/example/dqn_envpool_wandb_new_nstep_13.py diff --git a/ding/example/dqn_envpool_wandb_new_nstep_13.py b/ding/example/dqn_envpool_wandb_new_nstep_13.py new file mode 100644 index 0000000000..0b3ebeb4c5 --- /dev/null +++ b/ding/example/dqn_envpool_wandb_new_nstep_13.py @@ -0,0 +1,140 @@ +import datetime +import torch +try: + torch.multiprocessing.set_start_method('spawn') +except RuntimeError: + pass +from easydict import EasyDict +from ditk import logging +from ding.model import DQN +from ding.policy import DQNFastPolicy +from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV3 +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial import pong_dqn_envpool_config + + +def main(cfg): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'Pong-v5-DQN-envpool-new-13-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNFastPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + task.use(epoch_timer()) + + # Here is the part of single process pipeline. + task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use( + EnvpoolStepCollectorV2( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) + task.use(data_pusher(cfg, buffer_)) + #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use( + wandb_online_logger( + metric_list=policy._monitor_vars_learn(), + model=policy._model, + exp_config=cfg, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + ) + ) + + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=10000000)) + + task.run() + + +if __name__ == "__main__": + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") + parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") + arg = parser.parse_args() + + pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num + pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size + pong_dqn_envpool_config.seed = arg.seed + pong_dqn_envpool_config.env.stop_value = 2000 + pong_dqn_envpool_config.nstep = 3 + pong_dqn_envpool_config.policy.nstep = 3 + pong_dqn_envpool_config.seed = arg.seed + + pong_dqn_envpool_config.policy.learn.update_per_collect = 2 + pong_dqn_envpool_config.policy.learn.batch_size = 32 + pong_dqn_envpool_config.policy.learn.learning_rate = 0.0001 + pong_dqn_envpool_config.policy.learn.target_update_freq = 0 + pong_dqn_envpool_config.policy.learn.target_update = 0.01 + + main(pong_dqn_envpool_config) From 878bbb3909f5a5e902196597ce9a4765db20b7af Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 27 Oct 2023 12:29:08 +0800 Subject: [PATCH 228/244] polish qbert test 14 15 --- .../example/dqn_envpool_wandb_new_nstep_14.py | 140 ++++++++++++++++++ .../example/dqn_envpool_wandb_new_nstep_15.py | 140 ++++++++++++++++++ 2 files changed, 280 insertions(+) create mode 100644 ding/example/dqn_envpool_wandb_new_nstep_14.py create mode 100644 ding/example/dqn_envpool_wandb_new_nstep_15.py diff --git a/ding/example/dqn_envpool_wandb_new_nstep_14.py b/ding/example/dqn_envpool_wandb_new_nstep_14.py new file mode 100644 index 0000000000..cf339945db --- /dev/null +++ b/ding/example/dqn_envpool_wandb_new_nstep_14.py @@ -0,0 +1,140 @@ +import datetime +import torch +try: + torch.multiprocessing.set_start_method('spawn') +except RuntimeError: + pass +from easydict import EasyDict +from ditk import logging +from ding.model import DQN +from ding.policy import DQNFastPolicy +from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV3 +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial import pong_dqn_envpool_config + + +def main(cfg): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'Pong-v5-DQN-envpool-new-14-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNFastPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + task.use(epoch_timer()) + + # Here is the part of single process pipeline. + task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use( + EnvpoolStepCollectorV2( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) + task.use(data_pusher(cfg, buffer_)) + #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use( + wandb_online_logger( + metric_list=policy._monitor_vars_learn(), + model=policy._model, + exp_config=cfg, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + ) + ) + + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=10000000)) + + task.run() + + +if __name__ == "__main__": + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") + parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") + arg = parser.parse_args() + + pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num + pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size + pong_dqn_envpool_config.seed = arg.seed + pong_dqn_envpool_config.env.stop_value = 2000 + pong_dqn_envpool_config.nstep = 3 + pong_dqn_envpool_config.policy.nstep = 3 + pong_dqn_envpool_config.seed = arg.seed + + pong_dqn_envpool_config.policy.learn.update_per_collect = 2 + pong_dqn_envpool_config.policy.learn.batch_size = 32 + pong_dqn_envpool_config.policy.learn.learning_rate = 0.0001 + pong_dqn_envpool_config.policy.learn.target_update_freq = 0 + pong_dqn_envpool_config.policy.learn.target_update = 0.04 + + main(pong_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_15.py b/ding/example/dqn_envpool_wandb_new_nstep_15.py new file mode 100644 index 0000000000..582e4346cf --- /dev/null +++ b/ding/example/dqn_envpool_wandb_new_nstep_15.py @@ -0,0 +1,140 @@ +import datetime +import torch +try: + torch.multiprocessing.set_start_method('spawn') +except RuntimeError: + pass +from easydict import EasyDict +from ditk import logging +from ding.model import DQN +from ding.policy import DQNFastPolicy +from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV3 +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial import pong_dqn_envpool_config + + +def main(cfg): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'Pong-v5-DQN-envpool-new-15-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNFastPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + task.use(epoch_timer()) + + # Here is the part of single process pipeline. + task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use( + EnvpoolStepCollectorV2( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) + task.use(data_pusher(cfg, buffer_)) + #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use( + wandb_online_logger( + metric_list=policy._monitor_vars_learn(), + model=policy._model, + exp_config=cfg, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + ) + ) + + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=10000000)) + + task.run() + + +if __name__ == "__main__": + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") + parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") + arg = parser.parse_args() + + pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num + pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size + pong_dqn_envpool_config.seed = arg.seed + pong_dqn_envpool_config.env.stop_value = 2000 + pong_dqn_envpool_config.nstep = 3 + pong_dqn_envpool_config.policy.nstep = 3 + pong_dqn_envpool_config.seed = arg.seed + + pong_dqn_envpool_config.policy.learn.update_per_collect = 1 + pong_dqn_envpool_config.policy.learn.batch_size = 32 + pong_dqn_envpool_config.policy.learn.learning_rate = 0.0001 + pong_dqn_envpool_config.policy.learn.target_update_freq = 0 + pong_dqn_envpool_config.policy.learn.target_update = 0.04 + + main(pong_dqn_envpool_config) From 73b73dc02e5c47fed3c4de70a522746f64c194b9 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Fri, 27 Oct 2023 16:57:09 +0800 Subject: [PATCH 229/244] polish qbert test 16~18 --- .../example/dqn_envpool_wandb_new_nstep_16.py | 140 ++++++++++++++++++ .../example/dqn_envpool_wandb_new_nstep_17.py | 140 ++++++++++++++++++ .../example/dqn_envpool_wandb_new_nstep_18.py | 140 ++++++++++++++++++ 3 files changed, 420 insertions(+) create mode 100644 ding/example/dqn_envpool_wandb_new_nstep_16.py create mode 100644 ding/example/dqn_envpool_wandb_new_nstep_17.py create mode 100644 ding/example/dqn_envpool_wandb_new_nstep_18.py diff --git a/ding/example/dqn_envpool_wandb_new_nstep_16.py b/ding/example/dqn_envpool_wandb_new_nstep_16.py new file mode 100644 index 0000000000..6e73be8bf4 --- /dev/null +++ b/ding/example/dqn_envpool_wandb_new_nstep_16.py @@ -0,0 +1,140 @@ +import datetime +import torch +try: + torch.multiprocessing.set_start_method('spawn') +except RuntimeError: + pass +from easydict import EasyDict +from ditk import logging +from ding.model import DQN +from ding.policy import DQNFastPolicy +from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV3 +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial import pong_dqn_envpool_config + + +def main(cfg): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'Pong-v5-DQN-envpool-new-16-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNFastPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + task.use(epoch_timer()) + + # Here is the part of single process pipeline. + task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use( + EnvpoolStepCollectorV2( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) + task.use(data_pusher(cfg, buffer_)) + #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use( + wandb_online_logger( + metric_list=policy._monitor_vars_learn(), + model=policy._model, + exp_config=cfg, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + ) + ) + + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=10000000)) + + task.run() + + +if __name__ == "__main__": + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") + parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") + arg = parser.parse_args() + + pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num + pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size + pong_dqn_envpool_config.seed = arg.seed + pong_dqn_envpool_config.env.stop_value = 2000 + pong_dqn_envpool_config.nstep = 3 + pong_dqn_envpool_config.policy.nstep = 3 + pong_dqn_envpool_config.seed = arg.seed + + pong_dqn_envpool_config.policy.learn.update_per_collect = 1 + pong_dqn_envpool_config.policy.learn.batch_size = 32 + pong_dqn_envpool_config.policy.learn.learning_rate = 0.0001 + pong_dqn_envpool_config.policy.learn.target_update_freq = 0 + pong_dqn_envpool_config.policy.learn.target_update = 0.08 + + main(pong_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_17.py b/ding/example/dqn_envpool_wandb_new_nstep_17.py new file mode 100644 index 0000000000..9a2d227ad9 --- /dev/null +++ b/ding/example/dqn_envpool_wandb_new_nstep_17.py @@ -0,0 +1,140 @@ +import datetime +import torch +try: + torch.multiprocessing.set_start_method('spawn') +except RuntimeError: + pass +from easydict import EasyDict +from ditk import logging +from ding.model import DQN +from ding.policy import DQNFastPolicy +from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV3 +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial import pong_dqn_envpool_config + + +def main(cfg): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'Pong-v5-DQN-envpool-new-17-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNFastPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + task.use(epoch_timer()) + + # Here is the part of single process pipeline. + task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use( + EnvpoolStepCollectorV2( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) + task.use(data_pusher(cfg, buffer_)) + #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use( + wandb_online_logger( + metric_list=policy._monitor_vars_learn(), + model=policy._model, + exp_config=cfg, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + ) + ) + + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=10000000)) + + task.run() + + +if __name__ == "__main__": + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") + parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") + arg = parser.parse_args() + + pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num + pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size + pong_dqn_envpool_config.seed = arg.seed + pong_dqn_envpool_config.env.stop_value = 2000 + pong_dqn_envpool_config.nstep = 3 + pong_dqn_envpool_config.policy.nstep = 3 + pong_dqn_envpool_config.seed = arg.seed + + pong_dqn_envpool_config.policy.learn.update_per_collect = 1 + pong_dqn_envpool_config.policy.learn.batch_size = 32 + pong_dqn_envpool_config.policy.learn.learning_rate = 0.0002 + pong_dqn_envpool_config.policy.learn.target_update_freq = 0 + pong_dqn_envpool_config.policy.learn.target_update = 0.04 + + main(pong_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_18.py b/ding/example/dqn_envpool_wandb_new_nstep_18.py new file mode 100644 index 0000000000..52bed921a1 --- /dev/null +++ b/ding/example/dqn_envpool_wandb_new_nstep_18.py @@ -0,0 +1,140 @@ +import datetime +import torch +try: + torch.multiprocessing.set_start_method('spawn') +except RuntimeError: + pass +from easydict import EasyDict +from ditk import logging +from ding.model import DQN +from ding.policy import DQNFastPolicy +from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 +from ding.data import DequeBuffer +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OnlineRLContext +from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ + termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV3 +from ding.utils import set_pkg_seed + +from dizoo.atari.config.serial import pong_dqn_envpool_config + + +def main(cfg): + logging.getLogger().setLevel(logging.INFO) + cfg.exp_name = 'Pong-v5-DQN-envpool-new-18-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + + collector_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.collector_env_num, + 'batch_size': cfg.env.collector_batch_size, + # env wrappers + 'episodic_life': True, # collector: True + 'reward_clip': False, # collector: True + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["collector_env_cfg"] = collector_env_cfg + evaluator_env_cfg = EasyDict( + { + 'env_id': cfg.env.env_id, + 'env_num': cfg.env.evaluator_env_num, + 'batch_size': cfg.env.evaluator_batch_size, + # env wrappers + 'episodic_life': False, # evaluator: False + 'reward_clip': False, # evaluator: False + 'gray_scale': cfg.env.get('gray_scale', True), + 'stack_num': cfg.env.get('stack_num', 4), + } + ) + cfg.env["evaluator_env_cfg"] = evaluator_env_cfg + cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) + ding_init(cfg) + with task.start(async_mode=False, ctx=OnlineRLContext()): + collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) + collector_env.seed(cfg.seed) + evaluator_env.seed(cfg.seed) + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + model = DQN(**cfg.policy.model) + buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) + policy = DQNFastPolicy(cfg.policy, model=model) + + # Consider the case with multiple processes + if task.router.is_active: + # You can use labels to distinguish between workers with different roles, + # here we use node_id to distinguish. + if task.router.node_id == 0: + task.add_role(task.role.LEARNER) + elif task.router.node_id == 1: + task.add_role(task.role.EVALUATOR) + else: + task.add_role(task.role.COLLECTOR) + + # Sync their context and model between each worker. + task.use(ContextExchanger(skip_n_iter=1)) + task.use(ModelExchanger(model)) + + task.use(epoch_timer()) + + # Here is the part of single process pipeline. + task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(eps_greedy_handler(cfg)) + task.use( + EnvpoolStepCollectorV2( + cfg, + policy.collect_mode, + collector_env, + random_collect_size=cfg.policy.random_collect_size \ + if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) + task.use(data_pusher(cfg, buffer_)) + #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) + task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) + task.use(online_logger(train_show_freq=10)) + task.use( + wandb_online_logger( + metric_list=policy._monitor_vars_learn(), + model=policy._model, + exp_config=cfg, + anonymous=True, + project_name=cfg.exp_name, + wandb_sweep=False, + ) + ) + + #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) + task.use(termination_checker(max_env_step=10000000)) + + task.run() + + +if __name__ == "__main__": + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") + parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") + arg = parser.parse_args() + + pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num + pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size + pong_dqn_envpool_config.seed = arg.seed + pong_dqn_envpool_config.env.stop_value = 2000 + pong_dqn_envpool_config.nstep = 3 + pong_dqn_envpool_config.policy.nstep = 3 + pong_dqn_envpool_config.seed = arg.seed + + pong_dqn_envpool_config.policy.learn.update_per_collect = 1 + pong_dqn_envpool_config.policy.learn.batch_size = 32 + pong_dqn_envpool_config.policy.learn.learning_rate = 0.0004 + pong_dqn_envpool_config.policy.learn.target_update_freq = 0 + pong_dqn_envpool_config.policy.learn.target_update = 0.04 + + main(pong_dqn_envpool_config) From 7daf239661bb9ceee33ea819fa30a4a3b0830943 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 1 Nov 2023 15:34:54 +0800 Subject: [PATCH 230/244] polish code --- ...b_new_nstep_14.py => dqn_envpool_nstep.py} | 13 +- ding/example/dqn_envpool_wandb_new.py | 137 ------------- ding/example/dqn_envpool_wandb_new_nstep.py | 134 ------------- .../example/dqn_envpool_wandb_new_nstep_10.py | 140 -------------- .../example/dqn_envpool_wandb_new_nstep_11.py | 140 -------------- .../example/dqn_envpool_wandb_new_nstep_12.py | 140 -------------- .../example/dqn_envpool_wandb_new_nstep_13.py | 140 -------------- .../example/dqn_envpool_wandb_new_nstep_15.py | 140 -------------- .../example/dqn_envpool_wandb_new_nstep_16.py | 140 -------------- .../example/dqn_envpool_wandb_new_nstep_17.py | 140 -------------- .../example/dqn_envpool_wandb_new_nstep_18.py | 140 -------------- ding/example/dqn_envpool_wandb_new_nstep_2.py | 134 ------------- ding/example/dqn_envpool_wandb_new_nstep_3.py | 134 ------------- ding/example/dqn_envpool_wandb_new_nstep_6.py | 139 -------------- ding/example/dqn_envpool_wandb_new_nstep_7.py | 139 -------------- ding/example/dqn_envpool_wandb_new_nstep_8.py | 139 -------------- ding/example/dqn_envpool_wandb_new_nstep_9.py | 139 -------------- ...dqn_envpool_wandb_new_nstep_large_batch.py | 138 ------------- ..._envpool_wandb_new_nstep_large_learning.py | 138 ------------- .../dqn_envpool_wandb_new_nstep_priority.py | 138 ------------- ...dqn_envpool_wandb_new_nstep_shrink_size.py | 139 -------------- ...n_envpool_wandb_new_nstep_spaceinvaders.py | 134 ------------- ...vpool_wandb_new_nstep_target_update_100.py | 138 ------------- ...ol_wandb_new_nstep_update_per_collect_5.py | 138 ------------- ding/example/dqn_envpool_wandb_origin.py | 134 ------------- ding/example/dqn_envpool_wandb_sweep_pong.py | 157 --------------- ding/example/dqn_envpool_wandb_test.py | 123 ------------ ding/framework/middleware/__init__.py | 4 +- ding/framework/middleware/collector.py | 44 ----- ding/framework/middleware/learner.py | 181 +----------------- ding/model/common/head.py | 4 +- ding/policy/dqn.py | 4 +- dizoo/atari/config/serial/pong/__init__.py | 6 - .../pong_dqn_envpool_large_batch_config.py | 63 ------ .../pong_dqn_envpool_large_learning_config.py | 63 ------ .../pong/pong_dqn_envpool_priority_config.py | 63 ------ .../pong_dqn_envpool_shink_model_config.py | 64 ------- ...ng_dqn_envpool_target_update_100_config.py | 63 ------ ...dqn_envpool_update_per_collect_5_config.py | 63 ------ 39 files changed, 13 insertions(+), 4214 deletions(-) rename ding/example/{dqn_envpool_wandb_new_nstep_14.py => dqn_envpool_nstep.py} (92%) delete mode 100644 ding/example/dqn_envpool_wandb_new.py delete mode 100644 ding/example/dqn_envpool_wandb_new_nstep.py delete mode 100644 ding/example/dqn_envpool_wandb_new_nstep_10.py delete mode 100644 ding/example/dqn_envpool_wandb_new_nstep_11.py delete mode 100644 ding/example/dqn_envpool_wandb_new_nstep_12.py delete mode 100644 ding/example/dqn_envpool_wandb_new_nstep_13.py delete mode 100644 ding/example/dqn_envpool_wandb_new_nstep_15.py delete mode 100644 ding/example/dqn_envpool_wandb_new_nstep_16.py delete mode 100644 ding/example/dqn_envpool_wandb_new_nstep_17.py delete mode 100644 ding/example/dqn_envpool_wandb_new_nstep_18.py delete mode 100644 ding/example/dqn_envpool_wandb_new_nstep_2.py delete mode 100644 ding/example/dqn_envpool_wandb_new_nstep_3.py delete mode 100644 ding/example/dqn_envpool_wandb_new_nstep_6.py delete mode 100644 ding/example/dqn_envpool_wandb_new_nstep_7.py delete mode 100644 ding/example/dqn_envpool_wandb_new_nstep_8.py delete mode 100644 ding/example/dqn_envpool_wandb_new_nstep_9.py delete mode 100644 ding/example/dqn_envpool_wandb_new_nstep_large_batch.py delete mode 100644 ding/example/dqn_envpool_wandb_new_nstep_large_learning.py delete mode 100644 ding/example/dqn_envpool_wandb_new_nstep_priority.py delete mode 100644 ding/example/dqn_envpool_wandb_new_nstep_shrink_size.py delete mode 100644 ding/example/dqn_envpool_wandb_new_nstep_spaceinvaders.py delete mode 100644 ding/example/dqn_envpool_wandb_new_nstep_target_update_100.py delete mode 100644 ding/example/dqn_envpool_wandb_new_nstep_update_per_collect_5.py delete mode 100644 ding/example/dqn_envpool_wandb_origin.py delete mode 100644 ding/example/dqn_envpool_wandb_sweep_pong.py delete mode 100644 ding/example/dqn_envpool_wandb_test.py delete mode 100644 dizoo/atari/config/serial/pong/pong_dqn_envpool_large_batch_config.py delete mode 100644 dizoo/atari/config/serial/pong/pong_dqn_envpool_large_learning_config.py delete mode 100644 dizoo/atari/config/serial/pong/pong_dqn_envpool_priority_config.py delete mode 100644 dizoo/atari/config/serial/pong/pong_dqn_envpool_shink_model_config.py delete mode 100644 dizoo/atari/config/serial/pong/pong_dqn_envpool_target_update_100_config.py delete mode 100644 dizoo/atari/config/serial/pong/pong_dqn_envpool_update_per_collect_5_config.py diff --git a/ding/example/dqn_envpool_wandb_new_nstep_14.py b/ding/example/dqn_envpool_nstep.py similarity index 92% rename from ding/example/dqn_envpool_wandb_new_nstep_14.py rename to ding/example/dqn_envpool_nstep.py index cf339945db..1580291aae 100644 --- a/ding/example/dqn_envpool_wandb_new_nstep_14.py +++ b/ding/example/dqn_envpool_nstep.py @@ -13,9 +13,9 @@ from ding.config import compile_config from ding.framework import task, ding_init from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV3 +from ding.framework.middleware import envpool_evaluator, data_pusher, \ + eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, \ + termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollector, EnvpoolOffPolicyLearner from ding.utils import set_pkg_seed from dizoo.atari.config.serial import pong_dqn_envpool_config @@ -23,7 +23,7 @@ def main(cfg): logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Pong-v5-DQN-envpool-new-14-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + cfg.exp_name = 'Pong-v5-DQN-envpool-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") collector_env_cfg = EasyDict( { @@ -85,7 +85,7 @@ def main(cfg): task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) task.use(eps_greedy_handler(cfg)) task.use( - EnvpoolStepCollectorV2( + EnvpoolStepCollector( cfg, policy.collect_mode, collector_env, @@ -94,8 +94,7 @@ def main(cfg): ) ) task.use(data_pusher(cfg, buffer_)) - #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) + task.use(EnvpoolOffPolicyLearner(cfg, policy, buffer_)) task.use(online_logger(train_show_freq=10)) task.use( wandb_online_logger( diff --git a/ding/example/dqn_envpool_wandb_new.py b/ding/example/dqn_envpool_wandb_new.py deleted file mode 100644 index b5131b9e8c..0000000000 --- a/ding/example/dqn_envpool_wandb_new.py +++ /dev/null @@ -1,137 +0,0 @@ -import datetime -import torch -try: - torch.multiprocessing.set_start_method('spawn') -except RuntimeError: - pass -from easydict import EasyDict -from ditk import logging -from ding.model import DQN -from ding.policy import DQNFastPolicy -from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV3 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, interaction_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollector, OffPolicyLearnerV2 -from ding.utils import set_pkg_seed - -from dizoo.atari.config.serial import pong_dqn_envpool_config - - -def main(cfg): - logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Pong-v5-DQN-envpool-new-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - - collector_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.collector_env_num, - 'batch_size': cfg.env.collector_batch_size, - # env wrappers - 'episodic_life': True, # collector: True - 'reward_clip': False, # collector: True - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["collector_env_cfg"] = collector_env_cfg - evaluator_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.evaluator_env_num, - 'batch_size': cfg.env.evaluator_batch_size, - # env wrappers - 'episodic_life': False, # evaluator: False - 'reward_clip': False, # evaluator: False - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV3, DQNFastPolicy, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV3(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV3(cfg.env.evaluator_env_cfg) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNFastPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - task.use(epoch_timer()) - - # Here is the part of single process pipeline. - task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use( - EnvpoolStepCollector( - cfg, - policy.collect_mode, - collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) - if "nstep" in cfg.policy and cfg.policy.nstep > 1: - task.use(nstep_reward_enhancer(cfg)) - task.use(data_pusher(cfg, buffer_)) - #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(OffPolicyLearnerV2(cfg, policy, buffer_)) - task.use(online_logger(train_show_freq=10)) - # task.use( - # wandb_online_logger( - # metric_list=policy._monitor_vars_learn(), - # model=policy._model, - # exp_config=cfg, - # anonymous=True, - # project_name=cfg.exp_name, - # wandb_sweep=False, - # ) - # ) - - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=10000000)) - - task.run() - - -if __name__ == "__main__": - - import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--seed", type=int, default=0, help="random seed") - parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") - parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") - arg = parser.parse_args() - - pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num - pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size - pong_dqn_envpool_config.seed = arg.seed - pong_dqn_envpool_config.env.stop_value = 2000 - pong_dqn_envpool_config.policy.nstep = 1 - pong_dqn_envpool_config.nstep = 1 - - pong_dqn_envpool_config.seed = arg.seed - - main(pong_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep.py b/ding/example/dqn_envpool_wandb_new_nstep.py deleted file mode 100644 index 5b3b1fa1fb..0000000000 --- a/ding/example/dqn_envpool_wandb_new_nstep.py +++ /dev/null @@ -1,134 +0,0 @@ -import datetime -import torch -try: - torch.multiprocessing.set_start_method('spawn') -except RuntimeError: - pass -from easydict import EasyDict -from ditk import logging -from ding.model import DQN -from ding.policy import DQNFastPolicy -from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV2 -from ding.utils import set_pkg_seed - -from dizoo.atari.config.serial import pong_dqn_envpool_config - - -def main(cfg): - logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Pong-v5-DQN-envpool-new-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - - collector_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.collector_env_num, - 'batch_size': cfg.env.collector_batch_size, - # env wrappers - 'episodic_life': True, # collector: True - 'reward_clip': False, # collector: True - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["collector_env_cfg"] = collector_env_cfg - evaluator_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.evaluator_env_num, - 'batch_size': cfg.env.evaluator_batch_size, - # env wrappers - 'episodic_life': False, # evaluator: False - 'reward_clip': False, # evaluator: False - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNFastPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - task.use(epoch_timer()) - - # Here is the part of single process pipeline. - task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use( - EnvpoolStepCollectorV2( - cfg, - policy.collect_mode, - collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) - task.use(data_pusher(cfg, buffer_)) - #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(OffPolicyLearnerV2(cfg, policy, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use( - wandb_online_logger( - metric_list=policy._monitor_vars_learn(), - model=policy._model, - exp_config=cfg, - anonymous=True, - project_name=cfg.exp_name, - wandb_sweep=False, - ) - ) - - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=10000000)) - - task.run() - - -if __name__ == "__main__": - - import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--seed", type=int, default=0, help="random seed") - parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") - parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") - arg = parser.parse_args() - - pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num - pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size - pong_dqn_envpool_config.seed = arg.seed - pong_dqn_envpool_config.env.stop_value = 2000 - pong_dqn_envpool_config.nstep = 3 - pong_dqn_envpool_config.policy.nstep = 3 - pong_dqn_envpool_config.seed = arg.seed - - main(pong_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_10.py b/ding/example/dqn_envpool_wandb_new_nstep_10.py deleted file mode 100644 index fdf6dcaeb4..0000000000 --- a/ding/example/dqn_envpool_wandb_new_nstep_10.py +++ /dev/null @@ -1,140 +0,0 @@ -import datetime -import torch -try: - torch.multiprocessing.set_start_method('spawn') -except RuntimeError: - pass -from easydict import EasyDict -from ditk import logging -from ding.model import DQN -from ding.policy import DQNFastPolicy -from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV3 -from ding.utils import set_pkg_seed - -from dizoo.atari.config.serial import pong_dqn_envpool_config - - -def main(cfg): - logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Pong-v5-DQN-envpool-new-10-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - - collector_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.collector_env_num, - 'batch_size': cfg.env.collector_batch_size, - # env wrappers - 'episodic_life': True, # collector: True - 'reward_clip': False, # collector: True - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["collector_env_cfg"] = collector_env_cfg - evaluator_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.evaluator_env_num, - 'batch_size': cfg.env.evaluator_batch_size, - # env wrappers - 'episodic_life': False, # evaluator: False - 'reward_clip': False, # evaluator: False - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNFastPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - task.use(epoch_timer()) - - # Here is the part of single process pipeline. - task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use( - EnvpoolStepCollectorV2( - cfg, - policy.collect_mode, - collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) - task.use(data_pusher(cfg, buffer_)) - #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use( - wandb_online_logger( - metric_list=policy._monitor_vars_learn(), - model=policy._model, - exp_config=cfg, - anonymous=True, - project_name=cfg.exp_name, - wandb_sweep=False, - ) - ) - - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=10000000)) - - task.run() - - -if __name__ == "__main__": - - import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--seed", type=int, default=0, help="random seed") - parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") - parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") - arg = parser.parse_args() - - pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num - pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size - pong_dqn_envpool_config.seed = arg.seed - pong_dqn_envpool_config.env.stop_value = 2000 - pong_dqn_envpool_config.nstep = 3 - pong_dqn_envpool_config.policy.nstep = 3 - pong_dqn_envpool_config.seed = arg.seed - - pong_dqn_envpool_config.policy.learn.update_per_collect = 3 - pong_dqn_envpool_config.policy.learn.batch_size = 32 - pong_dqn_envpool_config.policy.learn.learning_rate = 0.0001 - pong_dqn_envpool_config.policy.learn.target_update_freq = 0 - pong_dqn_envpool_config.policy.learn.target_update = 0.01 - - main(pong_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_11.py b/ding/example/dqn_envpool_wandb_new_nstep_11.py deleted file mode 100644 index b462208622..0000000000 --- a/ding/example/dqn_envpool_wandb_new_nstep_11.py +++ /dev/null @@ -1,140 +0,0 @@ -import datetime -import torch -try: - torch.multiprocessing.set_start_method('spawn') -except RuntimeError: - pass -from easydict import EasyDict -from ditk import logging -from ding.model import DQN -from ding.policy import DQNFastPolicy -from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV3 -from ding.utils import set_pkg_seed - -from dizoo.atari.config.serial import pong_dqn_envpool_config - - -def main(cfg): - logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Pong-v5-DQN-envpool-new-11-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - - collector_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.collector_env_num, - 'batch_size': cfg.env.collector_batch_size, - # env wrappers - 'episodic_life': True, # collector: True - 'reward_clip': False, # collector: True - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["collector_env_cfg"] = collector_env_cfg - evaluator_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.evaluator_env_num, - 'batch_size': cfg.env.evaluator_batch_size, - # env wrappers - 'episodic_life': False, # evaluator: False - 'reward_clip': False, # evaluator: False - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNFastPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - task.use(epoch_timer()) - - # Here is the part of single process pipeline. - task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use( - EnvpoolStepCollectorV2( - cfg, - policy.collect_mode, - collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) - task.use(data_pusher(cfg, buffer_)) - #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use( - wandb_online_logger( - metric_list=policy._monitor_vars_learn(), - model=policy._model, - exp_config=cfg, - anonymous=True, - project_name=cfg.exp_name, - wandb_sweep=False, - ) - ) - - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=10000000)) - - task.run() - - -if __name__ == "__main__": - - import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--seed", type=int, default=0, help="random seed") - parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") - parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") - arg = parser.parse_args() - - pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num - pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size - pong_dqn_envpool_config.seed = arg.seed - pong_dqn_envpool_config.env.stop_value = 2000 - pong_dqn_envpool_config.nstep = 3 - pong_dqn_envpool_config.policy.nstep = 3 - pong_dqn_envpool_config.seed = arg.seed - - pong_dqn_envpool_config.policy.learn.update_per_collect = 3 - pong_dqn_envpool_config.policy.learn.batch_size = 32 - pong_dqn_envpool_config.policy.learn.learning_rate = 0.0001 - pong_dqn_envpool_config.policy.learn.target_update_freq = 0 - pong_dqn_envpool_config.policy.learn.target_update = 0.02 - - main(pong_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_12.py b/ding/example/dqn_envpool_wandb_new_nstep_12.py deleted file mode 100644 index b78367fca0..0000000000 --- a/ding/example/dqn_envpool_wandb_new_nstep_12.py +++ /dev/null @@ -1,140 +0,0 @@ -import datetime -import torch -try: - torch.multiprocessing.set_start_method('spawn') -except RuntimeError: - pass -from easydict import EasyDict -from ditk import logging -from ding.model import DQN -from ding.policy import DQNFastPolicy -from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV3 -from ding.utils import set_pkg_seed - -from dizoo.atari.config.serial import pong_dqn_envpool_config - - -def main(cfg): - logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Pong-v5-DQN-envpool-new-12-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - - collector_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.collector_env_num, - 'batch_size': cfg.env.collector_batch_size, - # env wrappers - 'episodic_life': True, # collector: True - 'reward_clip': False, # collector: True - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["collector_env_cfg"] = collector_env_cfg - evaluator_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.evaluator_env_num, - 'batch_size': cfg.env.evaluator_batch_size, - # env wrappers - 'episodic_life': False, # evaluator: False - 'reward_clip': False, # evaluator: False - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNFastPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - task.use(epoch_timer()) - - # Here is the part of single process pipeline. - task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use( - EnvpoolStepCollectorV2( - cfg, - policy.collect_mode, - collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) - task.use(data_pusher(cfg, buffer_)) - #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use( - wandb_online_logger( - metric_list=policy._monitor_vars_learn(), - model=policy._model, - exp_config=cfg, - anonymous=True, - project_name=cfg.exp_name, - wandb_sweep=False, - ) - ) - - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=10000000)) - - task.run() - - -if __name__ == "__main__": - - import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--seed", type=int, default=0, help="random seed") - parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") - parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") - arg = parser.parse_args() - - pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num - pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size - pong_dqn_envpool_config.seed = arg.seed - pong_dqn_envpool_config.env.stop_value = 2000 - pong_dqn_envpool_config.nstep = 3 - pong_dqn_envpool_config.policy.nstep = 3 - pong_dqn_envpool_config.seed = arg.seed - - pong_dqn_envpool_config.policy.learn.update_per_collect = 3 - pong_dqn_envpool_config.policy.learn.batch_size = 32 - pong_dqn_envpool_config.policy.learn.learning_rate = 0.0001 - pong_dqn_envpool_config.policy.learn.target_update_freq = 0 - pong_dqn_envpool_config.policy.learn.target_update = 0.04 - - main(pong_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_13.py b/ding/example/dqn_envpool_wandb_new_nstep_13.py deleted file mode 100644 index 0b3ebeb4c5..0000000000 --- a/ding/example/dqn_envpool_wandb_new_nstep_13.py +++ /dev/null @@ -1,140 +0,0 @@ -import datetime -import torch -try: - torch.multiprocessing.set_start_method('spawn') -except RuntimeError: - pass -from easydict import EasyDict -from ditk import logging -from ding.model import DQN -from ding.policy import DQNFastPolicy -from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV3 -from ding.utils import set_pkg_seed - -from dizoo.atari.config.serial import pong_dqn_envpool_config - - -def main(cfg): - logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Pong-v5-DQN-envpool-new-13-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - - collector_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.collector_env_num, - 'batch_size': cfg.env.collector_batch_size, - # env wrappers - 'episodic_life': True, # collector: True - 'reward_clip': False, # collector: True - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["collector_env_cfg"] = collector_env_cfg - evaluator_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.evaluator_env_num, - 'batch_size': cfg.env.evaluator_batch_size, - # env wrappers - 'episodic_life': False, # evaluator: False - 'reward_clip': False, # evaluator: False - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNFastPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - task.use(epoch_timer()) - - # Here is the part of single process pipeline. - task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use( - EnvpoolStepCollectorV2( - cfg, - policy.collect_mode, - collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) - task.use(data_pusher(cfg, buffer_)) - #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use( - wandb_online_logger( - metric_list=policy._monitor_vars_learn(), - model=policy._model, - exp_config=cfg, - anonymous=True, - project_name=cfg.exp_name, - wandb_sweep=False, - ) - ) - - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=10000000)) - - task.run() - - -if __name__ == "__main__": - - import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--seed", type=int, default=0, help="random seed") - parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") - parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") - arg = parser.parse_args() - - pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num - pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size - pong_dqn_envpool_config.seed = arg.seed - pong_dqn_envpool_config.env.stop_value = 2000 - pong_dqn_envpool_config.nstep = 3 - pong_dqn_envpool_config.policy.nstep = 3 - pong_dqn_envpool_config.seed = arg.seed - - pong_dqn_envpool_config.policy.learn.update_per_collect = 2 - pong_dqn_envpool_config.policy.learn.batch_size = 32 - pong_dqn_envpool_config.policy.learn.learning_rate = 0.0001 - pong_dqn_envpool_config.policy.learn.target_update_freq = 0 - pong_dqn_envpool_config.policy.learn.target_update = 0.01 - - main(pong_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_15.py b/ding/example/dqn_envpool_wandb_new_nstep_15.py deleted file mode 100644 index 582e4346cf..0000000000 --- a/ding/example/dqn_envpool_wandb_new_nstep_15.py +++ /dev/null @@ -1,140 +0,0 @@ -import datetime -import torch -try: - torch.multiprocessing.set_start_method('spawn') -except RuntimeError: - pass -from easydict import EasyDict -from ditk import logging -from ding.model import DQN -from ding.policy import DQNFastPolicy -from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV3 -from ding.utils import set_pkg_seed - -from dizoo.atari.config.serial import pong_dqn_envpool_config - - -def main(cfg): - logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Pong-v5-DQN-envpool-new-15-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - - collector_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.collector_env_num, - 'batch_size': cfg.env.collector_batch_size, - # env wrappers - 'episodic_life': True, # collector: True - 'reward_clip': False, # collector: True - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["collector_env_cfg"] = collector_env_cfg - evaluator_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.evaluator_env_num, - 'batch_size': cfg.env.evaluator_batch_size, - # env wrappers - 'episodic_life': False, # evaluator: False - 'reward_clip': False, # evaluator: False - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNFastPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - task.use(epoch_timer()) - - # Here is the part of single process pipeline. - task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use( - EnvpoolStepCollectorV2( - cfg, - policy.collect_mode, - collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) - task.use(data_pusher(cfg, buffer_)) - #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use( - wandb_online_logger( - metric_list=policy._monitor_vars_learn(), - model=policy._model, - exp_config=cfg, - anonymous=True, - project_name=cfg.exp_name, - wandb_sweep=False, - ) - ) - - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=10000000)) - - task.run() - - -if __name__ == "__main__": - - import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--seed", type=int, default=0, help="random seed") - parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") - parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") - arg = parser.parse_args() - - pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num - pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size - pong_dqn_envpool_config.seed = arg.seed - pong_dqn_envpool_config.env.stop_value = 2000 - pong_dqn_envpool_config.nstep = 3 - pong_dqn_envpool_config.policy.nstep = 3 - pong_dqn_envpool_config.seed = arg.seed - - pong_dqn_envpool_config.policy.learn.update_per_collect = 1 - pong_dqn_envpool_config.policy.learn.batch_size = 32 - pong_dqn_envpool_config.policy.learn.learning_rate = 0.0001 - pong_dqn_envpool_config.policy.learn.target_update_freq = 0 - pong_dqn_envpool_config.policy.learn.target_update = 0.04 - - main(pong_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_16.py b/ding/example/dqn_envpool_wandb_new_nstep_16.py deleted file mode 100644 index 6e73be8bf4..0000000000 --- a/ding/example/dqn_envpool_wandb_new_nstep_16.py +++ /dev/null @@ -1,140 +0,0 @@ -import datetime -import torch -try: - torch.multiprocessing.set_start_method('spawn') -except RuntimeError: - pass -from easydict import EasyDict -from ditk import logging -from ding.model import DQN -from ding.policy import DQNFastPolicy -from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV3 -from ding.utils import set_pkg_seed - -from dizoo.atari.config.serial import pong_dqn_envpool_config - - -def main(cfg): - logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Pong-v5-DQN-envpool-new-16-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - - collector_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.collector_env_num, - 'batch_size': cfg.env.collector_batch_size, - # env wrappers - 'episodic_life': True, # collector: True - 'reward_clip': False, # collector: True - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["collector_env_cfg"] = collector_env_cfg - evaluator_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.evaluator_env_num, - 'batch_size': cfg.env.evaluator_batch_size, - # env wrappers - 'episodic_life': False, # evaluator: False - 'reward_clip': False, # evaluator: False - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNFastPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - task.use(epoch_timer()) - - # Here is the part of single process pipeline. - task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use( - EnvpoolStepCollectorV2( - cfg, - policy.collect_mode, - collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) - task.use(data_pusher(cfg, buffer_)) - #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use( - wandb_online_logger( - metric_list=policy._monitor_vars_learn(), - model=policy._model, - exp_config=cfg, - anonymous=True, - project_name=cfg.exp_name, - wandb_sweep=False, - ) - ) - - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=10000000)) - - task.run() - - -if __name__ == "__main__": - - import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--seed", type=int, default=0, help="random seed") - parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") - parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") - arg = parser.parse_args() - - pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num - pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size - pong_dqn_envpool_config.seed = arg.seed - pong_dqn_envpool_config.env.stop_value = 2000 - pong_dqn_envpool_config.nstep = 3 - pong_dqn_envpool_config.policy.nstep = 3 - pong_dqn_envpool_config.seed = arg.seed - - pong_dqn_envpool_config.policy.learn.update_per_collect = 1 - pong_dqn_envpool_config.policy.learn.batch_size = 32 - pong_dqn_envpool_config.policy.learn.learning_rate = 0.0001 - pong_dqn_envpool_config.policy.learn.target_update_freq = 0 - pong_dqn_envpool_config.policy.learn.target_update = 0.08 - - main(pong_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_17.py b/ding/example/dqn_envpool_wandb_new_nstep_17.py deleted file mode 100644 index 9a2d227ad9..0000000000 --- a/ding/example/dqn_envpool_wandb_new_nstep_17.py +++ /dev/null @@ -1,140 +0,0 @@ -import datetime -import torch -try: - torch.multiprocessing.set_start_method('spawn') -except RuntimeError: - pass -from easydict import EasyDict -from ditk import logging -from ding.model import DQN -from ding.policy import DQNFastPolicy -from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV3 -from ding.utils import set_pkg_seed - -from dizoo.atari.config.serial import pong_dqn_envpool_config - - -def main(cfg): - logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Pong-v5-DQN-envpool-new-17-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - - collector_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.collector_env_num, - 'batch_size': cfg.env.collector_batch_size, - # env wrappers - 'episodic_life': True, # collector: True - 'reward_clip': False, # collector: True - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["collector_env_cfg"] = collector_env_cfg - evaluator_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.evaluator_env_num, - 'batch_size': cfg.env.evaluator_batch_size, - # env wrappers - 'episodic_life': False, # evaluator: False - 'reward_clip': False, # evaluator: False - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNFastPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - task.use(epoch_timer()) - - # Here is the part of single process pipeline. - task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use( - EnvpoolStepCollectorV2( - cfg, - policy.collect_mode, - collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) - task.use(data_pusher(cfg, buffer_)) - #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use( - wandb_online_logger( - metric_list=policy._monitor_vars_learn(), - model=policy._model, - exp_config=cfg, - anonymous=True, - project_name=cfg.exp_name, - wandb_sweep=False, - ) - ) - - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=10000000)) - - task.run() - - -if __name__ == "__main__": - - import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--seed", type=int, default=0, help="random seed") - parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") - parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") - arg = parser.parse_args() - - pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num - pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size - pong_dqn_envpool_config.seed = arg.seed - pong_dqn_envpool_config.env.stop_value = 2000 - pong_dqn_envpool_config.nstep = 3 - pong_dqn_envpool_config.policy.nstep = 3 - pong_dqn_envpool_config.seed = arg.seed - - pong_dqn_envpool_config.policy.learn.update_per_collect = 1 - pong_dqn_envpool_config.policy.learn.batch_size = 32 - pong_dqn_envpool_config.policy.learn.learning_rate = 0.0002 - pong_dqn_envpool_config.policy.learn.target_update_freq = 0 - pong_dqn_envpool_config.policy.learn.target_update = 0.04 - - main(pong_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_18.py b/ding/example/dqn_envpool_wandb_new_nstep_18.py deleted file mode 100644 index 52bed921a1..0000000000 --- a/ding/example/dqn_envpool_wandb_new_nstep_18.py +++ /dev/null @@ -1,140 +0,0 @@ -import datetime -import torch -try: - torch.multiprocessing.set_start_method('spawn') -except RuntimeError: - pass -from easydict import EasyDict -from ditk import logging -from ding.model import DQN -from ding.policy import DQNFastPolicy -from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV3 -from ding.utils import set_pkg_seed - -from dizoo.atari.config.serial import pong_dqn_envpool_config - - -def main(cfg): - logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Pong-v5-DQN-envpool-new-18-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - - collector_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.collector_env_num, - 'batch_size': cfg.env.collector_batch_size, - # env wrappers - 'episodic_life': True, # collector: True - 'reward_clip': False, # collector: True - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["collector_env_cfg"] = collector_env_cfg - evaluator_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.evaluator_env_num, - 'batch_size': cfg.env.evaluator_batch_size, - # env wrappers - 'episodic_life': False, # evaluator: False - 'reward_clip': False, # evaluator: False - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNFastPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - task.use(epoch_timer()) - - # Here is the part of single process pipeline. - task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use( - EnvpoolStepCollectorV2( - cfg, - policy.collect_mode, - collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) - task.use(data_pusher(cfg, buffer_)) - #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use( - wandb_online_logger( - metric_list=policy._monitor_vars_learn(), - model=policy._model, - exp_config=cfg, - anonymous=True, - project_name=cfg.exp_name, - wandb_sweep=False, - ) - ) - - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=10000000)) - - task.run() - - -if __name__ == "__main__": - - import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--seed", type=int, default=0, help="random seed") - parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") - parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") - arg = parser.parse_args() - - pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num - pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size - pong_dqn_envpool_config.seed = arg.seed - pong_dqn_envpool_config.env.stop_value = 2000 - pong_dqn_envpool_config.nstep = 3 - pong_dqn_envpool_config.policy.nstep = 3 - pong_dqn_envpool_config.seed = arg.seed - - pong_dqn_envpool_config.policy.learn.update_per_collect = 1 - pong_dqn_envpool_config.policy.learn.batch_size = 32 - pong_dqn_envpool_config.policy.learn.learning_rate = 0.0004 - pong_dqn_envpool_config.policy.learn.target_update_freq = 0 - pong_dqn_envpool_config.policy.learn.target_update = 0.04 - - main(pong_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_2.py b/ding/example/dqn_envpool_wandb_new_nstep_2.py deleted file mode 100644 index 8e35e9a960..0000000000 --- a/ding/example/dqn_envpool_wandb_new_nstep_2.py +++ /dev/null @@ -1,134 +0,0 @@ -import datetime -import torch -try: - torch.multiprocessing.set_start_method('spawn') -except RuntimeError: - pass -from easydict import EasyDict -from ditk import logging -from ding.model import DQN -from ding.policy import DQNFastPolicy -from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV2, OffPolicyLearnerV3 -from ding.utils import set_pkg_seed - -from dizoo.atari.config.serial import pong_dqn_envpool_config - - -def main(cfg): - logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Pong-v5-DQN-envpool-new-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - - collector_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.collector_env_num, - 'batch_size': cfg.env.collector_batch_size, - # env wrappers - 'episodic_life': True, # collector: True - 'reward_clip': False, # collector: True - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["collector_env_cfg"] = collector_env_cfg - evaluator_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.evaluator_env_num, - 'batch_size': cfg.env.evaluator_batch_size, - # env wrappers - 'episodic_life': False, # evaluator: False - 'reward_clip': False, # evaluator: False - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNFastPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - task.use(epoch_timer()) - - # Here is the part of single process pipeline. - task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use( - EnvpoolStepCollectorV2( - cfg, - policy.collect_mode, - collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) - task.use(data_pusher(cfg, buffer_)) - #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use( - wandb_online_logger( - metric_list=policy._monitor_vars_learn(), - model=policy._model, - exp_config=cfg, - anonymous=True, - project_name=cfg.exp_name, - wandb_sweep=False, - ) - ) - - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=10000000)) - - task.run() - - -if __name__ == "__main__": - - import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--seed", type=int, default=0, help="random seed") - parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") - parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") - arg = parser.parse_args() - - pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num - pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size - pong_dqn_envpool_config.seed = arg.seed - pong_dqn_envpool_config.env.stop_value = 2000 - pong_dqn_envpool_config.nstep = 3 - pong_dqn_envpool_config.policy.nstep = 3 - pong_dqn_envpool_config.seed = arg.seed - - main(pong_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_3.py b/ding/example/dqn_envpool_wandb_new_nstep_3.py deleted file mode 100644 index 660e5d8a80..0000000000 --- a/ding/example/dqn_envpool_wandb_new_nstep_3.py +++ /dev/null @@ -1,134 +0,0 @@ -import datetime -import torch -try: - torch.multiprocessing.set_start_method('spawn') -except RuntimeError: - pass -from easydict import EasyDict -from ditk import logging -from ding.model import DQN -from ding.policy import DQNFastPolicy -from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV2, OffPolicyLearnerV3, OffPolicyLearnerV4 -from ding.utils import set_pkg_seed - -from dizoo.atari.config.serial import pong_dqn_envpool_config - - -def main(cfg): - logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Test-Pong-v5-DQN-envpool-new-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - - collector_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.collector_env_num, - 'batch_size': cfg.env.collector_batch_size, - # env wrappers - 'episodic_life': True, # collector: True - 'reward_clip': False, # collector: True - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["collector_env_cfg"] = collector_env_cfg - evaluator_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.evaluator_env_num, - 'batch_size': cfg.env.evaluator_batch_size, - # env wrappers - 'episodic_life': False, # evaluator: False - 'reward_clip': False, # evaluator: False - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNFastPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - task.use(epoch_timer()) - - # Here is the part of single process pipeline. - task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use( - EnvpoolStepCollectorV2( - cfg, - policy.collect_mode, - collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) - task.use(data_pusher(cfg, buffer_)) - #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(OffPolicyLearnerV4(cfg, policy, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use( - wandb_online_logger( - metric_list=policy._monitor_vars_learn(), - model=policy._model, - exp_config=cfg, - anonymous=True, - project_name=cfg.exp_name, - wandb_sweep=False, - ) - ) - - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=10000000)) - - task.run() - - -if __name__ == "__main__": - - import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--seed", type=int, default=0, help="random seed") - parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") - parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") - arg = parser.parse_args() - - pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num - pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size - pong_dqn_envpool_config.seed = arg.seed - pong_dqn_envpool_config.env.stop_value = 2000 - pong_dqn_envpool_config.nstep = 3 - pong_dqn_envpool_config.policy.nstep = 3 - pong_dqn_envpool_config.seed = arg.seed - - main(pong_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_6.py b/ding/example/dqn_envpool_wandb_new_nstep_6.py deleted file mode 100644 index aee300d6c4..0000000000 --- a/ding/example/dqn_envpool_wandb_new_nstep_6.py +++ /dev/null @@ -1,139 +0,0 @@ -import datetime -import torch -try: - torch.multiprocessing.set_start_method('spawn') -except RuntimeError: - pass -from easydict import EasyDict -from ditk import logging -from ding.model import DQN -from ding.policy import DQNFastPolicy -from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV3 -from ding.utils import set_pkg_seed - -from dizoo.atari.config.serial import pong_dqn_envpool_config - - -def main(cfg): - logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Pong-v5-DQN-envpool-new-6-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - - collector_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.collector_env_num, - 'batch_size': cfg.env.collector_batch_size, - # env wrappers - 'episodic_life': True, # collector: True - 'reward_clip': False, # collector: True - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["collector_env_cfg"] = collector_env_cfg - evaluator_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.evaluator_env_num, - 'batch_size': cfg.env.evaluator_batch_size, - # env wrappers - 'episodic_life': False, # evaluator: False - 'reward_clip': False, # evaluator: False - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNFastPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - task.use(epoch_timer()) - - # Here is the part of single process pipeline. - task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use( - EnvpoolStepCollectorV2( - cfg, - policy.collect_mode, - collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) - task.use(data_pusher(cfg, buffer_)) - #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use( - wandb_online_logger( - metric_list=policy._monitor_vars_learn(), - model=policy._model, - exp_config=cfg, - anonymous=True, - project_name=cfg.exp_name, - wandb_sweep=False, - ) - ) - - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=10000000)) - - task.run() - - -if __name__ == "__main__": - - import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--seed", type=int, default=0, help="random seed") - parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") - parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") - arg = parser.parse_args() - - pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num - pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size - pong_dqn_envpool_config.seed = arg.seed - pong_dqn_envpool_config.env.stop_value = 2000 - pong_dqn_envpool_config.nstep = 3 - pong_dqn_envpool_config.policy.nstep = 3 - pong_dqn_envpool_config.seed = arg.seed - - pong_dqn_envpool_config.policy.learn.update_per_collect = 5 - pong_dqn_envpool_config.policy.learn.batch_size = 32 - pong_dqn_envpool_config.policy.learn.learning_rate = 0.0001 - pong_dqn_envpool_config.policy.learn.target_update_freq = 500 - - main(pong_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_7.py b/ding/example/dqn_envpool_wandb_new_nstep_7.py deleted file mode 100644 index 7020f67766..0000000000 --- a/ding/example/dqn_envpool_wandb_new_nstep_7.py +++ /dev/null @@ -1,139 +0,0 @@ -import datetime -import torch -try: - torch.multiprocessing.set_start_method('spawn') -except RuntimeError: - pass -from easydict import EasyDict -from ditk import logging -from ding.model import DQN -from ding.policy import DQNFastPolicy -from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV3 -from ding.utils import set_pkg_seed - -from dizoo.atari.config.serial import pong_dqn_envpool_config - - -def main(cfg): - logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Pong-v5-DQN-envpool-new-7-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - - collector_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.collector_env_num, - 'batch_size': cfg.env.collector_batch_size, - # env wrappers - 'episodic_life': True, # collector: True - 'reward_clip': False, # collector: True - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["collector_env_cfg"] = collector_env_cfg - evaluator_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.evaluator_env_num, - 'batch_size': cfg.env.evaluator_batch_size, - # env wrappers - 'episodic_life': False, # evaluator: False - 'reward_clip': False, # evaluator: False - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNFastPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - task.use(epoch_timer()) - - # Here is the part of single process pipeline. - task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use( - EnvpoolStepCollectorV2( - cfg, - policy.collect_mode, - collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) - task.use(data_pusher(cfg, buffer_)) - #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use( - wandb_online_logger( - metric_list=policy._monitor_vars_learn(), - model=policy._model, - exp_config=cfg, - anonymous=True, - project_name=cfg.exp_name, - wandb_sweep=False, - ) - ) - - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=10000000)) - - task.run() - - -if __name__ == "__main__": - - import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--seed", type=int, default=0, help="random seed") - parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") - parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") - arg = parser.parse_args() - - pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num - pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size - pong_dqn_envpool_config.seed = arg.seed - pong_dqn_envpool_config.env.stop_value = 2000 - pong_dqn_envpool_config.nstep = 3 - pong_dqn_envpool_config.policy.nstep = 3 - pong_dqn_envpool_config.seed = arg.seed - - pong_dqn_envpool_config.policy.learn.update_per_collect = 5 - pong_dqn_envpool_config.policy.learn.batch_size = 32 - pong_dqn_envpool_config.policy.learn.learning_rate = 0.0001 - pong_dqn_envpool_config.policy.learn.target_update_freq = 200 - - main(pong_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_8.py b/ding/example/dqn_envpool_wandb_new_nstep_8.py deleted file mode 100644 index cf7dcb388e..0000000000 --- a/ding/example/dqn_envpool_wandb_new_nstep_8.py +++ /dev/null @@ -1,139 +0,0 @@ -import datetime -import torch -try: - torch.multiprocessing.set_start_method('spawn') -except RuntimeError: - pass -from easydict import EasyDict -from ditk import logging -from ding.model import DQN -from ding.policy import DQNFastPolicy -from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV3 -from ding.utils import set_pkg_seed - -from dizoo.atari.config.serial import pong_dqn_envpool_config - - -def main(cfg): - logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Pong-v5-DQN-envpool-new-8-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - - collector_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.collector_env_num, - 'batch_size': cfg.env.collector_batch_size, - # env wrappers - 'episodic_life': True, # collector: True - 'reward_clip': False, # collector: True - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["collector_env_cfg"] = collector_env_cfg - evaluator_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.evaluator_env_num, - 'batch_size': cfg.env.evaluator_batch_size, - # env wrappers - 'episodic_life': False, # evaluator: False - 'reward_clip': False, # evaluator: False - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNFastPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - task.use(epoch_timer()) - - # Here is the part of single process pipeline. - task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use( - EnvpoolStepCollectorV2( - cfg, - policy.collect_mode, - collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) - task.use(data_pusher(cfg, buffer_)) - #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use( - wandb_online_logger( - metric_list=policy._monitor_vars_learn(), - model=policy._model, - exp_config=cfg, - anonymous=True, - project_name=cfg.exp_name, - wandb_sweep=False, - ) - ) - - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=10000000)) - - task.run() - - -if __name__ == "__main__": - - import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--seed", type=int, default=0, help="random seed") - parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") - parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") - arg = parser.parse_args() - - pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num - pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size - pong_dqn_envpool_config.seed = arg.seed - pong_dqn_envpool_config.env.stop_value = 2000 - pong_dqn_envpool_config.nstep = 3 - pong_dqn_envpool_config.policy.nstep = 3 - pong_dqn_envpool_config.seed = arg.seed - - pong_dqn_envpool_config.policy.learn.update_per_collect = 4 - pong_dqn_envpool_config.policy.learn.batch_size = 32 - pong_dqn_envpool_config.policy.learn.learning_rate = 0.0001 - pong_dqn_envpool_config.policy.learn.target_update_freq = 200 - - main(pong_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_9.py b/ding/example/dqn_envpool_wandb_new_nstep_9.py deleted file mode 100644 index 33fc67f187..0000000000 --- a/ding/example/dqn_envpool_wandb_new_nstep_9.py +++ /dev/null @@ -1,139 +0,0 @@ -import datetime -import torch -try: - torch.multiprocessing.set_start_method('spawn') -except RuntimeError: - pass -from easydict import EasyDict -from ditk import logging -from ding.model import DQN -from ding.policy import DQNFastPolicy -from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV3 -from ding.utils import set_pkg_seed - -from dizoo.atari.config.serial import pong_dqn_envpool_config - - -def main(cfg): - logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Pong-v5-DQN-envpool-new-9-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - - collector_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.collector_env_num, - 'batch_size': cfg.env.collector_batch_size, - # env wrappers - 'episodic_life': True, # collector: True - 'reward_clip': False, # collector: True - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["collector_env_cfg"] = collector_env_cfg - evaluator_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.evaluator_env_num, - 'batch_size': cfg.env.evaluator_batch_size, - # env wrappers - 'episodic_life': False, # evaluator: False - 'reward_clip': False, # evaluator: False - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNFastPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - task.use(epoch_timer()) - - # Here is the part of single process pipeline. - task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use( - EnvpoolStepCollectorV2( - cfg, - policy.collect_mode, - collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) - task.use(data_pusher(cfg, buffer_)) - #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use( - wandb_online_logger( - metric_list=policy._monitor_vars_learn(), - model=policy._model, - exp_config=cfg, - anonymous=True, - project_name=cfg.exp_name, - wandb_sweep=False, - ) - ) - - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=10000000)) - - task.run() - - -if __name__ == "__main__": - - import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--seed", type=int, default=0, help="random seed") - parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") - parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") - arg = parser.parse_args() - - pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num - pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size - pong_dqn_envpool_config.seed = arg.seed - pong_dqn_envpool_config.env.stop_value = 2000 - pong_dqn_envpool_config.nstep = 3 - pong_dqn_envpool_config.policy.nstep = 3 - pong_dqn_envpool_config.seed = arg.seed - - pong_dqn_envpool_config.policy.learn.update_per_collect = 3 - pong_dqn_envpool_config.policy.learn.batch_size = 32 - pong_dqn_envpool_config.policy.learn.learning_rate = 0.0001 - pong_dqn_envpool_config.policy.learn.target_update_freq = 200 - - main(pong_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_large_batch.py b/ding/example/dqn_envpool_wandb_new_nstep_large_batch.py deleted file mode 100644 index 6eef6fcb5b..0000000000 --- a/ding/example/dqn_envpool_wandb_new_nstep_large_batch.py +++ /dev/null @@ -1,138 +0,0 @@ -import datetime -import torch -try: - torch.multiprocessing.set_start_method('spawn') -except RuntimeError: - pass -from easydict import EasyDict -import torch -from torch import nn -from ditk import logging -from ding.model import DQN -from ding.policy import DQNFastPolicy -from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV2, OffPolicyLearnerV3 -from ding.utils import set_pkg_seed - -from dizoo.atari.config.serial import pong_dqn_envpool_large_batch_config - - -def main(cfg): - logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Pong-v5-DQN-envpool-large_batch-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - - collector_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.collector_env_num, - 'batch_size': cfg.env.collector_batch_size, - # env wrappers - 'episodic_life': True, # collector: True - 'reward_clip': False, # collector: True - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["collector_env_cfg"] = collector_env_cfg - evaluator_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.evaluator_env_num, - 'batch_size': cfg.env.evaluator_batch_size, - # env wrappers - 'episodic_life': False, # evaluator: False - 'reward_clip': False, # evaluator: False - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - cfg.policy.model['activation'] = nn.ReLU(inplace=True) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNFastPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - task.use(epoch_timer()) - - # Here is the part of single process pipeline. - task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use( - EnvpoolStepCollectorV2( - cfg, - policy.collect_mode, - collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) - task.use(data_pusher(cfg, buffer_)) - #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use( - wandb_online_logger( - metric_list=policy._monitor_vars_learn(), - model=policy._model, - exp_config=cfg, - anonymous=True, - project_name=cfg.exp_name, - wandb_sweep=False, - ) - ) - - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=10000000)) - - task.run() - - -if __name__ == "__main__": - - import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--seed", type=int, default=0, help="random seed") - parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") - parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") - arg = parser.parse_args() - - pong_dqn_envpool_large_batch_config.env.collector_env_num = arg.collector_env_num - pong_dqn_envpool_large_batch_config.env.collector_batch_size = arg.collector_batch_size - pong_dqn_envpool_large_batch_config.seed = arg.seed - pong_dqn_envpool_large_batch_config.env.stop_value = 2000 - pong_dqn_envpool_large_batch_config.nstep = 3 - pong_dqn_envpool_large_batch_config.policy.nstep = 3 - pong_dqn_envpool_large_batch_config.seed = arg.seed - - main(pong_dqn_envpool_large_batch_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_large_learning.py b/ding/example/dqn_envpool_wandb_new_nstep_large_learning.py deleted file mode 100644 index 988829b6d5..0000000000 --- a/ding/example/dqn_envpool_wandb_new_nstep_large_learning.py +++ /dev/null @@ -1,138 +0,0 @@ -import datetime -import torch -try: - torch.multiprocessing.set_start_method('spawn') -except RuntimeError: - pass -from easydict import EasyDict -import torch -from torch import nn -from ditk import logging -from ding.model import DQN -from ding.policy import DQNFastPolicy -from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV2, OffPolicyLearnerV3 -from ding.utils import set_pkg_seed - -from dizoo.atari.config.serial import pong_dqn_envpool_large_learning_config - - -def main(cfg): - logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Pong-v5-DQN-envpool-large-learning-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - - collector_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.collector_env_num, - 'batch_size': cfg.env.collector_batch_size, - # env wrappers - 'episodic_life': True, # collector: True - 'reward_clip': False, # collector: True - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["collector_env_cfg"] = collector_env_cfg - evaluator_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.evaluator_env_num, - 'batch_size': cfg.env.evaluator_batch_size, - # env wrappers - 'episodic_life': False, # evaluator: False - 'reward_clip': False, # evaluator: False - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - cfg.policy.model['activation'] = nn.ReLU(inplace=True) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNFastPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - task.use(epoch_timer()) - - # Here is the part of single process pipeline. - task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use( - EnvpoolStepCollectorV2( - cfg, - policy.collect_mode, - collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) - task.use(data_pusher(cfg, buffer_)) - #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use( - wandb_online_logger( - metric_list=policy._monitor_vars_learn(), - model=policy._model, - exp_config=cfg, - anonymous=True, - project_name=cfg.exp_name, - wandb_sweep=False, - ) - ) - - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=10000000)) - - task.run() - - -if __name__ == "__main__": - - import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--seed", type=int, default=0, help="random seed") - parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") - parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") - arg = parser.parse_args() - - pong_dqn_envpool_large_learning_config.env.collector_env_num = arg.collector_env_num - pong_dqn_envpool_large_learning_config.env.collector_batch_size = arg.collector_batch_size - pong_dqn_envpool_large_learning_config.seed = arg.seed - pong_dqn_envpool_large_learning_config.env.stop_value = 2000 - pong_dqn_envpool_large_learning_config.nstep = 3 - pong_dqn_envpool_large_learning_config.policy.nstep = 3 - pong_dqn_envpool_large_learning_config.seed = arg.seed - - main(pong_dqn_envpool_large_learning_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_priority.py b/ding/example/dqn_envpool_wandb_new_nstep_priority.py deleted file mode 100644 index 02260f91b3..0000000000 --- a/ding/example/dqn_envpool_wandb_new_nstep_priority.py +++ /dev/null @@ -1,138 +0,0 @@ -import datetime -import torch -try: - torch.multiprocessing.set_start_method('spawn') -except RuntimeError: - pass -from easydict import EasyDict -import torch -from torch import nn -from ditk import logging -from ding.model import DQN -from ding.policy import DQNFastPolicy -from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV2, OffPolicyLearnerV3 -from ding.utils import set_pkg_seed - -from dizoo.atari.config.serial import pong_dqn_envpool_priority_config - - -def main(cfg): - logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Pong-v5-DQN-envpool-priority-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - - collector_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.collector_env_num, - 'batch_size': cfg.env.collector_batch_size, - # env wrappers - 'episodic_life': True, # collector: True - 'reward_clip': False, # collector: True - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["collector_env_cfg"] = collector_env_cfg - evaluator_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.evaluator_env_num, - 'batch_size': cfg.env.evaluator_batch_size, - # env wrappers - 'episodic_life': False, # evaluator: False - 'reward_clip': False, # evaluator: False - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - cfg.policy.model['activation'] = nn.ReLU(inplace=True) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNFastPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - task.use(epoch_timer()) - - # Here is the part of single process pipeline. - task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use( - EnvpoolStepCollectorV2( - cfg, - policy.collect_mode, - collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) - task.use(data_pusher(cfg, buffer_)) - #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use( - wandb_online_logger( - metric_list=policy._monitor_vars_learn(), - model=policy._model, - exp_config=cfg, - anonymous=True, - project_name=cfg.exp_name, - wandb_sweep=False, - ) - ) - - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=10000000)) - - task.run() - - -if __name__ == "__main__": - - import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--seed", type=int, default=0, help="random seed") - parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") - parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") - arg = parser.parse_args() - - pong_dqn_envpool_priority_config.env.collector_env_num = arg.collector_env_num - pong_dqn_envpool_priority_config.env.collector_batch_size = arg.collector_batch_size - pong_dqn_envpool_priority_config.seed = arg.seed - pong_dqn_envpool_priority_config.env.stop_value = 2000 - pong_dqn_envpool_priority_config.nstep = 3 - pong_dqn_envpool_priority_config.policy.nstep = 3 - pong_dqn_envpool_priority_config.seed = arg.seed - - main(pong_dqn_envpool_priority_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_shrink_size.py b/ding/example/dqn_envpool_wandb_new_nstep_shrink_size.py deleted file mode 100644 index 64ed0e2c86..0000000000 --- a/ding/example/dqn_envpool_wandb_new_nstep_shrink_size.py +++ /dev/null @@ -1,139 +0,0 @@ -import datetime -import torch -try: - torch.multiprocessing.set_start_method('spawn') -except RuntimeError: - pass -from easydict import EasyDict -import torch -from torch import nn -from ditk import logging -from ding.model import DQN -from ding.policy import DQNFastPolicy -from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV2, OffPolicyLearnerV3 -from ding.utils import set_pkg_seed - -from dizoo.atari.config.serial import pong_dqn_envpool_shink_model_config - - -def main(cfg): - logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Pong-v5-DQN-envpool-shrink-model-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - - collector_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.collector_env_num, - 'batch_size': cfg.env.collector_batch_size, - # env wrappers - 'episodic_life': True, # collector: True - 'reward_clip': False, # collector: True - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["collector_env_cfg"] = collector_env_cfg - evaluator_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.evaluator_env_num, - 'batch_size': cfg.env.evaluator_batch_size, - # env wrappers - 'episodic_life': False, # evaluator: False - 'reward_clip': False, # evaluator: False - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - cfg.policy.model['activation'] = nn.ReLU(inplace=True) - cfg.policy.model['head_layer_num'] = 0 - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNFastPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - task.use(epoch_timer()) - - # Here is the part of single process pipeline. - task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use( - EnvpoolStepCollectorV2( - cfg, - policy.collect_mode, - collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) - task.use(data_pusher(cfg, buffer_)) - #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use( - wandb_online_logger( - metric_list=policy._monitor_vars_learn(), - model=policy._model, - exp_config=cfg, - anonymous=True, - project_name=cfg.exp_name, - wandb_sweep=False, - ) - ) - - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=10000000)) - - task.run() - - -if __name__ == "__main__": - - import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--seed", type=int, default=0, help="random seed") - parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") - parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") - arg = parser.parse_args() - - pong_dqn_envpool_shink_model_config.env.collector_env_num = arg.collector_env_num - pong_dqn_envpool_shink_model_config.env.collector_batch_size = arg.collector_batch_size - pong_dqn_envpool_shink_model_config.seed = arg.seed - pong_dqn_envpool_shink_model_config.env.stop_value = 2000 - pong_dqn_envpool_shink_model_config.nstep = 3 - pong_dqn_envpool_shink_model_config.policy.nstep = 3 - pong_dqn_envpool_shink_model_config.seed = arg.seed - - main(pong_dqn_envpool_shink_model_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_spaceinvaders.py b/ding/example/dqn_envpool_wandb_new_nstep_spaceinvaders.py deleted file mode 100644 index 816921bd2f..0000000000 --- a/ding/example/dqn_envpool_wandb_new_nstep_spaceinvaders.py +++ /dev/null @@ -1,134 +0,0 @@ -import datetime -import torch -try: - torch.multiprocessing.set_start_method('spawn') -except RuntimeError: - pass -from easydict import EasyDict -from ditk import logging -from ding.model import DQN -from ding.policy import DQNFastPolicy -from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV2 -from ding.utils import set_pkg_seed - -from dizoo.atari.config.serial.spaceinvaders.spaceinvaders_dqn_envpool_config import spaceinvaders_dqn_envpool_config - - -def main(cfg): - logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Test-Spaceinvaders-v5-DQN-envpool-new-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - - collector_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.collector_env_num, - 'batch_size': cfg.env.collector_batch_size, - # env wrappers - 'episodic_life': True, # collector: True - 'reward_clip': False, # collector: True - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["collector_env_cfg"] = collector_env_cfg - evaluator_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.evaluator_env_num, - 'batch_size': cfg.env.evaluator_batch_size, - # env wrappers - 'episodic_life': False, # evaluator: False - 'reward_clip': False, # evaluator: False - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNFastPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - task.use(epoch_timer()) - - # Here is the part of single process pipeline. - task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use( - EnvpoolStepCollectorV2( - cfg, - policy.collect_mode, - collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) - task.use(data_pusher(cfg, buffer_)) - #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(OffPolicyLearnerV2(cfg, policy, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use( - wandb_online_logger( - metric_list=policy._monitor_vars_learn(), - model=policy._model, - exp_config=cfg, - anonymous=True, - project_name=cfg.exp_name, - wandb_sweep=False, - ) - ) - - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=10000000)) - - task.run() - - -if __name__ == "__main__": - - import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--seed", type=int, default=0, help="random seed") - parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") - parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") - arg = parser.parse_args() - - spaceinvaders_dqn_envpool_config.env.collector_env_num = arg.collector_env_num - spaceinvaders_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size - spaceinvaders_dqn_envpool_config.seed = arg.seed - spaceinvaders_dqn_envpool_config.env.stop_value = 1000000000 - spaceinvaders_dqn_envpool_config.nstep = 3 - spaceinvaders_dqn_envpool_config.policy.nstep = 3 - spaceinvaders_dqn_envpool_config.seed = arg.seed - - main(spaceinvaders_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_target_update_100.py b/ding/example/dqn_envpool_wandb_new_nstep_target_update_100.py deleted file mode 100644 index 391897cd83..0000000000 --- a/ding/example/dqn_envpool_wandb_new_nstep_target_update_100.py +++ /dev/null @@ -1,138 +0,0 @@ -import datetime -import torch -try: - torch.multiprocessing.set_start_method('spawn') -except RuntimeError: - pass -from easydict import EasyDict -import torch -from torch import nn -from ditk import logging -from ding.model import DQN -from ding.policy import DQNFastPolicy -from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV2, OffPolicyLearnerV3 -from ding.utils import set_pkg_seed - -from dizoo.atari.config.serial import pong_dqn_envpool_target_update_100_config - - -def main(cfg): - logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Pong-v5-DQN-envpool-target-update-100-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - - collector_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.collector_env_num, - 'batch_size': cfg.env.collector_batch_size, - # env wrappers - 'episodic_life': True, # collector: True - 'reward_clip': False, # collector: True - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["collector_env_cfg"] = collector_env_cfg - evaluator_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.evaluator_env_num, - 'batch_size': cfg.env.evaluator_batch_size, - # env wrappers - 'episodic_life': False, # evaluator: False - 'reward_clip': False, # evaluator: False - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - cfg.policy.model['activation'] = nn.ReLU(inplace=True) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNFastPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - task.use(epoch_timer()) - - # Here is the part of single process pipeline. - task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use( - EnvpoolStepCollectorV2( - cfg, - policy.collect_mode, - collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) - task.use(data_pusher(cfg, buffer_)) - #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use( - wandb_online_logger( - metric_list=policy._monitor_vars_learn(), - model=policy._model, - exp_config=cfg, - anonymous=True, - project_name=cfg.exp_name, - wandb_sweep=False, - ) - ) - - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=10000000)) - - task.run() - - -if __name__ == "__main__": - - import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--seed", type=int, default=0, help="random seed") - parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") - parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") - arg = parser.parse_args() - - pong_dqn_envpool_target_update_100_config.env.collector_env_num = arg.collector_env_num - pong_dqn_envpool_target_update_100_config.env.collector_batch_size = arg.collector_batch_size - pong_dqn_envpool_target_update_100_config.seed = arg.seed - pong_dqn_envpool_target_update_100_config.env.stop_value = 2000 - pong_dqn_envpool_target_update_100_config.nstep = 3 - pong_dqn_envpool_target_update_100_config.policy.nstep = 3 - pong_dqn_envpool_target_update_100_config.seed = arg.seed - - main(pong_dqn_envpool_target_update_100_config) diff --git a/ding/example/dqn_envpool_wandb_new_nstep_update_per_collect_5.py b/ding/example/dqn_envpool_wandb_new_nstep_update_per_collect_5.py deleted file mode 100644 index 68b89ed8fc..0000000000 --- a/ding/example/dqn_envpool_wandb_new_nstep_update_per_collect_5.py +++ /dev/null @@ -1,138 +0,0 @@ -import datetime -import torch -try: - torch.multiprocessing.set_start_method('spawn') -except RuntimeError: - pass -from easydict import EasyDict -import torch -from torch import nn -from ditk import logging -from ding.model import DQN -from ding.policy import DQNFastPolicy -from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, envpool_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollectorV2, OffPolicyLearnerV2, OffPolicyLearnerV3 -from ding.utils import set_pkg_seed - -from dizoo.atari.config.serial import pong_dqn_envpool_update_per_collect_5_config - - -def main(cfg): - logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Pong-v5-DQN-envpool-update-per-collect-5-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - - collector_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.collector_env_num, - 'batch_size': cfg.env.collector_batch_size, - # env wrappers - 'episodic_life': True, # collector: True - 'reward_clip': False, # collector: True - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["collector_env_cfg"] = collector_env_cfg - evaluator_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.evaluator_env_num, - 'batch_size': cfg.env.evaluator_batch_size, - # env wrappers - 'episodic_life': False, # evaluator: False - 'reward_clip': False, # evaluator: False - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - cfg.policy.model['activation'] = nn.ReLU(inplace=True) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNFastPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - task.use(epoch_timer()) - - # Here is the part of single process pipeline. - task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use( - EnvpoolStepCollectorV2( - cfg, - policy.collect_mode, - collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) - task.use(data_pusher(cfg, buffer_)) - #task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(OffPolicyLearnerV3(cfg, policy, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use( - wandb_online_logger( - metric_list=policy._monitor_vars_learn(), - model=policy._model, - exp_config=cfg, - anonymous=True, - project_name=cfg.exp_name, - wandb_sweep=False, - ) - ) - - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=10000000)) - - task.run() - - -if __name__ == "__main__": - - import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--seed", type=int, default=0, help="random seed") - parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") - parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") - arg = parser.parse_args() - - pong_dqn_envpool_update_per_collect_5_config.env.collector_env_num = arg.collector_env_num - pong_dqn_envpool_update_per_collect_5_config.env.collector_batch_size = arg.collector_batch_size - pong_dqn_envpool_update_per_collect_5_config.seed = arg.seed - pong_dqn_envpool_update_per_collect_5_config.env.stop_value = 2000 - pong_dqn_envpool_update_per_collect_5_config.nstep = 3 - pong_dqn_envpool_update_per_collect_5_config.policy.nstep = 3 - pong_dqn_envpool_update_per_collect_5_config.seed = arg.seed - - main(pong_dqn_envpool_update_per_collect_5_config) diff --git a/ding/example/dqn_envpool_wandb_origin.py b/ding/example/dqn_envpool_wandb_origin.py deleted file mode 100644 index 07c1f8fcff..0000000000 --- a/ding/example/dqn_envpool_wandb_origin.py +++ /dev/null @@ -1,134 +0,0 @@ -import gym -import datetime -import wandb -import numpy as np -from easydict import EasyDict -from ditk import logging -from ding.data.model_loader import FileModelLoader -from ding.data.storage_loader import FileStorageLoader -from ding.model import DQN -from ding.policy import DQNPolicy -from ding.envs import DingEnvWrapper, BaseEnvManagerV2 -from ding.envs.env_manager.envpool_env_manager import PoolEnvManager, PoolEnvManagerV2 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer -from ding.utils import set_pkg_seed - -from dizoo.atari.config.serial import pong_dqn_envpool_config - - -def main(cfg): - logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Pong-v5-DQN-envpool-origin-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - - collector_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.collector_env_num, - 'batch_size': cfg.env.collector_batch_size, - # env wrappers - 'episodic_life': True, # collector: True - 'reward_clip': False, # collector: True - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["collector_env_cfg"] = collector_env_cfg - evaluator_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.evaluator_env_num, - 'batch_size': cfg.env.evaluator_batch_size, - # env wrappers - 'episodic_life': False, # evaluator: False - 'reward_clip': False, # evaluator: False - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV2, DQNPolicy, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV2(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV2(cfg.env.evaluator_env_cfg) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - task.use(epoch_timer()) - - # Here is the part of single process pipeline. - task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use( - StepCollector( - cfg, - policy.collect_mode, - collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) - if "nstep" in cfg.policy and cfg.policy.nstep > 1: - task.use(nstep_reward_enhancer(cfg)) - task.use(data_pusher(cfg, buffer_)) - task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use( - wandb_online_logger( - metric_list=policy._monitor_vars_learn(), - model=policy._model, - exp_config=cfg, - anonymous=True, - project_name=cfg.exp_name, - wandb_sweep=False, - ) - ) - - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=10000000)) - - task.run() - - -if __name__ == "__main__": - import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--seed", type=int, default=0, help="random seed") - parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") - parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") - arg = parser.parse_args() - - pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num - pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size - pong_dqn_envpool_config.seed = arg.seed - pong_dqn_envpool_config.env.stop_value = 2000 - pong_dqn_envpool_config.policy.nstep = 1 - pong_dqn_envpool_config.nstep = 1 - - main(pong_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_wandb_sweep_pong.py b/ding/example/dqn_envpool_wandb_sweep_pong.py deleted file mode 100644 index de642adf75..0000000000 --- a/ding/example/dqn_envpool_wandb_sweep_pong.py +++ /dev/null @@ -1,157 +0,0 @@ -import shutil -import datetime -import wandb -import numpy as np -from easydict import EasyDict -from ditk import logging -from ding.data.model_loader import FileModelLoader -from ding.data.storage_loader import FileStorageLoader -from ding.model import DQN -from ding.policy import DQNPolicy -from ding.envs import DingEnvWrapper, BaseEnvManagerV2 -from ding.envs.env_manager.envpool_env_manager import PoolEnvManager, PoolEnvManagerV2 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger -from ding.utils import set_pkg_seed - -from dizoo.atari.config.serial import pong_dqn_envpool_config - - -def main(cfg, seed=0, max_env_step=int(1e7)): - logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Pong-v5-DQN-envpool-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - cfg.seed = seed - collector_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.collector_env_num, - 'batch_size': cfg.env.collector_batch_size, - # env wrappers - 'episodic_life': True, # collector: True - 'reward_clip': False, # collector: True - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["collector_env_cfg"] = collector_env_cfg - evaluator_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.evaluator_env_num, - 'batch_size': cfg.env.evaluator_batch_size, - # env wrappers - 'episodic_life': False, # evaluator: False - 'reward_clip': False, # evaluator: False - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV2, DQNPolicy, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV2(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV2(cfg.env.evaluator_env_cfg) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - # Here is the part of single process pipeline. - task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use( - StepCollector( - cfg, - policy.collect_mode, - collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) - if "nstep" in cfg.policy and cfg.policy.nstep > 1: - task.use(nstep_reward_enhancer(cfg)) - task.use(data_pusher(cfg, buffer_)) - task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use( - wandb_online_logger( - metric_list=policy._monitor_vars_learn(), - model=policy._model, - anonymous=True, - project_name=cfg.exp_name, - wandb_sweep=False, - ) - ) - - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=max_env_step)) - - task.run() - - -def sweep_main(): - wandb.init() - - good_pair = (wandb.config.collector_env_num % wandb.config.collector_batch_size == 0) - - if not good_pair: - wandb.log({"time": 0.0}) - else: - import time - start_time = time.time() - pong_dqn_envpool_config.env.stop_value = 2000 - pong_dqn_envpool_config.exp_name = f'Pong-v5-envpool-new-pipeline-speed-test-{wandb.config.collector_env_num}-{wandb.config.collector_batch_size}' - pong_dqn_envpool_config.env.collector_env_num = wandb.config.collector_env_num - pong_dqn_envpool_config.env.collector_batch_size = wandb.config.collector_batch_size - main(EasyDict(pong_dqn_envpool_config), max_env_step=10000000) - print(time.time() - start_time) - wandb.log({"time_cost": time.time() - start_time}) - #remove the directory named as exp_name - shutil.rmtree(pong_dqn_envpool_config.exp_name) - - -if __name__ == "__main__": - - sweep_configuration = { - 'method': 'grid', - 'metric': { - 'goal': 'maximize', - 'name': 'time_cost' - }, - 'parameters': { - 'collector_env_num': { - 'values': [64] - }, - 'collector_batch_size': { - 'values': [64, 32, 16, 8] - }, - } - } - - sweep_id = wandb.sweep(sweep=sweep_configuration, project='Pong-v5-envpool-new-pipeline-speed-test') - - wandb.agent(sweep_id, function=sweep_main) diff --git a/ding/example/dqn_envpool_wandb_test.py b/ding/example/dqn_envpool_wandb_test.py deleted file mode 100644 index 4b6e7c15b7..0000000000 --- a/ding/example/dqn_envpool_wandb_test.py +++ /dev/null @@ -1,123 +0,0 @@ -import gym -import datetime -import wandb -import numpy as np -from easydict import EasyDict -from ditk import logging -from ding.data.model_loader import FileModelLoader -from ding.data.storage_loader import FileStorageLoader -from ding.model import DQN -from ding.policy import DQNPolicy -from ding.envs import DingEnvWrapper, BaseEnvManagerV2 -from ding.envs.env_manager.envpool_env_manager import PoolEnvManager, PoolEnvManagerV2 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer -from ding.utils import set_pkg_seed - -from dizoo.atari.config.serial import pong_dqn_envpool_config - - -def main(cfg): - logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Pong-v5-DQN-envpool-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - cfg.env.collector_env_num = 64 - cfg.env.collector_batch_size = 64 - cfg.env['test'] = True - collector_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.collector_env_num, - 'batch_size': cfg.env.collector_batch_size, - # env wrappers - 'episodic_life': True, # collector: True - 'reward_clip': False, # collector: True - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - 'test': cfg.env.test, - } - ) - cfg.env["collector_env_cfg"] = collector_env_cfg - evaluator_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.evaluator_env_num, - 'batch_size': cfg.env.evaluator_batch_size, - # env wrappers - 'episodic_life': False, # evaluator: False - 'reward_clip': False, # evaluator: False - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - 'test': cfg.env.test, - } - ) - cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV2, DQNPolicy, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV2(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV2(cfg.env.evaluator_env_cfg) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - task.use(epoch_timer()) - - # Here is the part of single process pipeline. - task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use( - StepCollector( - cfg, - policy.collect_mode, - collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) - if "nstep" in cfg.policy and cfg.policy.nstep > 1: - task.use(nstep_reward_enhancer(cfg)) - task.use(data_pusher(cfg, buffer_)) - task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use( - wandb_online_logger( - metric_list=policy._monitor_vars_learn(), - model=policy._model, - anonymous=True, - project_name=cfg.exp_name, - wandb_sweep=False, - ) - ) - - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=10000000)) - - task.run() - - -if __name__ == "__main__": - main(pong_dqn_envpool_config) diff --git a/ding/framework/middleware/__init__.py b/ding/framework/middleware/__init__.py index bae9446ac9..aff23d79fc 100644 --- a/ding/framework/middleware/__init__.py +++ b/ding/framework/middleware/__init__.py @@ -1,6 +1,6 @@ from .functional import * -from .collector import StepCollector, EpisodeCollector, PPOFStepCollector, EnvpoolStepCollector, EnvpoolStepCollectorV2 -from .learner import OffPolicyLearner, HERLearner, OffPolicyLearnerV2, OffPolicyLearnerV3, OffPolicyLearnerV4 +from .collector import StepCollector, EpisodeCollector, PPOFStepCollector, EnvpoolStepCollector +from .learner import OffPolicyLearner, HERLearner, EnvpoolOffPolicyLearner from .ckpt_handler import CkptSaver from .distributer import ContextExchanger, ModelExchanger, PeriodicalModelExchanger from .barrier import Barrier, BarrierRuntime diff --git a/ding/framework/middleware/collector.py b/ding/framework/middleware/collector.py index 6c6f8e3567..3ed0773688 100644 --- a/ding/framework/middleware/collector.py +++ b/ding/framework/middleware/collector.py @@ -82,50 +82,6 @@ def __new__(cls, *args, **kwargs): return task.void() return super(EnvpoolStepCollector, cls).__new__(cls) - def __init__(self, cfg: EasyDict, policy, env: BaseEnvManager, random_collect_size: int = 0) -> None: - """ - Arguments: - - cfg (:obj:`EasyDict`): Config. - - policy (:obj:`Policy`): The policy to be collected. - - env (:obj:`BaseEnvManager`): The env for the collection, the BaseEnvManager object or \ - its derivatives are supported. - - random_collect_size (:obj:`int`): The count of samples that will be collected randomly, \ - typically used in initial runs. - """ - self.cfg = cfg - self.env = env - self.policy = policy - self.random_collect_size = random_collect_size - - def __call__(self, ctx: "OnlineRLContext") -> None: - """ - Overview: - An encapsulation of inference and rollout middleware. Stop when completing \ - the target number of steps. - Input of ctx: - - env_step (:obj:`int`): The env steps which will increase during collection. - """ - start = time.time() - old = ctx.env_step - if self.random_collect_size > 0 and old < self.random_collect_size: - target_size = self.random_collect_size - old - trajectories = self.env.collect_data(target_size) - else: - # compatible with old config, a train sample = unroll_len step - target_size = self.cfg.policy.collect.n_sample * self.cfg.policy.collect.unroll_len - trajectories = self.env.collect_data(target_size, self.policy, policy_forward_kwargs=ctx.collect_kwargs) - ctx.trajectories = trajectories - ctx.env_step += len(ctx.trajectories) - ctx.collector_time += time.time() - start - - -class EnvpoolStepCollectorV2: - - def __new__(cls, *args, **kwargs): - if task.router.is_active and not task.has_role(task.role.COLLECTOR): - return task.void() - return super(EnvpoolStepCollectorV2, cls).__new__(cls) - def __init__(self, cfg: EasyDict, policy, env: BaseEnvManager, random_collect_size: int = 0) -> None: """ Arguments: diff --git a/ding/framework/middleware/learner.py b/ding/framework/middleware/learner.py index 4aeb7eced6..1a3692ee84 100644 --- a/ding/framework/middleware/learner.py +++ b/ding/framework/middleware/learner.py @@ -121,7 +121,7 @@ def __call__(self, ctx: "OnlineRLContext") -> None: print("time_trainer:time_fetcher={}:{}={}".format(time_trainer, time_fetcher, time_trainer / time_fetcher)) -class OffPolicyLearnerV2: +class EnvpoolOffPolicyLearner: """ Overview: The class of the off-policy learner, including data fetching and model training. Use \ @@ -131,91 +131,7 @@ class OffPolicyLearnerV2: def __new__(cls, *args, **kwargs): if task.router.is_active and not task.has_role(task.role.LEARNER): return task.void() - return super(OffPolicyLearnerV2, cls).__new__(cls) - - def __init__( - self, - cfg: EasyDict, - policy: 'Policy', - buffer_: Union[Buffer, List[Tuple[Buffer, float]], Dict[str, Buffer]], - reward_model: Optional['BaseRewardModel'] = None, - log_freq: int = 100, - ) -> None: - """ - Arguments: - - cfg (:obj:`EasyDict`): Config. - - policy (:obj:`Policy`): The policy to be trained. - - buffer (:obj:`Buffer`): The replay buffer to store the data for training. - - reward_model (:obj:`BaseRewardModel`): Additional reward estimator likes RND, ICM, etc. \ - default to None. - - log_freq (:obj:`int`): The frequency (iteration) of showing log. - """ - self.cfg = cfg - self._fetcher = task.wrap(offpolicy_data_fetcher_v2(cfg, buffer_)) - self._trainer = task.wrap(trainer(cfg, policy.learn_mode, log_freq=log_freq)) - if reward_model is not None: - self._reward_estimator = task.wrap(reward_estimator(cfg, reward_model)) - else: - self._reward_estimator = None - - def __call__(self, ctx: "OnlineRLContext") -> None: - """ - Output of ctx: - - train_output (:obj:`Deque`): The training output in deque. - """ - start = time.time() - time_fetcher = 0.0 - time_process_data = 0.0 - time_trainer = 0.0 - - train_output_queue = [] - - for _ in range(self.cfg.policy.learn.update_per_collect): - start_fetch_data = time.time() - self._fetcher(ctx) - time_fetcher += time.time() - start_fetch_data - - start_process_data = time.time() - ctx.train_data = fast_preprocess_learn( - ctx.train_data_sample, - use_priority=False, #policy._cfg.priority, - use_priority_IS_weight=False, #policy._cfg.priority_IS_weight, - cuda=True, #policy._cuda, - device="cuda:0", #policy._device, - ) - time_process_data += time.time() - start_process_data - - if self._reward_estimator: - self._reward_estimator(ctx) - - start_trainer = time.time() - self._trainer(ctx) - time_trainer += time.time() - start_trainer - - train_output_queue.append(ctx.train_output) - ctx.train_output_for_post_process = ctx.train_output - - ctx.train_output = train_output_queue - ctx.learner_time += time.time() - start - #print("time_fetcher:time_trainer={}:{}={}".format(time_fetcher, time_trainer, time_fetcher / time_trainer)) - #print( - # "time_process_data:time_trainer={}:{}={}".format( - # time_process_data, time_trainer, time_process_data / time_trainer - # ) - #) - - -class OffPolicyLearnerV3: - """ - Overview: - The class of the off-policy learner, including data fetching and model training. Use \ - the `__call__` method to execute the whole learning process. - """ - - def __new__(cls, *args, **kwargs): - if task.router.is_active and not task.has_role(task.role.LEARNER): - return task.void() - return super(OffPolicyLearnerV3, cls).__new__(cls) + return super(EnvpoolOffPolicyLearner, cls).__new__(cls) def __init__( self, @@ -296,99 +212,6 @@ def __call__(self, ctx: "OnlineRLContext") -> None: ctx.train_output = train_output_queue ctx.learner_time += time.time() - start - #print("time_fetcher:time_fetch_data={}:{}={}".format(time_fetcher, time_fetch_data, time_fetcher / time_fetch_data)) - #print("time_trainer:time_get_data={}:{}={}".format(time_trainer, time_get_data, time_trainer / time_get_data)) - #print("time_trainer:time_fetcher={}:{}={}".format(time_trainer, time_fetcher, time_trainer / time_fetcher)) - - -class OffPolicyLearnerV4: - """ - Overview: - The class of the off-policy learner, including data fetching and model training. Use \ - the `__call__` method to execute the whole learning process. - """ - - def __new__(cls, *args, **kwargs): - if task.router.is_active and not task.has_role(task.role.LEARNER): - return task.void() - return super(OffPolicyLearnerV4, cls).__new__(cls) - - def __init__( - self, - cfg: EasyDict, - policy: 'Policy', - buffer_: Union[Buffer, List[Tuple[Buffer, float]], Dict[str, Buffer]], - reward_model: Optional['BaseRewardModel'] = None, - log_freq: int = 100, - ) -> None: - """ - Arguments: - - cfg (:obj:`EasyDict`): Config. - - policy (:obj:`Policy`): The policy to be trained. - - buffer (:obj:`Buffer`): The replay buffer to store the data for training. - - reward_model (:obj:`BaseRewardModel`): Additional reward estimator likes RND, ICM, etc. \ - default to None. - - log_freq (:obj:`int`): The frequency (iteration) of showing log. - """ - self.cfg = cfg - self._fetcher = task.wrap(offpolicy_data_fetcher_v2(cfg, buffer_)) - self._trainer = task.wrap(trainer(cfg, policy.learn_mode, log_freq=log_freq)) - if reward_model is not None: - self._reward_estimator = task.wrap(reward_estimator(cfg, reward_model)) - else: - self._reward_estimator = None - - def __call__(self, ctx: "OnlineRLContext") -> None: - """ - Output of ctx: - - train_output (:obj:`Deque`): The training output in deque. - """ - start = time.time() - time_fetcher = 0.0 - time_process_data = 0.0 - time_trainer = 0.0 - - train_output_queue = [] - train_data_processed = Queue() - - for _ in range(self.cfg.policy.learn.update_per_collect): - start_fetch_data = time.time() - self._fetcher(ctx) - time_fetcher += time.time() - start_fetch_data - - start_process_data = time.time() - train_data = fast_preprocess_learn( - ctx.train_data_sample, - use_priority=False, #policy._cfg.priority, - use_priority_IS_weight=False, #policy._cfg.priority_IS_weight, - cuda=True, #policy._cuda, - device="cuda:0", #policy._device, - ) - time_process_data += time.time() - start_process_data - - train_data_processed.put(train_data) - - if self._reward_estimator: - self._reward_estimator(ctx) - - for _ in range(self.cfg.policy.learn.update_per_collect): - - start_trainer = time.time() - ctx.train_data = train_data_processed.get() - self._trainer(ctx) - time_trainer += time.time() - start_trainer - - train_output_queue.append(ctx.train_output) - ctx.train_output_for_post_process = ctx.train_output - - ctx.train_output = train_output_queue - ctx.learner_time += time.time() - start - #print("time_fetcher:time_trainer={}:{}={}".format(time_fetcher, time_trainer, time_fetcher / time_trainer)) - #print( - # "time_process_data:time_trainer={}:{}={}".format( - # time_process_data, time_trainer, time_process_data / time_trainer - # ) - #) class HERLearner: diff --git a/ding/model/common/head.py b/ding/model/common/head.py index ddb8014c2e..a666cf0c0a 100755 --- a/ding/model/common/head.py +++ b/ding/model/common/head.py @@ -832,7 +832,7 @@ def __init__( v_layer_num = layer_num layer = NoiseLinearLayer if noise else nn.Linear block = noise_block if noise else fc_block - if a_layer_num>0: + if a_layer_num > 0: self.A = nn.Sequential( MLP( hidden_size, @@ -848,7 +848,7 @@ def __init__( ) else: self.A = block(hidden_size, output_size) - if v_layer_num>0: + if v_layer_num > 0: self.V = nn.Sequential( MLP( hidden_size, diff --git a/ding/policy/dqn.py b/ding/policy/dqn.py index 146e259044..f5e26e230d 100644 --- a/ding/policy/dqn.py +++ b/ding/policy/dqn.py @@ -734,7 +734,7 @@ def _init_learn(self) -> None: self._learn_model = model_wrap(self._model, wrapper_name='argmax_sample') self._learn_model.reset() self._target_model.reset() - self.time_counter=dict( + self.time_counter = dict( set_model_train_time=0, forward_q_value_time=0, forward_target_next_time=0, @@ -850,7 +850,6 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: #torch.cuda.synchronize() target_update_time = time.time() - start - time_learn_total = time.time() - start_total # print(f"set_model_train_time:time_learn={set_model_train_time}:{time_learn_total}={set_model_train_time/time_learn_total}") @@ -883,7 +882,6 @@ def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: # print(f"gradient_step_time:time_learn={self.time_counter['gradient_step_time']}:{self.time_counter['time_learn_total']}={self.time_counter['gradient_step_time']/self.time_counter['time_learn_total']}") # print(f"target_update_time:time_learn={self.time_counter['target_update_time']}:{self.time_counter['time_learn_total']}={self.time_counter['target_update_time']/self.time_counter['time_learn_total']}") - return { 'cur_lr': self._optimizer.defaults['lr'], 'total_loss': loss.item(), diff --git a/dizoo/atari/config/serial/pong/__init__.py b/dizoo/atari/config/serial/pong/__init__.py index ec63a0ce04..5ce3db9a5b 100644 --- a/dizoo/atari/config/serial/pong/__init__.py +++ b/dizoo/atari/config/serial/pong/__init__.py @@ -1,9 +1,3 @@ from .pong_dqn_config import pong_dqn_config, pong_dqn_create_config from .pong_dqn_envpool_config import pong_dqn_envpool_config, pong_dqn_envpool_create_config -from .pong_dqn_envpool_shink_model_config import pong_dqn_envpool_shink_model_config -from .pong_dqn_envpool_large_batch_config import pong_dqn_envpool_large_batch_config -from .pong_dqn_envpool_large_learning_config import pong_dqn_envpool_large_learning_config -from .pong_dqn_envpool_priority_config import pong_dqn_envpool_priority_config -from .pong_dqn_envpool_target_update_100_config import pong_dqn_envpool_target_update_100_config -from .pong_dqn_envpool_update_per_collect_5_config import pong_dqn_envpool_update_per_collect_5_config from .pong_dqfd_config import pong_dqfd_config, pong_dqfd_create_config \ No newline at end of file diff --git a/dizoo/atari/config/serial/pong/pong_dqn_envpool_large_batch_config.py b/dizoo/atari/config/serial/pong/pong_dqn_envpool_large_batch_config.py deleted file mode 100644 index d64ae19819..0000000000 --- a/dizoo/atari/config/serial/pong/pong_dqn_envpool_large_batch_config.py +++ /dev/null @@ -1,63 +0,0 @@ -from easydict import EasyDict - -pong_dqn_envpool_large_batch_config = dict( - exp_name='pong_dqn_envpool_large_batch_seed0', - env=dict( - collector_env_num=8, - collector_batch_size=8, - evaluator_env_num=8, - evaluator_batch_size=8, - n_evaluator_episode=8, - stop_value=20, - env_id='Pong-v5', - #'ALE/Pong-v5' is available. But special setting is needed after gym make. - frame_stack=4, - ), - policy=dict( - cuda=True, - priority=False, - random_collect_size=50000, - model=dict( - obs_shape=[4, 84, 84], - action_shape=6, - encoder_hidden_size_list=[128, 128, 512], - ), - nstep=3, - discount_factor=0.99, - learn=dict( - update_per_collect=3, - batch_size=64, - learning_rate=0.0003, - target_update_freq=500, - ), - collect=dict(n_sample=96, ), - eval=dict(evaluator=dict(eval_freq=4000, )), - other=dict( - eps=dict( - type='exp', - start=1., - end=0.05, - decay=250000, - ), - replay_buffer=dict(replay_buffer_size=100000, ), - ), - ), -) -pong_dqn_envpool_large_batch_config = EasyDict(pong_dqn_envpool_large_batch_config) -main_config = pong_dqn_envpool_large_batch_config -pong_dqn_envpool_create_config = dict( - env=dict( - type='atari', - import_names=['dizoo.atari.envs.atari_env'], - ), - env_manager=dict(type='env_pool'), - policy=dict(type='dqn'), - replay_buffer=dict(type='deque'), -) -pong_dqn_envpool_create_config = EasyDict(pong_dqn_envpool_create_config) -create_config = pong_dqn_envpool_create_config - -if __name__ == '__main__': - # or you can enter `ding -m serial -c pong_dqn_envpool_config.py -s 0` - from ding.entry import serial_pipeline - serial_pipeline((main_config, create_config), seed=0) diff --git a/dizoo/atari/config/serial/pong/pong_dqn_envpool_large_learning_config.py b/dizoo/atari/config/serial/pong/pong_dqn_envpool_large_learning_config.py deleted file mode 100644 index d74b24ca6d..0000000000 --- a/dizoo/atari/config/serial/pong/pong_dqn_envpool_large_learning_config.py +++ /dev/null @@ -1,63 +0,0 @@ -from easydict import EasyDict - -pong_dqn_envpool_large_learning_config = dict( - exp_name='pong_dqn_envpool_large_batch_seed0', - env=dict( - collector_env_num=8, - collector_batch_size=8, - evaluator_env_num=8, - evaluator_batch_size=8, - n_evaluator_episode=8, - stop_value=20, - env_id='Pong-v5', - #'ALE/Pong-v5' is available. But special setting is needed after gym make. - frame_stack=4, - ), - policy=dict( - cuda=True, - priority=False, - random_collect_size=50000, - model=dict( - obs_shape=[4, 84, 84], - action_shape=6, - encoder_hidden_size_list=[128, 128, 512], - ), - nstep=3, - discount_factor=0.99, - learn=dict( - update_per_collect=3, - batch_size=64, - learning_rate=0.001, - target_update_freq=500, - ), - collect=dict(n_sample=96, ), - eval=dict(evaluator=dict(eval_freq=4000, )), - other=dict( - eps=dict( - type='exp', - start=1., - end=0.05, - decay=250000, - ), - replay_buffer=dict(replay_buffer_size=100000, ), - ), - ), -) -pong_dqn_envpool_large_learning_config = EasyDict(pong_dqn_envpool_large_learning_config) -main_config = pong_dqn_envpool_large_learning_config -pong_dqn_envpool_create_config = dict( - env=dict( - type='atari', - import_names=['dizoo.atari.envs.atari_env'], - ), - env_manager=dict(type='env_pool'), - policy=dict(type='dqn'), - replay_buffer=dict(type='deque'), -) -pong_dqn_envpool_create_config = EasyDict(pong_dqn_envpool_create_config) -create_config = pong_dqn_envpool_create_config - -if __name__ == '__main__': - # or you can enter `ding -m serial -c pong_dqn_envpool_config.py -s 0` - from ding.entry import serial_pipeline - serial_pipeline((main_config, create_config), seed=0) diff --git a/dizoo/atari/config/serial/pong/pong_dqn_envpool_priority_config.py b/dizoo/atari/config/serial/pong/pong_dqn_envpool_priority_config.py deleted file mode 100644 index 9cb58445a7..0000000000 --- a/dizoo/atari/config/serial/pong/pong_dqn_envpool_priority_config.py +++ /dev/null @@ -1,63 +0,0 @@ -from easydict import EasyDict - -pong_dqn_envpool_priority_config = dict( - exp_name='pong_dqn_envpool_large_batch_seed0', - env=dict( - collector_env_num=8, - collector_batch_size=8, - evaluator_env_num=8, - evaluator_batch_size=8, - n_evaluator_episode=8, - stop_value=20, - env_id='Pong-v5', - #'ALE/Pong-v5' is available. But special setting is needed after gym make. - frame_stack=4, - ), - policy=dict( - cuda=True, - priority=True, - random_collect_size=50000, - model=dict( - obs_shape=[4, 84, 84], - action_shape=6, - encoder_hidden_size_list=[128, 128, 512], - ), - nstep=3, - discount_factor=0.99, - learn=dict( - update_per_collect=3, - batch_size=64, - learning_rate=0.0003, - target_update_freq=500, - ), - collect=dict(n_sample=96, ), - eval=dict(evaluator=dict(eval_freq=4000, )), - other=dict( - eps=dict( - type='exp', - start=1., - end=0.05, - decay=250000, - ), - replay_buffer=dict(replay_buffer_size=100000, ), - ), - ), -) -pong_dqn_envpool_priority_config = EasyDict(pong_dqn_envpool_priority_config) -main_config = pong_dqn_envpool_priority_config -pong_dqn_envpool_create_config = dict( - env=dict( - type='atari', - import_names=['dizoo.atari.envs.atari_env'], - ), - env_manager=dict(type='env_pool'), - policy=dict(type='dqn'), - replay_buffer=dict(type='deque'), -) -pong_dqn_envpool_create_config = EasyDict(pong_dqn_envpool_create_config) -create_config = pong_dqn_envpool_create_config - -if __name__ == '__main__': - # or you can enter `ding -m serial -c pong_dqn_envpool_config.py -s 0` - from ding.entry import serial_pipeline - serial_pipeline((main_config, create_config), seed=0) diff --git a/dizoo/atari/config/serial/pong/pong_dqn_envpool_shink_model_config.py b/dizoo/atari/config/serial/pong/pong_dqn_envpool_shink_model_config.py deleted file mode 100644 index b9980ca298..0000000000 --- a/dizoo/atari/config/serial/pong/pong_dqn_envpool_shink_model_config.py +++ /dev/null @@ -1,64 +0,0 @@ -from easydict import EasyDict - -pong_dqn_envpool_shink_model_config = dict( - exp_name='pong_dqn_envpool_shink_model_seed0', - env=dict( - collector_env_num=8, - collector_batch_size=8, - evaluator_env_num=8, - evaluator_batch_size=8, - n_evaluator_episode=8, - stop_value=20, - env_id='Pong-v5', - #'ALE/Pong-v5' is available. But special setting is needed after gym make. - frame_stack=4, - ), - policy=dict( - cuda=True, - priority=False, - random_collect_size=50000, - model=dict( - obs_shape=[4, 84, 84], - action_shape=6, - encoder_hidden_size_list=[32, 32, 64], - # encoder_hidden_size_list=[128, 128, 512], - ), - nstep=3, - discount_factor=0.99, - learn=dict( - update_per_collect=10, - batch_size=32, - learning_rate=0.0001, - target_update_freq=500, - ), - collect=dict(n_sample=96, ), - eval=dict(evaluator=dict(eval_freq=4000, )), - other=dict( - eps=dict( - type='exp', - start=1., - end=0.05, - decay=250000, - ), - replay_buffer=dict(replay_buffer_size=100000, ), - ), - ), -) -pong_dqn_envpool_shink_model_config = EasyDict(pong_dqn_envpool_shink_model_config) -main_config = pong_dqn_envpool_shink_model_config -pong_dqn_envpool_create_config = dict( - env=dict( - type='atari', - import_names=['dizoo.atari.envs.atari_env'], - ), - env_manager=dict(type='env_pool'), - policy=dict(type='dqn'), - replay_buffer=dict(type='deque'), -) -pong_dqn_envpool_create_config = EasyDict(pong_dqn_envpool_create_config) -create_config = pong_dqn_envpool_create_config - -if __name__ == '__main__': - # or you can enter `ding -m serial -c pong_dqn_envpool_config.py -s 0` - from ding.entry import serial_pipeline - serial_pipeline((main_config, create_config), seed=0) diff --git a/dizoo/atari/config/serial/pong/pong_dqn_envpool_target_update_100_config.py b/dizoo/atari/config/serial/pong/pong_dqn_envpool_target_update_100_config.py deleted file mode 100644 index 54f89e32b3..0000000000 --- a/dizoo/atari/config/serial/pong/pong_dqn_envpool_target_update_100_config.py +++ /dev/null @@ -1,63 +0,0 @@ -from easydict import EasyDict - -pong_dqn_envpool_target_update_100_config = dict( - exp_name='pong_dqn_envpool_large_batch_seed0', - env=dict( - collector_env_num=8, - collector_batch_size=8, - evaluator_env_num=8, - evaluator_batch_size=8, - n_evaluator_episode=8, - stop_value=20, - env_id='Pong-v5', - #'ALE/Pong-v5' is available. But special setting is needed after gym make. - frame_stack=4, - ), - policy=dict( - cuda=True, - priority=False, - random_collect_size=50000, - model=dict( - obs_shape=[4, 84, 84], - action_shape=6, - encoder_hidden_size_list=[128, 128, 512], - ), - nstep=3, - discount_factor=0.99, - learn=dict( - update_per_collect=3, - batch_size=64, - learning_rate=0.0003, - target_update_freq=100, - ), - collect=dict(n_sample=96, ), - eval=dict(evaluator=dict(eval_freq=4000, )), - other=dict( - eps=dict( - type='exp', - start=1., - end=0.05, - decay=250000, - ), - replay_buffer=dict(replay_buffer_size=100000, ), - ), - ), -) -pong_dqn_envpool_target_update_100_config = EasyDict(pong_dqn_envpool_target_update_100_config) -main_config = pong_dqn_envpool_target_update_100_config -pong_dqn_envpool_create_config = dict( - env=dict( - type='atari', - import_names=['dizoo.atari.envs.atari_env'], - ), - env_manager=dict(type='env_pool'), - policy=dict(type='dqn'), - replay_buffer=dict(type='deque'), -) -pong_dqn_envpool_create_config = EasyDict(pong_dqn_envpool_create_config) -create_config = pong_dqn_envpool_create_config - -if __name__ == '__main__': - # or you can enter `ding -m serial -c pong_dqn_envpool_config.py -s 0` - from ding.entry import serial_pipeline - serial_pipeline((main_config, create_config), seed=0) diff --git a/dizoo/atari/config/serial/pong/pong_dqn_envpool_update_per_collect_5_config.py b/dizoo/atari/config/serial/pong/pong_dqn_envpool_update_per_collect_5_config.py deleted file mode 100644 index 8523b54334..0000000000 --- a/dizoo/atari/config/serial/pong/pong_dqn_envpool_update_per_collect_5_config.py +++ /dev/null @@ -1,63 +0,0 @@ -from easydict import EasyDict - -pong_dqn_envpool_update_per_collect_5_config = dict( - exp_name='pong_dqn_envpool_large_batch_seed0', - env=dict( - collector_env_num=8, - collector_batch_size=8, - evaluator_env_num=8, - evaluator_batch_size=8, - n_evaluator_episode=8, - stop_value=20, - env_id='Pong-v5', - #'ALE/Pong-v5' is available. But special setting is needed after gym make. - frame_stack=4, - ), - policy=dict( - cuda=True, - priority=False, - random_collect_size=50000, - model=dict( - obs_shape=[4, 84, 84], - action_shape=6, - encoder_hidden_size_list=[128, 128, 512], - ), - nstep=3, - discount_factor=0.99, - learn=dict( - update_per_collect=5, - batch_size=64, - learning_rate=0.0003, - target_update_freq=500, - ), - collect=dict(n_sample=96, ), - eval=dict(evaluator=dict(eval_freq=4000, )), - other=dict( - eps=dict( - type='exp', - start=1., - end=0.05, - decay=250000, - ), - replay_buffer=dict(replay_buffer_size=100000, ), - ), - ), -) -pong_dqn_envpool_update_per_collect_5_config = EasyDict(pong_dqn_envpool_update_per_collect_5_config) -main_config = pong_dqn_envpool_update_per_collect_5_config -pong_dqn_envpool_create_config = dict( - env=dict( - type='atari', - import_names=['dizoo.atari.envs.atari_env'], - ), - env_manager=dict(type='env_pool'), - policy=dict(type='dqn'), - replay_buffer=dict(type='deque'), -) -pong_dqn_envpool_create_config = EasyDict(pong_dqn_envpool_create_config) -create_config = pong_dqn_envpool_create_config - -if __name__ == '__main__': - # or you can enter `ding -m serial -c pong_dqn_envpool_config.py -s 0` - from ding.entry import serial_pipeline - serial_pipeline((main_config, create_config), seed=0) From ed0f4905a020b6321ca377090dad2e90f46825c6 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 1 Nov 2023 15:49:54 +0800 Subject: [PATCH 231/244] polish code --- ding/envs/env_manager/envpool_env_manager.py | 466 +------------------ ding/example/dqn_envpool.py | 100 ---- ding/example/dqn_envpool_nstep.py | 8 +- ding/example/dqn_envpool_wandb.py | 118 ----- ding/example/dqn_envpool_wandb_main.py | 130 ------ 5 files changed, 6 insertions(+), 816 deletions(-) delete mode 100644 ding/example/dqn_envpool.py delete mode 100644 ding/example/dqn_envpool_wandb.py delete mode 100644 ding/example/dqn_envpool_wandb_main.py diff --git a/ding/envs/env_manager/envpool_env_manager.py b/ding/envs/env_manager/envpool_env_manager.py index e6ad96af3e..5a273507c9 100644 --- a/ding/envs/env_manager/envpool_env_manager.py +++ b/ding/envs/env_manager/envpool_env_manager.py @@ -171,473 +171,11 @@ def action_space(self) -> 'gym.spaces.Space': # noqa return self._action_space -@ENV_MANAGER_REGISTRY.register('env_pool_v2') -class PoolEnvManagerV2: - ''' - Overview: - Envpool now supports Atari, Classic Control, Toy Text, ViZDoom. - Here we list some commonly used env_ids as follows. - For more examples, you can refer to . - - - Atari: "Pong-v5", "SpaceInvaders-v5", "Qbert-v5" - - Classic Control: "CartPole-v0", "CartPole-v1", "Pendulum-v1" - ''' - - @classmethod - def default_config(cls) -> EasyDict: - return EasyDict(deepcopy(cls.config)) - - config = dict( - type='envpool', - # Sync mode: batch_size == env_num - # Async mode: batch_size < env_num - env_num=8, - batch_size=8, - ) - - def __init__(self, cfg: EasyDict) -> None: - super().__init__() - self._cfg = cfg - self._env_num = cfg.env_num - self._batch_size = cfg.batch_size - self._ready_obs = {} - self._closed = True - self._seed = None - self._test = False - - def launch(self) -> None: - assert self._closed, "Please first close the env manager" - if self._seed is None: - seed = 0 - else: - seed = self._seed - - kwargs = {} - if "episodic_life" in self._cfg: - kwargs["episodic_life"] = self._cfg.episodic_life - if "reward_clip" in self._cfg: - kwargs["reward_clip"] = self._cfg.reward_clip - if "stack_num" in self._cfg: - kwargs["stack_num"] = self._cfg.stack_num - if "gray_scale" in self._cfg: - kwargs["gray_scale"] = self._cfg.gray_scale - if "frame_skip" in self._cfg: - kwargs["frame_skip"] = self._cfg.frame_skip - if "test" in self._cfg: - self._test = self._cfg.test - - self._envs = envpool.make( - task_id=self._cfg.env_id, - env_type="gym", - num_envs=self._env_num, - batch_size=self._batch_size, - seed=seed, - **kwargs - ) - self._action_space = self._envs.action_space - self._observation_space = self._envs.observation_space - self._closed = False - self.reset() - - def reset(self) -> None: - self._ready_obs = {} - self._envs.async_reset() - while True: - obs, _, _, info = self._envs.recv() - env_id = info['env_id'] - obs = obs.astype(np.float32) - obs /= 255.0 - self._ready_obs = deep_merge_dicts({i: o for i, o in zip(env_id, obs)}, self._ready_obs) - if len(self._ready_obs) == self._env_num: - break - self._eval_episode_return = [0. for _ in range(self._env_num)] - - def step(self, action: Union[List, np.ndarray]) -> Dict[int, namedtuple]: - env_id = np.array(list(self._ready_obs.keys())) - action = np.array(action) - if len(action.shape) == 2: - action = action.squeeze(1) - self._envs.send(action, env_id) - - obs, rew, done, info = self._envs.recv() - if self._test: - assert all(info['env_id'] == env_id) - obs = obs.astype(np.float32) - obs /= 255.0 - rew = rew.astype(np.float32) - env_id = info['env_id'] - timesteps = {} - new_data = [] - self._ready_obs = {} - for i in range(len(env_id)): - d = bool(done[i]) - r = to_ndarray([rew[i]]) - self._eval_episode_return[env_id[i]] += r - info_dict = {'env_id': i} - timesteps[env_id[i]] = BaseEnvTimestep(obs[i], r, d, info=info_dict) - if d: - info_dict['eval_episode_return'] = self._eval_episode_return[env_id[i]] - timesteps[env_id[i]].info['eval_episode_return'] = info_dict['eval_episode_return'] - self._eval_episode_return[env_id[i]] = 0. - new_data.append(tnp.array({'obs': obs[i], 'reward': r, 'done': d, 'info': info_dict, 'env_id': env_id[i]})) - self._ready_obs[env_id[i]] = obs[i] - return new_data - - def step_v2(self, action: Dict[int, np.ndarray]) -> Dict[int, namedtuple]: - # env_id = np.array(list(action.keys())) - # action = np.array(action) - # if len(action.shape) == 2: - # action = action.squeeze(1) - self._envs.send(action) - - obs, rew, done, info = self._envs.recv() - if self._test: - assert all(info['env_id'] == env_id) - obs = obs.astype(np.float32) - obs /= 255.0 - rew = rew.astype(np.float32) - env_id = info['env_id'] - timesteps = {} - new_data = [] - self._ready_obs = {} - for i in range(len(env_id)): - d = bool(done[i]) - r = to_ndarray([rew[i]]) - self._eval_episode_return[env_id[i]] += r - info_dict = {'env_id': i} - timesteps[env_id[i]] = BaseEnvTimestep(obs[i], r, d, info=info_dict) - if d: - info_dict['eval_episode_return'] = self._eval_episode_return[env_id[i]] - timesteps[env_id[i]].info['eval_episode_return'] = info_dict['eval_episode_return'] - self._eval_episode_return[env_id[i]] = 0. - new_data.append(tnp.array({'obs': obs[i], 'reward': r, 'done': d, 'info': info_dict, 'env_id': env_id[i]})) - self._ready_obs[env_id[i]] = obs[i] - return new_data - - def close(self) -> None: - if self._closed: - return - # Envpool has no `close` API - self._closed = True - - @property - def closed(self) -> None: - return self._closed - - def seed(self, seed: int, dynamic_seed=False) -> None: - # The i-th environment seed in Envpool will be set with i+seed, so we don't do extra transformation here - self._seed = seed - logging.warning("envpool doesn't support dynamic_seed in different episode") - - @property - def env_num(self) -> int: - return self._env_num - - @property - def ready_obs(self) -> tnp.array: - if isinstance(self._ready_obs, dict): - obs = [tnp.array(o) for k, o in self._ready_obs.items()] - return tnp.stack(obs) - else: - raise NotImplementedError - - @property - def ready_obs_v2(self) -> tnp.array: - if self._ready_obs is not None: - return self._ready_obs - else: - raise ValueError - - @property - def observation_space(self) -> 'gym.spaces.Space': # noqa - try: - return self._observation_space - except AttributeError: - self.launch() - self.close() - self._ready_obs = {} - return self._observation_space - - @property - def action_space(self) -> 'gym.spaces.Space': # noqa - try: - return self._action_space - except AttributeError: - self.launch() - self.close() - self._ready_obs = {} - return self._action_space - - -@ENV_MANAGER_REGISTRY.register('env_pool_v3') -class PoolEnvManagerV3: - ''' - Overview: - Envpool now supports Atari, Classic Control, Toy Text, ViZDoom. - Here we list some commonly used env_ids as follows. - For more examples, you can refer to . - - - Atari: "Pong-v5", "SpaceInvaders-v5", "Qbert-v5" - - Classic Control: "CartPole-v0", "CartPole-v1", "Pendulum-v1" - ''' - - @classmethod - def default_config(cls) -> EasyDict: - return EasyDict(deepcopy(cls.config)) - - config = dict( - type='envpool', - # Sync mode: batch_size == env_num - # Async mode: batch_size < env_num - env_num=8, - batch_size=8, - ) - - def __init__(self, cfg: EasyDict) -> None: - super().__init__() - self._cfg = cfg - self._env_num = cfg.env_num - self._batch_size = cfg.batch_size - self._ready_obs = {} - self._closed = True - self._seed = None - self._test = False - - def launch(self) -> None: - assert self._closed, "Please first close the env manager" - if self._seed is None: - seed = 0 - else: - seed = self._seed - - kwargs = {} - if "episodic_life" in self._cfg: - kwargs["episodic_life"] = self._cfg.episodic_life - if "reward_clip" in self._cfg: - kwargs["reward_clip"] = self._cfg.reward_clip - if "stack_num" in self._cfg: - kwargs["stack_num"] = self._cfg.stack_num - if "gray_scale" in self._cfg: - kwargs["gray_scale"] = self._cfg.gray_scale - if "frame_skip" in self._cfg: - kwargs["frame_skip"] = self._cfg.frame_skip - if "test" in self._cfg: - self._test = self._cfg.test - - self._envs = envpool.make( - task_id=self._cfg.env_id, - env_type="gym", - num_envs=self._env_num, - batch_size=self._batch_size, - seed=seed, - **kwargs - ) - self._action_space = self._envs.action_space - self._observation_space = self._envs.observation_space - self._closed = False - self.reset() - - def reset(self) -> None: - self._ready_obs = {} - self._ready_obs_send = {} - self._ready_action_send = {} - self._envs.async_reset() - while True: - obs, _, _, info = self._envs.recv() - env_id = info['env_id'] - obs = obs.astype(np.float32) - obs /= 255.0 - self._ready_obs = deep_merge_dicts({i: o for i, o in zip(env_id, obs)}, self._ready_obs) - if len(self._ready_obs) == self._env_num: - break - self._eval_episode_return = [0. for _ in range(self._env_num)] - - def step(self, action: Union[List, np.ndarray]) -> Dict[int, namedtuple]: - env_id = np.array(list(self._ready_obs.keys())) - action = np.array(action) - if len(action.shape) == 2: - action = action.squeeze(1) - self._envs.send(action, env_id) - - obs, rew, done, info = self._envs.recv() - if self._test: - assert all(info['env_id'] == env_id) - obs = obs.astype(np.float32) - obs /= 255.0 - rew = rew.astype(np.float32) - env_id = info['env_id'] - timesteps = {} - new_data = [] - self._ready_obs = {} - for i in range(len(env_id)): - d = bool(done[i]) - r = to_ndarray([rew[i]]) - self._eval_episode_return[env_id[i]] += r - info_dict = {'env_id': i} - timesteps[env_id[i]] = BaseEnvTimestep(obs[i], r, d, info=info_dict) - if d: - info_dict['eval_episode_return'] = self._eval_episode_return[env_id[i]] - timesteps[env_id[i]].info['eval_episode_return'] = info_dict['eval_episode_return'] - self._eval_episode_return[env_id[i]] = 0. - new_data.append(tnp.array({'obs': obs[i], 'reward': r, 'done': d, 'info': info_dict, 'env_id': env_id[i]})) - self._ready_obs[env_id[i]] = obs[i] - return new_data - - def collect_data(self, num, policy=None, policy_forward_kwargs=None): - if self.closed: - self.launch() - - new_data = [] - - while len(new_data) < num: - - obs_to_send = self._ready_obs - env_id_to_send = list(obs_to_send.keys()) - num_to_send = len(obs_to_send.keys()) - if num_to_send > 0: - if policy: - action_to_send = policy.forward(obs_to_send, **policy_forward_kwargs) - else: - #random policy - action_to_send = {i: {"action": np.array([self._action_space.sample()])} for i in env_id_to_send} - self._ready_obs_send.update(obs_to_send) - self._ready_action_send.update(action_to_send) - action = np.array([action_to_send[i]['action'] for i in env_id_to_send]) - if action.ndim == 2 and action.shape[1] == 1: - action = action.squeeze(1) - env_id = np.array(env_id_to_send) - self._envs.send(action, env_id) - - next_obs, rew, done, info = self._envs.recv() - next_obs = next_obs.astype(np.float32) - next_obs /= 255.0 - rew = rew.astype(np.float32) - env_id = info['env_id'] - - self._ready_obs = {} - for i in range(len(env_id)): - new_data.append( - ttorch.tensor( - { - 'obs': self._ready_obs_send[env_id[i]], - 'action': self._ready_action_send[env_id[i]]['action'], - 'next_obs': next_obs[i], - 'reward': np.array([rew[i]]), - 'done': done[i] - } - ) - ) - self._ready_obs[env_id[i]] = next_obs[i] - - return new_data - - def collect_data_nstep(self, num, n_step=1, policy=None, policy_forward_kwargs=None): - if self.closed: - self.launch() - - new_data = [] - - while len(new_data) < num: - - obs_to_send = self._ready_obs - env_id_to_send = list(obs_to_send.keys()) - num_to_send = len(obs_to_send.keys()) - if num_to_send > 0: - if policy: - action_to_send = policy.forward(obs_to_send, **policy_forward_kwargs) - else: - #random policy - action_to_send = {i: {"action": np.array([self._action_space.sample()])} for i in env_id_to_send} - self._ready_obs_send.update(obs_to_send) - self._ready_action_send.update(action_to_send) - action = np.array([action_to_send[i]['action'] for i in env_id_to_send]) - if action.ndim == 2 and action.shape[1] == 1: - action = action.squeeze(1) - env_id = np.array(env_id_to_send) - self._envs.send(action, env_id) - - next_obs, rew, done, info = self._envs.recv() - next_obs = next_obs.astype(np.float32) - next_obs /= 255.0 - rew = rew.astype(np.float32) - env_id = info['env_id'] - - self._ready_obs = {} - for i in range(len(env_id)): - new_data.append( - ttorch.tensor( - { - 'obs': self._ready_obs_send[env_id[i]], - 'action': self._ready_action_send[env_id[i]]['action'], - 'next_obs': next_obs[i], - 'reward': np.array([rew[i]]), - 'done': done[i] - } - ) - ) - self._ready_obs[env_id[i]] = next_obs[i] - - return new_data - - def close(self) -> None: - if self._closed: - return - # Envpool has no `close` API - self._closed = True - - @property - def closed(self) -> None: - return self._closed - - def seed(self, seed: int, dynamic_seed=False) -> None: - # The i-th environment seed in Envpool will be set with i+seed, so we don't do extra transformation here - self._seed = seed - logging.warning("envpool doesn't support dynamic_seed in different episode") - - @property - def env_num(self) -> int: - return self._env_num - - @property - def ready_obs(self) -> tnp.array: - if isinstance(self._ready_obs, dict): - obs = [tnp.array(o) for k, o in self._ready_obs.items()] - return tnp.stack(obs) - else: - raise NotImplementedError - - @property - def ready_obs_v2(self) -> tnp.array: - if self._ready_obs is not None: - return self._ready_obs - else: - raise ValueError - - @property - def observation_space(self) -> 'gym.spaces.Space': # noqa - try: - return self._observation_space - except AttributeError: - self.launch() - self.close() - self._ready_obs = {} - return self._observation_space - - @property - def action_space(self) -> 'gym.spaces.Space': # noqa - try: - return self._action_space - except AttributeError: - self.launch() - self.close() - self._ready_obs = {} - return self._action_space - - @ENV_MANAGER_REGISTRY.register('env_pool_v4') -class PoolEnvManagerV4: +class PoolEnvManagerV2: ''' Overview: + Envpool env manager support new pipeline of DI-engine Envpool now supports Atari, Classic Control, Toy Text, ViZDoom. Here we list some commonly used env_ids as follows. For more examples, you can refer to . diff --git a/ding/example/dqn_envpool.py b/ding/example/dqn_envpool.py deleted file mode 100644 index bef6a72369..0000000000 --- a/ding/example/dqn_envpool.py +++ /dev/null @@ -1,100 +0,0 @@ -import gym -import datetime -import wandb -import numpy as np -from easydict import EasyDict -from ditk import logging -from ding.data.model_loader import FileModelLoader -from ding.data.storage_loader import FileStorageLoader -from ding.model import DQN -from ding.policy import DQNPolicy -from ding.envs import DingEnvWrapper, BaseEnvManagerV2 -from ding.envs.env_manager.envpool_env_manager import PoolEnvManager, PoolEnvManagerV2 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker -from ding.utils import set_pkg_seed - -from dizoo.atari.config.serial import pong_dqn_envpool_config - - -def main(cfg): - logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'pong_dqn_envpool_' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - cfg.env.collector_env_num = 8 - cfg.env.collector_batch_size = 8 - collector_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.collector_env_num, - 'batch_size': cfg.env.collector_batch_size, - # env wrappers - 'episodic_life': True, # collector: True - 'reward_clip': False, # collector: True - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["collector_env_cfg"] = collector_env_cfg - evaluator_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.evaluator_env_num, - 'batch_size': cfg.env.evaluator_batch_size, - # env wrappers - 'episodic_life': False, # evaluator: False - 'reward_clip': False, # evaluator: False - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV2, DQNPolicy, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV2(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV2(cfg.env.evaluator_env_cfg) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - # Here is the part of single process pipeline. - task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use(StepCollector(cfg, policy.collect_mode, collector_env)) - if "nstep" in cfg.policy and cfg.policy.nstep > 1: - task.use(nstep_reward_enhancer(cfg)) - task.use(data_pusher(cfg, buffer_)) - task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(online_logger(train_show_freq=10)) - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=10000000)) - - task.run() - - -if __name__ == "__main__": - main(pong_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_nstep.py b/ding/example/dqn_envpool_nstep.py index 1580291aae..fb6159922e 100644 --- a/ding/example/dqn_envpool_nstep.py +++ b/ding/example/dqn_envpool_nstep.py @@ -8,7 +8,7 @@ from ditk import logging from ding.model import DQN from ding.policy import DQNFastPolicy -from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV4 +from ding.envs.env_manager.envpool_env_manager import PoolEnvManagerV2 from ding.data import DequeBuffer from ding.config import compile_config from ding.framework import task, ding_init @@ -51,11 +51,11 @@ def main(cfg): } ) cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV4, DQNFastPolicy, save_cfg=task.router.node_id == 0) + cfg = compile_config(cfg, PoolEnvManagerV2, DQNFastPolicy, save_cfg=task.router.node_id == 0) ding_init(cfg) with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV4(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV4(cfg.env.evaluator_env_cfg) + collector_env = PoolEnvManagerV2(cfg.env.collector_env_cfg) + evaluator_env = PoolEnvManagerV2(cfg.env.evaluator_env_cfg) collector_env.seed(cfg.seed) evaluator_env.seed(cfg.seed) set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) diff --git a/ding/example/dqn_envpool_wandb.py b/ding/example/dqn_envpool_wandb.py deleted file mode 100644 index 1e63982039..0000000000 --- a/ding/example/dqn_envpool_wandb.py +++ /dev/null @@ -1,118 +0,0 @@ -import gym -import datetime -import wandb -import numpy as np -from easydict import EasyDict -from ditk import logging -from ding.data.model_loader import FileModelLoader -from ding.data.storage_loader import FileStorageLoader -from ding.model import DQN -from ding.policy import DQNPolicy -from ding.envs import DingEnvWrapper, BaseEnvManagerV2 -from ding.envs.env_manager.envpool_env_manager import PoolEnvManager, PoolEnvManagerV2 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger -from ding.utils import set_pkg_seed - -from dizoo.atari.config.serial import pong_dqn_envpool_config - - -def main(cfg): - logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Pong-v5-DQN-envpool-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - cfg.env.collector_env_num = 8 - cfg.env.collector_batch_size = 8 - collector_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.collector_env_num, - 'batch_size': cfg.env.collector_batch_size, - # env wrappers - 'episodic_life': True, # collector: True - 'reward_clip': False, # collector: True - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["collector_env_cfg"] = collector_env_cfg - evaluator_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.evaluator_env_num, - 'batch_size': cfg.env.evaluator_batch_size, - # env wrappers - 'episodic_life': False, # evaluator: False - 'reward_clip': False, # evaluator: False - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV2, DQNPolicy, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV2(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV2(cfg.env.evaluator_env_cfg) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - # Here is the part of single process pipeline. - task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use( - StepCollector( - cfg, - policy.collect_mode, - collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) - if "nstep" in cfg.policy and cfg.policy.nstep > 1: - task.use(nstep_reward_enhancer(cfg)) - task.use(data_pusher(cfg, buffer_)) - task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use( - wandb_online_logger( - metric_list=policy._monitor_vars_learn(), - model=policy._model, - anonymous=True, - project_name=cfg.exp_name, - wandb_sweep=False, - ) - ) - - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=10000000)) - - task.run() - - -if __name__ == "__main__": - main(pong_dqn_envpool_config) diff --git a/ding/example/dqn_envpool_wandb_main.py b/ding/example/dqn_envpool_wandb_main.py deleted file mode 100644 index 8d5db7ad04..0000000000 --- a/ding/example/dqn_envpool_wandb_main.py +++ /dev/null @@ -1,130 +0,0 @@ -import gym -import datetime -import wandb -import numpy as np -from easydict import EasyDict -from ditk import logging -from ding.data.model_loader import FileModelLoader -from ding.data.storage_loader import FileStorageLoader -from ding.model import DQN -from ding.policy import DQNPolicy -from ding.envs import DingEnvWrapper, BaseEnvManagerV2 -from ding.envs.env_manager.envpool_env_manager import PoolEnvManager, PoolEnvManagerV2 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, nstep_reward_enhancer, \ - termination_checker, wandb_online_logger, epoch_timer -from ding.utils import set_pkg_seed - -from dizoo.atari.config.serial import pong_dqn_envpool_config - - -def main(cfg): - logging.getLogger().setLevel(logging.INFO) - cfg.exp_name = 'Pong-v5-DQN-envpool-' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - - collector_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.collector_env_num, - 'batch_size': cfg.env.collector_batch_size, - # env wrappers - 'episodic_life': True, # collector: True - 'reward_clip': False, # collector: True - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["collector_env_cfg"] = collector_env_cfg - evaluator_env_cfg = EasyDict( - { - 'env_id': cfg.env.env_id, - 'env_num': cfg.env.evaluator_env_num, - 'batch_size': cfg.env.evaluator_batch_size, - # env wrappers - 'episodic_life': False, # evaluator: False - 'reward_clip': False, # evaluator: False - 'gray_scale': cfg.env.get('gray_scale', True), - 'stack_num': cfg.env.get('stack_num', 4), - } - ) - cfg.env["evaluator_env_cfg"] = evaluator_env_cfg - cfg = compile_config(cfg, PoolEnvManagerV2, DQNPolicy, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = PoolEnvManagerV2(cfg.env.collector_env_cfg) - evaluator_env = PoolEnvManagerV2(cfg.env.evaluator_env_cfg) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - task.use(epoch_timer()) - - # Here is the part of single process pipeline. - task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use( - StepCollector( - cfg, - policy.collect_mode, - collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) - if "nstep" in cfg.policy and cfg.policy.nstep > 1: - task.use(nstep_reward_enhancer(cfg)) - task.use(data_pusher(cfg, buffer_)) - task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use( - wandb_online_logger( - metric_list=policy._monitor_vars_learn(), - model=policy._model, - anonymous=True, - project_name=cfg.exp_name, - wandb_sweep=False, - ) - ) - - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=10000000)) - - task.run() - - -if __name__ == "__main__": - import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--seed", type=int, default=0, help="random seed") - parser.add_argument("--collector_env_num", type=int, default=8, help="collector env number") - parser.add_argument("--collector_batch_size", type=int, default=8, help="collector batch size") - arg = parser.parse_args() - - pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num - pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size - pong_dqn_envpool_config.seed = arg.seed - - main(pong_dqn_envpool_config) From 8981236c0a566840821ff0fe17dfabe434e06663 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 1 Nov 2023 16:14:47 +0800 Subject: [PATCH 232/244] polish code --- ding/framework/middleware/collector.py | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/ding/framework/middleware/collector.py b/ding/framework/middleware/collector.py index 3ed0773688..273e6c313e 100644 --- a/ding/framework/middleware/collector.py +++ b/ding/framework/middleware/collector.py @@ -53,7 +53,6 @@ def __call__(self, ctx: "OnlineRLContext") -> None: Input of ctx: - env_step (:obj:`int`): The env steps which will increase during collection. """ - start = time.time() old = ctx.env_step if self.random_collect_size > 0 and old < self.random_collect_size: target_size = self.random_collect_size - old @@ -72,7 +71,6 @@ def __call__(self, ctx: "OnlineRLContext") -> None: self._transitions.clear() break - ctx.collector_time += time.time() - start class EnvpoolStepCollector: @@ -128,12 +126,7 @@ def __call__(self, ctx: "OnlineRLContext") -> None: counter = 0 - time_send = 0.0 - time_receive = 0.0 - time_process = 0.0 - while True: - start_send = time.time() if len(self._ready_obs_receive.keys()) > 0: if random: action_to_send = { @@ -154,16 +147,12 @@ def __call__(self, ctx: "OnlineRLContext") -> None: action_send = action_send.squeeze(1) env_id_send = np.array(list(action_to_send.keys())) self.env.send_action(action_send, env_id_send) - time_send += time.time() - start_send - start_receive = time.time() next_obs, rew, done, info = self.env.receive_data() env_id_receive = info['env_id'] counter += len(env_id_receive) self._ready_obs_receive.update({i: next_obs[i] for i in range(len(next_obs))}) - time_receive += time.time() - start_receive - start_process = time.time() #todo for i in range(len(env_id_receive)): current_reward = rew[i] @@ -247,13 +236,7 @@ def __call__(self, ctx: "OnlineRLContext") -> None: else: self._trajectory[env_id_receive[i]][-1]['value_gamma'] = self._discount_ratio_list[0] - time_process += time.time() - start_process if counter >= target_size: - # if self._nsteps>1: - # # transform reward to ttorch.tensor - # for i in range(self.env.env_num): - # for j in range(len(self._trajectory[i])): - # self._trajectory[i][j]['reward']=np.concatenate(self._trajectory[env_id_receive[i]][j]['reward']) break ctx.trajectories = [] @@ -263,9 +246,6 @@ def __call__(self, ctx: "OnlineRLContext") -> None: ctx.env_step += len(ctx.trajectories) ctx.collector_time += time.time() - start - print(f'time_send:[{time_send}]') - print(f'time_receive:[{time_receive}]') - print(f'time_process:[{time_process}]') class PPOFStepCollector: From 3a1d98cb1ca56fbe11d3ccb22a868c29c31c0115 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 1 Nov 2023 16:28:09 +0800 Subject: [PATCH 233/244] polish code --- .../middleware/functional/enhancer.py | 4 ---- .../middleware/functional/evaluator.py | 3 --- ding/framework/middleware/learner.py | 21 +------------------ 3 files changed, 1 insertion(+), 27 deletions(-) diff --git a/ding/framework/middleware/functional/enhancer.py b/ding/framework/middleware/functional/enhancer.py index 2804ead95c..cc5d046d38 100644 --- a/ding/framework/middleware/functional/enhancer.py +++ b/ding/framework/middleware/functional/enhancer.py @@ -79,8 +79,6 @@ def nstep_reward_enhancer(cfg: EasyDict) -> Callable: return task.void() def _enhance(ctx: "OnlineRLContext"): - - start = time.time() nstep = cfg.policy.nstep gamma = cfg.policy.discount_factor L = len(ctx.trajectories) @@ -103,8 +101,6 @@ def _enhance(ctx: "OnlineRLContext"): ctx.trajectories[i].reward = nstep_rewards[i] ctx.trajectories[i].value_gamma = value_gamma[i] - ctx.nstep_time += time.time() - start - return _enhance diff --git a/ding/framework/middleware/functional/evaluator.py b/ding/framework/middleware/functional/evaluator.py index b39e16adfb..0166c2de4a 100644 --- a/ding/framework/middleware/functional/evaluator.py +++ b/ding/framework/middleware/functional/evaluator.py @@ -240,7 +240,6 @@ def _evaluate(ctx: Union["OnlineRLContext", "OfflineRLContext"]): """ # evaluation will be executed if the task begins or enough train_iter after last evaluation - start = time.time() if ctx.last_eval_iter != -1 and \ (ctx.train_iter - ctx.last_eval_iter < cfg.policy.eval.evaluator.eval_freq): return @@ -305,8 +304,6 @@ def _evaluate(ctx: Union["OnlineRLContext", "OfflineRLContext"]): if stop_flag: task.finish = True - ctx.evaluator_time += time.time() - start - return _evaluate diff --git a/ding/framework/middleware/learner.py b/ding/framework/middleware/learner.py index 1a3692ee84..969640cc59 100644 --- a/ding/framework/middleware/learner.py +++ b/ding/framework/middleware/learner.py @@ -17,26 +17,7 @@ from threading import Thread from ding.policy.common_utils import default_preprocess_learn, fast_preprocess_learn - def data_process_func(data_queue_input, data_queue_output): - while True: - data = data_queue_input.get() - if data is None: - break - else: - #print("get one data") - output_data = fast_preprocess_learn( - data, - use_priority=False, #policy._cfg.priority, - use_priority_IS_weight=False, #policy._cfg.priority_IS_weight, - cuda=True, #policy._cuda, - device="cuda:0", #policy._device, - ) - data_queue_output.put(output_data) - #print("put one data, queue size:{}".format(data_queue_output.qsize())) - - -def data_process_func_v2(data_queue_input, data_queue_output): while True: if data_queue_input.empty(): time.sleep(0.001) @@ -157,7 +138,7 @@ def __init__( self._data_queue_input = Queue() self._data_queue_output = Queue() - self.thread_worker = Thread(target=data_process_func_v2, args=(self._data_queue_input, self._data_queue_output)) + self.thread_worker = Thread(target=data_process_func, args=(self._data_queue_input, self._data_queue_output)) self.thread_worker.start() self._trainer = task.wrap(trainer(cfg, policy.learn_mode, log_freq=log_freq)) From 83ca2178b107895fdd9730ca309b87efe97e4b19 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 1 Nov 2023 16:32:21 +0800 Subject: [PATCH 234/244] polish code --- ding/framework/middleware/learner.py | 10 ---- ding/rl_utils/ppo.py | 68 ---------------------------- 2 files changed, 78 deletions(-) diff --git a/ding/framework/middleware/learner.py b/ding/framework/middleware/learner.py index 969640cc59..cb3ee51143 100644 --- a/ding/framework/middleware/learner.py +++ b/ding/framework/middleware/learner.py @@ -80,27 +80,17 @@ def __call__(self, ctx: "OnlineRLContext") -> None: Output of ctx: - train_output (:obj:`Deque`): The training output in deque. """ - start = time.time() - time_fetcher = 0.0 - time_trainer = 0.0 train_output_queue = [] for _ in range(self.cfg.policy.learn.update_per_collect): - start_fetcher = time.time() self._fetcher(ctx) - time_fetcher += time.time() - start_fetcher if ctx.train_data is None: break if self._reward_estimator: self._reward_estimator(ctx) - start_trainer = time.time() self._trainer(ctx) - time_trainer += time.time() - start_trainer train_output_queue.append(ctx.train_output) ctx.train_output_for_post_process = ctx.train_output ctx.train_output = train_output_queue - ctx.learner_time += time.time() - start - print("time_trainer:time_fetcher={}:{}={}".format(time_trainer, time_fetcher, time_trainer / time_fetcher)) - class EnvpoolOffPolicyLearner: """ diff --git a/ding/rl_utils/ppo.py b/ding/rl_utils/ppo.py index 9d26579f2e..4e6f1a02e9 100644 --- a/ding/rl_utils/ppo.py +++ b/ding/rl_utils/ppo.py @@ -20,10 +20,6 @@ ppo_loss = namedtuple('ppo_loss', ['policy_loss', 'value_loss', 'entropy_loss']) ppo_policy_loss = namedtuple('ppo_policy_loss', ['policy_loss', 'entropy_loss']) ppo_info = namedtuple('ppo_info', ['approx_kl', 'clipfrac']) -ppo_data_general = namedtuple( - 'ppo_data_general', ['logp_new', 'logp_old', 'value_new', 'value_old', 'adv', 'return_', 'weight'] -) -ppo_policy_data_general = namedtuple('ppo_policy_data_general', ['logp_new', 'logp_old', 'adv', 'weight']) def shape_fn_ppo(args, kwargs): @@ -368,67 +364,3 @@ def ppo_policy_error_continuous(data: namedtuple, clipfrac = torch.as_tensor(clipped).float().mean().item() return ppo_policy_loss(policy_loss, entropy_loss), ppo_info(approx_kl, clipfrac) - -def ppo_error_general( - data: namedtuple, - entropy, - clip_ratio: float = 0.2, - use_value_clip: bool = True, - dual_clip: Optional[float] = None -) -> Tuple[namedtuple, namedtuple]: - assert dual_clip is None or dual_clip > 1.0, "dual_clip value must be greater than 1.0, but get value: {}".format( - dual_clip - ) - logp_new, logp_old, value_new, value_old, adv, return_, weight = data - if weight is None: - weight = torch.ones_like(adv) - entropy_loss = (entropy * weight).mean() - # policy_loss - ratio = torch.exp(logp_new - logp_old) - surr1 = ratio * adv - surr2 = ratio.clamp(1 - clip_ratio, 1 + clip_ratio) * adv - if dual_clip is not None: - policy_loss = (-torch.max(torch.min(surr1, surr2), dual_clip * adv) * weight).mean() - else: - policy_loss = (-torch.min(surr1, surr2) * weight).mean() - with torch.no_grad(): - approx_kl = (logp_old - logp_new).mean().item() - clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) - clipfrac = torch.as_tensor(clipped).float().mean().item() - # value_loss - if use_value_clip: - value_clip = value_old + (value_new - value_old).clamp(-clip_ratio, clip_ratio) - v1 = (return_ - value_new).pow(2) - v2 = (return_ - value_clip).pow(2) - value_loss = 0.5 * (torch.max(v1, v2) * weight).mean() - else: - value_loss = 0.5 * ((return_ - value_new).pow(2) * weight).mean() - - return ppo_loss(policy_loss, value_loss, entropy_loss), ppo_info(approx_kl, clipfrac) - - -def ppo_policy_error_general(data: namedtuple, - entropy, - clip_ratio: float = 0.2, - dual_clip: Optional[float] = None) -> Tuple[namedtuple, namedtuple]: - assert dual_clip is None or dual_clip > 1.0, "dual_clip value must be greater than 1.0, but get value: {}".format( - dual_clip - ) - logp_new, logp_old, adv, weight = data - if weight is None: - weight = torch.ones_like(adv) - entropy_loss = (entropy * weight).mean() - # policy_loss - ratio = torch.exp(logp_new - logp_old) - surr1 = ratio * adv - surr2 = ratio.clamp(1 - clip_ratio, 1 + clip_ratio) * adv - if dual_clip is not None: - policy_loss = (-torch.max(torch.min(surr1, surr2), dual_clip * adv) * weight).mean() - else: - policy_loss = (-torch.min(surr1, surr2) * weight).mean() - with torch.no_grad(): - approx_kl = (logp_old - logp_new).mean().item() - clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) - clipfrac = torch.as_tensor(clipped).float().mean().item() - - return ppo_policy_loss(policy_loss, entropy_loss), ppo_info(approx_kl, clipfrac) From 97360c0a9c3461adfd7eb165c987fc07ba2478cf Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 1 Nov 2023 16:45:54 +0800 Subject: [PATCH 235/244] polish code --- ding/example/apex_dqn.py | 135 ---------------- ding/example/apex_dqn_ddp.py | 54 ------- ding/example/apex_dqn_parallel.py | 109 ------------- ding/example/apex_dqn_parallel_origin.py | 117 -------------- ding/example/apex_dqn_priority.py | 76 --------- ding/example/apex_dqn_priority_parallel.py | 130 --------------- .../apex_dqn_priority_parallel_wandb.py | 152 ------------------ 7 files changed, 773 deletions(-) delete mode 100644 ding/example/apex_dqn.py delete mode 100644 ding/example/apex_dqn_ddp.py delete mode 100644 ding/example/apex_dqn_parallel.py delete mode 100644 ding/example/apex_dqn_parallel_origin.py delete mode 100644 ding/example/apex_dqn_priority.py delete mode 100644 ding/example/apex_dqn_priority_parallel.py delete mode 100644 ding/example/apex_dqn_priority_parallel_wandb.py diff --git a/ding/example/apex_dqn.py b/ding/example/apex_dqn.py deleted file mode 100644 index 8067ceb14e..0000000000 --- a/ding/example/apex_dqn.py +++ /dev/null @@ -1,135 +0,0 @@ -import os -from ditk import logging -from ding.model import DQN -from ding.policy import DQNPolicy -from ding.data import DequeBuffer -from ding.data.buffer.middleware import PriorityExperienceReplay -from ding.envs import setup_ding_env_manager -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework import Parallel -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ModelExchanger, ContextExchanger, online_logger, \ - nstep_reward_enhancer, priority_calculator -from ding.utils import set_pkg_seed - - -def main(): - from ding.config.DQN.gym_lunarlander_v2 import cfg, env - - cfg.exp_name = 'LunarLander-v2-Apex-DQN' - cfg.policy.priority = True - cfg.policy.priority_IS_weight = True - cfg = compile_config(cfg, policy=DQNPolicy, save_cfg=task.router.node_id == 0) - - logging.getLogger().setLevel(logging.INFO) - model_path = os.path.join(cfg.exp_name, 'models') - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - - assert task.router.is_active - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - if task.has_role(task.role.COLLECTOR): - collector_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.collector_env_num, 'collector', debug=True) - evaluator_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.evaluator_env_num, 'evaluator', debug=True) - elif task.has_role(task.role.EVALUATOR): - collector_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.collector_env_num, 'collector', debug=True) - evaluator_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.evaluator_env_num, 'evaluator', debug=True) - elif task.has_role(task.role.LEARNER): - collector_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.collector_env_num, 'collector', debug=True) - evaluator_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.evaluator_env_num, 'evaluator', debug=True) - - if task.has_role(task.role.LEARNER): - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) - policy = DQNPolicy(cfg.policy, model=model) - task.use(ContextExchanger(skip_n_iter=1)) - #task.use(PeriodicalModelExchanger(model=policy._model, mode="send")) - task.use(ModelExchanger(model)) - - elif task.has_role(task.role.COLLECTOR): - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) - policy = DQNPolicy(cfg.policy, model=model) - #collect_model_loader=FileModelLoader(model=model, dirname=model_path) - task.use(ContextExchanger(skip_n_iter=1)) - #task.use(PeriodicalModelExchanger(model=policy._model, model_loader=collect_model_loader, mode="update")) - task.use(ModelExchanger(model)) - - elif task.has_role(task.role.EVALUATOR): - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) - policy = DQNPolicy(cfg.policy, model=model) - #eval_model_loader=FileModelLoader(model=model, dirname=model_path) - task.use(ContextExchanger(skip_n_iter=1)) - #task.use(PeriodicalModelExchanger(model=policy._model, model_loader=eval_model_loader, mode="update")) - task.use(ModelExchanger(model)) - - # Here is the part of single process pipeline. - task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use(StepCollector(cfg, policy.collect_mode, collector_env)) - print(f"cfg.policy.nstep:{cfg.policy.nstep}") - if "nstep" in cfg.policy and cfg.policy.nstep > 1: - if task.has_role(task.role.COLLECTOR): - task.use(nstep_reward_enhancer(cfg)) - - def dqn_priority_calculation(update_target_model_frequency): - last_update_train_iter = 0 - - def _calculate_priority(data): - nonlocal last_update_train_iter - - if (task.ctx.train_iter - last_update_train_iter) % update_target_model_frequency == 0: - update_target_model = True - else: - update_target_model = False - priority = policy.calculate_priority(data, update_target_model=update_target_model)['priority'] - last_update_train_iter = task.ctx.train_iter - return priority - - return _calculate_priority - - task.use( - priority_calculator( - priority_calculation_fn=dqn_priority_calculation( - update_target_model_frequency=cfg.policy.learn.target_update_freq - ), - ) - ) - - task.use(data_pusher(cfg, buffer_)) - task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use(CkptSaver(policy, cfg.exp_name, train_freq=100)) - task.run() - - -if __name__ == "__main__": - - Parallel.runner( - n_parallel_workers=3, - ports=50515, - protocol="tcp", - topology="mesh", - attach_to=None, - address=None, - labels=None, - node_ids=None, - mq_type="nng", - redis_host=None, - redis_port=None, - startup_interval=1 - )(main) diff --git a/ding/example/apex_dqn_ddp.py b/ding/example/apex_dqn_ddp.py deleted file mode 100644 index 5491dbb79c..0000000000 --- a/ding/example/apex_dqn_ddp.py +++ /dev/null @@ -1,54 +0,0 @@ -import gym -from ditk import logging -from ding.model import DQN -from ding.policy import DQNPolicy -from ding.envs import DingEnvWrapper, BaseEnvManagerV2 -from ding.data import DequeBuffer -from ding.data.buffer.middleware import PriorityExperienceReplay -from ding.config import compile_config -from ding.framework import task -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ModelExchanger -from ding.utils import set_pkg_seed -from ding.utils import DDPContext, to_ddp_config -from dizoo.classic_control.cartpole.config.cartpole_dqn_config import main_config, create_config - - -def main(main_config, create_config): - logging.getLogger().setLevel(logging.INFO) - main_config.exp_name = 'cartpole_dqn_per' - main_config.policy.priority = True - main_config.policy.priority_IS_weight = True - cfg = compile_config(main_config, create_cfg=create_config, auto=True) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = BaseEnvManagerV2( - env_fn=[lambda: DingEnvWrapper(gym.make("CartPole-v0")) for _ in range(cfg.env.collector_env_num)], - cfg=cfg.env.manager - ) - evaluator_env = BaseEnvManagerV2( - env_fn=[lambda: DingEnvWrapper(gym.make("CartPole-v0")) for _ in range(cfg.env.evaluator_env_num)], - cfg=cfg.env.manager - ) - - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) - policy = DQNPolicy(cfg.policy, model=model) - - task.use(ModelExchanger(model)) - task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use(StepCollector(cfg, policy.collect_mode, collector_env)) - task.use(data_pusher(cfg, buffer_)) - task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(CkptSaver(policy, cfg.exp_name, train_freq=100)) - task.run() - - -if __name__ == "__main__": - with DDPContext(): - main_config = to_ddp_config(main_config) - main(main_config, create_config) diff --git a/ding/example/apex_dqn_parallel.py b/ding/example/apex_dqn_parallel.py deleted file mode 100644 index c050362585..0000000000 --- a/ding/example/apex_dqn_parallel.py +++ /dev/null @@ -1,109 +0,0 @@ -import os -from ditk import logging -from ding.model import DQN -from ding.policy import DQNPolicy -from ding.envs import setup_ding_env_manager -from ding.data import DequeBuffer, FileModelLoader -from ding.data.buffer.middleware import PriorityExperienceReplay -from ding.config import compile_config -from ding.framework import task -from ding.framework import Parallel -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ModelExchanger, PeriodicalModelExchanger, ContextExchanger, online_logger -from ding.utils import set_pkg_seed - - -def main(cfg, env): - - logging.getLogger().setLevel(logging.INFO) - model_path = os.path.join(cfg.exp_name, 'models') - - with task.start(async_mode=False, ctx=OnlineRLContext()): - - assert task.router.is_active - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - if task.has_role(task.role.COLLECTOR): - collector_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.collector_env_num, 'collector') - evaluator_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.evaluator_env_num, 'evaluator') - elif task.has_role(task.role.EVALUATOR): - collector_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.collector_env_num, 'collector') - evaluator_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.evaluator_env_num, 'evaluator') - - if task.has_role(task.role.LEARNER): - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) - policy = DQNPolicy(cfg.policy, model=model, enable_field=['learn']) - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - task.use(PeriodicalModelExchanger(model=policy._model, mode="send")) - - # Here is the part of single process pipeline. - task.use(data_pusher(cfg, buffer_)) - task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use(CkptSaver(policy, cfg.exp_name, train_freq=100)) - - elif task.has_role(task.role.COLLECTOR): - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) - policy = DQNPolicy(cfg.policy, model=model, enable_field=['collect']) - collect_model_loader = FileModelLoader(model=model, dirname=model_path) - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - #task.use(PeriodicalModelExchanger(model=policy._model, model_loader=collect_model_loader, mode="update")) - - # Here is the part of single process pipeline. - task.use(eps_greedy_handler(cfg)) - task.use(StepCollector(cfg, policy.collect_mode, collector_env)) - task.use(data_pusher(cfg, buffer_)) - - elif task.has_role(task.role.EVALUATOR): - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) - policy = DQNPolicy(cfg.policy, model=model, enable_field=['eval']) - eval_model_loader = FileModelLoader(model=model, dirname=model_path) - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - #task.use(PeriodicalModelExchanger(model=policy._model, model_loader=eval_model_loader, mode="update")) - - # Here is the part of single process pipeline. - task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(CkptSaver(policy, cfg.exp_name, train_freq=100)) - - task.run() - - -if __name__ == "__main__": - - from ding.config.DQN.gym_lunarlander_v2 import cfg, env - cfg.exp_name = 'LunarLander-v2-Apex-DQN' - cfg.policy.priority = True - cfg.policy.priority_IS_weight = True - cfg = compile_config(cfg, policy=DQNPolicy) - - Parallel.runner( - n_parallel_workers=3, - ports=50515, - protocol="tcp", - topology="mesh", - attach_to=None, - address=None, - labels=None, - node_ids=None, - mq_type="nng", - redis_host=None, - redis_port=None, - startup_interval=1 - )(main, cfg, env) diff --git a/ding/example/apex_dqn_parallel_origin.py b/ding/example/apex_dqn_parallel_origin.py deleted file mode 100644 index c1d909ec19..0000000000 --- a/ding/example/apex_dqn_parallel_origin.py +++ /dev/null @@ -1,117 +0,0 @@ -""" -# Example of DQN pipeline - -Use the pipeline on a single process: - -> python3 -u ding/example/dqn.py - -Use the pipeline on multiple processes: - -We surpose there are N processes (workers) = 1 learner + 1 evaluator + (N-2) collectors - -## First Example —— Execute on one machine with multi processes. - -Execute 4 processes with 1 learner + 1 evaluator + 2 collectors -Remember to keep them connected by mesh to ensure that they can exchange information with each other. - -> ditask --package . --main ding.example.dqn.main --parallel-workers 4 --topology mesh - -## Second Example —— Execute on multiple machines. - -1. Execute 1 learner + 1 evaluator on one machine. - -> ditask --package . --main ding.example.dqn.main --parallel-workers 2 --topology mesh --node-ids 0 --ports 50515 - -2. Execute 2 collectors on another machine. (Suppose the ip of the first machine is 127.0.0.1). - Here we use `alone` topology instead of `mesh` because the collectors do not need communicate with each other. - Remember the `node_ids` cannot be duplicated with the learner, evaluator processes. - And remember to set the `ports` (should not conflict with others) and `attach_to` parameters. - The value of the `attach_to` parameter should be obtained from the log of the - process started earlier (e.g. 'NNG listen on tcp://10.0.0.4:50515'). - -> ditask --package . --main ding.example.dqn.main --parallel-workers 2 --topology alone --node-ids 2 \ - --ports 50517 --attach-to tcp://10.0.0.4:50515,tcp://127.0.0.1:50516 - -3. You can repeat step 2 to start more collectors on other machines. -""" -import gym -from ditk import logging -from ding.data.model_loader import FileModelLoader -from ding.data.storage_loader import FileStorageLoader -from ding.model import DQN -from ding.policy import DQNPolicy -from ding.envs import DingEnvWrapper, BaseEnvManagerV2 -from ding.data import DequeBuffer -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger -from ding.utils import set_pkg_seed -from dizoo.classic_control.cartpole.config.cartpole_dqn_config import main_config, create_config - - -def main(): - logging.getLogger().setLevel(logging.INFO) - cfg = compile_config(main_config, create_cfg=create_config, auto=True, save_cfg=task.router.node_id == 0) - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = BaseEnvManagerV2( - env_fn=[lambda: DingEnvWrapper(gym.make("CartPole-v0")) for _ in range(cfg.env.collector_env_num)], - cfg=cfg.env.manager - ) - evaluator_env = BaseEnvManagerV2( - env_fn=[lambda: DingEnvWrapper(gym.make("CartPole-v0")) for _ in range(cfg.env.evaluator_env_num)], - cfg=cfg.env.manager - ) - - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - policy = DQNPolicy(cfg.policy, model=model) - - # Consider the case with multiple processes - if task.router.is_active: - # You can use labels to distinguish between workers with different roles, - # here we use node_id to distinguish. - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - # Sync their context and model between each worker. - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - # Here is the part of single process pipeline. - task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use(StepCollector(cfg, policy.collect_mode, collector_env)) - task.use(data_pusher(cfg, buffer_)) - task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use(CkptSaver(policy, cfg.exp_name, train_freq=100)) - - task.run() - - -if __name__ == "__main__": - - from ding.framework import Parallel - Parallel.runner( - n_parallel_workers=3, - ports=50520, - protocol="tcp", - topology="mesh", - attach_to=None, - address=None, - labels=None, - node_ids=None, - mq_type="nng", - redis_host=None, - redis_port=None, - startup_interval=1 - )(main) diff --git a/ding/example/apex_dqn_priority.py b/ding/example/apex_dqn_priority.py deleted file mode 100644 index 1101b1e9eb..0000000000 --- a/ding/example/apex_dqn_priority.py +++ /dev/null @@ -1,76 +0,0 @@ -import gym -from ditk import logging -from ding.model import DQN -from ding.policy import DQNPolicy -from ding.envs import DingEnvWrapper, BaseEnvManagerV2 -from ding.data import DequeBuffer -from ding.data.buffer.middleware import PriorityExperienceReplay -from ding.config import compile_config -from ding.framework import task -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, nstep_reward_enhancer, priority_calculator -from ding.utils import set_pkg_seed -from dizoo.classic_control.cartpole.config.cartpole_dqn_config import main_config, create_config - - -def main(): - logging.getLogger().setLevel(logging.INFO) - main_config.exp_name = 'cartpole_dqn_per' - main_config.policy.priority = True - main_config.policy.priority_IS_weight = True - cfg = compile_config(main_config, create_cfg=create_config, auto=True) - with task.start(async_mode=False, ctx=OnlineRLContext()): - collector_env = BaseEnvManagerV2( - env_fn=[lambda: DingEnvWrapper(gym.make("CartPole-v0")) for _ in range(cfg.env.collector_env_num)], - cfg=cfg.env.manager - ) - evaluator_env = BaseEnvManagerV2( - env_fn=[lambda: DingEnvWrapper(gym.make("CartPole-v0")) for _ in range(cfg.env.evaluator_env_num)], - cfg=cfg.env.manager - ) - - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) - policy = DQNPolicy(cfg.policy, model=model) - - task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use(StepCollector(cfg, policy.collect_mode, collector_env)) - if "nstep" in cfg.policy and cfg.policy.nstep > 1: - task.use(nstep_reward_enhancer(cfg)) - - def dqn_priority_calculation(update_target_model_frequency): - last_update_train_iter = 0 - - def _calculate_priority(data): - nonlocal last_update_train_iter - - if (task.ctx.train_iter - last_update_train_iter) % update_target_model_frequency == 0: - update_target_model = True - else: - update_target_model = False - priority = policy.calculate_priority(data, update_target_model=update_target_model)['priority'] - last_update_train_iter = task.ctx.train_iter - return priority - - return _calculate_priority - - task.use( - priority_calculator( - priority_calculation_fn=dqn_priority_calculation( - update_target_model_frequency=cfg.policy.learn.target_update_freq - ), - ) - ) - task.use(data_pusher(cfg, buffer_)) - task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(CkptSaver(policy, cfg.exp_name, train_freq=100)) - task.run() - - -if __name__ == "__main__": - main() diff --git a/ding/example/apex_dqn_priority_parallel.py b/ding/example/apex_dqn_priority_parallel.py deleted file mode 100644 index f306f95083..0000000000 --- a/ding/example/apex_dqn_priority_parallel.py +++ /dev/null @@ -1,130 +0,0 @@ -import os -from ditk import logging -from ding.model import DQN -from ding.policy import DQNPolicy -from ding.data import DequeBuffer -from ding.data.buffer.middleware import PriorityExperienceReplay -from ding.envs import setup_ding_env_manager -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework import Parallel -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ModelExchanger, ContextExchanger, online_logger, \ - nstep_reward_enhancer, priority_calculator -from ding.utils import set_pkg_seed - - -def main(): - from ding.config.DQN.gym_lunarlander_v2 import cfg, env - - cfg.exp_name = 'LunarLander-v2-Apex-DQN' - cfg.policy.priority = True - cfg.policy.priority_IS_weight = True - cfg = compile_config(cfg, policy=DQNPolicy, save_cfg=task.router.node_id == 0) - - logging.getLogger().setLevel(logging.INFO) - model_path = os.path.join(cfg.exp_name, 'models') - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - - assert task.router.is_active - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - if task.has_role(task.role.COLLECTOR): - collector_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.collector_env_num, 'collector', debug=True) - evaluator_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.evaluator_env_num, 'evaluator', debug=True) - elif task.has_role(task.role.EVALUATOR): - collector_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.collector_env_num, 'collector', debug=True) - evaluator_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.evaluator_env_num, 'evaluator', debug=True) - elif task.has_role(task.role.LEARNER): - collector_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.collector_env_num, 'collector', debug=True) - evaluator_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.evaluator_env_num, 'evaluator', debug=True) - - if task.has_role(task.role.LEARNER): - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) - policy = DQNPolicy(cfg.policy, model=model) - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - elif task.has_role(task.role.COLLECTOR): - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) - policy = DQNPolicy(cfg.policy, model=model) - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - elif task.has_role(task.role.EVALUATOR): - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) - policy = DQNPolicy(cfg.policy, model=model) - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - # Here is the part of single process pipeline. - task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) - task.use(eps_greedy_handler(cfg)) - task.use(StepCollector(cfg, policy.collect_mode, collector_env)) - print(f"cfg.policy.nstep:{cfg.policy.nstep}") - if "nstep" in cfg.policy and cfg.policy.nstep > 1: - if task.has_role(task.role.COLLECTOR): - task.use(nstep_reward_enhancer(cfg)) - - def dqn_priority_calculation(update_target_model_frequency): - last_update_train_iter = 0 - - def _calculate_priority(data): - nonlocal last_update_train_iter - - if (task.ctx.train_iter - last_update_train_iter) % update_target_model_frequency == 0: - update_target_model = True - else: - update_target_model = False - priority = policy.calculate_priority(data, update_target_model=update_target_model)['priority'] - last_update_train_iter = task.ctx.train_iter - return priority - - return _calculate_priority - - task.use( - priority_calculator( - priority_calculation_fn=dqn_priority_calculation( - update_target_model_frequency=cfg.policy.learn.target_update_freq - ), - ) - ) - - task.use(data_pusher(cfg, buffer_)) - task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use(online_logger(train_show_freq=10)) - task.use(CkptSaver(policy, cfg.exp_name, train_freq=100)) - task.run() - - -if __name__ == "__main__": - - Parallel.runner( - n_parallel_workers=3, - ports=50515, - protocol="tcp", - topology="mesh", - attach_to=None, - address=None, - labels=None, - node_ids=None, - mq_type="nng", - redis_host=None, - redis_port=None, - startup_interval=1 - )(main) diff --git a/ding/example/apex_dqn_priority_parallel_wandb.py b/ding/example/apex_dqn_priority_parallel_wandb.py deleted file mode 100644 index 858091e328..0000000000 --- a/ding/example/apex_dqn_priority_parallel_wandb.py +++ /dev/null @@ -1,152 +0,0 @@ -import os -from ditk import logging -from ding.model import DQN -from ding.policy import DQNPolicy -from ding.data import DequeBuffer -from ding.data.buffer.middleware import PriorityExperienceReplay -from ding.envs import setup_ding_env_manager -from ding.config import compile_config -from ding.framework import task, ding_init -from ding.framework import Parallel -from ding.framework.context import OnlineRLContext -from ding.framework.middleware import OffPolicyLearner, StepCollector, interaction_evaluator, data_pusher, \ - eps_greedy_handler, CkptSaver, ModelExchanger, ContextExchanger, online_logger, \ - nstep_reward_enhancer, priority_calculator, wandb_online_logger -from ding.utils import set_pkg_seed - - -def main(): - from ding.config.DQN.gym_lunarlander_v2 import cfg, env - - cfg.exp_name = 'LunarLander-v2-Apex-DQN' - cfg.policy.priority = True - cfg.policy.priority_IS_weight = True - cfg = compile_config(cfg, policy=DQNPolicy, save_cfg=task.router.node_id == 0) - - logging.getLogger().setLevel(logging.INFO) - model_path = os.path.join(cfg.exp_name, 'models') - ding_init(cfg) - with task.start(async_mode=False, ctx=OnlineRLContext()): - - assert task.router.is_active - if task.router.node_id == 0: - task.add_role(task.role.LEARNER) - elif task.router.node_id == 1: - task.add_role(task.role.EVALUATOR) - else: - task.add_role(task.role.COLLECTOR) - - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - - if task.has_role(task.role.COLLECTOR): - collector_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.collector_env_num, 'collector', debug=True) - evaluator_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.evaluator_env_num, 'evaluator', debug=True) - elif task.has_role(task.role.EVALUATOR): - collector_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.collector_env_num, 'collector', debug=True) - evaluator_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.evaluator_env_num, 'evaluator', debug=True) - elif task.has_role(task.role.LEARNER): - collector_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.collector_env_num, 'collector', debug=True) - evaluator_env = setup_ding_env_manager(env(cfg=cfg.env), cfg.env.evaluator_env_num, 'evaluator', debug=True) - - if task.has_role(task.role.LEARNER): - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) - policy = DQNPolicy(cfg.policy, model=model) - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - elif task.has_role(task.role.COLLECTOR): - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) - policy = DQNPolicy(cfg.policy, model=model) - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - elif task.has_role(task.role.EVALUATOR): - model = DQN(**cfg.policy.model) - buffer_ = DequeBuffer(size=cfg.policy.other.replay_buffer.replay_buffer_size) - buffer_.use(PriorityExperienceReplay(buffer_, IS_weight=True)) - policy = DQNPolicy(cfg.policy, model=model) - task.use(ContextExchanger(skip_n_iter=1)) - task.use(ModelExchanger(model)) - - # Here is the part of single process pipeline. - task.use( - interaction_evaluator( - cfg, policy.eval_mode, evaluator_env, render=cfg.policy.eval.render \ - if hasattr(cfg.policy.eval, "render") else False - ) - ) - task.use(eps_greedy_handler(cfg)) - task.use( - StepCollector( - cfg, - policy.collect_mode, - collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) - print(f"cfg.policy.nstep:{cfg.policy.nstep}") - if "nstep" in cfg.policy and cfg.policy.nstep > 1: - if task.has_role(task.role.COLLECTOR): - task.use(nstep_reward_enhancer(cfg)) - - def dqn_priority_calculation(update_target_model_frequency): - last_update_train_iter = 0 - - def _calculate_priority(data): - nonlocal last_update_train_iter - - if (task.ctx.train_iter - last_update_train_iter) % update_target_model_frequency == 0: - update_target_model = True - else: - update_target_model = False - priority = policy.calculate_priority(data, update_target_model=update_target_model)['priority'] - last_update_train_iter = task.ctx.train_iter - return priority - - return _calculate_priority - - task.use( - priority_calculator( - priority_calculation_fn=dqn_priority_calculation( - update_target_model_frequency=cfg.policy.learn.target_update_freq - ), - ) - ) - - task.use(data_pusher(cfg, buffer_)) - task.use(OffPolicyLearner(cfg, policy.learn_mode, buffer_)) - task.use( - wandb_online_logger( - metric_list=policy.monitor_vars(), - model=policy._model, - anonymous=True, - project_name=cfg.exp_name, - wandb_sweep=False, - ) - ) - task.use(online_logger(train_show_freq=10)) - task.use(CkptSaver(policy, cfg.exp_name, train_freq=100)) - task.run() - - -if __name__ == "__main__": - - Parallel.runner( - n_parallel_workers=3, - ports=50515, - protocol="tcp", - topology="mesh", - attach_to=None, - address=None, - labels=None, - node_ids=None, - mq_type="nng", - redis_host=None, - redis_port=None, - startup_interval=1 - )(main) From 25fab568818815c8f6cde7607249547953924697 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 1 Nov 2023 21:06:42 +0800 Subject: [PATCH 236/244] polish code --- ding/framework/context.py | 7 --- ding/model/common/head.py | 58 +++++++++++------------ ding/policy/a2c.py | 3 -- ding/policy/c51.py | 3 -- ding/policy/common_utils.py | 70 ---------------------------- ding/policy/dqn.py | 92 +------------------------------------ 6 files changed, 27 insertions(+), 206 deletions(-) diff --git a/ding/framework/context.py b/ding/framework/context.py index 23ba81ad1e..6fb35eec13 100644 --- a/ding/framework/context.py +++ b/ding/framework/context.py @@ -69,19 +69,12 @@ class OnlineRLContext(Context): eval_output: List = dataclasses.field(default_factory=dict) # wandb wandb_url: str = "" - evaluator_time = 0.0 - collector_time = 0.0 - learner_time = 0.0 - data_pusher_time = 0.0 - nstep_time = 0.0 - total_time = 0.0 def __post_init__(self): # This method is called just after __init__ method. Here, concretely speaking, # this method is called just after the object initialize its fields. # We use this method here to keep the fields needed for each iteration. self.keep('env_step', 'env_episode', 'train_iter', 'last_eval_iter', 'last_eval_value', 'wandb_url') - self.keep('evaluator_time', 'collector_time', 'learner_time', 'data_pusher_time', 'nstep_time', 'total_time') @dataclasses.dataclass diff --git a/ding/model/common/head.py b/ding/model/common/head.py index a666cf0c0a..30f5b58d98 100755 --- a/ding/model/common/head.py +++ b/ding/model/common/head.py @@ -832,38 +832,32 @@ def __init__( v_layer_num = layer_num layer = NoiseLinearLayer if noise else nn.Linear block = noise_block if noise else fc_block - if a_layer_num > 0: - self.A = nn.Sequential( - MLP( - hidden_size, - hidden_size, - hidden_size, - a_layer_num, - layer_fn=layer, - activation=activation, - use_dropout=dropout is not None, - dropout_probability=dropout, - norm_type=norm_type - ), block(hidden_size, output_size) - ) - else: - self.A = block(hidden_size, output_size) - if v_layer_num > 0: - self.V = nn.Sequential( - MLP( - hidden_size, - hidden_size, - hidden_size, - v_layer_num, - layer_fn=layer, - activation=activation, - use_dropout=dropout is not None, - dropout_probability=dropout, - norm_type=norm_type - ), block(hidden_size, 1) - ) - else: - self.V = block(hidden_size, 1) + self.A = nn.Sequential( + MLP( + hidden_size, + hidden_size, + hidden_size, + a_layer_num, + layer_fn=layer, + activation=activation, + use_dropout=dropout is not None, + dropout_probability=dropout, + norm_type=norm_type + ), block(hidden_size, output_size) + ) + self.V = nn.Sequential( + MLP( + hidden_size, + hidden_size, + hidden_size, + v_layer_num, + layer_fn=layer, + activation=activation, + use_dropout=dropout is not None, + dropout_probability=dropout, + norm_type=norm_type + ), block(hidden_size, 1) + ) def forward(self, x: torch.Tensor) -> Dict: """ diff --git a/ding/policy/a2c.py b/ding/policy/a2c.py index da738d9784..6e05f4e712 100644 --- a/ding/policy/a2c.py +++ b/ding/policy/a2c.py @@ -293,6 +293,3 @@ def _forward_eval(self, data: dict) -> dict: def _monitor_vars_learn(self) -> List[str]: return super()._monitor_vars_learn() + ['policy_loss', 'value_loss', 'entropy_loss', 'adv_abs_max', 'grad_norm'] - - def monitor_vars(self) -> List[str]: - return self._monitor_vars_learn() diff --git a/ding/policy/c51.py b/ding/policy/c51.py index dbb106bf42..0b6f36d68e 100644 --- a/ding/policy/c51.py +++ b/ding/policy/c51.py @@ -266,6 +266,3 @@ def _get_train_sample(self, data: list) -> Union[None, List[Any]]: """ data = get_nstep_return_data(data, self._nstep, gamma=self._gamma) return get_train_sample(data, self._unroll_len) - - def monitor_vars(self) -> List[str]: - return ['cur_lr', 'total_loss', 'q_value', 'target_q_value'] diff --git a/ding/policy/common_utils.py b/ding/policy/common_utils.py index 23eef01391..40533deab1 100644 --- a/ding/policy/common_utils.py +++ b/ding/policy/common_utils.py @@ -70,7 +70,6 @@ def default_preprocess_learn( return data - def fast_preprocess_learn( data: List[Any], use_priority_IS_weight: bool = False, @@ -138,75 +137,6 @@ def fast_preprocess_learn( return processes_data - -def fast_preprocess_learn_v2( - data: List[Any], - use_priority_IS_weight: bool = False, - use_priority: bool = False, - cuda: bool = False, - device: str = 'cpu', -) -> dict: - # data preprocess - processes_data = {} - - action = torch.stack([data[i]['action'] for i in range(len(data))]) - if cuda: - action = to_device(action, device=device) - if action.ndim == 2 and action.shape[1] == 1: - action = action.squeeze(1) - processes_data['action'] = action - - obs = torch.stack([data[i]['obs'] for i in range(len(data))]) - if cuda: - obs = to_device(obs, device=device) - processes_data['obs'] = obs - - next_obs = torch.stack([data[i]['next_obs'] for i in range(len(data))]) - if cuda: - next_obs = to_device(next_obs, device=device) - processes_data['next_obs'] = next_obs - - if 'next_n_obs' in data[0]: - next_n_obs = torch.stack([data[i]['next_n_obs'] for i in range(len(data))]) - if cuda: - next_n_obs = to_device(next_n_obs, device=device) - processes_data['next_n_obs'] = next_n_obs - - reward = torch.stack([data[i]['reward'] for i in range(len(data))]) - if cuda: - reward = to_device(reward, device=device) - reward = reward.permute(1, 0).contiguous() - processes_data['reward'] = reward - - if 'value_gamma' in data[0]: - value_gamma = torch.stack([data[i]['value_gamma'] for i in range(len(data))]) - if cuda: - value_gamma = to_device(value_gamma, device=device) - processes_data['value_gamma'] = value_gamma - - done = torch.tensor([data[i]['done'] for i in range(len(data))], dtype=torch.float32) - if cuda: - done = to_device(done, device=device) - processes_data['done'] = done - - if use_priority and use_priority_IS_weight: - if 'priority_IS' in data: - weight = data['priority_IS'] - else: # for compability - weight = data['IS'] - else: - if 'weight' in data[0]: - weight = torch.tensor([data[i]['weight'] for i in range(len(data))]) - else: - weight = None - - if weight and cuda: - weight = to_device(weight, device=device) - processes_data['weight'] = weight - - return processes_data - - def single_env_forward_wrapper(forward_fn: Callable) -> Callable: """ Overview: diff --git a/ding/policy/dqn.py b/ding/policy/dqn.py index f5e26e230d..835e34f6e2 100644 --- a/ding/policy/dqn.py +++ b/ding/policy/dqn.py @@ -749,138 +749,48 @@ def _init_learn(self) -> None: ) def _forward_learn(self, data: Dict[str, Any]) -> Dict[str, Any]: - """ - Overview: - Forward computation graph of learn mode(updating policy). - Arguments: - - data (:obj:`Dict[str, Any]`): Dict type data, a batch of data for training, values are torch.Tensor or \ - np.ndarray or dict/list combinations. - Returns: - - info_dict (:obj:`Dict[str, Any]`): Dict type data, a info dict indicated training result, which will be \ - recorded in text log and tensorboard, values are python scalar or a list of scalars. - ArgumentsKeys: - - necessary: ``obs``, ``action``, ``reward``, ``next_obs``, ``done`` - - optional: ``value_gamma`` - ReturnsKeys: - - necessary: ``cur_lr``, ``total_loss``, ``priority`` - - optional: ``action_distribution`` - """ - - # data = fast_preprocess_learn( - # data, - # use_priority=self._priority, - # use_priority_IS_weight=self._cfg.priority_IS_weight, - # cuda=self._cuda, - # device=self._device, - # ) - - # if self._cuda: - # for key in data.keys(): - # if isinstance(data[key], torch.Tensor): - # data[key] = to_device(data[key], self._device) - start_total = time.time() # ==================== # Q-learning forward # ==================== - #torch.cuda.synchronize() - start = time.time() self._learn_model.train() self._target_model.train() - #torch.cuda.synchronize() - set_model_train_time = time.time() - start - # Current q value (main model) - start = time.time() + q_value = self._learn_model.forward(data['obs'])['logit'] - #torch.cuda.synchronize() - forward_q_value_time = time.time() - start - start = time.time() # Target q value with torch.no_grad(): target_next_n_q_value = self._target_model.forward(data['next_n_obs'])['logit'] # Max q value action (main model), i.e. Double DQN target_next_n_action = self._learn_model.forward(data['next_n_obs'])['action'] - #torch.cuda.synchronize() - forward_target_next_time = time.time() - start - start = time.time() data_n = q_nstep_td_data( q_value, target_next_n_q_value, data['action'], target_next_n_action, data['reward'], data['done'], data['weight'] ) - #torch.cuda.synchronize() - q_nstep_td_data_time = time.time() - start - start = time.time() if self._cfg.nstep == 1: value_gamma = None else: value_gamma = data.get( 'value_gamma' ) if 'value_gamma' in data else self._cfg.discount_factor * torch.ones_like(data['done']) - #torch.cuda.synchronize() - get_value_gamma_time = time.time() - start - start = time.time() loss, td_error_per_sample = q_nstep_td_error(data_n, self._gamma, nstep=self._nstep, value_gamma=value_gamma) - #torch.cuda.synchronize() - loss_time = time.time() - start # ==================== # Q-learning update # ==================== - start = time.time() self._optimizer.zero_grad() loss.backward() - #torch.cuda.synchronize() - backward_time = time.time() - start if self._cfg.multi_gpu: self.sync_gradients(self._learn_model) - start = time.time() self._optimizer.step() - #torch.cuda.synchronize() - gradient_step_time = time.time() - start # ============= # after update # ============= - start = time.time() self._target_model.update(self._learn_model.state_dict()) - #torch.cuda.synchronize() - target_update_time = time.time() - start - - time_learn_total = time.time() - start_total - - # print(f"set_model_train_time:time_learn={set_model_train_time}:{time_learn_total}={set_model_train_time/time_learn_total}") - # print(f"forward_q_value_time:time_learn={forward_q_value_time}:{time_learn_total}={forward_q_value_time/time_learn_total}") - # print(f"forward_target_next_time:time_learn={forward_target_next_time}:{time_learn_total}={forward_target_next_time/time_learn_total}") - # print(f"q_nstep_td_data_time:time_learn={q_nstep_td_data_time}:{time_learn_total}={q_nstep_td_data_time/time_learn_total}") - # print(f"get_value_gamma_time:time_learn={get_value_gamma_time}:{time_learn_total}={get_value_gamma_time/time_learn_total}") - # print(f"loss_time:time_learn={loss_time}:{time_learn_total}={loss_time/time_learn_total}") - # print(f"backward_time:time_learn={backward_time}:{time_learn_total}={backward_time/time_learn_total}") - # print(f"gradient_step_time:time_learn={gradient_step_time}:{time_learn_total}={gradient_step_time/time_learn_total}") - # print(f"target_update_time:time_learn={target_update_time}:{time_learn_total}={target_update_time/time_learn_total}") - # self.time_counter['set_model_train_time'] += set_model_train_time - # self.time_counter['forward_q_value_time'] += forward_q_value_time - # self.time_counter['forward_target_next_time'] += forward_target_next_time - # self.time_counter['q_nstep_td_data_time'] += q_nstep_td_data_time - # self.time_counter['get_value_gamma_time'] += get_value_gamma_time - # self.time_counter['loss_time'] += loss_time - # self.time_counter['backward_time'] += backward_time - # self.time_counter['gradient_step_time'] += gradient_step_time - # self.time_counter['target_update_time'] += target_update_time - # self.time_counter['time_learn_total'] += time_learn_total - # self.time_counter['counter_learn'] += 1 - # print(f"set_model_train_time:time_learn={self.time_counter['set_model_train_time']}:{self.time_counter['time_learn_total']}={self.time_counter['set_model_train_time']/self.time_counter['time_learn_total']}") - # print(f"forward_q_value_time:time_learn={self.time_counter['forward_q_value_time']}:{self.time_counter['time_learn_total']}={self.time_counter['forward_q_value_time']/self.time_counter['time_learn_total']}") - # print(f"forward_target_next_time:time_learn={self.time_counter['forward_target_next_time']}:{self.time_counter['time_learn_total']}={self.time_counter['forward_target_next_time']/self.time_counter['time_learn_total']}") - # print(f"q_nstep_td_data_time:time_learn={self.time_counter['q_nstep_td_data_time']}:{self.time_counter['time_learn_total']}={self.time_counter['q_nstep_td_data_time']/self.time_counter['time_learn_total']}") - # print(f"get_value_gamma_time:time_learn={self.time_counter['get_value_gamma_time']}:{self.time_counter['time_learn_total']}={self.time_counter['get_value_gamma_time']/self.time_counter['time_learn_total']}") - # print(f"loss_time:time_learn={self.time_counter['loss_time']}:{self.time_counter['time_learn_total']}={self.time_counter['loss_time']/self.time_counter['time_learn_total']}") - # print(f"backward_time:time_learn={self.time_counter['backward_time']}:{self.time_counter['time_learn_total']}={self.time_counter['backward_time']/self.time_counter['time_learn_total']}") - # print(f"gradient_step_time:time_learn={self.time_counter['gradient_step_time']}:{self.time_counter['time_learn_total']}={self.time_counter['gradient_step_time']/self.time_counter['time_learn_total']}") - # print(f"target_update_time:time_learn={self.time_counter['target_update_time']}:{self.time_counter['time_learn_total']}={self.time_counter['target_update_time']/self.time_counter['time_learn_total']}") return { 'cur_lr': self._optimizer.defaults['lr'], From ab93b390dd88a033c89626e0a02c68092bb292c8 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 1 Nov 2023 21:09:34 +0800 Subject: [PATCH 237/244] polish code --- ding/worker/collector/sample_serial_collector.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ding/worker/collector/sample_serial_collector.py b/ding/worker/collector/sample_serial_collector.py index 35667b1ceb..34374d8630 100644 --- a/ding/worker/collector/sample_serial_collector.py +++ b/ding/worker/collector/sample_serial_collector.py @@ -34,7 +34,8 @@ def __init__( policy: namedtuple = None, tb_logger: 'SummaryWriter' = None, # noqa exp_name: Optional[str] = 'default_experiment', - instance_name: Optional[str] = 'collector' + instance_name: Optional[str] = 'collector', + timer_cuda: bool = False, ) -> None: """ Overview: @@ -51,7 +52,7 @@ def __init__( self._deepcopy_obs = cfg.deepcopy_obs # whether to deepcopy each data self._transform_obs = cfg.transform_obs self._cfg = cfg - self._timer = EasyTimer(cuda=False) + self._timer = EasyTimer(cuda=timer_cuda) self._end_flag = False self._rank = get_rank() self._world_size = get_world_size() From cd762b610266f834705f984e9086dcc47cd0d5ca Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 1 Nov 2023 22:20:01 +0800 Subject: [PATCH 238/244] polish code --- ding/example/dqn_envpool_nstep.py | 2 +- ding/framework/middleware/collector.py | 2 -- .../middleware/functional/data_processor.py | 3 --- .../middleware/functional/evaluator.py | 2 -- ding/framework/middleware/functional/timer.py | 1 - ding/framework/middleware/learner.py | 22 +++++-------------- 6 files changed, 7 insertions(+), 25 deletions(-) diff --git a/ding/example/dqn_envpool_nstep.py b/ding/example/dqn_envpool_nstep.py index fb6159922e..dd64ddde75 100644 --- a/ding/example/dqn_envpool_nstep.py +++ b/ding/example/dqn_envpool_nstep.py @@ -108,7 +108,7 @@ def main(cfg): ) #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=10000000)) + task.use(termination_checker(max_env_step=100)) task.run() diff --git a/ding/framework/middleware/collector.py b/ding/framework/middleware/collector.py index 273e6c313e..d5a3f3d79b 100644 --- a/ding/framework/middleware/collector.py +++ b/ding/framework/middleware/collector.py @@ -111,7 +111,6 @@ def __call__(self, ctx: "OnlineRLContext") -> None: Input of ctx: - env_step (:obj:`int`): The env steps which will increase during collection. """ - start = time.time() old = ctx.env_step if self.random_collect_size > 0 and old < self.random_collect_size: @@ -244,7 +243,6 @@ def __call__(self, ctx: "OnlineRLContext") -> None: ctx.trajectories.extend(self._trajectory[i]) self._trajectory[i] = [] ctx.env_step += len(ctx.trajectories) - ctx.collector_time += time.time() - start diff --git a/ding/framework/middleware/functional/data_processor.py b/ding/framework/middleware/functional/data_processor.py index 3c887f0bb1..420af3d4fd 100644 --- a/ding/framework/middleware/functional/data_processor.py +++ b/ding/framework/middleware/functional/data_processor.py @@ -36,7 +36,6 @@ def _push(ctx: "OnlineRLContext"): - trajectories (:obj:`List[Dict]`): Trajectories. - episodes (:obj:`List[Dict]`): Episodes. """ - start = time.time() if ctx.trajectories is not None: # each data in buffer is a transition if group_by_env: for i, t in enumerate(ctx.trajectories): @@ -52,8 +51,6 @@ def _push(ctx: "OnlineRLContext"): else: raise RuntimeError("Either ctx.trajectories or ctx.episodes should be not None.") - ctx.data_pusher_time += time.time() - start - return _push diff --git a/ding/framework/middleware/functional/evaluator.py b/ding/framework/middleware/functional/evaluator.py index 0166c2de4a..519a5d7567 100644 --- a/ding/framework/middleware/functional/evaluator.py +++ b/ding/framework/middleware/functional/evaluator.py @@ -335,7 +335,6 @@ def _evaluate(ctx: Union["OnlineRLContext", "OfflineRLContext"]): """ # evaluation will be executed if the task begins or enough train_iter after last evaluation - start = time.time() if ctx.last_eval_iter != -1 and \ (ctx.train_iter - ctx.last_eval_iter < cfg.policy.eval.evaluator.eval_freq): return @@ -426,7 +425,6 @@ def _evaluate(ctx: Union["OnlineRLContext", "OfflineRLContext"]): if stop_flag: task.finish = True - ctx.evaluator_time += time.time() - start return _evaluate diff --git a/ding/framework/middleware/functional/timer.py b/ding/framework/middleware/functional/timer.py index 7c73b9b809..db8a2c0056 100644 --- a/ding/framework/middleware/functional/timer.py +++ b/ding/framework/middleware/functional/timer.py @@ -31,6 +31,5 @@ def _epoch_timer(ctx: "Context"): np.mean(records) * 1000 ) ) - ctx.total_time += time_cost return _epoch_timer diff --git a/ding/framework/middleware/learner.py b/ding/framework/middleware/learner.py index cb3ee51143..413aca0e4c 100644 --- a/ding/framework/middleware/learner.py +++ b/ding/framework/middleware/learner.py @@ -142,29 +142,16 @@ def __call__(self, ctx: "OnlineRLContext") -> None: Output of ctx: - train_output (:obj:`Deque`): The training output in deque. """ - start = time.time() - time_fetcher = 0.0 - time_trainer = 0.0 - time_fetch_data = 0.0 - time_get_data = 0.0 - train_output_queue = [] data_counter = 0 - - start_fetcher = time.time() for _ in range(self.cfg.policy.learn.update_per_collect): - start_fetch_data = time.time() self._fetcher(ctx) - time_fetch_data += time.time() - start_fetch_data if ctx.train_data_sample is None: break self._data_queue_input.put(ctx.train_data_sample) data_counter += 1 - time_fetcher += time.time() - start_fetcher - start_trainer = time.time() for _ in range(data_counter): - start_get_data = time.time() while True: if self._data_queue_output.empty(): time.sleep(0.001) @@ -172,17 +159,20 @@ def __call__(self, ctx: "OnlineRLContext") -> None: else: ctx.train_data = self._data_queue_output.get() break - time_get_data += time.time() - start_get_data if self._reward_estimator: self._reward_estimator(ctx) self._trainer(ctx) train_output_queue.append(ctx.train_output) ctx.train_output_for_post_process = ctx.train_output - time_trainer += time.time() - start_trainer ctx.train_output = train_output_queue - ctx.learner_time += time.time() - start + + yield + + if task.finish: + self._data_queue_input.put(None) + self.thread_worker.join() class HERLearner: From d3c9bf85acb3b8095e6969ef2cfdd82b766b4cea Mon Sep 17 00:00:00 2001 From: zjowowen Date: Wed, 1 Nov 2023 23:17:54 +0800 Subject: [PATCH 239/244] polish code --- ding/envs/env_manager/envpool_env_manager.py | 23 +++++++++++++------ ding/example/dqn_envpool_nstep.py | 2 +- ding/framework/middleware/collector.py | 2 -- .../middleware/functional/enhancer.py | 4 +--- .../middleware/functional/evaluator.py | 1 - ding/framework/middleware/learner.py | 2 ++ ding/policy/common_utils.py | 2 ++ ding/rl_utils/ppo.py | 1 - 8 files changed, 22 insertions(+), 15 deletions(-) diff --git a/ding/envs/env_manager/envpool_env_manager.py b/ding/envs/env_manager/envpool_env_manager.py index 5a273507c9..fe321d32f7 100644 --- a/ding/envs/env_manager/envpool_env_manager.py +++ b/ding/envs/env_manager/envpool_env_manager.py @@ -35,6 +35,7 @@ class EnvState(enum.IntEnum): class PoolEnvManager: ''' Overview: + PoolEnvManager supports old pipeline of DI-engine. Envpool now supports Atari, Classic Control, Toy Text, ViZDoom. Here we list some commonly used env_ids as follows. For more examples, you can refer to . @@ -53,10 +54,12 @@ def default_config(cls) -> EasyDict: # Async mode: batch_size < env_num env_num=8, batch_size=8, + image_observation=True, ) def __init__(self, cfg: EasyDict) -> None: - self._cfg = cfg + self._cfg = self.default_config() + self._cfg.update(cfg) self._env_num = cfg.env_num self._batch_size = cfg.batch_size self._ready_obs = {} @@ -102,7 +105,8 @@ def reset(self) -> None: obs, _, _, info = self._envs.recv() env_id = info['env_id'] obs = obs.astype(np.float32) - obs /= 255.0 + if self._cfg.image_observation: + obs /= 255.0 self._ready_obs = deep_merge_dicts({i: o for i, o in zip(env_id, obs)}, self._ready_obs) if len(self._ready_obs) == self._env_num: break @@ -117,7 +121,8 @@ def step(self, action: dict) -> Dict[int, namedtuple]: obs, rew, done, info = self._envs.recv() obs = obs.astype(np.float32) - obs /= 255.0 + if self._cfg.image_observation: + obs /= 255.0 rew = rew.astype(np.float32) env_id = info['env_id'] timesteps = {} @@ -175,7 +180,7 @@ def action_space(self) -> 'gym.spaces.Space': # noqa class PoolEnvManagerV2: ''' Overview: - Envpool env manager support new pipeline of DI-engine + PoolEnvManagerV2 supports new pipeline of DI-engine. Envpool now supports Atari, Classic Control, Toy Text, ViZDoom. Here we list some commonly used env_ids as follows. For more examples, you can refer to . @@ -192,11 +197,13 @@ def default_config(cls) -> EasyDict: type='envpool', env_num=8, batch_size=8, + image_observation=True, ) def __init__(self, cfg: EasyDict) -> None: super().__init__() - self._cfg = cfg + self._cfg = self.default_config() + self._cfg.update(cfg) self._env_num = cfg.env_num self._batch_size = cfg.batch_size @@ -245,7 +252,8 @@ def reset(self) -> None: obs, _, _, info = self._envs.recv() env_id = info['env_id'] obs = obs.astype(np.float32) - obs /= 255.0 + if self._cfg.image_observation: + obs /= 255.0 ready_obs = deep_merge_dicts({i: o for i, o in zip(env_id, obs)}, ready_obs) if len(ready_obs) == self._env_num: break @@ -259,7 +267,8 @@ def send_action(self, action, env_id) -> Dict[int, namedtuple]: def receive_data(self): next_obs, rew, done, info = self._envs.recv() next_obs = next_obs.astype(np.float32) - next_obs /= 255.0 + if self._cfg.image_observation: + next_obs /= 255.0 rew = rew.astype(np.float32) return next_obs, rew, done, info diff --git a/ding/example/dqn_envpool_nstep.py b/ding/example/dqn_envpool_nstep.py index dd64ddde75..fb6159922e 100644 --- a/ding/example/dqn_envpool_nstep.py +++ b/ding/example/dqn_envpool_nstep.py @@ -108,7 +108,7 @@ def main(cfg): ) #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) - task.use(termination_checker(max_env_step=100)) + task.use(termination_checker(max_env_step=10000000)) task.run() diff --git a/ding/framework/middleware/collector.py b/ding/framework/middleware/collector.py index d5a3f3d79b..9796764c5b 100644 --- a/ding/framework/middleware/collector.py +++ b/ding/framework/middleware/collector.py @@ -72,7 +72,6 @@ def __call__(self, ctx: "OnlineRLContext") -> None: break - class EnvpoolStepCollector: def __new__(cls, *args, **kwargs): @@ -245,7 +244,6 @@ def __call__(self, ctx: "OnlineRLContext") -> None: ctx.env_step += len(ctx.trajectories) - class PPOFStepCollector: """ Overview: diff --git a/ding/framework/middleware/functional/enhancer.py b/ding/framework/middleware/functional/enhancer.py index cc5d046d38..597a086850 100644 --- a/ding/framework/middleware/functional/enhancer.py +++ b/ding/framework/middleware/functional/enhancer.py @@ -8,8 +8,6 @@ from ding.reward_model import BaseRewardModel, HerRewardModel from ding.data import Buffer -import time - def reward_estimator(cfg: EasyDict, reward_model: "BaseRewardModel") -> Callable: """ @@ -82,7 +80,7 @@ def _enhance(ctx: "OnlineRLContext"): nstep = cfg.policy.nstep gamma = cfg.policy.discount_factor L = len(ctx.trajectories) - reward_template = ctx.trajectories[0]["reward"] + reward_template = ctx.trajectories[0].reward nstep_rewards = [] value_gamma = [] for i in range(L): diff --git a/ding/framework/middleware/functional/evaluator.py b/ding/framework/middleware/functional/evaluator.py index 519a5d7567..37093a3a67 100644 --- a/ding/framework/middleware/functional/evaluator.py +++ b/ding/framework/middleware/functional/evaluator.py @@ -425,7 +425,6 @@ def _evaluate(ctx: Union["OnlineRLContext", "OfflineRLContext"]): if stop_flag: task.finish = True - return _evaluate diff --git a/ding/framework/middleware/learner.py b/ding/framework/middleware/learner.py index 413aca0e4c..980bcb584d 100644 --- a/ding/framework/middleware/learner.py +++ b/ding/framework/middleware/learner.py @@ -17,6 +17,7 @@ from threading import Thread from ding.policy.common_utils import default_preprocess_learn, fast_preprocess_learn + def data_process_func(data_queue_input, data_queue_output): while True: if data_queue_input.empty(): @@ -92,6 +93,7 @@ def __call__(self, ctx: "OnlineRLContext") -> None: ctx.train_output_for_post_process = ctx.train_output ctx.train_output = train_output_queue + class EnvpoolOffPolicyLearner: """ Overview: diff --git a/ding/policy/common_utils.py b/ding/policy/common_utils.py index 40533deab1..9dcd2c018a 100644 --- a/ding/policy/common_utils.py +++ b/ding/policy/common_utils.py @@ -70,6 +70,7 @@ def default_preprocess_learn( return data + def fast_preprocess_learn( data: List[Any], use_priority_IS_weight: bool = False, @@ -137,6 +138,7 @@ def fast_preprocess_learn( return processes_data + def single_env_forward_wrapper(forward_fn: Callable) -> Callable: """ Overview: diff --git a/ding/rl_utils/ppo.py b/ding/rl_utils/ppo.py index 4e6f1a02e9..29b441b24a 100644 --- a/ding/rl_utils/ppo.py +++ b/ding/rl_utils/ppo.py @@ -363,4 +363,3 @@ def ppo_policy_error_continuous(data: namedtuple, clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped).float().mean().item() return ppo_policy_loss(policy_loss, entropy_loss), ppo_info(approx_kl, clipfrac) - From 1bd96e049b61a38d9972e60ac7778eb0e928e955 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 16 Nov 2023 23:31:19 +0800 Subject: [PATCH 240/244] polish pr --- ding/envs/env_manager/envpool_env_manager.py | 63 ++++++++----------- ..._envpool_nstep.py => dqn_nstep_envpool.py} | 21 ------- ding/framework/middleware/collector.py | 2 - ding/framework/middleware/learner.py | 2 - ding/policy/common_utils.py | 1 - ding/policy/dqn.py | 17 +---- .../collector/sample_serial_collector.py | 4 ++ .../serial/pong/pong_dqn_envpool_config.py | 8 ++- 8 files changed, 39 insertions(+), 79 deletions(-) rename ding/example/{dqn_envpool_nstep.py => dqn_nstep_envpool.py} (88%) diff --git a/ding/envs/env_manager/envpool_env_manager.py b/ding/envs/env_manager/envpool_env_manager.py index fe321d32f7..715ac63deb 100644 --- a/ding/envs/env_manager/envpool_env_manager.py +++ b/ding/envs/env_manager/envpool_env_manager.py @@ -31,9 +31,9 @@ class EnvState(enum.IntEnum): NEED_RESET = 6 -@ENV_MANAGER_REGISTRY.register('env_pool') +@ENV_MANAGER_REGISTRY.register('envpool') class PoolEnvManager: - ''' + """ Overview: PoolEnvManager supports old pipeline of DI-engine. Envpool now supports Atari, Classic Control, Toy Text, ViZDoom. @@ -42,7 +42,7 @@ class PoolEnvManager: - Atari: "Pong-v5", "SpaceInvaders-v5", "Qbert-v5" - Classic Control: "CartPole-v0", "CartPole-v1", "Pendulum-v1" - ''' + """ @classmethod def default_config(cls) -> EasyDict: @@ -55,6 +55,11 @@ def default_config(cls) -> EasyDict: env_num=8, batch_size=8, image_observation=True, + episodic_life=False, + reward_clip=False, + gray_scale=True, + stack_num=4, + frame_skip=4, ) def __init__(self, cfg: EasyDict) -> None: @@ -73,25 +78,17 @@ def launch(self) -> None: else: seed = self._seed - kwargs = {} - if "episodic_life" in self._cfg: - kwargs["episodic_life"] = self._cfg.episodic_life - if "reward_clip" in self._cfg: - kwargs["reward_clip"] = self._cfg.reward_clip - if "stack_num" in self._cfg: - kwargs["stack_num"] = self._cfg.stack_num - if "gray_scale" in self._cfg: - kwargs["gray_scale"] = self._cfg.gray_scale - if "frame_skip" in self._cfg: - kwargs["frame_skip"] = self._cfg.frame_skip - self._envs = envpool.make( task_id=self._cfg.env_id, env_type="gym", num_envs=self._env_num, batch_size=self._batch_size, seed=seed, - **kwargs + episodic_life=self._cfg.episodic_life, + reward_clip=self._cfg.reward_clip, + stack_num=self._cfg.stack_num, + gray_scale=self._cfg.gray_scale, + frame_skip=self._cfg.frame_skip, ) self._action_space = self._envs.action_space self._observation_space = self._envs.observation_space @@ -176,9 +173,9 @@ def action_space(self) -> 'gym.spaces.Space': # noqa return self._action_space -@ENV_MANAGER_REGISTRY.register('env_pool_v4') +@ENV_MANAGER_REGISTRY.register('envpool_v2') class PoolEnvManagerV2: - ''' + """ Overview: PoolEnvManagerV2 supports new pipeline of DI-engine. Envpool now supports Atari, Classic Control, Toy Text, ViZDoom. @@ -187,17 +184,22 @@ class PoolEnvManagerV2: - Atari: "Pong-v5", "SpaceInvaders-v5", "Qbert-v5" - Classic Control: "CartPole-v0", "CartPole-v1", "Pendulum-v1" - ''' + """ @classmethod def default_config(cls) -> EasyDict: return EasyDict(deepcopy(cls.config)) config = dict( - type='envpool', + type='envpool_v2', env_num=8, batch_size=8, image_observation=True, + episodic_life=False, + reward_clip=False, + gray_scale=True, + stack_num=4, + frame_skip=4, ) def __init__(self, cfg: EasyDict) -> None: @@ -209,7 +211,6 @@ def __init__(self, cfg: EasyDict) -> None: self._closed = True self._seed = None - self._test = False def launch(self) -> None: assert self._closed, "Please first close the env manager" @@ -218,27 +219,17 @@ def launch(self) -> None: else: seed = self._seed - kwargs = {} - if "episodic_life" in self._cfg: - kwargs["episodic_life"] = self._cfg.episodic_life - if "reward_clip" in self._cfg: - kwargs["reward_clip"] = self._cfg.reward_clip - if "stack_num" in self._cfg: - kwargs["stack_num"] = self._cfg.stack_num - if "gray_scale" in self._cfg: - kwargs["gray_scale"] = self._cfg.gray_scale - if "frame_skip" in self._cfg: - kwargs["frame_skip"] = self._cfg.frame_skip - if "test" in self._cfg: - self._test = self._cfg.test - self._envs = envpool.make( task_id=self._cfg.env_id, env_type="gym", num_envs=self._env_num, batch_size=self._batch_size, seed=seed, - **kwargs + episodic_life=self._cfg.episodic_life, + reward_clip=self._cfg.reward_clip, + stack_num=self._cfg.stack_num, + gray_scale=self._cfg.gray_scale, + frame_skip=self._cfg.frame_skip, ) self._action_space = self._envs.action_space self._observation_space = self._envs.observation_space diff --git a/ding/example/dqn_envpool_nstep.py b/ding/example/dqn_nstep_envpool.py similarity index 88% rename from ding/example/dqn_envpool_nstep.py rename to ding/example/dqn_nstep_envpool.py index fb6159922e..c5695f900f 100644 --- a/ding/example/dqn_envpool_nstep.py +++ b/ding/example/dqn_nstep_envpool.py @@ -1,9 +1,4 @@ import datetime -import torch -try: - torch.multiprocessing.set_start_method('spawn') -except RuntimeError: - pass from easydict import EasyDict from ditk import logging from ding.model import DQN @@ -17,7 +12,6 @@ eps_greedy_handler, CkptSaver, ContextExchanger, ModelExchanger, online_logger, \ termination_checker, wandb_online_logger, epoch_timer, EnvpoolStepCollector, EnvpoolOffPolicyLearner from ding.utils import set_pkg_seed - from dizoo.atari.config.serial import pong_dqn_envpool_config @@ -78,10 +72,7 @@ def main(cfg): # Sync their context and model between each worker. task.use(ContextExchanger(skip_n_iter=1)) task.use(ModelExchanger(model)) - task.use(epoch_timer()) - - # Here is the part of single process pipeline. task.use(envpool_evaluator(cfg, policy.eval_mode, evaluator_env)) task.use(eps_greedy_handler(cfg)) task.use( @@ -106,10 +97,8 @@ def main(cfg): wandb_sweep=False, ) ) - #task.use(CkptSaver(policy, cfg.exp_name, train_freq=1000)) task.use(termination_checker(max_env_step=10000000)) - task.run() @@ -125,15 +114,5 @@ def main(cfg): pong_dqn_envpool_config.env.collector_env_num = arg.collector_env_num pong_dqn_envpool_config.env.collector_batch_size = arg.collector_batch_size pong_dqn_envpool_config.seed = arg.seed - pong_dqn_envpool_config.env.stop_value = 2000 - pong_dqn_envpool_config.nstep = 3 - pong_dqn_envpool_config.policy.nstep = 3 - pong_dqn_envpool_config.seed = arg.seed - - pong_dqn_envpool_config.policy.learn.update_per_collect = 2 - pong_dqn_envpool_config.policy.learn.batch_size = 32 - pong_dqn_envpool_config.policy.learn.learning_rate = 0.0001 - pong_dqn_envpool_config.policy.learn.target_update_freq = 0 - pong_dqn_envpool_config.policy.learn.target_update = 0.04 main(pong_dqn_envpool_config) diff --git a/ding/framework/middleware/collector.py b/ding/framework/middleware/collector.py index 9796764c5b..0411485219 100644 --- a/ding/framework/middleware/collector.py +++ b/ding/framework/middleware/collector.py @@ -10,8 +10,6 @@ if TYPE_CHECKING: from ding.framework import OnlineRLContext -import time - import numpy as np diff --git a/ding/framework/middleware/learner.py b/ding/framework/middleware/learner.py index 980bcb584d..16182435c6 100644 --- a/ding/framework/middleware/learner.py +++ b/ding/framework/middleware/learner.py @@ -27,7 +27,6 @@ def data_process_func(data_queue_input, data_queue_output): if data is None: break else: - #print("get one data") output_data = fast_preprocess_learn( data, use_priority=False, #policy._cfg.priority, @@ -36,7 +35,6 @@ def data_process_func(data_queue_input, data_queue_output): device="cuda:0", #policy._device, ) data_queue_output.put(output_data) - #print("put one data, queue size:{}".format(data_queue_output.qsize())) class OffPolicyLearner: diff --git a/ding/policy/common_utils.py b/ding/policy/common_utils.py index 9dcd2c018a..6793896613 100644 --- a/ding/policy/common_utils.py +++ b/ding/policy/common_utils.py @@ -5,7 +5,6 @@ import treetensor.torch as ttorch from ding.utils.data import default_collate from ding.torch_utils import to_tensor, to_ndarray, unsqueeze, squeeze, to_device -import time def default_preprocess_learn( diff --git a/ding/policy/dqn.py b/ding/policy/dqn.py index 835e34f6e2..06277b3b20 100644 --- a/ding/policy/dqn.py +++ b/ding/policy/dqn.py @@ -10,9 +10,7 @@ from ding.utils.data import default_collate, default_decollate from .base_policy import Policy -from .common_utils import default_preprocess_learn, fast_preprocess_learn - -import time +from .common_utils import default_preprocess_learn @POLICY_REGISTRY.register('dqn') @@ -201,14 +199,14 @@ def _init_learn(self) -> None: # use model_wrapper for specialized demands of different modes self._target_model = copy.deepcopy(self._model) - if 'target_update_freq' in self._cfg.learn: + if 'target_update_freq' in self._cfg.learn and self._cfg.learn.target_update_freq > 0: self._target_model = model_wrap( self._target_model, wrapper_name='target', update_type='assign', update_kwargs={'freq': self._cfg.learn.target_update_freq} ) - elif 'target_theta' in self._cfg.learn: + elif 'target_theta' in self._cfg.learn and self._cfg.learn.target_theta > 0: self._target_model = model_wrap( self._target_model, wrapper_name='target', @@ -251,8 +249,6 @@ def _forward_learn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: For more detailed examples, please refer to our unittest for DQNPolicy: ``ding.policy.tests.test_dqn``. """ - start = time.time() - # Data preprocessing operations, such as stack data, cpu to cuda device data = default_preprocess_learn( data, @@ -262,9 +258,6 @@ def _forward_learn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: use_nstep=True ) - time_data_process = time.time() - start - start = time.time() - if self._cuda: data = to_device(data, self._device) # Q-learning forward @@ -294,10 +287,6 @@ def _forward_learn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: # Postprocessing operations, such as updating target model, return logged values and priority. self._target_model.update(self._learn_model.state_dict()) - time_learn = time.time() - start - # print("time_data_process:",time_data_process) - # print("time_learn:",time_learn) - return { 'cur_lr': self._optimizer.defaults['lr'], 'total_loss': loss.item(), diff --git a/ding/worker/collector/sample_serial_collector.py b/ding/worker/collector/sample_serial_collector.py index 34374d8630..07bac75ae5 100644 --- a/ding/worker/collector/sample_serial_collector.py +++ b/ding/worker/collector/sample_serial_collector.py @@ -45,6 +45,10 @@ def __init__( - env (:obj:`BaseEnvManager`): the subclass of vectorized env_manager(BaseEnvManager) - policy (:obj:`namedtuple`): the api namedtuple of collect_mode policy - tb_logger (:obj:`SummaryWriter`): tensorboard handle + - exp_name (:obj:`Optional[str]`): name of the project folder of this experiment + - instance_name (:obj:`Optional[str]`): instance name, used to specify the saving path of log and model + - timer_cuda (:obj:`bool`): whether to use cuda timer, if True, the timer will measure the time of \ + the forward process on cuda, otherwise, the timer will measure the time of the forward process on cpu. """ self._exp_name = exp_name self._instance_name = instance_name diff --git a/dizoo/atari/config/serial/pong/pong_dqn_envpool_config.py b/dizoo/atari/config/serial/pong/pong_dqn_envpool_config.py index f09cd4c54d..b454412e89 100644 --- a/dizoo/atari/config/serial/pong/pong_dqn_envpool_config.py +++ b/dizoo/atari/config/serial/pong/pong_dqn_envpool_config.py @@ -8,11 +8,12 @@ evaluator_env_num=8, evaluator_batch_size=8, n_evaluator_episode=8, - stop_value=20, + stop_value=21, env_id='Pong-v5', #'ALE/Pong-v5' is available. But special setting is needed after gym make. frame_stack=4, ), + nstep = 3, policy=dict( cuda=True, priority=False, @@ -25,10 +26,11 @@ nstep=3, discount_factor=0.99, learn=dict( - update_per_collect=10, + update_per_collect=2, batch_size=32, learning_rate=0.0001, - target_update_freq=500, + target_update_freq=0, + target_theta = 0.04, ), collect=dict(n_sample=96, ), eval=dict(evaluator=dict(eval_freq=4000, )), From 35a2c677e38bd57140ceff040ea5f8536f42ff58 Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 16 Nov 2023 23:45:04 +0800 Subject: [PATCH 241/244] fix bug --- ding/framework/middleware/collector.py | 20 +++++++++++++++----- ding/policy/common_utils.py | 2 +- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/ding/framework/middleware/collector.py b/ding/framework/middleware/collector.py index 0411485219..1d3de2592e 100644 --- a/ding/framework/middleware/collector.py +++ b/ding/framework/middleware/collector.py @@ -11,7 +11,7 @@ from ding.framework import OnlineRLContext import numpy as np - +import torch class StepCollector: """ @@ -132,8 +132,18 @@ def __call__(self, ctx: "OnlineRLContext") -> None: for i in self._ready_obs_receive.keys() } else: - action_to_send = self.policy.forward(self._ready_obs_receive, **ctx.collect_kwargs) - + action_by_policy = self.policy.forward(self._ready_obs_receive, **ctx.collect_kwargs) + + if isinstance(list(action_by_policy.values())[0]['action'], torch.Tensor): + # transfer to numpy + action_to_send = { + i: { + "action": action_by_policy[i]['action'].cpu().numpy() + } + for i in action_by_policy.keys() + } + else: + action_to_send = action_by_policy self._ready_obs_send.update(self._ready_obs_receive) self._ready_obs_receive = {} self._ready_action_send.update(action_to_send) @@ -205,7 +215,7 @@ def __call__(self, ctx: "OnlineRLContext") -> None: if j == 1: self._trajectory[env_id_receive[i]][-j]['reward'].extend( [ - np.zeros_like(current_reward) for _ in + 0.0 for _ in range(self._nsteps - len(self._trajectory[env_id_receive[i]][-j]['reward'])) ] ) @@ -220,7 +230,7 @@ def __call__(self, ctx: "OnlineRLContext") -> None: self._trajectory[env_id_receive[i]][-j]['reward'].append(current_reward) self._trajectory[env_id_receive[i]][-j]['reward'].extend( [ - np.zeros_like(current_reward) for _ in range( + 0.0 for _ in range( self._nsteps - len(self._trajectory[env_id_receive[i]][-j]['reward']) ) ] diff --git a/ding/policy/common_utils.py b/ding/policy/common_utils.py index 6793896613..c3833fd58d 100644 --- a/ding/policy/common_utils.py +++ b/ding/policy/common_utils.py @@ -103,7 +103,7 @@ def fast_preprocess_learn( next_n_obs = to_device(next_n_obs, device=device) processes_data['next_n_obs'] = next_n_obs - reward = torch.tensor(np.array([data[i]['reward'] for i in range(len(data))])) + reward = torch.tensor(np.array([data[i]['reward'] for i in range(len(data))]), dtype=torch.float32) if cuda: reward = to_device(reward, device=device) reward = reward.permute(1, 0).contiguous() From 3687f8b151a2c25a70afc7b27c4a8883bace7bcf Mon Sep 17 00:00:00 2001 From: zjowowen Date: Mon, 20 Nov 2023 16:34:13 +0800 Subject: [PATCH 242/244] polish code --- ding/framework/middleware/collector.py | 1 + ding/framework/middleware/learner.py | 39 +++++++++--- ding/policy/common_utils.py | 29 +++++++-- ding/policy/dqn.py | 12 ++-- ding/policy/tests/test_common_utils.py | 62 +++++++++++++++++++ .../serial/pong/pong_dqn_envpool_config.py | 10 ++- 6 files changed, 131 insertions(+), 22 deletions(-) diff --git a/ding/framework/middleware/collector.py b/ding/framework/middleware/collector.py index 1d3de2592e..d6a11bcb61 100644 --- a/ding/framework/middleware/collector.py +++ b/ding/framework/middleware/collector.py @@ -13,6 +13,7 @@ import numpy as np import torch + class StepCollector: """ Overview: diff --git a/ding/framework/middleware/learner.py b/ding/framework/middleware/learner.py index 16182435c6..7182a9d5df 100644 --- a/ding/framework/middleware/learner.py +++ b/ding/framework/middleware/learner.py @@ -13,12 +13,19 @@ from queue import Queue import time -import torch.multiprocessing as mp from threading import Thread -from ding.policy.common_utils import default_preprocess_learn, fast_preprocess_learn - - -def data_process_func(data_queue_input, data_queue_output): +from ding.policy.common_utils import fast_preprocess_learn + + +def data_process_func( + data_queue_input: Queue, + data_queue_output: Queue, + use_priority: bool = False, + use_priority_IS_weight: bool = False, + use_nstep: bool = False, + cuda: bool = True, + device: str = "cuda:0", +): while True: if data_queue_input.empty(): time.sleep(0.001) @@ -29,10 +36,11 @@ def data_process_func(data_queue_input, data_queue_output): else: output_data = fast_preprocess_learn( data, - use_priority=False, #policy._cfg.priority, - use_priority_IS_weight=False, #policy._cfg.priority_IS_weight, - cuda=True, #policy._cuda, - device="cuda:0", #policy._device, + use_priority=use_priority, #policy._cfg.priority, + use_priority_IS_weight=use_priority_IS_weight, #policy._cfg.priority_IS_weight, + use_nstep=use_nstep, #policy._cfg.nstep > 1, + cuda=cuda, #policy._cuda, + device=device, #policy._device, ) data_queue_output.put(output_data) @@ -128,7 +136,18 @@ def __init__( self._data_queue_input = Queue() self._data_queue_output = Queue() - self.thread_worker = Thread(target=data_process_func, args=(self._data_queue_input, self._data_queue_output)) + self.thread_worker = Thread( + target=data_process_func, + args=( + self._data_queue_input, + self._data_queue_output, + cfg.policy.priority, + cfg.policy.priority_IS_weight, + cfg.policy.nstep > 1, + cfg.policy.cuda, + policy._device, + ) + ) self.thread_worker.start() self._trainer = task.wrap(trainer(cfg, policy.learn_mode, log_freq=log_freq)) diff --git a/ding/policy/common_utils.py b/ding/policy/common_utils.py index c3833fd58d..97a0084b37 100644 --- a/ding/policy/common_utils.py +++ b/ding/policy/common_utils.py @@ -71,19 +71,37 @@ def default_preprocess_learn( def fast_preprocess_learn( - data: List[Any], + data: List[np.ndarray], use_priority_IS_weight: bool = False, use_priority: bool = False, + use_nstep: bool = False, cuda: bool = False, device: str = 'cpu', ) -> dict: - # data preprocess + """ + Overview: + Fast data pre-processing before policy's ``_forward_learn`` method, including stacking batch data, transform \ + data to PyTorch Tensor and move data to GPU, etc. This function is faster than ``default_preprocess_learn`` \ + but less flexible. This function abandons calling ``default_collate`` to stack data because ``default_collate`` \ + is recursive and cumbersome. In this function, we alternatively stack the data and send it to GPU, so that it \ + is faster. In addition, this function is usually used in a special data process thread in learner. + Arguments: + - data (:obj:`List[np.ndarray]`): The list of a training batch samples, each sample is a dict of PyTorch Tensor. + - use_priority_IS_weight (:obj:`bool`): Whether to use priority IS weight correction, if True, this function \ + will set the weight of each sample to the priority IS weight. + - use_priority (:obj:`bool`): Whether to use priority, if True, this function will set the priority IS weight. + - cuda (:obj:`bool`): Whether to use cuda in policy, if True, this function will move the input data to cuda. + - device (:obj:`str`): The device name to move the input data to. + Returns: + - data (:obj:`dict`): The preprocessed dict data whose values can be directly used for \ + the following model forward and loss computation. + """ processes_data = {} action = torch.tensor(np.array([data[i]['action'] for i in range(len(data))])) if cuda: action = to_device(action, device=device) - if action.ndim == 2 and action.shape[1] == 1: + if action.ndim == 2 and action.shape[1] == 1 and action.dtype in [torch.int64, torch.int32]: action = action.squeeze(1) processes_data['action'] = action @@ -106,7 +124,8 @@ def fast_preprocess_learn( reward = torch.tensor(np.array([data[i]['reward'] for i in range(len(data))]), dtype=torch.float32) if cuda: reward = to_device(reward, device=device) - reward = reward.permute(1, 0).contiguous() + if use_nstep: + reward = reward.permute(1, 0).contiguous() processes_data['reward'] = reward if 'value_gamma' in data[0]: @@ -131,7 +150,7 @@ def fast_preprocess_learn( else: weight = None - if weight and cuda: + if weight is not None and cuda: weight = to_device(weight, device=device) processes_data['weight'] = weight diff --git a/ding/policy/dqn.py b/ding/policy/dqn.py index 06277b3b20..82e54462bf 100644 --- a/ding/policy/dqn.py +++ b/ding/policy/dqn.py @@ -199,14 +199,16 @@ def _init_learn(self) -> None: # use model_wrapper for specialized demands of different modes self._target_model = copy.deepcopy(self._model) - if 'target_update_freq' in self._cfg.learn and self._cfg.learn.target_update_freq > 0: + if 'target_update_freq' in self._cfg.learn and self._cfg.learn.target_update_freq is not None \ + and self._cfg.learn.target_update_freq > 0: self._target_model = model_wrap( self._target_model, wrapper_name='target', update_type='assign', update_kwargs={'freq': self._cfg.learn.target_update_freq} ) - elif 'target_theta' in self._cfg.learn and self._cfg.learn.target_theta > 0: + elif 'target_theta' in self._cfg.learn and self._cfg.learn.target_theta is not None \ + and self._cfg.learn.target_theta > 0.0: self._target_model = model_wrap( self._target_model, wrapper_name='target', @@ -704,14 +706,16 @@ def _init_learn(self) -> None: # use model_wrapper for specialized demands of different modes self._target_model = copy.deepcopy(self._model) - if 'target_update_freq' in self._cfg.learn and self._cfg.learn.target_update_freq > 0: + if 'target_update_freq' in self._cfg.learn and self._cfg.learn.target_update_freq is not None \ + and self._cfg.learn.target_update_freq > 0: self._target_model = model_wrap( self._target_model, wrapper_name='target', update_type='assign', update_kwargs={'freq': self._cfg.learn.target_update_freq} ) - elif 'target_theta' in self._cfg.learn and self._cfg.learn.target_theta > 0: + elif 'target_theta' in self._cfg.learn and self._cfg.learn.target_theta is not None \ + and self._cfg.learn.target_theta > 0.0: self._target_model = model_wrap( self._target_model, wrapper_name='target', diff --git a/ding/policy/tests/test_common_utils.py b/ding/policy/tests/test_common_utils.py index 96fbde0963..a7e279b6a0 100644 --- a/ding/policy/tests/test_common_utils.py +++ b/ding/policy/tests/test_common_utils.py @@ -5,6 +5,7 @@ import treetensor.torch as ttorch from ding.policy.common_utils import default_preprocess_learn +from ding.policy.common_utils import fast_preprocess_learn shape_test = [ [2], @@ -173,3 +174,64 @@ def test_default_preprocess_learn_nstep(): assert data['reward'][0][0] == torch.tensor(1.0) assert data['reward'][1][0] == torch.tensor(2.0) assert data['reward'][2][0] == torch.tensor(0.0) + + +@pytest.mark.unittest +def test_fast_preprocess_learn_action(): + + for shape in shape_test: + for dtype in dtype_test: + data = [ + { + 'obs': np.random.randn(4, 84, 84), + 'action': np.random.randn(*shape).astype(dtype), + 'reward': 1.0, + 'next_obs': np.random.randn(4, 84, 84), + 'done': False, + 'weight': 1.0, + } for _ in range(10) + ] + use_priority_IS_weight = False + use_priority = False + use_nstep = False + data = fast_preprocess_learn( + data, use_priority_IS_weight, use_priority, use_nstep, cuda=False, device="cpu" + ) + + assert data['obs'].shape == torch.Size([10, 4, 84, 84]) + if dtype in ["int64"] and shape[0] == 1: + assert data['action'].shape == torch.Size([10]) + else: + assert data['action'].shape == torch.Size([10, *shape]) + assert data['reward'].shape == torch.Size([10]) + assert data['next_obs'].shape == torch.Size([10, 4, 84, 84]) + assert data['done'].shape == torch.Size([10]) + assert data['weight'].shape == torch.Size([10]) + + +@pytest.mark.unittest +def test_fast_preprocess_learn_nstep(): + + data = [ + { + 'obs': np.random.randn(4, 84, 84), + 'action': np.random.randn(2), + 'reward': np.array([1.0, 2.0, 0.0]), + 'next_obs': np.random.randn(4, 84, 84), + 'done': False, + 'weight': 1.0, + } for _ in range(10) + ] + use_priority_IS_weight = False + use_priority = False + use_nstep = True + data = fast_preprocess_learn(data, use_priority_IS_weight, use_priority, use_nstep, cuda=False, device="cpu") + + assert data['reward'].shape == torch.Size([3, 10]) + assert data['reward'][0][0] == torch.tensor(1.0) + assert data['reward'][1][0] == torch.tensor(2.0) + assert data['reward'][2][0] == torch.tensor(0.0) + + +if __name__ == "__main__": + test_fast_preprocess_learn_nstep() diff --git a/dizoo/atari/config/serial/pong/pong_dqn_envpool_config.py b/dizoo/atari/config/serial/pong/pong_dqn_envpool_config.py index b454412e89..8a9f9c1721 100644 --- a/dizoo/atari/config/serial/pong/pong_dqn_envpool_config.py +++ b/dizoo/atari/config/serial/pong/pong_dqn_envpool_config.py @@ -29,8 +29,12 @@ update_per_collect=2, batch_size=32, learning_rate=0.0001, - target_update_freq=0, - target_theta = 0.04, + # If updating target network by replacement, \ + # target_update_freq should be larger than 0. \ + # If updating target network by changing several percentage of the origin weights, \ + # target_update_freq should be 0 and target_theta should be set. + target_update_freq=None, + target_theta=0.04, ), collect=dict(n_sample=96, ), eval=dict(evaluator=dict(eval_freq=4000, )), @@ -52,7 +56,7 @@ type='atari', import_names=['dizoo.atari.envs.atari_env'], ), - env_manager=dict(type='env_pool'), + env_manager=dict(type='envpool'), policy=dict(type='dqn'), replay_buffer=dict(type='deque'), ) From 4fb85b05af0b05858de8372276235fcdf19b162a Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 23 Nov 2023 14:56:04 +0800 Subject: [PATCH 243/244] polish code --- ding/envs/env_manager/envpool_env_manager.py | 3 +- .../tests/test_envpool_env_manager.py | 52 +++++++++++++++---- 2 files changed, 45 insertions(+), 10 deletions(-) diff --git a/ding/envs/env_manager/envpool_env_manager.py b/ding/envs/env_manager/envpool_env_manager.py index 715ac63deb..bcfa5ae3ce 100644 --- a/ding/envs/env_manager/envpool_env_manager.py +++ b/ding/envs/env_manager/envpool_env_manager.py @@ -245,7 +245,8 @@ def reset(self) -> None: obs = obs.astype(np.float32) if self._cfg.image_observation: obs /= 255.0 - ready_obs = deep_merge_dicts({i: o for i, o in zip(env_id, obs)}, ready_obs) + for i in range(len(list(env_id))): + ready_obs[env_id[i]] = obs[i] if len(ready_obs) == self._env_num: break self._eval_episode_return = [0. for _ in range(self._env_num)] diff --git a/ding/envs/env_manager/tests/test_envpool_env_manager.py b/ding/envs/env_manager/tests/test_envpool_env_manager.py index 9ac7730773..3d9e0dd5de 100644 --- a/ding/envs/env_manager/tests/test_envpool_env_manager.py +++ b/ding/envs/env_manager/tests/test_envpool_env_manager.py @@ -3,7 +3,7 @@ import numpy as np from easydict import EasyDict -from ..envpool_env_manager import PoolEnvManager +from ding.envs.env_manager.envpool_env_manager import PoolEnvManager, PoolEnvManagerV2 env_num_args = [[16, 8], [8, 8]] @@ -30,17 +30,51 @@ def test_naive(self, env_num, batch_size): env_manager = PoolEnvManager(env_manager_cfg) assert env_manager._closed env_manager.launch() - # Test step - start_time = time.time() - for count in range(20): + for count in range(5): env_id = env_manager.ready_obs.keys() action = {i: np.random.randint(4) for i in env_id} timestep = env_manager.step(action) assert len(timestep) == env_manager_cfg.batch_size - print('Count {}'.format(count)) - print([v.info for v in timestep.values()]) - end_time = time.time() - print('total step time: {}'.format(end_time - start_time)) - # Test close env_manager.close() assert env_manager._closed + + +@pytest.mark.envpooltest +@pytest.mark.parametrize('env_num, batch_size', env_num_args) +class TestPoolEnvManagerV2: + + def test_naive(self, env_num, batch_size): + env_manager_cfg = EasyDict( + { + 'env_id': 'Pong-v5', + 'env_num': env_num, + 'batch_size': batch_size, + 'seed': 3, + # env wrappers + 'episodic_life': False, + 'reward_clip': False, + 'gray_scale': True, + 'stack_num': 4, + 'frame_skip': 4, + } + ) + env_manager = PoolEnvManagerV2(env_manager_cfg) + assert env_manager._closed + ready_obs = env_manager.launch() + env_id = list(ready_obs.keys()) + for count in range(5): + action = {i: np.random.randint(4) for i in env_id} + action_send = np.array([action[i] for i in action.keys()]) + env_id_send = np.array(list(action.keys())) + env_manager.send_action(action_send, env_id_send) + next_obs, rew, done, info = env_manager.receive_data() + assert next_obs.shape == (env_manager_cfg.batch_size, 4, 84, 84) + assert rew.shape == (env_manager_cfg.batch_size, ) + assert done.shape == (env_manager_cfg.batch_size, ) + assert info['env_id'].shape == (env_manager_cfg.batch_size, ) + env_manager.close() + assert env_manager._closed + + +if __name__ == "__main__": + TestPoolEnvManagerV2().test_naive(16, 8) From 48ee6da58ba460e25f4010149b5d81d1d978539a Mon Sep 17 00:00:00 2001 From: zjowowen Date: Thu, 23 Nov 2023 15:03:28 +0800 Subject: [PATCH 244/244] polish code --- ding/example/dqn_nstep_envpool.py | 7 +++---- ding/framework/middleware/collector.py | 8 ++++---- ding/framework/middleware/functional/data_processor.py | 1 - ding/framework/middleware/functional/evaluator.py | 2 +- ding/framework/middleware/learner.py | 10 +++++----- ding/policy/common_utils.py | 4 ++-- 6 files changed, 15 insertions(+), 17 deletions(-) diff --git a/ding/example/dqn_nstep_envpool.py b/ding/example/dqn_nstep_envpool.py index c5695f900f..7ab7a74677 100644 --- a/ding/example/dqn_nstep_envpool.py +++ b/ding/example/dqn_nstep_envpool.py @@ -80,10 +80,9 @@ def main(cfg): cfg, policy.collect_mode, collector_env, - random_collect_size=cfg.policy.random_collect_size \ - if hasattr(cfg.policy, 'random_collect_size') else 0, - ) - ) + random_collect_size=cfg.policy.random_collect_size if hasattr(cfg.policy, 'random_collect_size') else 0, + ) + ) task.use(data_pusher(cfg, buffer_)) task.use(EnvpoolOffPolicyLearner(cfg, policy, buffer_)) task.use(online_logger(train_show_freq=10)) diff --git a/ding/framework/middleware/collector.py b/ding/framework/middleware/collector.py index d6a11bcb61..ed58c80993 100644 --- a/ding/framework/middleware/collector.py +++ b/ding/framework/middleware/collector.py @@ -187,7 +187,7 @@ def __call__(self, ctx: "OnlineRLContext") -> None: ) if self._nsteps > 1: - if done[i] == False and counter < target_size: + if done[i] is False and counter < target_size: reverse_record_position = min(self._nsteps, len(self._trajectory[env_id_receive[i]])) real_reverse_record_position = reverse_record_position @@ -195,7 +195,7 @@ def __call__(self, ctx: "OnlineRLContext") -> None: if j == 1: pass else: - if self._trajectory[env_id_receive[i]][-j]['done'] == True: + if self._trajectory[env_id_receive[i]][-j]['done'] is True: real_reverse_record_position = j - 1 break else: @@ -207,7 +207,7 @@ def __call__(self, ctx: "OnlineRLContext") -> None: self._trajectory[env_id_receive[i]][-real_reverse_record_position][ 'value_gamma'] = self._discount_ratio_list[real_reverse_record_position - 1] - else: # done[i] == True or counter >= target_size + else: # done[i] is True or counter >= target_size reverse_record_position = min(self._nsteps, len(self._trajectory[env_id_receive[i]])) real_reverse_record_position = reverse_record_position @@ -224,7 +224,7 @@ def __call__(self, ctx: "OnlineRLContext") -> None: self._trajectory[env_id_receive[i]][-j]['value_gamma'] = self._discount_ratio_list[j - 1] else: - if self._trajectory[env_id_receive[i]][-j]['done'] == True: + if self._trajectory[env_id_receive[i]][-j]['done'] is True: real_reverse_record_position = j break else: diff --git a/ding/framework/middleware/functional/data_processor.py b/ding/framework/middleware/functional/data_processor.py index 420af3d4fd..e254e4ad3b 100644 --- a/ding/framework/middleware/functional/data_processor.py +++ b/ding/framework/middleware/functional/data_processor.py @@ -284,7 +284,6 @@ def offline_data_fetcher_from_mem(cfg: EasyDict, dataset: Dataset) -> Callable: from threading import Thread from queue import Queue - import time stream = torch.cuda.Stream() def producer(queue, dataset, batch_size, device): diff --git a/ding/framework/middleware/functional/evaluator.py b/ding/framework/middleware/functional/evaluator.py index 37093a3a67..31b70153a7 100644 --- a/ding/framework/middleware/functional/evaluator.py +++ b/ding/framework/middleware/functional/evaluator.py @@ -385,7 +385,7 @@ def _evaluate(ctx: Union["OnlineRLContext", "OfflineRLContext"]): } ) - if done[i] == True: + if done[i] is True: episode_return_i = 0.0 for item in trajectory[env_id_receive[i]]: episode_return_i += item['reward'][0] diff --git a/ding/framework/middleware/learner.py b/ding/framework/middleware/learner.py index 7182a9d5df..8779ced315 100644 --- a/ding/framework/middleware/learner.py +++ b/ding/framework/middleware/learner.py @@ -36,11 +36,11 @@ def data_process_func( else: output_data = fast_preprocess_learn( data, - use_priority=use_priority, #policy._cfg.priority, - use_priority_IS_weight=use_priority_IS_weight, #policy._cfg.priority_IS_weight, - use_nstep=use_nstep, #policy._cfg.nstep > 1, - cuda=cuda, #policy._cuda, - device=device, #policy._device, + use_priority=use_priority, + use_priority_IS_weight=use_priority_IS_weight, + use_nstep=use_nstep, + cuda=cuda, + device=device, ) data_queue_output.put(output_data) diff --git a/ding/policy/common_utils.py b/ding/policy/common_utils.py index 97a0084b37..a8e095974c 100644 --- a/ding/policy/common_utils.py +++ b/ding/policy/common_utils.py @@ -1,7 +1,6 @@ from typing import List, Any, Dict, Callable import numpy as np import torch -import numpy as np import treetensor.torch as ttorch from ding.utils.data import default_collate from ding.torch_utils import to_tensor, to_ndarray, unsqueeze, squeeze, to_device @@ -82,7 +81,8 @@ def fast_preprocess_learn( Overview: Fast data pre-processing before policy's ``_forward_learn`` method, including stacking batch data, transform \ data to PyTorch Tensor and move data to GPU, etc. This function is faster than ``default_preprocess_learn`` \ - but less flexible. This function abandons calling ``default_collate`` to stack data because ``default_collate`` \ + but less flexible. + This function abandons calling ``default_collate`` to stack data because ``default_collate`` \ is recursive and cumbersome. In this function, we alternatively stack the data and send it to GPU, so that it \ is faster. In addition, this function is usually used in a special data process thread in learner. Arguments: