You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
In cleanRL example, the episode returns in two independent episodes are added up since the episode return counter is not reset at time-out.
To Reproduce
The following is the minimal example to reproduce the bug. I tried the RecordEpisodeStatistics wrapper in envpool repo and clean RL repo, and both have this bug.
importgymimportnumpyasnpimportenvpoolis_legacy_gym=True# From: https://github.com/sail-sg/envpool/blob/main/examples/cleanrl_examples/ppo_atari_envpool.pyclassRecordEpisodeStatistics(gym.Wrapper):
def__init__(self, env, deque_size=100):
super(RecordEpisodeStatistics, self).__init__(env)
self.num_envs=getattr(env, "num_envs", 1)
self.episode_returns=Noneself.episode_lengths=None# get if the env has livesself.has_lives=Falseenv.reset()
info=env.step(np.zeros(self.num_envs, dtype=int))[-1]
ifinfo["lives"].sum() >0:
self.has_lives=Trueprint("env has lives")
defreset(self, **kwargs):
ifis_legacy_gym:
observations=super(RecordEpisodeStatistics, self).reset(**kwargs)
else:
observations, _=super(RecordEpisodeStatistics, self).reset(**kwargs)
self.episode_returns=np.zeros(self.num_envs, dtype=np.float32)
self.episode_lengths=np.zeros(self.num_envs, dtype=np.int32)
self.lives=np.zeros(self.num_envs, dtype=np.int32)
self.returned_episode_returns=np.zeros(self.num_envs, dtype=np.float32)
self.returned_episode_lengths=np.zeros(self.num_envs, dtype=np.int32)
returnobservationsdefstep(self, action):
ifis_legacy_gym:
observations, rewards, dones, infos=super(
RecordEpisodeStatistics, self
).step(action)
else:
observations, rewards, term, trunc, infos=super(
RecordEpisodeStatistics, self
).step(action)
dones=term+truncself.episode_returns+=infos["reward"]
self.episode_lengths+=1self.returned_episode_returns[:] =self.episode_returnsself.returned_episode_lengths[:] =self.episode_lengthsall_lives_exhausted=infos["lives"] ==0ifself.has_lives:
self.episode_returns*=1-all_lives_exhaustedself.episode_lengths*=1-all_lives_exhaustedelse:
self.episode_returns*=1-donesself.episode_lengths*=1-donesinfos["r"] =self.returned_episode_returnsinfos["l"] =self.returned_episode_lengthsreturn (
observations,
rewards,
dones,
infos,
)
# From: https://github.com/vwxyzjn/cleanrl/blob/master/cleanrl/ppo_atari_envpool.py# class RecordEpisodeStatistics(gym.Wrapper):# def __init__(self, env, deque_size=100):# super().__init__(env)# self.num_envs = getattr(env, "num_envs", 1)# self.episode_returns = None# self.episode_lengths = None# def reset(self, **kwargs):# observations = super().reset(**kwargs)# self.episode_returns = np.zeros(self.num_envs, dtype=np.float32)# self.episode_lengths = np.zeros(self.num_envs, dtype=np.int32)# self.lives = np.zeros(self.num_envs, dtype=np.int32)# self.returned_episode_returns = np.zeros(self.num_envs, dtype=np.float32)# self.returned_episode_lengths = np.zeros(self.num_envs, dtype=np.int32)# return observations# def step(self, action):# observations, rewards, dones, infos = super().step(action)# self.episode_returns += infos["reward"]# self.episode_lengths += 1# self.returned_episode_returns[:] = self.episode_returns# self.returned_episode_lengths[:] = self.episode_lengths # self.episode_returns *= 1 - infos["terminated"]# self.episode_lengths *= 1 - infos["terminated"]# infos["r"] = self.returned_episode_returns# infos["l"] = self.returned_episode_lengths# return (# observations,# rewards,# dones,# infos,# )if__name__=="__main__":
np.random.seed(1)
envs=envpool.make(
"UpNDown-v5",
env_type="gym",
num_envs=1,
episodic_life=True, # Espeholt et al., 2018, Tab. G.1repeat_action_probability=0, # Hessel et al., 2022 (Muesli) Tab. 10full_action_space=False, # Espeholt et al., 2018, Appendix G., "Following related work, experts use game-specific action sets."max_episode_steps=30, # Set as 50 to hit timelimit fasterreward_clip=True,
seed=1,
)
envs=RecordEpisodeStatistics(envs)
num_episodes=2episode_count=0cur_episode_len=0cur_episode_return=0my_episode_returns= []
my_episode_lens= []
# Track episode returns here to compare with the ones recorded with `RecordEpisodeStatistics`recorded_episode_returns= []
recorded_episode_lens= []
obs=envs.reset()
whileepisode_count<num_episodes:
action=np.random.randint(0, envs.action_space.n, 1)
obs, reward, done, info=envs.step(action)
cur_episode_return+=info["reward"][0]
cur_episode_len+=1print(f"Ep={episode_count}, EpStep={cur_episode_len}, Return={info['r']}, MyReturn={cur_episode_return}, Terminated={info['terminated']}, Timeout={info['TimeLimit.truncated']}, Lives={info['lives']}")
# info["terminated"] = True: Game over.# info["TimeLimit.truncated"] = True: Timeout, the environment will be reset (so the episode return should be reset too)ifinfo["terminated"][0] orinfo["TimeLimit.truncated"][0]:
recorded_episode_returns.append(info["r"][0]) # Append the episode return recorded in `RecordEpisodeStatistics`recorded_episode_lens.append(info["l"][0]) # Append the episode length recorded in `RecordEpisodeStatistics`my_episode_returns.append(cur_episode_return)
my_episode_lens.append(cur_episode_len)
print(f"Episode {episode_count}'s length is {cur_episode_len} (terminated={info['terminated']}, timeout={info['TimeLimit.truncated']})")
episode_count+=1cur_episode_return*=1- (info["terminated"][0] |info["TimeLimit.truncated"][0])
cur_episode_len*=1- (info["terminated"][0] |info["TimeLimit.truncated"][0])
forepisode_idxinrange(num_episodes):
print(f"Episode {episode_idx}'s return is supposed to be {my_episode_returns[episode_idx]}, but the wrapper `RecordEpisodeStatistics` gives {recorded_episode_returns[episode_idx]}")
print(f"Episode {episode_idx}'s len is supposed to be {my_episode_lens[episode_idx]}, but the wrapper `RecordEpisodeStatistics` gives {recorded_episode_lens[episode_idx]}")
Executing the above code snippet, you should see the following printout
The return in the new episode (Ep=1) is not reset to zero but is carried from the return in the old episode. The expected behavior is to reset the return counter to zero upon timeout.
Screenshots
N/A
System info
Describe the characteristic of your environment:
Describe how the library was installed (pip, source, ...)
Describe the bug
In cleanRL example, the episode returns in two independent episodes are added up since the episode return counter is not reset at time-out.
To Reproduce
The following is the minimal example to reproduce the bug. I tried the RecordEpisodeStatistics wrapper in envpool repo and clean RL repo, and both have this bug.
Executing the above code snippet, you should see the following printout
Expected behavior
See the above example's output:
The return in the new episode (Ep=1) is not reset to zero but is carried from the return in the old episode. The expected behavior is to reset the return counter to zero upon timeout.
Screenshots
N/A
System info
Describe the characteristic of your environment:
Additional context
N/A
Reason and Possible fixes
Change both
envpool/examples/cleanrl_examples/ppo_atari_envpool.py
Lines 251 to 252 in f411fc2
to the following:
Checklist
The text was updated successfully, but these errors were encountered: