Time_step doesn't match 'time_step_spec' in a custom py_environment

Chris_Aachen · November 24, 2023, 10:51am

Hi everyone, I get an error while validating, using utils.validate_py_enivronment, my custom py_enivronment. it throws the following error:

in validate_py_environment
if not array_spec.check_arrays_nest (time_step, batched_time_step_s raise ValueError(while episode_count < episodes:
Given time_step`: %r does not match expected
"`time_step_spec: %r' % (time_step, batched_time_step_spec)


ValueError: Given `time_step`: TimeStep(
{'discount': array(0., dtype=float32),
'observation': array([[0.9375 0.9694037, 0.7618361, 0.0593321]],
dtype float32),
J
'reward': array(-0.9964797, dtype=float32),
'step_type': array(2)}) does not match expected `time_step_spec`: TimeStep( {'discount': BoundedArraySpec(shape=(), dtype=dtype('float32'),
name' discount minimum=0.0, maximum=1.0),
'observation': BoundedArraySpec(shape=(4,), dtype=dtype('float32'), name='observation minimum=[0.0.0.0.], maximum=[1. 1. 1. 1.]),
'reward': ArraySpec (shape=(), dtype=dtype('float32'), name=' reward'), 'step_type': ArraySpec(shape=(), dtype=dtype('int32'), name='step_type')})

I don’t know where I can changed the time_step_spec.
Can anyone help me?

Here my specs:

action_spec: BoundedArraySpec (shape=(3,), dtype=dtype(‘int32’), name=‘action’, minimum=[0 0 0], maximum=[1 1 1]) time_step_spec.observation: BoundedArraySpec (shape=(4,), dtype=dtype(‘float32’), name=‘observation’, minimum=[0. 0. 0. 0.], maximum=[1. 1. 1. 1.]) time_step_spec.step_type: ArraySpec (shape=(), dtype=dtype(‘int32’), name=‘step_type’) time_step_spec.discount: BoundedArraySpec (shape=(), dtype=dtype(‘float32’), name=‘discount’, minimum=0.0, maximum=1.0) time_step_spec.reward: ArraySpec (shape=(), dtype=dtype(‘float32’), name=‘reward’)

If code is needed, I can provide it.
Thanks in advance

tagoma · November 24, 2023, 6:28pm

Hi @Chris_Aachen
My take just based an error message:

‘step_type’: array(2)}) does not match expected time_step_spec: TimeStep( {‘discount’: BoundedArraySpec(shape=(), dtype=dtype(‘float32’)
Aren’t you feeding your RL model with an integer (2) while your bounded array requires a float? (basically, probably, just look at data type in your numpy array)
It would be easier for people to help if you provided code / minimum reproducible example, I guess.

Chris_Aachen · November 28, 2023, 10:20am

That is a good idea. I simplified my env and added it here.
This simplified env still throws the same error.

Maybe I use the wrong time step, I import tf_agents.trajectories. time_step.

Chris_Aachen · November 28, 2023, 10:23am

Here is an simplified env that throws the error.

# Import all relevant libs
import numpy as np
from tf_agents.environments import py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.trajectories import time_step as ts

class SimpleTestEnv(py_environment. PyEnvironment):
   def __init__(self):
      self._action_spec=array_spec. BoundedArraySpec(
        shape=(3,), dtype=np.int32, minimum=[0,0,0], maximum=[1,1,1], name='action')
      self._observation_spec = array_spec. BoundedArraySpec(
       shape=(4,), dtype=np.float32, minimum=[0,0,0,0], maximum=[1,1,1,1],  name='observation')
      self._state = np.array([1, 1, 1, 0], dtype=np.float32) # initial state
      self._episode_ended = False
   def action_spec(self):
      return self._action_spec
   def observation_spec(self):
      return self._observation_spec

   def scale_value(self, normed_value, min_value, max_value):
      scaled_value = min_value + (normed_value* (max_value - min_value)) 
      return scaled_value
  def normalize_value(self, value, min_value, max_value):
      normalized_value = (value - min_value) / (max_value - min_value) 
      return normalized_value
  # resets everything to the initial state
  def _reset(self):
      self._state = [1, 1, 1, 0] # initial state, everything starts at 1 
      self._episode_ended = False
      return ts.restart(np.array(self._state, dtype=np.float32))
  # gets new actions
  def _step(self, action):
      if self._episode_ended:
          # The last action ended the episode. Ignore the current action and start # a new episode.
          return self.reset()
      # scale the current state to its real values
      current_state1 = self.scale_value(self._state[0], 0, 0.3)
      current_state2 = self.scale_value(self._state[1], 0, 0.3)
      current_state3 = self.scale_value(self._state[2], 20, 100)
      current_state4 = self._state[3]

      # calc the new state based on the actions
      new_state1 = 0.2 * current_state1 * action[0]
      new_state3 = current_state3 - action[1] * 20  # just some simple calcs 
      new_state4 = current_state4 + 0.1
      if action[2] == 1: #  check if the material was rotated or not 
        new_state2 = current_state1
      else:
        new_state2 = current_state2

      # norm the new state
      new_normed_state1 = self.normalize_value(new_state1, 0, 0.3) 
      new_normed_state2 = self.normalize_value(new_state2, 0.1, 0.3)
      new_normed_state3 = self.normalize_value(new_state3, 20, 100)

      # calc the reward
      reward = new_state3**2 - 1 # just an example
      # return the new state and reward & if not termination than discount, too
      self._state = np.array([new_normed_state1, new_normed_state2, new_normed_state3, new_state4],      dtype=np.float32)
      # Make sure episodes don't go on forever.
      if new_normed_state1 <= 0.15:
         self._episode_ended = True
      if self._episode_ended:
         return ts.termination (np.array([self._state], dtype=np.float32), reward)
      else:
         return ts.transition(
           np.array([self._state], dtype=np.float32), reward, discount=1.0)

environment = SimpleTestEnv()
print('action_spec:', environment.action_spec())
print('time_step_spec.observation:', environment.time_step_spec().observation) print('time_step_spec.step_type:', environment.time_step_spec().step_type) print('time_step_spec.discount:', environment.time_step_spec().discount)
print('time_step_spec.reward:', environment.time_step_spec().reward)
utils.validate_py_environment (environment, episodes=3)

tagoma · November 28, 2023, 2:27pm

Hi @Chris_Aachen .

After playing for some time with your code using utils.validate_py_environment (environment, episodes=10), it seems to me the issue is that the state get beyond the mimimum and and maximum values you have set while defining self._observation_spec.

Can you please extend the range of acceptable values? And also simplify your rules to set the state?
Also maybe in the definition of _step, replace self.reset() with self._reset()

Chris_Aachen · November 29, 2023, 4:32pm

Hi @tagoma ,

thanks for helping me.

That is an interesting point with the self._observation_spec.
I changed self.reset() to self._reset()
Moreover, I simplified the rules to set the new state.

new_state1 = current_state1 - 0.01  # starts at 1
new_state2 = current_state1 - 0.01  # starts at 1
new_state3 = current_state1 - 0.01  # starts at 1
new_state4 = current_state1 + 0.01  # starts at 0

I even extended the range to -100 and 100.
But still the same error occurs.

tagoma · November 29, 2023, 5:25pm

And if you track the new state, all values in here [new_normed_state1, new_normed_state2, new_normed_state3, new_state4] stay within range defined with your BoundedArraySpec?
Unfortunately I deleted the notebook I used to work with your code, but if I remember well new_normed_state2 was the one causing the issue, going out-of-range.

Chris_Aachen · November 30, 2023, 12:54pm

Thanks that is interesting!

The error is thrown during the reset function, more concrete in this line:

return ts.restart(np.array(self._state, dtype=np.float32))

I do not reach the step function while debugging

Chris_Aachen · December 1, 2023, 8:39am

The problem seems to be related to the observation.
I used the simple CardGameEnv Tutoria and simplified it and extened the observation state to an array.

class CardGameEnv(py_environment.PyEnvironment):

  def __init__(self):
    self._action_spec = array_spec.BoundedArraySpec(
        shape=(), dtype=np.int32, minimum=0, maximum=1, name='action')
    self._observation_spec = array_spec.BoundedArraySpec(
        shape=(4,), dtype=np.int32, minimum=0, name='observation')
    self._state = [0, 0, 0,0]
    self._episode_ended = False

  def action_spec(self):
    return self._action_spec

  def observation_spec(self):
    return self._observation_spec

  def _reset(self):
    self._state = [0, 0, 0, 0]
    self._episode_ended = False
    return ts.restart(np.array([self._state], dtype=np.int32))

  def _step(self, action):

    if self._episode_ended:
      # The last action ended the episode. Ignore the current action and start
      # a new episode.
      return self.reset()

    # Make sure episodes don't go on forever.
    self._state[0] += 5

    if self._state[0] >= 21:
      self._episode_ended = True
      reward = self._state - 21
      return ts.termination(np.array([self._state], dtype=np.int32), reward)
    else:
      return ts.transition(
          np.array([self._state], dtype=np.int32), reward=0.0, discount=1.0)

So I guess that the observation state has to defined differently because if I change the code to class with only one observation then the code works.

Chris_Aachen · December 1, 2023, 3:05pm

I have now found the problem. It was due to two things.

The previous state definition was wrong. If the self._state is multidimensional, then apparently the state must be defined as follows:

self._observation_spec = array_spec.BoundedArraySpec(
        shape=(1,4), dtype=np.float32, minimum=0, name='observation')

The state must not be a numpy array, but the time_step needs a numpy array. You have to make sure that the types are correct.

Here is a simple env that works

class CardGameEnv(py_environment.PyEnvironment):

  def __init__(self):
    self._action_spec = array_spec.BoundedArraySpec(
        shape=(3,), dtype=np.int32, minimum=[0,0,0], maximum=[1,1,1], name='action')
    self._observation_spec = array_spec.BoundedArraySpec(
        shape=(1,4), dtype=np.float32, minimum=0, name='observation')
    
    self._state = [0, 0, 0, 0]
    self._episode_ended = False

  def action_spec(self):
    return self._action_spec

  def observation_spec(self):
    return self._observation_spec

  def _reset(self):
    self._state = [0, 0, 0, 0]
    self._episode_ended = False
    return ts.restart(np.array([self._state], dtype=np.float32))

  def _step(self, action):

    if self._episode_ended:
      # The last action ended the episode. Ignore the current action and start
      # a new episode.
      return self.reset()

    # check action
    print('Action: ', action)
    print('Action data type: ', type(action))

    # calc state
    print('State before: ', self._state)
    self._state[0] = self._state[0] + action[0]  
    self._state[1] = self._state[1] + action[1]
    self._state[2] = self._state[2] + action[2]
    self._state[3] = self._state[3] + action[2]
    print('State after: ', self._state)    

    if self._state[3] >= 5.0:
      self._episode_ended = True
      reward = self._state[0] - 21
      return ts.termination(np.array([self._state], dtype=np.float32), reward)
    else:
      return ts.transition(
          np.array([self._state], dtype=np.float32), reward=0.0, discount=1.0)

environment = CardGameEnv()
utils.validate_py_environment(environment, episodes=5)