Problems with utils.validate_py_environment in Custom Environment

Hi Community! It’s nice to be here!

Today I come to the forum for asking some help understanding what reason I cannot execute utils.validate_py_environment in my custom environment, although If I create a sequence of steps manually my environment works well.

Here is my environment class (inspired in the environment tutorial). This class represent a microgrid including a house consumption, a PV generator and a Battery Bank:

from __future__ import absolute_import

from future import division
from future import print_function

import abc
import tensorflow as tf
import numpy as np

from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.environments import suite_gym
from tf_agents.trajectories import time_step as ts

tf.compat.v1.enable_v2_behavior()

class MicroGridEnv(py_environment.PyEnvironment):
“”"
This class simulates the net amount of energy of the microgrid viewed from
the battery. That is the difference between generation and consumption of
energy

1. Actions: We have 2 actions. Action 0: charge the battery, and action 1: discharge the battery
2. Observations: [Net Load, Energy available in battery, prognostic of Net Load (next 24 h),
   n° of cycles]
3. Reward: - 1 per each cycle reached, - 2 if there's a lack of supply for the customer
"""
def __init__(self):
    self._action_spec = array_spec.BoundedArraySpec(
        shape=(),
        dtype=np.int32,
        minimum=0, maximum= 1,
        name='action'
    )
    self._observation_spec =  array_spec.BoundedArraySpec(
        shape=(4,),
        dtype=np.float32,
        minimum=[-1.0, 0.0, -1.0, 0],
        maximum=[1.0, 1.0, 1.0, 1e35],
        name='observation'
    )
    self._state = [0.0, 0.0, 0.0, 0.0]
    self._episode_ended = False

def action_spec(self):
    return self._action_spec

def observation_spec(self):
    return self._observation_spec

def _reset(self):
    self._state = [0.0, 0.0, 0.0, 0.0]
    self._episode_ended = False
    return ts.restart(np.array([self._state], dtype=np.float32))

def _step(self, action):

    if self._episode_ended:
        # The last action ended the episode. Ignore the current action and start
        # a new episode.
        return self.reset()

    # First, we see if the episode has ended
    if self._state[0] < 0:
        self._episode_ended = True
    
    # Make sure battery actions are witten here
    if action == 1:
        self._state += np.array([0.1, -0.1, 0.0, 0.0], dtype=np.float32)
    elif action == 0:
        self._state += np.array([-0.1, 0.1, 0.0, 0.0], dtype=np.float32)
    else:
        raise ValueError('action should be 0 or 1')

    print(self._state)

    # After battery acts, we have to update system dynamics:
    # - Shutdown condition (episode ends if there's a lack of supply)
    if self._state[0] < 0:
        self._episode_ended = True

    # - Cycle counter
    # - Net Load Prognostic
    if self._episode_ended:
        reward = np.array([self._state[0] - self._state[3]], dtype=np.float32)
        return ts.termination(
            self._state,
            reward
        )
    else:
        return ts.transition(
            self._state,
            reward=0
        )

Here is my main code:

environment = MicroGridEnv()

print(f"environment MicrogridEnv created")
print(f"action_spec: {environment.action_spec()}“)
print(f"time_step_spec.observation: {environment.time_step_spec().observation}”)
print(f"time_step_spec.step_type: {environment.time_step_spec().step_type}“)
print(f"time_step_spec.discount: {environment.time_step_spec().discount}”)
print(f"time_step_spec.reward: {environment.time_step_spec().reward}")

print(‘Simulating 5 actions’)

action = np.array(1, dtype=np.int32)
time_step = environment.reset()
print(time_step)
time_step = environment.step(action)
print(time_step)
time_step = environment.step(action)
print(time_step)
action = np.array(0, dtype=np.int32)
time_step = environment.step(action)
print(time_step)
time_step = environment.step(action)
print(time_step)
time_step = environment.step(action)
print(time_step)

utils.validate_py_environment(environment, episodes=5)

and here is the output in my terminal:

environment MicrogridEnv created

action_spec: BoundedArraySpec(shape=(), dtype=dtype(‘int32’), name=‘action’, minimum=0, maximum=1)
time_step_spec.observation: BoundedArraySpec(shape=(4,), dtype=dtype(‘float32’), name=‘observation’, minimum=[-1. 0. -1. 0.], maximum=[1.e+00 1.e+00 1.e+00 1.e+35])
time_step_spec.step_type: ArraySpec(shape=(), dtype=dtype(‘int32’), name=‘step_type’)
time_step_spec.discount: BoundedArraySpec(shape=(), dtype=dtype(‘float32’), name=‘discount’, minimum=0.0, maximum=1.0)
time_step_spec.reward: ArraySpec(shape=(), dtype=dtype(‘float32’), name=‘reward’)
Simulating 5 actions
TimeStep(
{‘discount’: array(1., dtype=float32),
‘observation’: array([[0., 0., 0., 0.]], dtype=float32),
‘reward’: array(0., dtype=float32),
‘step_type’: array(0)})
[ 0.1 -0.1 0. 0. ]
TimeStep(
{‘discount’: array(1., dtype=float32),
‘observation’: array([ 0.1, -0.1, 0. , 0. ]),
‘reward’: array(0., dtype=float32),
‘step_type’: array(1)})
[ 0.2 -0.2 0. 0. ]
TimeStep(
{‘discount’: array(1., dtype=float32),
‘observation’: array([ 0.2, -0.2, 0. , 0. ]),
‘reward’: array(0., dtype=float32),
‘step_type’: array(1)})
[ 0.1 -0.1 0. 0. ]
TimeStep(
{‘discount’: array(1., dtype=float32),
‘observation’: array([ 0.1, -0.1, 0. , 0. ]),
‘reward’: array(0., dtype=float32),
‘step_type’: array(1)})
[0. 0. 0. 0.]
TimeStep(
{‘discount’: array(1., dtype=float32),
‘observation’: array([0., 0., 0., 0.]),
‘reward’: array(0., dtype=float32),
‘step_type’: array(1)})
[-0.1 0.1 0. 0. ]
TimeStep(
{‘discount’: array([0.], dtype=float32),
‘observation’: array([-0.1, 0.1, 0. , 0. ]),
‘reward’: array([-0.1], dtype=float32),
‘step_type’: array([2])})
Traceback (most recent call last):
File “c:/Users/evilc/PycharmProjects/MG_Research/MG_environment.py”, line 126, in
utils.validate_py_environment(environment, episodes=5)
File “C:\Users\evilc\anaconda3\envs\research_2021\lib\site-packages\tf_agents\environments\utils.py”, line 72, in validate_py_environment
raise ValueError(
ValueError: Given time_step: TimeStep(
{‘discount’: array(1., dtype=float32),
‘observation’: array([[0., 0., 0., 0.]], dtype=float32),
‘reward’: array(0., dtype=float32),
‘step_type’: array(0)}) does not match expected time_step_spec: TimeStep(
{‘discount’: BoundedArraySpec(shape=(), dtype=dtype(‘float32’), name=‘discount’, minimum=0.0, maximum=1.0),
‘observation’: BoundedArraySpec(shape=(4,), dtype=dtype(‘float32’), name=‘observation’, minimum=[-1. 0. -1. 0.], maximum=[1.e+00 1.e+00 1.e+00 1.e+35]),
‘reward’: ArraySpec(shape=(), dtype=dtype(‘float32’), name=‘reward’),
‘step_type’: ArraySpec(shape=(), dtype=dtype(‘int32’), name=‘step_type’)})

I really appreciate any help you can bring me.

Best Regards!