Hi All,
I have or I believe I have built a custom environment for a 2d edge matching board game when I run the RL agent against the environment it looks to have leaned not to do illegal actions but after the first few moves it just repeats the same action till the episode ends.
So what am I missing here?
Have I got the reward system wrong? the idea was if a action increase the solved edges it gets a reward equal to the the increase (max 8), same for a decrease in solved edges just a negative reward (min -8) and zero for on change.
here is my current environment:
import copy
import numpy as np
import os
import string
import TileImages
from PIL import Image, ImageDraw, ImageFont
from tf_agents.environments import py_environment
from tf_agents.specs import BoundedArraySpec
from tf_agents.trajectories.time_step import StepType
from tf_agents.trajectories.time_step import TimeStep
BOARD_WIDTH = 4
BOARD_HEIGHT = 4
class Tile:
def __init__(self, id, sides):
self.id = id
self.orientation = 0
self.sides = sides
def rotate(self, rotation):
if rotation == 0:
self.orientation = (self.orientation + 1) % 4
self.sides = np.roll(self.sides, 1)
elif rotation == 1:
self.orientation = (self.orientation - 1) % 4
self.sides = np.roll(self.sides, -1)
elif rotation == 2:
self.orientation = (self.orientation + 2) % 4
self.sides = np.roll(self.sides, 2)
def render(self):
side_string = ""
for side in self.sides:
side_string += string.ascii_lowercase[side]
return side_string
class puzzleEnv(py_environment.PyEnvironment):
def __init__(self, tile_set, tile_images_path, discount=0.95):
super(puzzleEnv, self).__init__(handle_auto_reset=True)
self._tile_set = tile_set
self._tile_images_path = tile_images_path
self._tile_images = TileImages.TileImages(tile_images_path).tile_images
# self._tile_images= self._tile_images.tile_images
self._font_path = os.path.join(os.path.dirname(__file__), 'Roboto-Regular.ttf')
self.board = self._generate_initial_state(tile_set=tile_set)
self._number_of_steps = 0
self._reward = 0
self._solved_edges = 0
self._action = [0,0,0]
# self._action_spec = BoundedArraySpec(
# shape=(3, ), dtype=np.float32, minimum=0, maximum=[3,15,15], name='action')
# 0 - 262143 action 0-3 >> 16, tile_1 0-15 >> 8, tile 0-15
self._action_spec = BoundedArraySpec(
shape=( ), dtype=np.int32, minimum=0, maximum=262143, name='action')
self._observation_spec = BoundedArraySpec(
shape=(4, 4, 4), dtype=np.int32, minimum=0, maximum=22, name='observation')
self._discount = np.asarray(discount, dtype=np.float32)
self._state = np.zeros((4, 4, 4), dtype=np.int32)
self._board_to_state()
self._episode_ended = False
def action_spec(self):
return self._action_spec
def observation_spec(self):
return self._observation_spec
def _reset(self):
"""Return initial_time_step."""
self._number_of_steps = 0
self._solved_edges = 0
self._episode_ended = False
self.board = self._generate_initial_state(self._tile_set)
_, _, _ = self._check_state()
return TimeStep(StepType.FIRST, np.asarray(0.0, dtype=np.float32),
self._discount, self._state)
def _step(self, action):
"""Apply action and return new time_step."""
action = [(action >> 16) & 0xFF, (action >> 8) & 0xFF, action & 0xFF]
self._action = action
self._number_of_steps += 1
is_final, reward, _ = self._check_state()
if is_final or self._episode_ended:
return TimeStep(StepType.LAST, reward, self._discount, self._state)
# Illegal action
illegal_tile = action[1] < 0 or action[1] > 15 or action[2] < 0 or action[2] > 15
illegal_action = action[0] < 0 or action[0] > 3
if illegal_tile or illegal_action:
return TimeStep(StepType.MID, np.asarray(-0.01, dtype=np.float32), self._discount, self._state)
if action[0] == 3:
self._swap_tiles(action[1], action[2])
else:
self._rotate_tile(action[0], action[1])
is_final, reward, _ = self._check_state()
step_type = StepType.MID
if is_final:
step_type = StepType.LAST
return TimeStep(step_type, reward, self._discount, self._state)
def _check_state(self):
self._episode_ended = self._number_of_steps >= (4 * 4 * 3 * 2)
self._board_to_state()
flat_board = self.board.flatten()
top_edges = np.asarray([e.sides[0] for e in flat_board]).reshape(4, 4)
right_edges = np.asarray([e.sides[1] for e in flat_board]).reshape(4, 4)
bottom_edges = np.asarray([e.sides[2] for e in flat_board]).reshape(4, 4)
left_edges = np.asarray([e.sides[3] for e in flat_board]).reshape(4, 4)
bottom_edges = np.roll(bottom_edges, 1, axis=0)
left_edges = np.roll(left_edges, -1, axis=1)
solved_edges = np.count_nonzero(
top_edges == bottom_edges) + np.count_nonzero(right_edges == left_edges)
reward = solved_edges - self._solved_edges
self._reward = reward
if self._solved_edges != solved_edges:
self._solved_edges = solved_edges
is_final = solved_edges >= (4 * 4 * 2)
if is_final:
board_img = self.render(mode='human')
image_path = os.path.join(self._tile_images_path, 'complete.png')
board_img.save(image_path)
return is_final, np.asarray(reward, dtype=np.float32), solved_edges
def _generate_initial_state(self, tile_set, board_size=(4, 4)):
tiles = np.ndarray((board_size[0]*board_size[1], ), dtype=object)
for i, tile in enumerate(tile_set):
tiles[i] = Tile(i, tile)
np.random.shuffle(tiles)
board = tiles.reshape(board_size)
return board
def _board_to_state(self):
for i in range(4):
for j in range(4):
self._state[i, j, :] = self.board[i, j].sides
def _rotate_tile(self, rotation, board_position):
tile1_x, tile1_y = np.unravel_index(board_position, self.board.shape)
self.board[tile1_x, tile1_y].rotate(rotation)
def _swap_tiles(self, board_position_1, board_position_2):
tile1_x, tile1_y = np.unravel_index(board_position_1, self.board.shape)
tile2_x, tile2_y = np.unravel_index(board_position_2, self.board.shape)
# swap the tiles
self.board[tile1_x, tile1_y], self.board[tile2_x,
tile2_y] = self.board[tile2_x, tile2_y], self.board[tile1_x, tile1_y]
def get_state(self) -> TimeStep:
# Returning an unmodifiable copy of the state.
return copy.deepcopy(self._current_time_step)
def set_state(self, time_step: TimeStep):
self._current_time_step = time_step
# self._state = time_step.observation
self._board_to_state()
def render(self, mode='bucas'):
if mode == 'bucas':
board_edges = ""
board_pieces = ""
for i, tile in enumerate(self.board.flatten()):
board_pieces += str(tile.id).zfill(3)
board_edges += tile.render()
return board_edges, board_pieces
elif mode == 'human':
new_img = Image.new('RGBA', (64, 64))
board = np.empty((4,4), dtype=object)
for i in range(4):
for j in range(4):
new_img = Image.new('RGBA', (64, 64))
tile = self.board[j][i].sides
north = self._tile_images[tile[0]]
new_img.paste(north, (0, 0), north)
east = self._tile_images[tile[1]].rotate(270, expand=True)
new_img.paste(east, (32, 0), east)
south = self._tile_images[tile[2]].rotate(180, expand=True)
new_img.paste(south, (0, 32), south)
west = self._tile_images[tile[3]].rotate(90, expand=True)
new_img.paste(west, (0, 0), west)
board[i][j] = new_img
# build the board image
board_img = Image.new('RGBA', (4*64, 4*64))
for i in range(4):
for j in range(4):
board_img.paste(board[i][j], (i*64, j*64))
# add stats
new_img = Image.new('RGB', (265+160, 256))
new_img.paste(board_img, (160, 0))
font = ImageFont.truetype(self._font_path, 16)
draw = ImageDraw.Draw(new_img)
text = "\nStep: {}\nSolved Edges: {}\nReward: {}\nAction: {}".format(self._number_of_steps, self._solved_edges, self._reward, self._action)
draw.text((0, 0),text,(255,255,255),font=font)
# draw.textsize(text, font=font)
return new_img
else:
raise ValueError("Invalid render mode: {}".format(mode))
Have I broken the agent? just got the reinforce_agent.py from the tensorflow/agent repo and modified to work with my environment so maybe I broke something or missed setting something up?
my current agent:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import Environment as Environment
import numpy as np
import os
import time
from PIL import Image, ImageDraw, ImageFont
from absl import app
from absl import flags
from absl import logging
from six.moves import range
import tensorflow as tf # pylint: disable=g-explicit-tensorflow-version-import
from tf_agents.agents.reinforce import reinforce_agent
from tf_agents.drivers import dynamic_episode_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment, ActionDiscretizeWrapper
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import actor_distribution_network
from tf_agents.networks import value_network
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.utils import common
flags.DEFINE_string('root_dir', os.getenv('TEST_UNDECLARED_OUTPUTS_DIR'),
'Root directory for writing logs/summaries/checkpoints.')
flags.DEFINE_integer('num_iterations', 500,
'Total number train/eval iterations to perform.')
FLAGS = flags.FLAGS
def load_pieces(file_path):
pieces = []
with open(file_path, 'r') as f:
for line in f:
line = line.strip()
if line:
pieces.append(np.array([int(x) for x in line.split(' ')]))
return pieces
tile_set = load_pieces('/workspaces/git/puzzle/data/pieces4x4.txt')
def train_eval(
root_dir,
env_name='CartPole-v0',
num_iterations=1000,
train_sequence_length=1,
actor_fc_layers=(100,),
value_net_fc_layers=(100,),
use_value_network=False,
use_tf_functions=True,
# Params for collect
collect_episodes_per_iteration=2,
replay_buffer_capacity=2000,
# Params for train
learning_rate=1e-4,
gamma=0.9,
gradient_clipping=None,
normalize_returns=True,
value_estimation_loss_coef=0.2,
batch_size=1,
# Params for eval
num_eval_episodes=10,
eval_interval=100,
# Params for checkpoints, summaries, and logging
train_checkpoint_interval=1000,
policy_checkpoint_interval=1000,
rb_checkpoint_interval=1000,
log_interval=100,
summary_interval=100,
summaries_flush_secs=1,
debug_summaries=True,
summarize_grads_and_vars=False,
eval_metrics_callback=None):
"""A simple train and eval for Reinforce."""
root_dir = os.path.expanduser(root_dir)
train_dir = os.path.join(root_dir, 'train')
eval_dir = os.path.join(root_dir, 'eval')
image_dir = os.path.join(root_dir, 'images')
font_path = os.path.join(root_dir, 'data', 'Roboto-Regular.ttf')
train_summary_writer = tf.compat.v2.summary.create_file_writer(
train_dir, flush_millis=summaries_flush_secs * 1000)
train_summary_writer.set_as_default()
eval_summary_writer = tf.compat.v2.summary.create_file_writer(
eval_dir, flush_millis=summaries_flush_secs * 1000)
eval_metrics = [
tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes),
tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes),
]
with tf.compat.v2.summary.record_if(lambda: tf.math.equal(global_step % summary_interval, 0)):
# tf_env = tf_py_environment.TFPyEnvironment(suite_gym.load(env_name))
# eval_tf_env = tf_py_environment.TFPyEnvironment(suite_gym.load(env_name))
train_py_env = Environment.puzzleEnv(tile_set=tile_set, tile_images_path='/workspaces/git/puzzle/data/E2_Edges.png')
eval_py_env = Environment.puzzleEnv(tile_set=tile_set, tile_images_path='/workspaces/git/puzzle/data/E2_Edges.png')
tf_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_tf_env = tf_py_environment.TFPyEnvironment(eval_py_env)
actor_net = actor_distribution_network.ActorDistributionNetwork(
tf_env.time_step_spec().observation,
tf_env.action_spec(),
fc_layer_params=actor_fc_layers)
if use_value_network:
value_net = value_network.ValueNetwork(
tf_env.time_step_spec().observation,
fc_layer_params=value_net_fc_layers)
global_step = tf.compat.v1.train.get_or_create_global_step()
tf_agent = reinforce_agent.ReinforceAgent(
tf_env.time_step_spec(),
tf_env.action_spec(),
actor_network=actor_net,
value_network=value_net if use_value_network else None,
value_estimation_loss_coef=value_estimation_loss_coef,
gamma=gamma,
optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate),
normalize_returns=normalize_returns,
gradient_clipping=gradient_clipping,
debug_summaries=debug_summaries,
summarize_grads_and_vars=summarize_grads_and_vars,
train_step_counter=global_step
)
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
tf_agent.collect_data_spec,
batch_size=tf_env.batch_size,
max_length=replay_buffer_capacity)
tf_agent.initialize()
train_metrics = [
tf_metrics.NumberOfEpisodes(),
tf_metrics.EnvironmentSteps(),
tf_metrics.AverageReturnMetric(),
tf_metrics.AverageEpisodeLengthMetric(),
]
eval_policy = tf_agent.policy
collect_policy = tf_agent.collect_policy
collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(
tf_env,
collect_policy,
observers=[replay_buffer.add_batch] + train_metrics,
num_episodes=collect_episodes_per_iteration)
# # Dataset generates trajectories with shape [Bx2x...]
# dataset = replay_buffer.as_dataset(
# num_parallel_calls=1,
# sample_batch_size=batch_size,
# num_steps=train_sequence_length + 1).prefetch(3)
# iterator = iter(dataset)
# def train_step():
# experience, _ = next(iterator)
# return tf_agent.train(experience)
def train_step():
experience = replay_buffer.gather_all()
return tf_agent.train(experience)
if use_tf_functions:
# To speed up collect use TF function.
collect_driver.run = common.function(collect_driver.run)
# To speed up train use TF function.
tf_agent.train = common.function(tf_agent.train)
train_step = common.function(train_step)
# Compute evaluation metrics.
metrics = metric_utils.eager_compute(
eval_metrics,
eval_tf_env,
eval_policy,
num_episodes=num_eval_episodes,
train_step=global_step,
summary_writer=eval_summary_writer,
summary_prefix='Metrics',
)
# TODO(b/126590894): Move this functionality into eager_compute_summaries
if eval_metrics_callback is not None:
eval_metrics_callback(metrics, global_step.numpy())
time_step = None
policy_state = collect_policy.get_initial_state(tf_env.batch_size)
timed_at_step = global_step.numpy()
time_acc = 0
train_checkpointer = common.Checkpointer(
ckpt_dir=train_dir,
max_to_keep=1,
agent=tf_agent,
global_step=global_step,
metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics'))
policy_checkpointer = common.Checkpointer(
ckpt_dir=os.path.join(train_dir, 'policy'),
max_to_keep=1,
policy=eval_policy,
global_step=global_step)
rb_checkpointer = common.Checkpointer(
ckpt_dir=os.path.join(train_dir, 'replay_buffer'),
max_to_keep=1,
replay_buffer=replay_buffer)
train_checkpointer.initialize_or_restore()
rb_checkpointer.initialize_or_restore()
for _ in range(num_iterations):
start_time = time.time()
time_step, policy_state = collect_driver.run(
time_step=time_step,
policy_state=policy_state,
)
total_loss = train_step()
replay_buffer.clear()
time_acc += time.time() - start_time
global_step_val = global_step.numpy()
if global_step_val % log_interval == 0:
logging.info('step = %d, loss = %f', global_step_val, total_loss.loss)
steps_per_sec = (global_step_val - timed_at_step) / time_acc
logging.info('%.3f steps/sec', steps_per_sec)
tf.compat.v2.summary.scalar(
name='global_steps_per_sec', data=steps_per_sec, step=global_step)
timed_at_step = global_step_val
time_acc = 0
for train_metric in train_metrics:
train_metric.tf_summaries(
train_step=global_step, step_metrics=train_metrics[:2])
if global_step.numpy() % train_checkpoint_interval == 0:
train_checkpointer.save(global_step=global_step.numpy())
if global_step.numpy() % policy_checkpoint_interval == 0:
policy_checkpointer.save(global_step=global_step.numpy())
if global_step.numpy() % rb_checkpoint_interval == 0:
rb_checkpointer.save(global_step=global_step.numpy())
if global_step_val % eval_interval == 0:
metrics = metric_utils.eager_compute(
eval_metrics,
eval_tf_env,
eval_policy,
num_episodes=num_eval_episodes,
train_step=global_step,
summary_writer=eval_summary_writer,
summary_prefix='Metrics',
)
# TODO(b/126590894): Move this functionality into
# eager_compute_summaries.
if eval_metrics_callback is not None:
eval_metrics_callback(metrics, global_step_val)
train_checkpointer.save(global_step=global_step.numpy())
policy_checkpointer.save(global_step=global_step.numpy())
rb_checkpointer.save(global_step=global_step.numpy())
def run_episodes_and_create_video(policy, eval_tf_env, eval_py_env):
num_episodes = 3
frames = []
global_step_val = global_step.numpy()
image_path = os.path.join(image_dir, 'environment_{}.png'.format(global_step_val))
for episode in range(num_episodes):
time_step = eval_tf_env.reset()
frames.append(eval_py_env.render(mode='human'))
while not time_step.is_last():
action_step = policy.action(time_step)
time_step = eval_tf_env.step(action_step.action)
img = eval_py_env.render(mode='human')
font = ImageFont.truetype(font_path, 16)
draw = ImageDraw.Draw(img)
text = "Episode: {}".format(episode)
draw.text((0, 0),text,(255,255,255),font=font)
# image_debug_path = os.path.join(image_dir, 'debug_{}_{}.png'.format(episode, step))
# img.save(image_debug_path)
frames.append(img)
frames[0].save(image_path, save_all=True, append_images=frames[1:], optimize=True, duration=len(frames)/8, loop=0, format="PNG")
run_episodes_and_create_video(tf_agent.policy, eval_tf_env, eval_py_env)
def main(_):
tf.compat.v1.enable_eager_execution(
config=tf.compat.v1.ConfigProto(allow_soft_placement=True))
tf.compat.v1.enable_v2_behavior()
logging.set_verbosity(logging.INFO)
train_eval(FLAGS.root_dir, num_iterations=FLAGS.num_iterations)
if __name__ == '__main__':
flags.mark_flag_as_required('root_dir')
app.run(main)