Agent doing the same actions

Hi All,
I have or I believe I have built a custom environment for a 2d edge matching board game when I run the RL agent against the environment it looks to have leaned not to do illegal actions but after the first few moves it just repeats the same action till the episode ends.

environment_75020
So what am I missing here?

Have I got the reward system wrong? the idea was if a action increase the solved edges it gets a reward equal to the the increase (max 8), same for a decrease in solved edges just a negative reward (min -8) and zero for on change.

here is my current environment:

import copy
import numpy as np
import os
import string
import TileImages

from PIL import Image, ImageDraw, ImageFont
from tf_agents.environments import py_environment
from tf_agents.specs import BoundedArraySpec
from tf_agents.trajectories.time_step import StepType
from tf_agents.trajectories.time_step import TimeStep

BOARD_WIDTH = 4
BOARD_HEIGHT = 4

class Tile:
    def __init__(self, id, sides):
        self.id = id
        self.orientation = 0
        self.sides = sides

    def rotate(self, rotation):
        if rotation == 0:
            self.orientation = (self.orientation + 1) % 4
            self.sides = np.roll(self.sides, 1)
        elif rotation == 1:
            self.orientation = (self.orientation - 1) % 4
            self.sides = np.roll(self.sides, -1)
        elif rotation == 2:
            self.orientation = (self.orientation + 2) % 4
            self.sides = np.roll(self.sides, 2)

    def render(self):
        side_string = ""
        for side in self.sides:
            side_string += string.ascii_lowercase[side]
        return side_string


class puzzleEnv(py_environment.PyEnvironment):
    def __init__(self, tile_set, tile_images_path, discount=0.95):
        super(puzzleEnv, self).__init__(handle_auto_reset=True)

        self._tile_set = tile_set
        self._tile_images_path = tile_images_path
        self._tile_images = TileImages.TileImages(tile_images_path).tile_images
        # self._tile_images= self._tile_images.tile_images
        self._font_path = os.path.join(os.path.dirname(__file__), 'Roboto-Regular.ttf')

        self.board = self._generate_initial_state(tile_set=tile_set)

        self._number_of_steps = 0
        self._reward = 0
        self._solved_edges = 0
        self._action = [0,0,0]

        # self._action_spec = BoundedArraySpec(
        #     shape=(3, ), dtype=np.float32, minimum=0, maximum=[3,15,15], name='action')
        # 0 - 262143 action 0-3 >> 16, tile_1 0-15 >> 8, tile 0-15
        self._action_spec = BoundedArraySpec(
            shape=( ), dtype=np.int32, minimum=0, maximum=262143, name='action')
        self._observation_spec = BoundedArraySpec(
            shape=(4, 4, 4), dtype=np.int32, minimum=0, maximum=22, name='observation')


        self._discount = np.asarray(discount, dtype=np.float32)
        self._state = np.zeros((4, 4, 4), dtype=np.int32)
        self._board_to_state()
        self._episode_ended = False

    def action_spec(self):
        return self._action_spec

    def observation_spec(self):
        return self._observation_spec

    def _reset(self):
        """Return initial_time_step."""
        self._number_of_steps = 0
        self._solved_edges = 0
        self._episode_ended = False
        self.board = self._generate_initial_state(self._tile_set)
        _, _, _ = self._check_state()
        return TimeStep(StepType.FIRST, np.asarray(0.0, dtype=np.float32),
                        self._discount, self._state)

    def _step(self, action):
        """Apply action and return new time_step."""
        action = [(action >> 16) & 0xFF, (action >> 8) & 0xFF, action & 0xFF]
        self._action = action
        self._number_of_steps += 1

        is_final, reward, _ = self._check_state()
        if is_final or self._episode_ended:
            return TimeStep(StepType.LAST, reward, self._discount, self._state)

        # Illegal action
        illegal_tile = action[1] < 0 or action[1] > 15 or action[2] < 0 or action[2] > 15
        illegal_action = action[0] < 0 or action[0] > 3
        if illegal_tile or illegal_action:
            return TimeStep(StepType.MID, np.asarray(-0.01, dtype=np.float32), self._discount, self._state)

        if action[0] == 3:
            self._swap_tiles(action[1], action[2])
        else:
            self._rotate_tile(action[0], action[1])

        is_final, reward, _ = self._check_state()

        step_type = StepType.MID
        if is_final:
            step_type = StepType.LAST

        return TimeStep(step_type, reward, self._discount, self._state)

    def _check_state(self):
        self._episode_ended = self._number_of_steps >= (4 * 4 * 3 * 2)
        self._board_to_state()
        flat_board = self.board.flatten()
        top_edges = np.asarray([e.sides[0] for e in flat_board]).reshape(4, 4)
        right_edges = np.asarray([e.sides[1] for e in flat_board]).reshape(4, 4)
        bottom_edges = np.asarray([e.sides[2] for e in flat_board]).reshape(4, 4)
        left_edges = np.asarray([e.sides[3] for e in flat_board]).reshape(4, 4)

        bottom_edges = np.roll(bottom_edges, 1, axis=0)
        left_edges = np.roll(left_edges, -1, axis=1)

        solved_edges = np.count_nonzero(
            top_edges == bottom_edges) + np.count_nonzero(right_edges == left_edges)

        reward = solved_edges - self._solved_edges
        self._reward = reward
        if self._solved_edges != solved_edges:
            self._solved_edges = solved_edges
        is_final = solved_edges >= (4 * 4 * 2)

        if is_final:
            board_img = self.render(mode='human')
            image_path = os.path.join(self._tile_images_path, 'complete.png')
            board_img.save(image_path)

        return is_final, np.asarray(reward, dtype=np.float32), solved_edges

    def _generate_initial_state(self, tile_set, board_size=(4, 4)):
        tiles = np.ndarray((board_size[0]*board_size[1], ), dtype=object)
        for i, tile in enumerate(tile_set):
            tiles[i] = Tile(i, tile)
        np.random.shuffle(tiles)
        board = tiles.reshape(board_size)
        return board

    def _board_to_state(self):
        for i in range(4):
            for j in range(4):
                self._state[i, j, :] = self.board[i, j].sides

    def _rotate_tile(self, rotation, board_position):
        tile1_x, tile1_y = np.unravel_index(board_position, self.board.shape)
        self.board[tile1_x, tile1_y].rotate(rotation)

    def _swap_tiles(self, board_position_1, board_position_2):
        tile1_x, tile1_y = np.unravel_index(board_position_1, self.board.shape)
        tile2_x, tile2_y = np.unravel_index(board_position_2, self.board.shape)
        # swap the tiles
        self.board[tile1_x, tile1_y], self.board[tile2_x,
                                                 tile2_y] = self.board[tile2_x, tile2_y], self.board[tile1_x, tile1_y]

    def get_state(self) -> TimeStep:
        # Returning an unmodifiable copy of the state.
        return copy.deepcopy(self._current_time_step)

    def set_state(self, time_step: TimeStep):
        self._current_time_step = time_step
        # self._state = time_step.observation
        self._board_to_state()

    def render(self, mode='bucas'):
        if mode == 'bucas':
            board_edges = ""
            board_pieces = ""
            for i, tile in enumerate(self.board.flatten()):
                board_pieces += str(tile.id).zfill(3)
                board_edges += tile.render()
            return board_edges, board_pieces
        elif mode == 'human':
            new_img = Image.new('RGBA', (64, 64))
            board = np.empty((4,4), dtype=object)
            for i in range(4):
                for j in range(4):
                    new_img = Image.new('RGBA', (64, 64))
                    tile = self.board[j][i].sides
                    north = self._tile_images[tile[0]]
                    new_img.paste(north, (0, 0), north)
                    east = self._tile_images[tile[1]].rotate(270, expand=True)
                    new_img.paste(east, (32, 0), east)
                    south = self._tile_images[tile[2]].rotate(180, expand=True)
                    new_img.paste(south, (0, 32), south)
                    west = self._tile_images[tile[3]].rotate(90, expand=True)
                    new_img.paste(west, (0, 0), west)
                    board[i][j] = new_img

            # build the board image
            board_img = Image.new('RGBA', (4*64, 4*64))
            for i in range(4):
                for j in range(4):
                    board_img.paste(board[i][j], (i*64, j*64))
            # add stats
            new_img = Image.new('RGB', (265+160, 256))
            new_img.paste(board_img, (160, 0))
            font = ImageFont.truetype(self._font_path, 16)
            draw = ImageDraw.Draw(new_img)
            text = "\nStep: {}\nSolved Edges: {}\nReward: {}\nAction: {}".format(self._number_of_steps, self._solved_edges, self._reward, self._action)
            draw.text((0, 0),text,(255,255,255),font=font)
            # draw.textsize(text, font=font)
            return new_img
        else:
            raise ValueError("Invalid render mode: {}".format(mode))

Have I broken the agent? just got the reinforce_agent.py from the tensorflow/agent repo and modified to work with my environment so maybe I broke something or missed setting something up?

my current agent:

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import Environment as Environment
import numpy as np
import os
import time

from PIL import Image, ImageDraw, ImageFont

from absl import app
from absl import flags
from absl import logging

from six.moves import range
import tensorflow as tf  # pylint: disable=g-explicit-tensorflow-version-import

from tf_agents.agents.reinforce import reinforce_agent
from tf_agents.drivers import dynamic_episode_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment, ActionDiscretizeWrapper
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import actor_distribution_network
from tf_agents.networks import value_network
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.utils import common

flags.DEFINE_string('root_dir', os.getenv('TEST_UNDECLARED_OUTPUTS_DIR'),
                    'Root directory for writing logs/summaries/checkpoints.')
flags.DEFINE_integer('num_iterations', 500,
                     'Total number train/eval iterations to perform.')
FLAGS = flags.FLAGS

def load_pieces(file_path):
  pieces = []
  with open(file_path, 'r') as f:
    for line in f:
      line = line.strip()
      if line:
        pieces.append(np.array([int(x) for x in line.split(' ')]))
  return pieces

tile_set = load_pieces('/workspaces/git/puzzle/data/pieces4x4.txt')

def train_eval(
    root_dir,
    env_name='CartPole-v0',
    num_iterations=1000,
    train_sequence_length=1,
    actor_fc_layers=(100,),
    value_net_fc_layers=(100,),
    use_value_network=False,
    use_tf_functions=True,
    # Params for collect
    collect_episodes_per_iteration=2,
    replay_buffer_capacity=2000,
    # Params for train
    learning_rate=1e-4,
    gamma=0.9,
    gradient_clipping=None,
    normalize_returns=True,
    value_estimation_loss_coef=0.2,
    batch_size=1,
    # Params for eval
    num_eval_episodes=10,
    eval_interval=100,
    # Params for checkpoints, summaries, and logging
    train_checkpoint_interval=1000,
    policy_checkpoint_interval=1000,
    rb_checkpoint_interval=1000,
    log_interval=100,
    summary_interval=100,
    summaries_flush_secs=1,
    debug_summaries=True,
    summarize_grads_and_vars=False,
    eval_metrics_callback=None):
  """A simple train and eval for Reinforce."""
  root_dir = os.path.expanduser(root_dir)
  train_dir = os.path.join(root_dir, 'train')
  eval_dir = os.path.join(root_dir, 'eval')
  image_dir = os.path.join(root_dir, 'images')
  font_path = os.path.join(root_dir, 'data', 'Roboto-Regular.ttf')

  train_summary_writer = tf.compat.v2.summary.create_file_writer(
      train_dir, flush_millis=summaries_flush_secs * 1000)
  train_summary_writer.set_as_default()

  eval_summary_writer = tf.compat.v2.summary.create_file_writer(
      eval_dir, flush_millis=summaries_flush_secs * 1000)
  eval_metrics = [
      tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes),
      tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes),
  ]

  with tf.compat.v2.summary.record_if(lambda: tf.math.equal(global_step % summary_interval, 0)):
    # tf_env = tf_py_environment.TFPyEnvironment(suite_gym.load(env_name))
    # eval_tf_env = tf_py_environment.TFPyEnvironment(suite_gym.load(env_name))

    train_py_env = Environment.puzzleEnv(tile_set=tile_set, tile_images_path='/workspaces/git/puzzle/data/E2_Edges.png')
    eval_py_env = Environment.puzzleEnv(tile_set=tile_set, tile_images_path='/workspaces/git/puzzle/data/E2_Edges.png')

    tf_env = tf_py_environment.TFPyEnvironment(train_py_env)
    eval_tf_env = tf_py_environment.TFPyEnvironment(eval_py_env)

    actor_net = actor_distribution_network.ActorDistributionNetwork(
        tf_env.time_step_spec().observation,
        tf_env.action_spec(),
        fc_layer_params=actor_fc_layers)

    if use_value_network:
      value_net = value_network.ValueNetwork(
          tf_env.time_step_spec().observation,
          fc_layer_params=value_net_fc_layers)

    global_step = tf.compat.v1.train.get_or_create_global_step()
    tf_agent = reinforce_agent.ReinforceAgent(
        tf_env.time_step_spec(),
        tf_env.action_spec(),
        actor_network=actor_net,
        value_network=value_net if use_value_network else None,
        value_estimation_loss_coef=value_estimation_loss_coef,
        gamma=gamma,
        optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate),
        normalize_returns=normalize_returns,
        gradient_clipping=gradient_clipping,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
        train_step_counter=global_step
    )

    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        tf_agent.collect_data_spec,
        batch_size=tf_env.batch_size,
        max_length=replay_buffer_capacity)

    tf_agent.initialize()

    train_metrics = [
        tf_metrics.NumberOfEpisodes(),
        tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(),
        tf_metrics.AverageEpisodeLengthMetric(),
    ]

    eval_policy = tf_agent.policy
    collect_policy = tf_agent.collect_policy

    collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(
        tf_env,
        collect_policy,
        observers=[replay_buffer.add_batch] + train_metrics,
        num_episodes=collect_episodes_per_iteration)

    # # Dataset generates trajectories with shape [Bx2x...]
    # dataset = replay_buffer.as_dataset(
    #     num_parallel_calls=1,
    #     sample_batch_size=batch_size,
    #     num_steps=train_sequence_length + 1).prefetch(3)
    # iterator = iter(dataset)

    # def train_step():
    #   experience, _ = next(iterator)
    #   return tf_agent.train(experience)
    def train_step():
      experience = replay_buffer.gather_all()
      return tf_agent.train(experience)

    if use_tf_functions:
      # To speed up collect use TF function.
      collect_driver.run = common.function(collect_driver.run)
      # To speed up train use TF function.
      tf_agent.train = common.function(tf_agent.train)
      train_step = common.function(train_step)

    # Compute evaluation metrics.
    metrics = metric_utils.eager_compute(
        eval_metrics,
        eval_tf_env,
        eval_policy,
        num_episodes=num_eval_episodes,
        train_step=global_step,
        summary_writer=eval_summary_writer,
        summary_prefix='Metrics',
    )
    # TODO(b/126590894): Move this functionality into eager_compute_summaries
    if eval_metrics_callback is not None:
      eval_metrics_callback(metrics, global_step.numpy())

    time_step = None
    policy_state = collect_policy.get_initial_state(tf_env.batch_size)

    timed_at_step = global_step.numpy()
    time_acc = 0

    train_checkpointer = common.Checkpointer(
        ckpt_dir=train_dir,
        max_to_keep=1,
        agent=tf_agent,
        global_step=global_step,
        metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics'))
    policy_checkpointer = common.Checkpointer(
        ckpt_dir=os.path.join(train_dir, 'policy'),
        max_to_keep=1,
        policy=eval_policy,
        global_step=global_step)
    rb_checkpointer = common.Checkpointer(
        ckpt_dir=os.path.join(train_dir, 'replay_buffer'),
        max_to_keep=1,
        replay_buffer=replay_buffer)

    train_checkpointer.initialize_or_restore()
    rb_checkpointer.initialize_or_restore()

    for _ in range(num_iterations):
      start_time = time.time()
      time_step, policy_state = collect_driver.run(
          time_step=time_step,
          policy_state=policy_state,
      )
      total_loss = train_step()
      replay_buffer.clear()
      time_acc += time.time() - start_time

      global_step_val = global_step.numpy()
      if global_step_val % log_interval == 0:
        logging.info('step = %d, loss = %f', global_step_val, total_loss.loss)
        steps_per_sec = (global_step_val - timed_at_step) / time_acc
        logging.info('%.3f steps/sec', steps_per_sec)
        tf.compat.v2.summary.scalar(
            name='global_steps_per_sec', data=steps_per_sec, step=global_step)
        timed_at_step = global_step_val
        time_acc = 0

      for train_metric in train_metrics:
        train_metric.tf_summaries(
            train_step=global_step, step_metrics=train_metrics[:2])

      if global_step.numpy() % train_checkpoint_interval == 0:
        train_checkpointer.save(global_step=global_step.numpy())

      if global_step.numpy() % policy_checkpoint_interval == 0:
        policy_checkpointer.save(global_step=global_step.numpy())

      if global_step.numpy() % rb_checkpoint_interval == 0:
        rb_checkpointer.save(global_step=global_step.numpy())

      if global_step_val % eval_interval == 0:
        metrics = metric_utils.eager_compute(
            eval_metrics,
            eval_tf_env,
            eval_policy,
            num_episodes=num_eval_episodes,
            train_step=global_step,
            summary_writer=eval_summary_writer,
            summary_prefix='Metrics',
        )
        # TODO(b/126590894): Move this functionality into
        # eager_compute_summaries.
        if eval_metrics_callback is not None:
          eval_metrics_callback(metrics, global_step_val)

    train_checkpointer.save(global_step=global_step.numpy())
    policy_checkpointer.save(global_step=global_step.numpy())
    rb_checkpointer.save(global_step=global_step.numpy())

    def run_episodes_and_create_video(policy, eval_tf_env, eval_py_env):
      num_episodes = 3
      frames = []
      global_step_val = global_step.numpy()
      image_path = os.path.join(image_dir, 'environment_{}.png'.format(global_step_val))
      for episode in range(num_episodes):
        time_step = eval_tf_env.reset()
        frames.append(eval_py_env.render(mode='human'))
        while not time_step.is_last():
          action_step = policy.action(time_step)
          time_step = eval_tf_env.step(action_step.action)
          img = eval_py_env.render(mode='human')
          font = ImageFont.truetype(font_path, 16)
          draw = ImageDraw.Draw(img)
          text = "Episode: {}".format(episode)
          draw.text((0, 0),text,(255,255,255),font=font)
          # image_debug_path = os.path.join(image_dir, 'debug_{}_{}.png'.format(episode, step))
          # img.save(image_debug_path)
          frames.append(img)

      frames[0].save(image_path, save_all=True, append_images=frames[1:], optimize=True, duration=len(frames)/8, loop=0, format="PNG")

    run_episodes_and_create_video(tf_agent.policy, eval_tf_env, eval_py_env)

def main(_):
  tf.compat.v1.enable_eager_execution(
      config=tf.compat.v1.ConfigProto(allow_soft_placement=True))
  tf.compat.v1.enable_v2_behavior()
  logging.set_verbosity(logging.INFO)
  train_eval(FLAGS.root_dir, num_iterations=FLAGS.num_iterations)


if __name__ == '__main__':
  flags.mark_flag_as_required('root_dir')
  app.run(main)