Chess AI with DQN

hello everyone,
im sorry in advance if my question is too broad, im doing my thesis and it is on AI and machine learning and im building a model that learns how to play chess using reinforcement learning DDQN, my current issues with my code are 2 which arre what follows:

  • whenever i write from x import y visual studio tells me it cant find the x file (in yellow underlines) but then when i run the code it does import from x so im confused is it like a VS thing or is it an error and it is not taking in all my imports?
    PS: all the code files are in the same directory and i have an init file in the directory aswell.

  • i tried refining my step method which is resposible for taking steps but it kept giving -10 reward every iteration until the last modification and now i have an error “Error in step method: ‘Board’ object is not subscriptable” and still giving -10s and idk what to make out of it so a bit of insight would be very helpful.

i will put the code of the chess environment and the training script without the pieces code to save space and time for you.
chess_env:

import torch
import chess
import chess.engine
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from rook import generate_rook_moves
from pawn import generate_pawn_moves
from bishop import generate_bishop_moves
from knight import generate_knight_moves
from queen import generate_queen_moves
from king import generate_king_moves

# Define QNetwork here 
class QNetwork(torch.nn.Module):
    def __init__(self, input_size, output_size):
        super(QNetwork, self).__init__()
        self.fc1 = torch.nn.Linear(input_size, 128)
        self.relu1 = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(128, 64)
        self.relu2 = torch.nn.ReLU()
        self.fc3 = torch.nn.Linear(64, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x

class ChessEnvironment:
    def __init__(self, q_network,actions):
        self.q_network = q_network
        self.board = chess.Board()
        self.turn = 'w'
        self.actions = self.generate_all_moves()

    def initialize_board(self):
        return {
            'a1': 'wr', 'b1': 'wn', 'c1': 'wb', 'd1': 'wq', 'e1': 'wk', 'f1': 'wb', 'g1': 'wn', 'h1': 'wr',
            'a2': 'wp', 'b2': 'wp', 'c2': 'wp', 'd2': 'wp', 'e2': 'wp', 'f2': 'wp', 'g2': 'wp', 'h2': 'wp',
            'a8': 'br', 'b8': 'bn', 'c8': 'bb', 'd8': 'bq', 'e8': 'bk', 'f8': 'bb', 'g8': 'bn', 'h8': 'br',
            'a7': 'bp', 'b7': 'bp', 'c7': 'bp', 'd7': 'bp', 'e7': 'bp', 'f7': 'bp', 'g7': 'bp', 'h7': 'bp',
        }

    def generate_all_moves(self):
        # Generate all possible legal moves in UCI format
        return [move.uci() for move in self.board.legal_moves]

    def reset(self):
        self.board.reset()
        self.turn = 'w'
        return self.get_state()

    def get_state(self):
        # Convert board state to a 1D tensor of length 64
        state = np.zeros(64, dtype=int)
        for square, piece in self.board.piece_map().items():
         state[square] = piece.piece_type if piece.color == chess.WHITE else -piece.piece_type
        return state
        #return torch.tensor(board_state, dtype=torch.float32).unsqueeze(0)  # Add batch dimension

    def legal_moves(self):
        legal_moves = []
        for square, piece in self.board.items():
            if piece[0] == self.turn:
                moves = self.generate_valid_moves(square, piece)
                legal_moves.extend(self.filter_moves_check(square, moves))
        return legal_moves

    def generate_valid_moves(self, square, piece):
        if piece[1] == 'r':
            return generate_rook_moves(self.board, square, piece[0])
        elif piece[1] == 'p':
            return generate_pawn_moves(self.board, square, piece[0])
        elif piece[1] == 'b' :
            return generate_bishop_moves(self.board, square, piece[0])
        elif piece[1] == 'kn':
            return generate_knight_moves(self.board, square, piece[0])
        elif piece[1] == 'k':
            return generate_king_moves(self.board, square, piece[0])
        elif piece[1] == 'q':
            return generate_queen_moves(self.board, square, piece[0])
        return []

    def filter_moves_check(self, square, moves):
        valid_moves = []
        for move in moves:
            self.make_move(square, move)
            if not self.is_check(self.turn):
                valid_moves.append(move)
            self.undo_move(square, move)
        return valid_moves

    def make_move(self, source_square, destination_square):
        self.board[destination_square] = self.board[source_square]
        del self.board[source_square]
        self.turn = 'b' if self.turn == 'w' else 'w'

    def undo_move(self, source_square, destination_square):
        self.board[source_square] = self.board[destination_square]
        del self.board[destination_square]
        self.turn = 'b' if self.turn == 'w' else 'w'

    def is_check(self, color):
        king_square = None
        for square, piece in self.board.items():
            if piece == color + 'k':
                king_square = square
                break
        if king_square:
            for square, piece in self.board.items():
                if piece[0] != color:
                    if king_square in self.generate_valid_moves(square, piece):
                        return True
        return False

    def is_checkmate(self):
        return self.is_check(self.turn) and not self.legal_moves()

    def is_stalemate(self):
        return not self.is_check(self.turn) and not self.legal_moves()

    def is_insufficient_material(self):
        pieces = [piece[1] for piece in self.board.values()]
        return pieces in [['k', 'k'], ['k', 'kn', 'k'], ['k', 'kb', 'k']]

    def step(self, action):
     move = self.actions[action]
     source_square = move[:2]
     destination_square = move[2:]
     try:
            self.make_move(source_square, destination_square)
     except Exception as e:
            print(f"Error in step method: {e}")
            return self.get_state(), -10, True

     next_state = self.get_state()
     reward = self.calculate_reward(source_square, destination_square)
     done = self.is_checkmate() or self.is_stalemate() or self.is_insufficient_material()

     if self.is_checkmate():
            reward += 100
     elif self.is_stalemate() or self.is_insufficient_material():
            reward += 0

     return next_state, reward, done
     """try:
            # Extract source and destination squares from the action index
            move_uci = self.actions[action]
            move = chess.Move.from_uci(move_uci)

            # Check if the move is legal
            if move not in self.board.legal_moves:
                return self.get_state(), -10, True  # Invalid move with high negative reward, and end episode

            # Make the move on the board
            self.make_move(move)

            # Get the new state after the move
            next_state = self.get_state()

            # Calculate reward based on the board state after the move
            reward = self.calculate_reward()

            # Check if the game is done (checkmate, stalemate, or insufficient material)
            done = self.board.is_checkmate() or self.board.is_stalemate() or self.is_insufficient_material()

            if self.board.is_checkmate():
                reward += 100  # Reward for achieving checkmate
            elif self.board.is_stalemate() or self.is_insufficient_material():
                reward += 0  # No reward for stalemate or insufficient material

            # Update the actions list
            self.actions = self.generate_all_moves()

            return next_state, reward, done
     except Exception as e:
            print(f"Error in step method: {e}")
            return self.get_state(), -10, True"""
     
    def calculate_reward(self, source_square, destination_square):
    # Reward function for specific events
     reward = 0

    # Reward for capturing a piece
     if destination_square in self.board:
        captured_piece = self.board[destination_square]
        piece_value = {'p': 1, 'n': 3, 'b': 3, 'r': 5, 'q': 9, 'k': 0}
        reward += piece_value[captured_piece[1]]
    
    # Reward for putting the opponent in check
     if self.is_check(self.turn):
        reward += 5

    # Penalize for each move to encourage faster resolution
     reward -= 1

     return reward

train:

import torch
import torch.nn as nn
import torch.optim as optim
import chess
import chess.engine
import random
import numpy as np
from chess_env import ChessEnvironment, QNetwork


# Define the actions (all possible moves in UCI notation)
actions = [move for move in chess.Board().legal_moves]

# Initialize Q-Network with the correct input and output sizes
input_size = 64  # Since the state is represented by an array of 64 integers
output_size = len(actions)  # Number of possible actions

q_network = QNetwork(input_size=input_size, output_size=output_size)
env = ChessEnvironment(q_network, actions)

# Hyperparameters
gamma = 0.99  # Discount factor for future rewards
epsilon = 1.0  # Initial exploration rate
epsilon_min = 0.01  # Minimum exploration rate
epsilon_decay = 0.995  # Decay rate for exploration probability
learning_rate = 0.001  # Learning rate for the optimizer

optimizer = optim.Adam(env.q_network.parameters(), lr=learning_rate)
criterion = nn.MSELoss()

# Training loop
num_episodes = 1000
for episode in range(num_episodes):
    state = env.reset()
    total_reward = 0
    done = False

    while not done:
        if random.uniform(0, 1) < epsilon:
            action = random.randint(0, len(env.actions)-1) # Explore: select a random action
        else:
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            q_values = env.q_network(state_tensor)
            action = torch.argmax(q_values).item() # Exploit: select the action with max Q-value

        next_state, reward, done = env.step(action) # Take a step in the environmet
        total_reward += reward

        # Update Q-values using Bellman equation
        next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0)
        q_values_next = env.q_network(next_state_tensor)
        target = reward + (gamma * torch.max(q_values_next).item() * (1 - done))

        # Convert state to tensor
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        q_values = env.q_network(state_tensor)

        # Make a copy of the current Q-values to modify the target
        target_f = q_values.clone().detach()

        # Ensure the action index is within bounds
        if action >= len(env.actions):
            raise ValueError(f"Action index {action} out of bounds for actions list of length {len(env.actions)}")
        
        target_f[0][action] = target

        loss = criterion(q_values, target_f)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        state = next_state

    epsilon = max(epsilon_min, epsilon * epsilon_decay)

    print(f"Episode {episode + 1}/{num_episodes}, Total Reward: {total_reward}")