I need help creating a cnn for an online game

umm so there is this game calles arras and i wanted to try make a cnn for it. i didnt know where to start so i first wrote code that took screenshots of 300kb (240p 15fps grayscal) and mashed them up into groups of 3 such that the moving things got coloured. that way i got like 300 images every minute or so like 1.4-1.5mb per second. so this is the python code for it

import cv2
import numpy as np
import time
from mss import mss
import os
import csv
from pynput import keyboard, mouse

# ===== Settings =====
SAVE_DIR = "recorded_frames"
FPS = 15
STACK_SIZE = 3
OUTPUT_RES = (426, 240)  # width, height (240p)
MONITOR = {"top": 0, "left": 0, "width": 2560, "height": 1440}  # adjust if needed
INPUT_LOG = os.path.join(SAVE_DIR, "recorded_inputs.csv")
PREVIEW_DIR = os.path.join(SAVE_DIR, "preview_frames")

os.makedirs(SAVE_DIR, exist_ok=True)
os.makedirs(PREVIEW_DIR, exist_ok=True)

# ===== Input State =====
keys_pressed = set()
mouse_pos = (0, 0)
mouse_buttons = set()
recording = False   # starts paused
stop_program = False

def on_press(key):
    global recording, stop_program
    try:
        if key == keyboard.Key.f8:   # start recording
            print(" Recording started!")
            recording = True
        elif key == keyboard.Key.f9: # stop recording
            print("⏹ Recording stopped!")
            stop_program = True
            return False  # stop keyboard listener
        else:
            keys_pressed.add(str(key))
    except AttributeError:
        keys_pressed.add(str(key))

def on_release(key):
    try:
        keys_pressed.discard(key.char)
    except AttributeError:
        keys_pressed.discard(str(key))

def on_click(x, y, button, pressed):
    if pressed:
        mouse_buttons.add(str(button))
    else:
        mouse_buttons.discard(str(button))

def on_move(x, y):
    global mouse_pos
    mouse_pos = (x, y)

# ===== Recorder =====
def record():
    global recording, stop_program
    sct = mss()
    frame_count = 0
    frame_stack = []
    last_time = time.time()

    # Prepare CSV file
    with open(INPUT_LOG, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["frame_id", "keys_pressed", "mouse_x", "mouse_y", "mouse_buttons"])

    # Start input listeners
    kb_listener = keyboard.Listener(on_press=on_press, on_release=on_release)
    ms_listener = mouse.Listener(on_click=on_click, on_move=on_move)
    kb_listener.start()
    ms_listener.start()

    print("Press F8 to start recording, F9 to stop.")

    while not stop_program:
        if not recording:
            time.sleep(0.05)
            continue

        if time.time() - last_time < 1 / FPS:
            continue
        last_time = time.time()

        # Screenshot
        img = np.array(sct.grab(MONITOR))
        gray = cv2.cvtColor(img, cv2.COLOR_BGRA2GRAY)
        resized = cv2.resize(gray, OUTPUT_RES, interpolation=cv2.INTER_AREA)

        frame_stack.append(resized)

        if len(frame_stack) == STACK_SIZE:
            frame_count += 1
            stacked = np.stack(frame_stack, axis=-1)  # shape: (240, 426, 3)

            # Save frames
            frame_file = os.path.join(SAVE_DIR, f"frame_{frame_count:05d}.npy")
            np.save(frame_file, stacked)

            # Save input log
            with open(INPUT_LOG, "a", newline="") as f:
                writer = csv.writer(f)
                writer.writerow([
                    frame_count,
                    list(keys_pressed),
                    mouse_pos[0],
                    mouse_pos[1],
                    list(mouse_buttons)
                ])

            # Save preview every 20th stack
            if frame_count % 20 == 0:
                preview_file = os.path.join(PREVIEW_DIR, f"preview_{frame_count:05d}.png")
                cv2.imwrite(preview_file, frame_stack[-1])
                print(f"Preview saved: {preview_file}")

            print(f"Saved {frame_file} with inputs")

            frame_stack = []

        # Show preview in a window
        cv2.imshow("Recording (240p grayscale)", resized)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            stop_program = True
            break

    cv2.destroyAllWindows()
    kb_listener.stop()
    ms_listener.stop()

if __name__ == "__main__":
    record()

so that takes the scrrenshot and to visualize it u can use the code

import numpy as np
import matplotlib.pyplot as plt
from PIL import Image   # for saving as PNG/JPG

# --- Step 1: Load the frame ---
frame = np.load("frame_00001.npy")   # replace with your filename or frame number

print("Shape:", frame.shape)
print("Dtype:", frame.dtype)

# --- Step 2: Display the image ---
plt.imshow(frame)
plt.axis("off")
plt.show()

# --- Step 3: If you want grayscale only ---
# If the frame has 3 channels but is grayscale-like, pick one channel
if frame.ndim == 3 and frame.shape[2] == 3:
    gray_frame = frame[:, :, 0]  # take the first channel
else:
    gray_frame = frame           # already grayscale

plt.imshow(gray_frame, cmap="gray")
plt.axis("off")
plt.show()

# --- Step 4: Save the frame as an image file ---
# Save RGB version
Image.fromarray(frame).save("frame_rgb.png")

# Save grayscale version
Image.fromarray(gray_frame).save("frame_gray.png")

so if anyone knows how to train cnn on images pls teach me once i will be very grateful