General Purpose Language Translator

This should be a pretty good language translator if I could find a corpus to train on. Right now, it’s hardwired for this corpus. However, I had a version that was general purpose, and it’s easy to modify it back to that end. For someone with the skills, if you can find a corpus to do the job for multiple languages, please let me know. I’m stuck. (See next two replies for the rest)

import os
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import mysql.connector

# Set environment variable for increased timeout
os.environ['HF_DATASETS_DOWNLOAD_TIMEOUT'] = '600'

# Global database settings
DB_NAME = "translation_db"  # Hardcoded database name

# Connect to MySQL
db_config = {
    'user': 'root',
    'password': '',
    'host': 'localhost',
    'database': DB_NAME
}
conn = mysql.connector.connect(**db_config)
cursor = conn.cursor()

# Download and prepare the dataset using the specified Hugging Face dataset (using a subset)
def download_dataset():
    dataset_name = "NickyNicky/Colossal_Translation_Spanish_to_English_AND_English_to_Spanish_ORPO_DPO_Gemma"
    train_dataset = load_dataset(dataset_name, split='train[:10%]')  # Load a smaller subset
    val_dataset = load_dataset(dataset_name, split='validation[:10%]')  # Load a smaller subset
    return train_dataset, val_dataset

# Preprocess the dataset and split into training and validation sets
def preprocess_data(train_dataset, val_dataset, source_lang, dest_lang, num_words=30000, max_len=100):
    train_source = []
    train_target = []
    val_source = []
    val_target = []

    for example in train_dataset:
        train_source.append(example['translation'][source_lang])
        train_target.append(example['translation'][dest_lang])

    for example in val_dataset:
        val_source.append(example['translation'][source_lang])
        val_target.append(example['translation'][dest_lang])

    # Tokenize and pad the sequences for the training data
    source_tokenizer = Tokenizer(num_words=num_words, oov_token="")
    source_tokenizer.fit_on_texts(train_source)
    train_source_seq = source_tokenizer.texts_to_sequences(train_source)
    train_source_seq = pad_sequences(train_source_seq, maxlen=max_len, padding='post')

    target_tokenizer = Tokenizer(num_words=num_words, oov_token="")
    target_tokenizer.fit_on_texts(train_target)
    train_target_seq = target_tokenizer.texts_to_sequences(train_target)
    train_target_seq = pad_sequences(train_target_seq, maxlen=max_len, padding='post')

    # Tokenize and pad the sequences for the validation data
    val_source_seq = source_tokenizer.texts_to_sequences(val_source)
    val_source_seq = pad_sequences(val_source_seq, maxlen=max_len, padding='post')

    val_target_seq = target_tokenizer.texts_to_sequences(val_target)
    val_target_seq = pad_sequences(val_target_seq, maxlen=max_len, padding='post')

    return train_source_seq, train_target_seq, val_source_seq, val_target_seq, source_tokenizer, target_tokenizer

# Continue with the rest of your model code...

# Create the translation model
def create_translation_model(vocab_size=30000, embedding_dim=512, num_heads=8, ff_dim=1024, num_layers=4):
    inputs = tf.keras.layers.Input(shape=(None,))
    embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_dim)(inputs)
    
    for _ in range(num_layers):
        attention_output = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)(embedding_layer, embedding_layer)
        attention_output = tf.keras.layers.Dropout(0.1)(attention_output)
        out1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(embedding_layer + attention_output)
        
        ffn_output = tf.keras.layers.Dense(ff_dim, activation="relu")(out1)
        ffn_output = tf.keras.layers.Dense(embedding_dim)(ffn_output)
        ffn_output = tf.keras.layers.Dropout(0.1)(ffn_output)
        embedding_layer = tf.keras.layers.LayerNormalization(epsilon=1e-6)(out1 + ffn_output)

    outputs = tf.keras.layers.Dense(vocab_size, activation="softmax")(embedding_layer)
    
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    
    return model

# Function to create table for storing weights if it doesn't exist
def create_table_if_not_exists(table_name):
    cursor.execute(f"""
    CREATE TABLE IF NOT EXISTS {table_name} (
        id SERIAL PRIMARY KEY,
        layer_name TEXT,
        weight_matrix LONGBLOB
    )
    """)
    conn.commit()

# Function to store weights
def store_weights(model, table_name):
    for layer in model.layers:
        if len(layer.get_weights()) > 0:
            weights = layer.get_weights()[0]  # Storing only the first weight matrix
            weights_bytea = weights.tobytes()  # Convert to binary
            
            # Insert into MySQL
            cursor.execute(
                f"INSERT INTO {table_name} (layer_name, weight_matrix) VALUES (%s, %s)", 
                (layer.name, weights_bytea)
            )
    conn.commit()

# Function to load weights
def load_weights(model, table_name):
    for layer in model.layers:
        cursor.execute(f"SELECT weight_matrix FROM {table_name} WHERE layer_name = %s", (layer.name,))
        row = cursor.fetchone()
        if row:
            weight_matrix = np.frombuffer(row[0], dtype=np.float32).reshape(layer.get_weights()[0].shape)
            layer.set_weights([weight_matrix] + layer.get_weights()[1:])

# Update train_model to use preprocessed data and include validation
def train_model(model, train_source_seq, train_target_seq, val_source_seq, val_target_seq, table_name, epochs=10, batch_size=64):
    model.fit(train_source_seq, train_target_seq, validation_data=(val_source_seq, val_target_seq), epochs=epochs, batch_size=batch_size)
    store_weights(model, table_name)

# Translator class with language and database management
class Translator:
    def __init__(self, source_lang, dest_lang, model=None, model_path=None, source_tokenizer=None, target_tokenizer=None, max_len=100):
        self.source_lang = source_lang
        self.dest_lang = dest_lang
        self.table_name = f"{self.source_lang}_{self.dest_lang}_model_weights"
        self.model = model
        self.model_path = model_path
        self.source_tokenizer = source_tokenizer
        self.target_tokenizer = target_tokenizer
        self.max_len = max_len

        if self.model_path:
            self.load_model(self.model_path)

    def load_model(self, model_path=None):
        if model_path is None:
            model_path = f"models/{self.source_lang}_{self.dest_lang}_translation_model"
        self.model = tf.keras.models.load_model(model_path)
        print(f"Model loaded from {model_path}")

    def save_model(self, save_dir=None):
        if save_dir is None:
            save_dir = f"models/{self.source_lang}_{self.dest_lang}_translation_model"
        os.makedirs(save_dir, exist_ok=True)
        model_file = os.path.join(save_dir, f"{self.source_lang}_{self.dest_lang}_translation_model.h5")
        self.model.save(save_dir)
        self.model.save(model_file)
        print(f"Model saved to {model_file}")

    def translate(self, input_text):
        input_seq = self.source_tokenizer.texts_to_sequences([input_text])
        input_seq = pad_sequences(input_seq, maxlen=self.max_len, padding='post')
        
        output_seq = self.model.predict(input_seq)
        output_text = self.target_tokenizer.sequences_to_texts(output_seq.argmax(axis=2))[0]
        
        return output_text

if __name__ == "__main__":
    # Dynamic language selection
    source_lang = "es"  # Example: Spanish
    dest_lang = "en"  # Example: English

    # Download dataset
    train_dataset, val_dataset = download_dataset()

    # Preprocess data
    train_source_seq, train_target_seq, val_source_seq, val_target_seq, source_tokenizer, target_tokenizer = preprocess_data(
        train_dataset, val_dataset, source_lang, dest_lang
    )

    # Create translation model
    translation_model = create_translation_model()

    # Create table for weights if it doesn't exist
    table_name = f"{source_lang}_{dest_lang}_model_weights"
    create_table_if_not_exists(table_name)

    # Train model
    train_model(translation_model, train_source_seq, train_target_seq, val_source_seq, val_target_seq, table_name)

    # Initialize the Translator with the trained model and language settings
    translator = Translator(
        source_lang=source_lang, 
        dest_lang=dest_lang, 
        model=translation_model, 
        source_tokenizer=source_tokenizer, 
        target_tokenizer=target_tokenizer
    )

    # Save the trained model
    translator.save_model()

    # Load the model back (for demonstration)
    translator.load_model()

    # Translate a sample text
    text_to_translate = "Hola, ¿cómo estás? ¿Cómo te llamas?"
    translated_text = translator.translate(text_to_translate)

    print("Translated text:", translated_text)
    outputs = tf.keras.layers.Dense(vocab_size, activation="softmax")(embedding_layer)
    
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    
    return model

# Function to create table for storing weights if it doesn't exist
def create_table_if_not_exists(table_name):
    cursor.execute(f"""
    CREATE TABLE IF NOT EXISTS {table_name} (
        id SERIAL PRIMARY KEY,
        layer_name TEXT,
        weight_matrix LONGBLOB
    )
    """)
    conn.commit()

# Function to store weights
def store_weights(model, table_name):
    for layer in model.layers:
        if len(layer.get_weights()) > 0:
            weights = layer.get_weights()[0]  # Storing only the first weight matrix
            weights_bytea = weights.tobytes()  # Convert to binary
            
            # Insert into MySQL
            cursor.execute(
                f"INSERT INTO {table_name} (layer_name, weight_matrix) VALUES (%s, %s)", 
                (layer.name, weights_bytea)
            )
    conn.commit()

# Function to load weights
def load_weights(model, table_name):
    for layer in model.layers:
        cursor.execute(f"SELECT weight_matrix FROM {table_name} WHERE layer_name = %s", (layer.name,))
        row = cursor.fetchone()
        if row:
            weight_matrix = np.frombuffer(row[0], dtype=np.float32).reshape(layer.get_weights()[0].shape)
            layer.set_weights([weight_matrix] + layer.get_weights()[1:])

# Update train_model to use preprocessed data and include validation
def train_model(model, train_source_seq, train_target_seq, val_source_seq, val_target_seq, table_name, epochs=10, batch_size=64):
    model.fit(train_source_seq, train_target_seq, validation_data=(val_source_seq, val_target_seq), epochs=epochs, batch_size=batch_size)
    store_weights(model, table_name)

# Translator class with language and database management
class Translator:
    def __init__(self, source_lang, dest_lang, model=None, model_path=None, source_tokenizer=None, target_tokenizer=None, max_len=100):
        self.source_lang = source_lang
        self.dest_lang = dest_lang
        self.table_name = f"{self.source_lang}_{self.dest_lang}_model_weights"
        self.model = model
        self.model_path = model_path
        self.source_tokenizer = source_tokenizer
        self.target_tokenizer = target_tokenizer
        self.max_len = max_len

        if self.model_path:
            self.load_model(self.model_path)

    def load_model(self, model_path=None):
        if model_path is None:
            model_path = f"models/{self.source_lang}_{self.dest_lang}_translation_model"
        self.model = tf.keras.models.load_model(model_path)
        print(f"Model loaded from {model_path}")

    def save_model(self, save_dir=None):
        if save_dir is None:
            save_dir = f"models/{self.source_lang}_{self.dest_lang}_translation_model"
        os.makedirs(save_dir, exist_ok=True)
        model_file = os.path.join(save_dir, f"{self.source_lang}_{self.dest_lang}_translation_model.h5")
        self.model.save(save_dir)
        self.model.save(model_file)
        print(f"Model saved to {model_file}")

    def translate(self, input_text):
        input_seq = self.source_tokenizer.texts_to_sequences([input_text])
        input_seq = pad_sequences(input_seq, maxlen=self.max_len, padding='post')
        
        output_seq = self.model.predict(input_seq)
        output_text = self.target_tokenizer.sequences_to_texts(output_seq.argmax(axis=2))[0]
        
        return output_text

if __name__ == "__main__":
    # Dynamic language selection
    source_lang = "es"  # Example: Spanish
    dest_lang = "en"  # Example: English

    # Download dataset
    train_dataset, val_dataset = download_dataset()

    # Preprocess data
    train_source_seq, train_target_seq, val_source_seq, val_target_seq, source_tokenizer, target_tokenizer = preprocess_data(
        train_dataset, val_dataset, source_lang, dest_lang
    )

    # Create translation model
    translation_model = create_translation_model()

    # Create table for weights if it doesn't exist
    table_name = f"{source_lang}_{dest_lang}_model_weights"
    create_table_if_not_exists(table_name)

    # Train model
    train_model(translation_model, train_source_seq, train_target_seq, val_source_seq, val_target_seq, table_name)

    # Initialize the Translator with the trained model and language settings
    translator = Translator(
        source_lang=source_lang, 
        dest_lang=dest_lang, 
        model=translation_model, 
        source_tokenizer=source_tokenizer, 
        target_tokenizer=target_tokenizer
    )

    # Save the trained model
    translator.save_model()

    # Load the model back (for demonstration)
    translator.load_model()

    # Translate a sample text
    text_to_translate = "Hola, ¿cómo estás? ¿Cómo te llamas?"
    translated_text = translator.translate(text_to_translate)

    print("Translated text:", translated_text)
if __name__ == "__main__":
    # Dynamic language selection
    source_lang = "es"  # Example: Spanish
    dest_lang = "en"  # Example: English

    # Download dataset
    train_dataset, val_dataset = download_dataset()

    # Preprocess data
    train_source_seq, train_target_seq, val_source_seq, val_target_seq, source_tokenizer, target_tokenizer = preprocess_data(
        train_dataset, val_dataset, source_lang, dest_lang
    )

    # Create translation model
    translation_model = create_translation_model()

    # Create table for weights if it doesn't exist
    table_name = f"{source_lang}_{dest_lang}_model_weights"
    create_table_if_not_exists(table_name)

    # Train model
    train_model(translation_model, train_source_seq, train_target_seq, val_source_seq, val_target_seq, table_name)

    # Initialize the Translator with the trained model and language settings
    translator = Translator(
        source_lang=source_lang, 
        dest_lang=dest_lang, 
        model=translation_model, 
        source_tokenizer=source_tokenizer, 
        target_tokenizer=target_tokenizer
    )

    # Save the trained model
    translator.save_model()

    # Load the model back (for demonstration)
    translator.load_model()

    # Translate a sample text
    text_to_translate = "Hola, ¿cómo estás? ¿Cómo te llamas?"
    translated_text = translator.translate(text_to_translate)

    print("Translated text:", translated_text)