This should be a pretty good language translator if I could find a corpus to train on. Right now, it’s hardwired for this corpus. However, I had a version that was general purpose, and it’s easy to modify it back to that end. For someone with the skills, if you can find a corpus to do the job for multiple languages, please let me know. I’m stuck. (See next two replies for the rest)
import os import tensorflow as tf import numpy as np from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from datasets import load_dataset from sklearn.model_selection import train_test_split import mysql.connector # Set environment variable for increased timeout os.environ['HF_DATASETS_DOWNLOAD_TIMEOUT'] = '600' # Global database settings DB_NAME = "translation_db" # Hardcoded database name # Connect to MySQL db_config = { 'user': 'root', 'password': '', 'host': 'localhost', 'database': DB_NAME } conn = mysql.connector.connect(**db_config) cursor = conn.cursor() # Download and prepare the dataset using the specified Hugging Face dataset (using a subset) def download_dataset(): dataset_name = "NickyNicky/Colossal_Translation_Spanish_to_English_AND_English_to_Spanish_ORPO_DPO_Gemma" train_dataset = load_dataset(dataset_name, split='train[:10%]') # Load a smaller subset val_dataset = load_dataset(dataset_name, split='validation[:10%]') # Load a smaller subset return train_dataset, val_dataset # Preprocess the dataset and split into training and validation sets def preprocess_data(train_dataset, val_dataset, source_lang, dest_lang, num_words=30000, max_len=100): train_source = [] train_target = [] val_source = [] val_target = [] for example in train_dataset: train_source.append(example['translation'][source_lang]) train_target.append(example['translation'][dest_lang]) for example in val_dataset: val_source.append(example['translation'][source_lang]) val_target.append(example['translation'][dest_lang]) # Tokenize and pad the sequences for the training data source_tokenizer = Tokenizer(num_words=num_words, oov_token="") source_tokenizer.fit_on_texts(train_source) train_source_seq = source_tokenizer.texts_to_sequences(train_source) train_source_seq = pad_sequences(train_source_seq, maxlen=max_len, padding='post') target_tokenizer = Tokenizer(num_words=num_words, oov_token="") target_tokenizer.fit_on_texts(train_target) train_target_seq = target_tokenizer.texts_to_sequences(train_target) train_target_seq = pad_sequences(train_target_seq, maxlen=max_len, padding='post') # Tokenize and pad the sequences for the validation data val_source_seq = source_tokenizer.texts_to_sequences(val_source) val_source_seq = pad_sequences(val_source_seq, maxlen=max_len, padding='post') val_target_seq = target_tokenizer.texts_to_sequences(val_target) val_target_seq = pad_sequences(val_target_seq, maxlen=max_len, padding='post') return train_source_seq, train_target_seq, val_source_seq, val_target_seq, source_tokenizer, target_tokenizer # Continue with the rest of your model code... # Create the translation model def create_translation_model(vocab_size=30000, embedding_dim=512, num_heads=8, ff_dim=1024, num_layers=4): inputs = tf.keras.layers.Input(shape=(None,)) embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_dim)(inputs) for _ in range(num_layers): attention_output = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)(embedding_layer, embedding_layer) attention_output = tf.keras.layers.Dropout(0.1)(attention_output) out1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(embedding_layer + attention_output) ffn_output = tf.keras.layers.Dense(ff_dim, activation="relu")(out1) ffn_output = tf.keras.layers.Dense(embedding_dim)(ffn_output) ffn_output = tf.keras.layers.Dropout(0.1)(ffn_output) embedding_layer = tf.keras.layers.LayerNormalization(epsilon=1e-6)(out1 + ffn_output) outputs = tf.keras.layers.Dense(vocab_size, activation="softmax")(embedding_layer) model = tf.keras.Model(inputs=inputs, outputs=outputs) model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]) return model # Function to create table for storing weights if it doesn't exist def create_table_if_not_exists(table_name): cursor.execute(f""" CREATE TABLE IF NOT EXISTS {table_name} ( id SERIAL PRIMARY KEY, layer_name TEXT, weight_matrix LONGBLOB ) """) conn.commit() # Function to store weights def store_weights(model, table_name): for layer in model.layers: if len(layer.get_weights()) > 0: weights = layer.get_weights()[0] # Storing only the first weight matrix weights_bytea = weights.tobytes() # Convert to binary # Insert into MySQL cursor.execute( f"INSERT INTO {table_name} (layer_name, weight_matrix) VALUES (%s, %s)", (layer.name, weights_bytea) ) conn.commit() # Function to load weights def load_weights(model, table_name): for layer in model.layers: cursor.execute(f"SELECT weight_matrix FROM {table_name} WHERE layer_name = %s", (layer.name,)) row = cursor.fetchone() if row: weight_matrix = np.frombuffer(row[0], dtype=np.float32).reshape(layer.get_weights()[0].shape) layer.set_weights([weight_matrix] + layer.get_weights()[1:]) # Update train_model to use preprocessed data and include validation def train_model(model, train_source_seq, train_target_seq, val_source_seq, val_target_seq, table_name, epochs=10, batch_size=64): model.fit(train_source_seq, train_target_seq, validation_data=(val_source_seq, val_target_seq), epochs=epochs, batch_size=batch_size) store_weights(model, table_name) # Translator class with language and database management class Translator: def __init__(self, source_lang, dest_lang, model=None, model_path=None, source_tokenizer=None, target_tokenizer=None, max_len=100): self.source_lang = source_lang self.dest_lang = dest_lang self.table_name = f"{self.source_lang}_{self.dest_lang}_model_weights" self.model = model self.model_path = model_path self.source_tokenizer = source_tokenizer self.target_tokenizer = target_tokenizer self.max_len = max_len if self.model_path: self.load_model(self.model_path) def load_model(self, model_path=None): if model_path is None: model_path = f"models/{self.source_lang}_{self.dest_lang}_translation_model" self.model = tf.keras.models.load_model(model_path) print(f"Model loaded from {model_path}") def save_model(self, save_dir=None): if save_dir is None: save_dir = f"models/{self.source_lang}_{self.dest_lang}_translation_model" os.makedirs(save_dir, exist_ok=True) model_file = os.path.join(save_dir, f"{self.source_lang}_{self.dest_lang}_translation_model.h5") self.model.save(save_dir) self.model.save(model_file) print(f"Model saved to {model_file}") def translate(self, input_text): input_seq = self.source_tokenizer.texts_to_sequences([input_text]) input_seq = pad_sequences(input_seq, maxlen=self.max_len, padding='post') output_seq = self.model.predict(input_seq) output_text = self.target_tokenizer.sequences_to_texts(output_seq.argmax(axis=2))[0] return output_text if __name__ == "__main__": # Dynamic language selection source_lang = "es" # Example: Spanish dest_lang = "en" # Example: English # Download dataset train_dataset, val_dataset = download_dataset() # Preprocess data train_source_seq, train_target_seq, val_source_seq, val_target_seq, source_tokenizer, target_tokenizer = preprocess_data( train_dataset, val_dataset, source_lang, dest_lang ) # Create translation model translation_model = create_translation_model() # Create table for weights if it doesn't exist table_name = f"{source_lang}_{dest_lang}_model_weights" create_table_if_not_exists(table_name) # Train model train_model(translation_model, train_source_seq, train_target_seq, val_source_seq, val_target_seq, table_name) # Initialize the Translator with the trained model and language settings translator = Translator( source_lang=source_lang, dest_lang=dest_lang, model=translation_model, source_tokenizer=source_tokenizer, target_tokenizer=target_tokenizer ) # Save the trained model translator.save_model() # Load the model back (for demonstration) translator.load_model() # Translate a sample text text_to_translate = "Hola, ¿cómo estás? ¿Cómo te llamas?" translated_text = translator.translate(text_to_translate) print("Translated text:", translated_text)