The model is overfitting very fast. With just 10 epochs my model gets 97% accuracy from train data but only 62% on test data. I have tried changing the activation, reduce and increase the learning rate but the result is even worse and it never passes 62% accuracy. Is there any technique to increase the accuracy?
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
Dataset=pd.read_csv('train.csv')
Dataset.dropna(inplace=True)
X = Dataset[["text"]]
y = Dataset[["target"]]
import re
def Remove_Url(string):
return re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%|\-)*\b', '', string)
def Handle_Tags(string):
pattern = re.compile(r'[@|#][^\s]+')
matches = pattern.findall(string)
tags = [match[1:] for match in matches]
# Removing tags from main string
string = re.sub(pattern, '', string)
# More weightage to tag by adding them 3 times
return string + ' ' + ' '.join(tags) + ' '+ ' '.join(tags) + ' ' + ' '.join(tags)
import demoji
demoji.download_codes()
def Handle_emoji(string):
return demoji.replace_with_desc(string)
def Remove_html(string):
return re.sub(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});', '', str(string))
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
nltk.download('punkt')
nltk.download('stopwords')
stemmer = SnowballStemmer('english')
stopword = stopwords.words('english')
def Remove_StopAndStem(string):
string_list = string.split()
return ' '.join([stemmer.stem(i) for i in string_list if i not in stopword])
def Remove_UC(string):
thestring = re.sub(r'[^a-zA-Z\s]','', string)
thestring = re.sub(r'\b\w{1,2}\b', '', thestring)
return re.sub(' +', ' ', thestring)
# Step 1. Remove Url
X_clearned = X['text'].apply(Remove_Url)
# Step 2. Handle Tags
X_clearned = X['text'].apply(Handle_Tags)
# Step 3. Handle emoji's
X_clearned = X['text'].apply(Handle_emoji)
# Step 4. Remove HTML Tags
X_clearned = X['text'].apply(Remove_html)
# Step 5. Remove Stopwords and Stemming
X_clearned = X['text'].apply(Remove_StopAndStem)
# Step 6. Removing Useless Characters
X_clearned = X['text'].apply(Remove_UC)
X_new_Data = X_clearned.to_frame()
X_one_hot = pd.get_dummies(X_new_Data)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_one_hot, y, test_size=0.2, random_state=42)
# Set random seed
tf.random.set_seed(42)
# 1. Create the model
model_4 = tf.keras.Sequential([
tf.keras.layers.Dense(4,activation=tf.keras.activations.relu),
tf.keras.layers.Dense(4,activation=tf.keras.activations.relu),
tf.keras.layers.Dense(1,activation='sigmoid')
])
# 2. Compile the model
model_4.compile(loss=tf.keras.losses.BinaryCrossentropy(),
optimizer=tf.keras.optimizers.Adam(),
metrics=['accuracy'])
# 3. Fit the model
history_4 = model_4.fit(X_train, y_train, epochs=10, batch_size=96,validation_data=(X_test, y_test),)