Model overfitting very fast

Bonnary · August 21, 2021, 5:26am

The model is overfitting very fast. With just 10 epochs my model gets 97% accuracy from train data but only 62% on test data. I have tried changing the activation, reduce and increase the learning rate but the result is even worse and it never passes 62% accuracy. Is there any technique to increase the accuracy?

import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

Dataset=pd.read_csv('train.csv')
Dataset.dropna(inplace=True)

X = Dataset[["text"]]
y = Dataset[["target"]]

import re 

def Remove_Url(string):
    return re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%|\-)*\b', '', string)

def Handle_Tags(string):
    pattern = re.compile(r'[@|#][^\s]+')
    matches = pattern.findall(string)
    tags = [match[1:] for match in matches]
    # Removing tags from main string
    string = re.sub(pattern, '', string)
    # More weightage to tag by adding them 3 times
    return string + ' ' + ' '.join(tags) + ' '+ ' '.join(tags) + ' ' + ' '.join(tags)

import demoji
demoji.download_codes()

def Handle_emoji(string):
    return demoji.replace_with_desc(string)

def Remove_html(string):
    return re.sub(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});', '', str(string))

import nltk 
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

nltk.download('punkt')
nltk.download('stopwords')
stemmer  = SnowballStemmer('english')

stopword = stopwords.words('english')

def Remove_StopAndStem(string):
    string_list = string.split()
    return ' '.join([stemmer.stem(i) for i in string_list if i not in stopword])

def Remove_UC(string):
    thestring = re.sub(r'[^a-zA-Z\s]','', string)
    thestring = re.sub(r'\b\w{1,2}\b', '', thestring)
    return re.sub(' +', ' ', thestring) 

# Step 1. Remove Url
X_clearned = X['text'].apply(Remove_Url)
# Step 2. Handle Tags
X_clearned = X['text'].apply(Handle_Tags)
# Step 3. Handle emoji's
X_clearned = X['text'].apply(Handle_emoji)
# Step 4. Remove HTML Tags
X_clearned = X['text'].apply(Remove_html)
# Step 5. Remove Stopwords and Stemming
X_clearned = X['text'].apply(Remove_StopAndStem)
# Step 6. Removing Useless Characters
X_clearned = X['text'].apply(Remove_UC)

X_new_Data = X_clearned.to_frame()
X_one_hot = pd.get_dummies(X_new_Data)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_one_hot, y,  test_size=0.2,  random_state=42)

# Set random seed
tf.random.set_seed(42)

# 1. Create the model 
model_4 = tf.keras.Sequential([
  tf.keras.layers.Dense(4,activation=tf.keras.activations.relu),
  tf.keras.layers.Dense(4,activation=tf.keras.activations.relu),
  tf.keras.layers.Dense(1,activation='sigmoid')
])

# 2. Compile the model
model_4.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

# 3. Fit the model
history_4 = model_4.fit(X_train, y_train, epochs=10, batch_size=96,validation_data=(X_test, y_test),)

Ekaterina_Dranitsyna · August 21, 2021, 1:45pm

If you are working with text, higher accuracy probably could be reached by using a pretrained transformer model. Usually it achieves better results than simple sequential models. You can find pretrained models on TF Hub. As for examples of using them and fine-tuning to your specific task, you can look at Kaggle or Keras code examples (Natural Language Processing).