'Loss Not Decreasing for a Bert from Scratch PyTorch Model

I followed Aladdin Persson's Youtube video to code up just the encoder portion of the transformer model in PyTorch, except I just used the Pytorch's multi-head attention layer. The model seems to produce the correct shape of data. However, during training, the training loss does not drop and the resulting model always predicts the same output of 0.4761. Dataset used for training is from the Sarcasm Detection Dataset from Kaggle. Would appreciate any help you guys can give on errors that I have made.

import pandas as pd
from transformers import BertTokenizer
import torch.nn as nn
import torch
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import ReduceLROnPlateau
import math

df = pd.read_json("Sarcasm_Headlines_Dataset_v2.json", lines=True)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoded_input = tokenizer(df['headline'].tolist(), return_tensors='pt',padding=True)

X = encoded_input['input_ids']
y = torch.tensor(df['is_sarcastic'].values).float()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
torch.cuda.empty_cache()

class TransformerBlock(nn.Module):
    def __init__(self,embed_dim, num_heads, dropout, expansion_ratio):
        super(TransformerBlock, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim, num_heads)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_dim, expansion_ratio*embed_dim),
            nn.ReLU(),
            nn.Linear(expansion_ratio*embed_dim,embed_dim)
        )
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, value, key, query):
        attention, _ = self.attention(value, key, query)
        x=self.dropout(self.norm1(attention+query))
        forward = self.feed_forward(x)
        out=self.dropout(self.norm2(forward+x))
        return out

class Encoder(nn.Module):
    #the vocab size is one more than the max value in the X matrix.
    def __init__(self,vocab_size=30109,embed_dim=128,num_layers=1,num_heads=4,device="cpu",expansion_ratio=4,dropout=0.1,max_length=193):
        super(Encoder,self).__init__()
        
        self.device = device
        self.word_embedding = nn.Embedding(vocab_size,embed_dim)
        self.position_embedding = nn.Embedding(max_length,embed_dim)
        self.layers = nn.ModuleList(
            [
                TransformerBlock(embed_dim,num_heads,dropout,expansion_ratio) for _ in range(num_layers)
            ]
        )
        
        self.dropout = nn.Dropout(dropout)
        self.classifier1 = nn.Linear(embed_dim,embed_dim)
        self.classifier2 = nn.Linear(embed_dim,1)
        self.relu = nn.ReLU()
    
    def forward(self,x):
        N, seq_length = x.shape
        positions = torch.arange(0,seq_length).expand(N, seq_length).to(self.device)
        out = self.dropout(self.word_embedding(x) + self.position_embedding(positions))
        
        for layer in self.layers:
            #print(out.shape)
            out = layer(out,out,out)
        
        #Get the first output for classification
        #Pooled output from hugging face is: Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer and a Tanh activation function.
        #Pooled output from hugging face will be different from out[:,0,:], which is the output from the CLS token.
        out = self.relu(self.classifier1(out[:,0,:]))
        out = self.classifier2(out)
        
        return out

torch.cuda.empty_cache()
net = Encoder(device=device)
net.to(device)

batch_size = 32
num_train_samples = X_train.shape[0]
num_val_samples = X_test.shape[0]

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(net.parameters(),lr=1e-5)
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=5)

val_loss_hist=[]
loss_hist=[]
epoch = 0
min_val_loss = math.inf

print("Training Started")

patience = 0

for _ in range(100):
    
    epoch += 1
        
    net.train()
    epoch_loss = 0
    
    permutation = torch.randperm(X_train.size()[0])
    
    for i in range(0,X_train.size()[0], batch_size):
        
        indices = permutation[i:i+batch_size]
        
        features=X_train[indices].to(device)
        labels=y_train[indices].reshape(-1,1).to(device)
        
        output = net.forward(features)
        loss = criterion(output, labels)

        optimizer.zero_grad() 
        loss.backward()
        optimizer.step()
        
        epoch_loss+=loss.item()
        
    epoch_loss = epoch_loss / num_train_samples * num_val_samples
    loss_hist.append(epoch_loss)
    
    #print("Eval")
    net.eval()
    epoch_val_loss = 0
    
    permutation = torch.randperm(X_test.size()[0])
    
    for i in range(0,X_test.size()[0], batch_size):
        
        indices = permutation[i:i+batch_size]
        
        features=X_test[indices].to(device)
        labels = y_test[indices].reshape(-1,1).to(device)
        
        output = net.forward(features)
        loss = criterion(output, labels)        

        epoch_val_loss+=loss.item()
    
    val_loss_hist.append(epoch_val_loss)
    
    scheduler.step(epoch_val_loss)
    
    #if epoch % 5 == 0:
    print("Epoch: " + str(epoch) + " Train Loss: " + format(epoch_loss, ".4f") + ". Val Loss: " + format(epoch_val_loss, ".4f") + " LR: " + str(optimizer.param_groups[0]['lr']))
            
    if epoch_val_loss < min_val_loss:
        min_val_loss = epoch_val_loss
        torch.save(net.state_dict(), "torchmodel/weights_best.pth")
        print('\033[93m'+"Model Saved"+'\033[0m')
        patience = 0
        
    else:
        patience += 1
    
    if (patience == 10):
        break
        
print("Training Ended")


Solution 1:[1]

It'll be better if you load the modal inside the body instead of using scripts. Then use JQuery to show the modal after everything has loaded on the page.

For more information, check Bootstrap Documentation about modals. https://getbootstrap.com/docs/4.0/components/modal/

<html>
<head>
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css"
        integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
    <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js"
        integrity="sha384-KJ3o2DKtIkvYIK3UENzmM7KCkRr/rE9/Qpg6aAZGJwFDMVNA/GpGFF93hXpG5KkN"
        crossorigin="anonymous"></script>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.12.9/umd/popper.min.js"
        integrity="sha384-ApNbgh9B+Y1QKtv3Rn7W3mgPxhU9K/ScQsAP7hUibX39j7fakFPskvXusvfa0b4Q"
        crossorigin="anonymous"></script>
    <script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/js/bootstrap.min.js"
        integrity="sha384-JZR6Spejh4U02d8jOt6vLEHfe/JQGiRRSQQxSfFWpi1MquVdAyjUar5+76PVCmYl"
        crossorigin="anonymous"></script>
    <script>
        /* Function for running scripts after the page has loaded */
        $(document).ready(function () {
            $('#showModal').modal();
        });
    </script>
</head>

<body>
    <div>
        <!-- Button trigger modal -->
        <button type="button" class="btn btn-primary" data-toggle="modal" data-target="#showModal">
            Show Modal
        </button>

        <!-- Modal -->
        <div class="modal fade" id="showModal" tabindex="-1" role="dialog" aria-labelledby="showModalLabel"
            aria-hidden="true">
            <div class="modal-dialog" role="document">
                <div class="modal-content">
                    <div class="modal-body">
                        <button type="button" class="close" data-dismiss="modal" aria-label="Close">
                            <span aria-hidden="true">&times;</span>
                        </button>
                        <strong>Oh snap!</strong> Change a few things up and try submitting again.
                    </div>
                </div>
            </div>
        </div>

    </div>
</body>
</html>

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 SirRagex