'Why my Bert pre-trained model's loss can't reduce?

I tried to write a Bert pre-trained model to fine-tune it for a specific task. Here is my model:

model_path = config.model_path
class BertForCWS(transformers.BertPreTrainedModel):
    def __init__(self,config):
        super(BertForCWS,self).__init__(config)
        self.bert = transformers.BertModel.from_pretrained(model_path,config=config)
        self.softmax = nn.Softmax(dim=-1)
        self.lossCal = nn.CrossEntropyLoss()
        
    def forward(self,input,input_labels=None,tags=None,tag2vec=None,id2tag=None,attention_mask=None):
        output = self.bert(input,attention_mask=attention_mask)
        sequence = output[0]
        #get out the [cls] and [seq]
        sequence_out = [sen[1:-1] for sen in sequence]
        sequence_out= torch.tensor([item.detach().numpy() for item in sequence_out])
        score = []
        for sen in sequence_out:
            sen_score=[]
            for word in sen:
                word_score = []
                word_temp = word.unsqueeze(0)
                for tag,vec in tag2vec.items():
                    word_score.append(torch.mm(vec,word_temp.transpose(0,1)))
                sen_score.append(word_score)
            score.append(sen_score)
        score = torch.tensor(score)
        score = self.softmax(score)
        loss = 0
        for i in range(len(score)):
            for j in range(len(score[i])):
                loss_temp = 0
                for k in range(len(score[i][j])):
                    if(k == input_labels[i][j]):
                        loss_temp += (1-(score[i][j][k]))* (1-(score[i][j][k]))
                    else :
                        loss_temp += (score[i][j][k]) * (score[i][j][k])
            loss += loss_temp      
        return sequence_out,score,loss

I just tried to use the vector which is the output of the Bert to dot product the target label which is also the output of the Bert to get four labels.
here is how I test it with just one input:

test = "我是高"
test_id = tokenizer.encode(test)
print("test_id:",test_id)
test_input = torch.tensor([test_id])
print("test_input:",test_input)
label = [2,2,0]
label = torch.tensor([label])
print(label.size())
for ep in range(200):
    model.train()   
    model.zero_grad()
    test_output = model(test_input,input_labels=label,tag2vec=tag2vec)
    loss = test_output[2]
    loss = loss.requires_grad_()
    print(loss)
    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=1)
    optimizer.step()
    scheduler.step()

I got the loss:

tensor(2., requires_grad=True)
tensor(2., requires_grad=True)
tensor(1.9999, requires_grad=True)
tensor(2., requires_grad=True)
tensor(2., requires_grad=True)
tensor(2.0000, requires_grad=True)
tensor(2.0000, requires_grad=True)
tensor(2., requires_grad=True)
tensor(2.0000, requires_grad=True)
tensor(1.9981, requires_grad=True)
tensor(2.0000, requires_grad=True)
tensor(1.9860, requires_grad=True)
tensor(2.0000, requires_grad=True)
tensor(2., requires_grad=True)
tensor(2., requires_grad=True)
tensor(1.9971, requires_grad=True)
tensor(2., requires_grad=True)
tensor(1.9723, requires_grad=True)
tensor(2., requires_grad=True)
tensor(2.0000, requires_grad=True)

why the loss is always about 2 but never reduce? Thx a lot!



Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source