'RuntimeError: Function MmBackward returned an invalid gradient at index 0 - got [4096, 32] but expected shape compatible with [4096, 512]

Error when replacing nn.Linear with Mlp.

I am replacing my nn.Linear module with Mlp from timm module. In the case of nn.Linear, the model starts to learn, I tried to change the dimension of the data feed to the model, but this made more problems. How do I rewrite Mlp or convert data ?

def _ntuple(n):
    def parse(x):
        if isinstance(x, collections.abc.Iterable):
            return x
        return tuple(repeat(x, n))
    return parse


to_1tuple = _ntuple(1)
to_2tuple = _ntuple(2)
to_3tuple = _ntuple(3)
to_4tuple = _ntuple(4)
to_ntuple = _ntuple


def make_divisible(v, divisor=8, min_value=None, round_limit=.9):
    min_value = min_value or divisor
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
    # Make sure that round down does not go down by more than 10%.
    if new_v < round_limit * v:
        new_v += divisor
    return new_v

class Mlp(nn.Module):
    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        drop_probs = to_2tuple(drop)

        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.drop1 = nn.Dropout(drop_probs[0])
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop2 = nn.Dropout(drop_probs[1])

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop1(x)
        x = self.fc2(x)
        x = self.drop2(x)
        return x

class DenseNetAtt(nn.Module):
    def __init__(
        self, number_class_symbols, time_feature_count=32, lstm_hidden=256,
        lstm_len=2,
    ):
        super().__init__()
        self.feature_extractor = get_densenet201_backbone(pretrained=True)

        self.avg_pool = nn.AdaptiveAvgPool2d(
            (time_feature_count, time_feature_count))
        
        self.bilstm = BiLSTM(time_feature_count, lstm_hidden, lstm_len)

        self.classifier = nn.Sequential(
            Mlp(lstm_hidden *2, time_feature_count,),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(time_feature_count, number_class_symbols)
        )
    def forward(self, x):
        x = self.feature_extractor(x)
        b, c, h, w = x.size()
        x = x.view(b, c * h, w)
        x = self.avg_pool(x)
        x = x.transpose(1, 2)
        x = self.bilstm(x)
        x = self.classifier(x)
        x = nn.functional.log_softmax(x, dim=2).permute(1, 0, 2)
        return x

Code where the error occurs

def train_loop(data_loader, model, criterion, optimizer, epoch):
    loss_avg = AverageMeter()
    model.train()
    print("train loop")
    for images, texts, enc_pad_texts, text_lens in tqdm.tqdm(data_loader):
        model.zero_grad()
        
        images = images.to(DEVICE)
        batch_size = len(texts)
        
        output = model(images)
        output_lenghts = torch.full(
            size=(output.size(1),),
            fill_value=output.size(0),
            dtype=torch.long
        )
        loss = criterion(output, enc_pad_texts, output_lenghts, text_lens)
        loss.backward(retain_graph=True)
        
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
            
        loss_avg.update(loss.item(), batch_size)
        
    
        torch.nn.utils.clip_grad_norm_(model.parameters(), 2)
        
        optimizer.step()
    optimizer.sync_lookahead()

RuntimeError: Function MmBackward returned an invalid gradient at index 0 - got [2048, 32] but expected shape compatible with [2048, 512]
/tmp/ipykernel_6070/4094404913.py in <module>
----> 1 train(config_json)

/tmp/ipykernel_6070/872149847.py in train(config)
    141         timer = time.time()
    142         print("\nEpoch", epoch, "Previous took", epoch_time_m, "minutes")
--> 143         loss_avg = train_loop(train_loader, model, criterion, optimizer, epoch)
    144         acc_avg, cer_avg = val_loop(val_loader, model, tokenizer, DEVICE)
    145         print(f'acc: {acc_avg}; cer: {cer_avg};')

/tmp/ipykernel_6070/872149847.py in train_loop(data_loader, model, criterion, optimizer, epoch)
     56         )
     57         loss = criterion(output, enc_pad_texts, output_lenghts, text_lens)
---> 58         loss.backward(retain_graph=True)
     59 
     60         with amp.scale_loss(loss, optimizer) as scaled_loss:

/opt/conda/lib/python3.7/site-packages/torch/_tensor.py in backward(self, gradient, retain_graph, create_graph, inputs)
    253                 create_graph=create_graph,
    254                 inputs=inputs)
--> 255         torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
    256 
    257     def register_hook(self, hook):

/opt/conda/lib/python3.7/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
    147     Variable._execution_engine.run_backward(
    148         tensors, grad_tensors_, retain_graph, create_graph, inputs,
--> 149         allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
    150 
    151 

RuntimeError: Function MmBackward returned an invalid gradient at index 0 - got [2048, 32] but expected shape compatible with [2048, 512]




Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source