'CUDA error: device-side assert triggered in aws ec2

I am trying to run a CNN model on aws ec2 instance. It is giving me this error. The same script runs on a different ec2 instance with a different gpu but not on this. I am not able to solve it even though I’ve restarted the kernel, checked output neurons size, etc. Please help me!

RuntimeError                              Traceback (most recent call last)
/tmp/ipykernel_4008/1924527283.py in <cell line: 3>()
      5     start_time = time.monotonic()
      6 
----> 7     train_loss, train_acc = train(model, loader_train, optimizer, criterion, device="cpu")
      8     valid_loss, valid_acc = evaluate(model, loader_valid, criterion, device="cpu")
      9 

/tmp/ipykernel_4008/585199812.py in train(model, iterator, optimizer, criterion, device)
     13 
     14         with autocast(enabled=use_amp):
---> 15             y_pred = model(x)
     16 
     17             loss = criterion(y_pred, y)

~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1100         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1101                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102             return forward_call(*input, **kwargs)
   1103         # Do not call functions when jit is used
   1104         full_backward_hooks, non_full_backward_hooks = [], []

/tmp/ipykernel_4008/2249971542.py in forward(self, input)
    148 
    149         #MAIN Branch
--> 150         x = self.qcfem(input)
    151         #print(x.shape)
    152         residual = x

~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1100         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1101                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102             return forward_call(*input, **kwargs)
   1103         # Do not call functions when jit is used
   1104         full_backward_hooks, non_full_backward_hooks = [], []

/tmp/ipykernel_4008/2249971542.py in forward(self, input)
     44     def forward(self, input):
     45 
---> 46         w = self.relu(self.batch_norm(self.dilated_conv1(input)))
     47 
     48         x = self.relu(self.batch_norm(self.dilated_conv2(input)))

~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1100         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1101                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102             return forward_call(*input, **kwargs)
   1103         # Do not call functions when jit is used
   1104         full_backward_hooks, non_full_backward_hooks = [], []

~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/torch/nn/modules/conv.py in forward(self, input)
    444 
    445     def forward(self, input: Tensor) -> Tensor:
--> 446         return self._conv_forward(input, self.weight, self.bias)
    447 
    448 class Conv3d(_ConvNd):

~/anaconda3/envs/pytorch_p38/lib/python3.8/site-packages/torch/nn/modules/conv.py in _conv_forward(self, input, weight, bias)
    440                             weight, bias, self.stride,
    441                             _pair(0), self.dilation, self.groups)
--> 442         return F.conv2d(input, weight, bias, self.stride,
    443                         self.padding, self.dilation, self.groups)
    444 

RuntimeError: CUDA error: device-side assert triggered


Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source