'PyTorch CUDA error: an illegal memory access was encountered

Relatively new to using CUDA. I keep getting the following error after a seemingly random period of time: RuntimeError: CUDA error: an illegal memory access was encountered

I have seen people suggest things such as using cuda.set_device() rather than cuda.device(), setting torch.backends.cudnn.benchmark = False

but I can't seem to get the error to go away. Here are some pieces of my code: torch.cuda.set_device(torch.device('cuda:0')) torch.backends.cudnn.benchmark = False

class LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=0.2)

        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_().cuda()
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_().cuda()

        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))

        out = self.fc(out[:, -1, :]) 
        
        return out

    def pred(self, x):
        return self(x) > 0

def train(model, loss_fn, optimizer, num_epochs, x_train, y_train, x_val, y_val, loss_stop=60):
    cur_best_loss = 999
    loss_recur_count = 0
    best_model = None
    for t in range(num_epochs):
        model.train()

        y_train_pred = model(x_train)

        train_loss = loss_fn(y_train_pred, y_train)

        tr_l = train_loss.item()
        
        optimizer.zero_grad()

        train_loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():  
            y_val_pred = model(x_val)

            val_loss = loss_fn(y_val_pred, y_val)

            va_l = val_loss.item()
            
            if va_l < cur_best_loss:
                cur_best_loss = va_l
                best_model = model
                loss_recur_count = 0
            else:
                loss_recur_count += 1

        if loss_recur_count == loss_stop:
            break
    if best_model is None:
        print("model is None.")
    return best_model
def lstm_test(cols, df, test_percent, test_bal, initial_shares_test, max_price, last_sell_day):
    wdw = 20
    x_train, y_train, x_test, y_test, x_val, y_val = load_data(df, wdw, test_percent, cols)

    x_train = torch.from_numpy(x_train).type(torch.Tensor).cuda()
    x_test = torch.from_numpy(x_test).type(torch.Tensor).cuda()
    x_val = torch.from_numpy(x_val).type(torch.Tensor).cuda()
    y_train = torch.from_numpy(y_train).type(torch.Tensor).cuda()
    y_test = torch.from_numpy(y_test).type(torch.Tensor).cuda()
    y_val = torch.from_numpy(y_val).type(torch.Tensor).cuda()

    input_dim = x_train.shape[-1]
    hidden_dim = 32
    num_layers = 2
    output_dim = 1
    y_preds_dict = {}
    for i in range(11):
        model = LSTM(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers).cuda()

        r = (y_train.cpu().shape[0] - np.count_nonzero(y_train.cpu()))/np.count_nonzero(y_train.cpu())/2
        pos_w = torch.tensor([r]).cuda()

        loss_fn = torch.nn.BCEWithLogitsLoss(pos_weight=pos_w).cuda()

        optimizer = torch.optim.AdamW(model.parameters(), lr=0.01)

        best_model = train(model, loss_fn, optimizer, 300, x_train, y_train, x_val, y_val)
        
        y_test_pred = get_predictions(best_model, x_test)
        y_preds_dict[i] = y_test_pred.cpu().detach().numpy().flatten()

and here is the error msg:

<ipython-input-5-c52edc2c0508> in train(model, loss_fn, optimizer, num_epochs, x_train, y_train, x_val, y_val, loss_stop)
     19         model.eval()
     20         with torch.no_grad():
---> 21             y_val_pred = model(x_val)
     22 
     23             val_loss = loss_fn(y_val_pred, y_val)

~\anaconda3\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
   1049         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1050                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051             return forward_call(*input, **kwargs)
   1052         # Do not call functions when jit is used
   1053         full_backward_hooks, non_full_backward_hooks = [], []

<ipython-input-4-9da8c811c037> in forward(self, x)
     10 
     11     def forward(self, x):
---> 12         h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_().cuda()
     13         c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_().cuda()
     14 

RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.


Solution 1:[1]

Turns out using torch.cuda.empty_cache() at the end of the for i in range(11): loop solved the problem.

Solution 2:[2]

Had an issue where Pytorch Geometric was giving a similar error. Reducing the batch size worked for me - source.

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 Anthony Arena
Solution 2