'RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same | another case

When I run my script, I run into:

RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same

while my model and my input all seem to be on CUDA.

Check for the model:

In [5]: [i.device for i in net.parameters()]
Out[5]: 
[device(type='cuda', index=0),
 device(type='cuda', index=0),
 device(type='cuda', index=0),
 device(type='cuda', index=0),
 device(type='cuda', index=0),
 device(type='cuda', index=0),
 device(type='cuda', index=0),
 device(type='cuda', index=0)]

Check for the input:

In [14]: [i for i in iter(train_loader)][0][0].device
context has already been set
[W CudaIPCTypes.cpp:15] Producer process has been terminated before all shared CUDA tensors released. See Note [Sharing CUDA tensors]
Out[14]: device(type='cuda', index=0)

The script (training.py):

import os
os.environ['MKL_THREADING_LAYER'] = 'GNU'

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch import optim

# from torch.multiprocessing import Pool, Process, set_start_method
from torch.multiprocessing import set_start_method
try:
      set_start_method('spawn')
except RuntimeError as rte:
    print(rte)


import matplotlib.pyplot as plt

import unet
from datenaufbereitung import *
from datetime import datetime
from softcatch import CatchSIGINT

# net = NotImplemented
net = unet.FlexibleUNet(2, 4, bilinear = False, volume = 32, depth = 4)

lr = 1e-1
# criterion = nn.CrossEntropyLoss()
criterion = nn.MSELoss()
# optimizer = optim.Adam(net.parameters(), lr = lr)
optimizer = optim.AdamW(net.parameters(), lr = lr)
# optimizer = optim.SGD(net.parameters(), lr = lr, momentum = 0.7, dampening = 0.3)


checkpoint_path = 'checkpoints'
runs_path = 'runs'

def save_checkpoint(unique=True):
    if unique:
        isotime = datetime.now().isoformat()
        fname = f'unet-{isotime}.pt'
    else:
        fname = 'unet-cpt.pt'
    torch.save(net, f'{checkpoint_path}/{fname}')
    
def load_newest():
    global net
    import os
    from glob import glob
    flist = glob(f'{checkpoint_path}/*')
    if flist:
        newest = max(flist, key=os.path.getctime)
        net = torch.load(newest)

def train(trainloader, n_epochs = 50, stride = 1, testset = None):
    # if not testset is None:
    #     fig = plt.figure()
    with CatchSIGINT() as sigint:
        for epoch in range(n_epochs):  # loop over the dataset multiple times
            # config net for training
            net.train()
        
            running_loss = 0.0
            for i, data in enumerate(trainloader, 0):
                # get the inputs
                inputs, targets = data
        
                # zero the parameter gradients
                optimizer.zero_grad()
        
                # forward + backward + optimize
                outputs = net(inputs)
                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()
                
                # print statistics
                running_loss += loss.item()
                if i % stride == stride-1:    # print every <stride> mini-batches
                    print('[%d, %5d] loss: %.3f' %
                          (epoch + 1, i + 1, running_loss / stride))
                    running_loss = 0.0
                if sigint.caught:
                    break
                    
            if sigint.caught:
                break

            # save checkpoint
            save_checkpoint(unique = False)
    
            # # config net for evaluation
            # if not testset is None:
            #     net.eval()
            #     ip, tg = testset[0]
            #     output = net(ip)
            #     imshow(output[0])

    save_checkpoint()            
    

if __name__ == '__main__':
    # __spec__ = None
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # device = torch.device("cpu")

    wdh = ' '
    while not wdh in 'jn':
        wdh = str.lower(input('Letzten Stand wiederherstellen (j/n)? '))
    make_data()

    # trainset, testset = make_datasets(train_fname = 'train_short.lx', test_fname = 'train_short.lx', in_transform = in_transform)
    # trainset, testset = make_datasets(train_fname = '2px_train.lx', test_fname = '2px_test.lx', maxlen = 10, in_transform = in_transform)
    trainset, testset = make_datasets(maxlen = 10)

    trainset.to(device)
    testset.to(device)
    net.to(device)
    criterion.to(device)

    train_loader, test_loader = make_dataloaders(trainset, testset)
    if wdh == 'j':
        load_newest()
    print('## Trainiere Netz')
    train(train_loader, 500)

I gave my datasets a "to" method to ensure they move their output to CUDA in another script named "datenaufbereitung.py":

lass DisplacementDataset(Dataset):
    def __init__(self, base_path, lexicon, transform=None, target_transform=None, maxlen=None, device=None):
        self.base_path = base_path
        self.lexicon = lexicon
        self.transform = transform
        self.target_transform = target_transform
        if not maxlen is None:
            self.length = min(len(lexicon), maxlen)
        else:
            self.length = len(lexicon)
        self.device = device
    
    def to(self, device):
        self.device = device

    def __len__(self):
        return self.length
    
    def __getitem__(self, idx):
        src_fn, warp = self.lexicon[idx]
        warp = deserialize_warp(warp)
        
        src = np.load(os.path.join(self.base_path,src_fn))
        src = np.squeeze(src)
        dst = warp.warp(src)
        
        images = warp.valid_roi(np.stack((src,dst)))
        shifts = warp.valid_roi(np.concatenate((warp.shiftmesh(), warp.inverse_shiftmesh())))
        
        if self.transform:
            images = self.transform(images)
        if self.target_transform:
            shifts = self.target_transform(shifts)
        if not self.device is None:
            return images.to(self.device), shifts.to(self.device)
        return images, shifts

So, as I seem to have sent my model, my optimizer and my input/target data to CUDA, I would not expect this behavior.

What did I miss?

EDIT (2022-05-20):

Today, I tried to reproduce the error to get a stack trace, but errors have changed. Looks like my installation is unstable somehow. The error I now reproducibly get is this one:

/home/burkhard/anaconda3/envs/ml-lab/lib/python3.7/site-packages/torch/cuda/__init__.py:82: UserWarning: CUDA initialization: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero. (Triggered internally at ../c10/cuda/CUDAFunctions.cpp:109.) return torch._C._cuda_getDeviceCount() > 0

This happens at "import unet" or at "net = FlexibleUNet", i.e. one of the first times I actually use torch. The warning comes without a stack trace, so I had to verify this by inserting prints before and after.

Might this depend on the versions of my installation? After I installed the newest driver capable of handling the RTX 3080 on my Ubuntu system, I had to resort to nightly builds and not-yet-canon PiPy packages.

(ml-lab) burkhard@pc:~$ nvidia-smi 
Fri May 20 11:20:48 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.73.05    Driver Version: 510.73.05    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0  On |                  N/A |
|  0%   57C    P8    28W / 350W |    532MiB / 12288MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|    0   N/A  N/A      1036      G   /usr/lib/xorg/Xorg                102MiB |
|    0   N/A  N/A      1664      G   /usr/lib/xorg/Xorg                326MiB |
|    0   N/A  N/A      1794      G   /usr/bin/gnome-shell               77MiB |
|    0   N/A  N/A      2024      G   ...bexec/gnome-initial-setup        3MiB |
|    0   N/A  N/A     26082      G   ...a3/envs/ml-lab/bin/python        3MiB |
+-----------------------------------------------------------------------------+
(ml-lab) burkhard@burkhard@pc:~$ dkms status
nvidia, 510.73.05, 5.4.0-110-generic, x86_64: installed
(ml-lab) burkhard@burkhard@pc:~$ cat /proc/driver/nvidia/version
NVRM version: NVIDIA UNIX x86_64 Kernel Module  510.73.05  Sat May  7 05:30:26 UTC 2022
GCC version:  gcc version 9.4.0 (Ubuntu 9.4.0-1ubuntu1~20.04.1) 
(ml-lab) burkhard@burkhard@pc:~$ conda list | grep torch
_pytorch_select           0.2                       gpu_0  
torch                     1.12.0.dev20220518+cu116          pypi_0    pypi
torchaudio                0.12.0.dev20220518+cu116          pypi_0    pypi
torchvision               0.13.0.dev20220518+cu116          pypi_0    pypi


Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source