'RuntimeError: The size of tensor a (124) must match the size of tensor b (128) at non-singleton dimension 3

I'm working on implementing a convolutional autoencoder in PyTorch for a class and am getting a very slightly differently sized output from what is expected and don't know why. Input and gold output are both 128x128 images (np arrays). But currently it's outputting 124x124 images (successfully going all the way thru net, error occurs when calculating loss).

For reference, the encoder/decoder blocks and train loop are as follows:

class Encoder(nn.Module):
    
    def __init__(self, encoding_len):
        super().__init__()
        
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 8, 3, stride=2, padding=1),
            nn.ReLU(True),
            nn.Conv2d(8, 16, 3, stride=2, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(True),
            nn.Conv2d(16, 32, 3, stride=2, padding=0),
            nn.ReLU(True)
        )
        
        self.flatten = nn.Flatten(start_dim=1)
    
        self.dense_net = nn.Sequential(
            nn.Linear(32 * 15 * 15, 128), # need this to not be hard coded since idk where this number is coming from
            nn.ReLU(True),
            nn.Linear(128, encoding_len)
        )
        
    def forward(self, x):
        
        x = self.cnn(x)
        print(x.shape)
        x = self.flatten(x)
        print(x.shape)
        x = self.dense_net(x)
        
        return x

class Decoder(nn.Module): 
    
    def __init__(self, encoding_len):
        super().__init__()
        self.dense_net = nn.Sequential(
            nn.Linear(encoding_len, 128),
            nn.ReLU(True),
            nn.Linear(128, 32 * 15 * 15),
            nn.ReLU(True)
        )
        
        self.unflatten = nn.Unflatten(dim=1, unflattened_size=(32, 15, 15))
        
        self.cnn = nn.Sequential(
            nn.ConvTranspose2d(32, 16, 3, stride=2, output_padding=0),
            nn.BatchNorm2d(16),
            nn.ReLU(True),
            nn.ConvTranspose2d(16, 8, 3, stride=2, padding=1, output_padding=1),
            nn.BatchNorm2d(8),
            nn.ReLU(True),
            nn.ConvTranspose2d(8, 1, 3, stride=2, padding=1, output_padding=1)   
        )
        
    def forward(self, x):
        x = self.dense_net(x)
        x = self.unflatten(x)
        x = self.cnn(x)
        
        x = torch.sigmoid(x)
        print("output shape: {}".format(x.shape))
        return x

def train_encoder_decoder(train, lr=0.0001, n_epochs=25):
    criterion = nn.MSELoss()
    
    #might actually be #dimensions not length
    encoding_len = 4
    
    encoder = Encoder(encoding_len)
    decoder = Decoder(encoding_len)
    
    params_to_optimize = [
        {'params': encoder.parameters()},
        {'params': decoder.parameters()}
    ]
    
    optim = torch.optim.Adam(params_to_optimize, lr=lr, weight_decay=1e-05)
    
    loss_vals = []
    
    # check for GPU
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    encoder.to(device)
    decoder.to(device)
    
    for epoch in range(n_epochs):
        epoch_loss = 0
        
        for x, y in tqdm(train):
            x, y = x.to(device), y.to(device)
            x = torch.unsqueeze(x, 0)
            optim.zero_grad()
            
            print("input shape: {}".format(x.shape))
            
            enc_out = encoder(x)
            out = decoder(enc_out)
            
            loss = criterion(out, y)
            criterion.backward()
            
            optim.step()
            epoch_loss += loss.item()
            
        loss_vals.append(epoch_loss/len(train))
        
    print(loss_vals)
    
    return encoder, decoder

I think it has something to do with some of the conv layer shapes but I'm not sure.



Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source