'How to solve invalid input size .reshape() in Pytorch Text Summarization

I have a model from https://www.youtube.com/watch?v=U0s0f995w14(Aladdin Persson) when the model given a real data(text) the model resulting invalid input size in one of the DecoderBlock.

If there any questions and any suggestion much appreciated! Thanks

Model structure:

Transformer(
  (encoder): Encoder(
    (word_embedding): Embedding(94197, 256)
    (position_embedding): Embedding(100, 256)
    (layers): ModuleList(
      (0): TransformerBlock(
        (attention): SelfAttention(
          (values): Linear(in_features=32, out_features=32, bias=False)
          (keys): Linear(in_features=32, out_features=32, bias=False)
          (queries): Linear(in_features=32, out_features=32, bias=False)
          (fc_out): Linear(in_features=256, out_features=256, bias=True)
        )
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (feed_forward): Sequential(
          (0): Linear(in_features=256, out_features=0, bias=True)
          (1): ReLU()
          (2): Linear(in_features=0, out_features=256, bias=True)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerBlock(
        (attention): SelfAttention(
          (values): Linear(in_features=32, out_features=32, bias=False)
          (keys): Linear(in_features=32, out_features=32, bias=False)
          (queries): Linear(in_features=32, out_features=32, bias=False)
          (fc_out): Linear(in_features=256, out_features=256, bias=True)
        )
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (feed_forward): Sequential(
          (0): Linear(in_features=256, out_features=0, bias=True)
          (1): ReLU()
          (2): Linear(in_features=0, out_features=256, bias=True)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (2): TransformerBlock(
        (attention): SelfAttention(
          (values): Linear(in_features=32, out_features=32, bias=False)
          (keys): Linear(in_features=32, out_features=32, bias=False)
          (queries): Linear(in_features=32, out_features=32, bias=False)
          (fc_out): Linear(in_features=256, out_features=256, bias=True)
        )
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (feed_forward): Sequential(
          (0): Linear(in_features=256, out_features=0, bias=True)
          (1): ReLU()
          (2): Linear(in_features=0, out_features=256, bias=True)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (3): TransformerBlock(
        (attention): SelfAttention(
          (values): Linear(in_features=32, out_features=32, bias=False)
          (keys): Linear(in_features=32, out_features=32, bias=False)
          (queries): Linear(in_features=32, out_features=32, bias=False)
          (fc_out): Linear(in_features=256, out_features=256, bias=True)
        )
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (feed_forward): Sequential(
          (0): Linear(in_features=256, out_features=0, bias=True)
          (1): ReLU()
          (2): Linear(in_features=0, out_features=256, bias=True)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (4): TransformerBlock(
        (attention): SelfAttention(
          (values): Linear(in_features=32, out_features=32, bias=False)
          (keys): Linear(in_features=32, out_features=32, bias=False)
          (queries): Linear(in_features=32, out_features=32, bias=False)
          (fc_out): Linear(in_features=256, out_features=256, bias=True)
        )
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (feed_forward): Sequential(
          (0): Linear(in_features=256, out_features=0, bias=True)
          (1): ReLU()
          (2): Linear(in_features=0, out_features=256, bias=True)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (5): TransformerBlock(
        (attention): SelfAttention(
          (values): Linear(in_features=32, out_features=32, bias=False)
          (keys): Linear(in_features=32, out_features=32, bias=False)
          (queries): Linear(in_features=32, out_features=32, bias=False)
          (fc_out): Linear(in_features=256, out_features=256, bias=True)
        )
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (feed_forward): Sequential(
          (0): Linear(in_features=256, out_features=0, bias=True)
          (1): ReLU()
          (2): Linear(in_features=0, out_features=256, bias=True)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (decoder): Decoder(
    (word_embedding): Embedding(94197, 256)
    (position_embedding): Embedding(100, 256)
    (layers): ModuleList(
      (0): DecoderBlock(
        (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (attention): SelfAttention(
          (values): Linear(in_features=32, out_features=32, bias=False)
          (keys): Linear(in_features=32, out_features=32, bias=False)
          (queries): Linear(in_features=32, out_features=32, bias=False)
          (fc_out): Linear(in_features=256, out_features=256, bias=True)
        )
        (transformer_block): TransformerBlock(
          (attention): SelfAttention(
            (values): Linear(in_features=32, out_features=32, bias=False)
            (keys): Linear(in_features=32, out_features=32, bias=False)
            (queries): Linear(in_features=32, out_features=32, bias=False)
            (fc_out): Linear(in_features=256, out_features=256, bias=True)
          )
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (feed_forward): Sequential(
            (0): Linear(in_features=256, out_features=0, bias=True)
            (1): ReLU()
            (2): Linear(in_features=0, out_features=256, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): DecoderBlock(
        (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (attention): SelfAttention(
          (values): Linear(in_features=32, out_features=32, bias=False)
          (keys): Linear(in_features=32, out_features=32, bias=False)
          (queries): Linear(in_features=32, out_features=32, bias=False)
          (fc_out): Linear(in_features=256, out_features=256, bias=True)
        )
        (transformer_block): TransformerBlock(
          (attention): SelfAttention(
            (values): Linear(in_features=32, out_features=32, bias=False)
            (keys): Linear(in_features=32, out_features=32, bias=False)
            (queries): Linear(in_features=32, out_features=32, bias=False)
            (fc_out): Linear(in_features=256, out_features=256, bias=True)
          )
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (feed_forward): Sequential(
            (0): Linear(in_features=256, out_features=0, bias=True)
            (1): ReLU()
            (2): Linear(in_features=0, out_features=256, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (2): DecoderBlock(
        (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (attention): SelfAttention(
          (values): Linear(in_features=32, out_features=32, bias=False)
          (keys): Linear(in_features=32, out_features=32, bias=False)
          (queries): Linear(in_features=32, out_features=32, bias=False)
          (fc_out): Linear(in_features=256, out_features=256, bias=True)
        )
        (transformer_block): TransformerBlock(
          (attention): SelfAttention(
            (values): Linear(in_features=32, out_features=32, bias=False)
            (keys): Linear(in_features=32, out_features=32, bias=False)
            (queries): Linear(in_features=32, out_features=32, bias=False)
            (fc_out): Linear(in_features=256, out_features=256, bias=True)
          )
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (feed_forward): Sequential(
            (0): Linear(in_features=256, out_features=0, bias=True)
            (1): ReLU()
            (2): Linear(in_features=0, out_features=256, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (3): DecoderBlock(
        (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (attention): SelfAttention(
          (values): Linear(in_features=32, out_features=32, bias=False)
          (keys): Linear(in_features=32, out_features=32, bias=False)
          (queries): Linear(in_features=32, out_features=32, bias=False)
          (fc_out): Linear(in_features=256, out_features=256, bias=True)
        )
        (transformer_block): TransformerBlock(
          (attention): SelfAttention(
            (values): Linear(in_features=32, out_features=32, bias=False)
            (keys): Linear(in_features=32, out_features=32, bias=False)
            (queries): Linear(in_features=32, out_features=32, bias=False)
            (fc_out): Linear(in_features=256, out_features=256, bias=True)
          )
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (feed_forward): Sequential(
            (0): Linear(in_features=256, out_features=0, bias=True)
            (1): ReLU()
            (2): Linear(in_features=0, out_features=256, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (4): DecoderBlock(
        (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (attention): SelfAttention(
          (values): Linear(in_features=32, out_features=32, bias=False)
          (keys): Linear(in_features=32, out_features=32, bias=False)
          (queries): Linear(in_features=32, out_features=32, bias=False)
          (fc_out): Linear(in_features=256, out_features=256, bias=True)
        )
        (transformer_block): TransformerBlock(
          (attention): SelfAttention(
            (values): Linear(in_features=32, out_features=32, bias=False)
            (keys): Linear(in_features=32, out_features=32, bias=False)
            (queries): Linear(in_features=32, out_features=32, bias=False)
            (fc_out): Linear(in_features=256, out_features=256, bias=True)
          )
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (feed_forward): Sequential(
            (0): Linear(in_features=256, out_features=0, bias=True)
            (1): ReLU()
            (2): Linear(in_features=0, out_features=256, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (5): DecoderBlock(
        (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (attention): SelfAttention(
          (values): Linear(in_features=32, out_features=32, bias=False)
          (keys): Linear(in_features=32, out_features=32, bias=False)
          (queries): Linear(in_features=32, out_features=32, bias=False)
          (fc_out): Linear(in_features=256, out_features=256, bias=True)
        )
        (transformer_block): TransformerBlock(
          (attention): SelfAttention(
            (values): Linear(in_features=32, out_features=32, bias=False)
            (keys): Linear(in_features=32, out_features=32, bias=False)
            (queries): Linear(in_features=32, out_features=32, bias=False)
            (fc_out): Linear(in_features=256, out_features=256, bias=True)
          )
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (feed_forward): Sequential(
            (0): Linear(in_features=256, out_features=0, bias=True)
            (1): ReLU()
            (2): Linear(in_features=0, out_features=256, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (fc_out): Linear(in_features=256, out_features=94197, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

output:

Model initialization finished
Model training...
  0%|          | 0/805 [00:00<?, ?it/s]=> Saving checkpoint
                                       Starting new loop!
Traceback (most recent call last):

  File "E:\transformer_summarization\train.py", line 144, in <module>
    train()

  File "E:\transformer_summarization\train.py", line 124, in train
    outputs = model(introductions, abstracts[:-1])

  File "C:\Users\Home\anaconda3\envs\pytorch2\lib\site-packages\torch\nn\modules\module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)

  File "E:\transformer_summarization\model2.py", line 268, in forward
    out = self.decoder(trg, enc_src, src_mask, trg_mask)

  File "C:\Users\Home\anaconda3\envs\pytorch2\lib\site-packages\torch\nn\modules\module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)

  File "E:\transformer_summarization\model2.py", line 200, in forward
    x = layer(x, enc_out, enc_out, src_mask, trg_mask)

  File "C:\Users\Home\anaconda3\envs\pytorch2\lib\site-packages\torch\nn\modules\module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)

  File "E:\transformer_summarization\model2.py", line 164, in forward
    out = self.transformer_block(value, key, query, src_mask)

  File "C:\Users\Home\anaconda3\envs\pytorch2\lib\site-packages\torch\nn\modules\module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)

  File "E:\transformer_summarization\model2.py", line 94, in forward
    attention = self.attention(value, key, query, mask)

  File "C:\Users\Home\anaconda3\envs\pytorch2\lib\site-packages\torch\nn\modules\module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)

  File "E:\transformer_summarization\model2.py", line 36, in forward
    values = values.reshape(N, value_len, self.heads, self.head_dim)

RuntimeError: shape '[249, 8, 8, 32]' is invalid for input of size 3375104


How do you fix the invalid input of size?



Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source