'ValueError: Shapes (None, None) and (None, 28, 28, 10) are incompatible

I am working on a neural network to recognize handwritten digits using the MNIST digits dataset. I wanted to use ImageDataGenerator from Keras to see if I could use that to increase the score of the predictions. But when I actually try to run the model I get this error: ValueError: Shapes (None, None) and (None, 28, 28, 10) are incompatible. the relevant code is:

datagen = ImageDataGenerator(
    featurewise_center=True,
    featurewise_std_normalization=True,
    rotation_range=20,
    validation_split=0.2)

datagen.fit(X_train.reshape(126000, 28, 28, 1))
print(X_train.shape)
print(y_train.shape)
model = keras.Sequential([
    layers.Dense(784, activation='relu', input_shape=(28, 28, 1)),
    layers.BatchNormalization(),
    layers.Dropout(rate=0.5),
    layers.Dense(784, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(rate=0.5),
    layers.Dense(784, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(rate=0.5),
    layers.Dense(784, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(rate=0.5),
    layers.Dense(10, activation='softmax'),
])
model.compile(
    optimizer="Adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)
X_train = np.reshape(X_train, (X_train.shape[0], 28, 28, 1))
history = model.fit(datagen.flow(
    X_train,
    y_train),
    validation_data=(X_test, y_test),
    batch_size=640,
    epochs=100,
)

And this is the error I get:

ValueError: in user code:

    File "/Users/martdejager/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/engine/training.py", line 1021, in train_function  *
        return step_function(self, iterator)
    File "/Users/martdejager/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/engine/training.py", line 1010, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/martdejager/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/engine/training.py", line 1000, in run_step  **
        outputs = model.train_step(data)
    File "/Users/martdejager/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/engine/training.py", line 860, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/Users/martdejager/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/engine/training.py", line 918, in compute_loss
        return self.compiled_loss(
    File "/Users/martdejager/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/engine/compile_utils.py", line 201, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/Users/martdejager/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/losses.py", line 141, in __call__
        losses = call_fn(y_true, y_pred)
    File "/Users/martdejager/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/losses.py", line 245, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/Users/martdejager/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/losses.py", line 1789, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "/Users/martdejager/miniforge3/envs/env_tf/lib/python3.9/site-packages/keras/backend.py", line 5083, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, None) and (None, 28, 28, 10) are incompatible

Complete code:

# %%
import pandas as pd
from scipy.ndimage import shift
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.preprocessing.image import ImageDataGenerator
from keras import layers
from sklearn.model_selection import train_test_split
import numpy as np

# Setup plotting
import matplotlib.pyplot as plt

plt.style.use('seaborn-whitegrid')
# Set Matplotlib defaults
plt.rc('figure', autolayout=True)
plt.rc('axes', labelweight='bold', labelsize='large',
       titleweight='bold', titlesize=18, titlepad=10)


physical_devices = tf.config.list_physical_devices('GPU')
try:
  tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
  # Invalid device or cannot modify virtual devices once initialized.
  pass

# %%
# train = pd.read_csv("../data/train.csv")
# test = pd.read_csv("../data/test.csv")

train = pd.read_csv('../data/train.csv').astype('float32')
X, y = train.drop('label', axis = 1), train.label

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
X_predict = pd.read_csv('../data/test.csv')

# %%
class_names = [str(i) for i in range(10)]
print(class_names)

# %% [markdown]
# # Separating targets and features
# 
# `X_train` contains the features: all 784 pixels from the images in an array
# 
# `Y_train` contains the targets: the numerical labels from 1-9

# %%
# y_train = train["label"]
# X_train = train.drop(labels = ["label"],axis = 1)

# %%
plt.figure()
plt.imshow(X_train.iloc[0].values.reshape(28,28))
plt.colorbar()
plt.grid(False)
plt.show()


# %% [markdown]
# # Distribution
# 
# Data is more or less evenly distributed. Category 5 contains some 1000 elements less than Category 1 but it's still quite a lot of training examples.

# %%
y_train.value_counts()

# %%
y_train.hist()
plt.show()

# %% [markdown]
# # Data preparation
# 
# 1. Transform our data from 0-255 pixel value to 0-1 pixel value
# 2. Transform our labels from numerical values to one-hot encoded values

# %%
X_train = X_train / 255.0
X_test = X_test / 255.0

# %%
plt.figure(figsize=(10,10))
for i in range(25):
    plt.subplot(5,5,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(X_train.iloc[i].values.reshape(28,28), cmap=plt.cm.binary)
plt.show()

# %%
# y_train = keras.utils.to_categorical(y_train, num_classes=10)

# %%
# y_train[:5]

# %% [markdown]
# # Splitting our data
# 
# We use a 80/20 test split since there's quite a lot of data and we just want to learn on as much as possible. Random state of The One True Number so we can safely repeat the split.

# %%
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.2, random_state=0)

# %%
y_train = keras.utils.to_categorical(y_train, num_classes=10)
y_test = keras.utils.to_categorical(y_test, num_classes=10)

# %%
print(X_train.shape)

# %%
X_train = X_train.values # This is why I absolutely detest pandas. 3 hours of debugging, oh it's a dataframe haha.
print(X_train.shape)
print(y_train.shape)

# %%
# Method to shift the image by given dimension
# def shift_image(image, dx, dy):
#     image = image.reshape((28, 28))
#     shifted_image = shift(image, [dy, dx], cval=0, mode="constant")
#     return shifted_image.reshape([-1])

# %%
# Creating Augmented Dataset
# X_train_augmented = [image for image in X_train]
# y_train_augmented = [image for image in y_train]

# for dx, dy in ((1,0), (-1,0), (0,1), (0,-1)):
#      for image, label in zip(X_train, y_train):
#              X_train_augmented.append(shift_image(image, dx, dy))
#              y_train_augmented.append(label)

# %%
# Shuffle the dataset
# shuffle_idx = np.random.permutation(len(X_train_augmented))
# X_train = np.array(X_train_augmented)[shuffle_idx]
# y_train = np.array(y_train_augmented)[shuffle_idx]

# %%
print(X_train.shape)
print(y_train.shape)

# %%
print(X_train.shape)

# %% [markdown]
# # Images of our data
# 
# Which ones could the model have trouble with?

# %%
plt.figure(figsize=(10, 4))
for i in range(30):  
    plt.subplot(3, 10, i+1)
    plt.imshow(X_train[i].reshape((28,28)))
    plt.colorbar()
    plt.axis('off')
plt.show()

# %%
type(X_train)

# %% [markdown]
# # Building our network
# 
# We use 3Blue1Brown's model here. Input layer of 784 neurons (1 per pixel), ReLu activation, then two layers of 16 neurons with ReLu activation and finally a SoftMax layer.

# %%
datagen = ImageDataGenerator(
    featurewise_center=True,
    featurewise_std_normalization=True,
    rotation_range=20,
    validation_split=0.2)

datagen.fit(X_train.reshape(X_train.shape[0], 28, 28, 1))
print(X_train.shape)
print(y_train.shape)

# %%
X_train = X_train.reshape(X_train.shape[0], 28, 28, 1)

# %%
model = keras.Sequential([
    # layers.Dense(784, activation='relu', input_shape=(28, 28, 1)),
    # layers.BatchNormalization(),
    # layers.Dropout(rate=0.5),
    # layers.Dense(784, activation='relu'),
    # layers.BatchNormalization(),
    # layers.Dropout(rate=0.5),
    # layers.Dense(784, activation='relu'),
    # layers.BatchNormalization(),
    # layers.Dropout(rate=0.5),
    # layers.Dense(784, activation='relu'),
    # layers.BatchNormalization(),
    # layers.Dropout(rate=0.5),
    # layers.Dense(10, activation='softmax'),

    layers.Flatten(input_shape=(28, 28, 1,)),
    layers.Dense(784, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(rate=0.5),
    layers.Dense(784, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(rate=0.5),
    layers.Dense(784, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(rate=0.5),
    layers.Dense(784, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(rate=0.5),
    layers.Dense(10, activation='softmax'),
])


# %% [markdown]
# # Compiling the model
# 
# Adam optimizer is generally the best, but every year new ones come out so challenge it!
# 
# Loss and metrics are hopefully obvious.

# %%
model.compile(
    optimizer="Adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

# %% [markdown]
# # Training the model
# 
# Generally 5 epochs is enough to do better than most humans. Let's do 50 in the lecture as well to see the difference.

# %%
history = model.fit(datagen.flow(
    X_train,
    y_train),
    validation_data=(X_test, y_test),
    batch_size=640,
    epochs=100,
)

# %%
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot()
print("Maximum Loss is: {:0.4f}".format(history_df['loss'].max()))
print("Minimum Validation Loss: {:0.4f}".format(history_df['val_loss'].min()))

# %%
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['accuracy', 'val_accuracy']].plot()
print("Maximum accuracy is: {:0.4f}".format(history_df['accuracy'].max()))
print("Minimum Validation accuracy: {:0.4f}".format(history_df['val_accuracy'].min()))

# %% [markdown]
# # The predictions
# 
# Unfortunately, the `predict` method just returns the SoftMax layers values, so we still need to retrieve the actual prediction from that. `np.argmax` does that: it retrieves the highest number from each array. Parameter `axis=1` makes it return the index from that array so actually get the number.

# %%
predictions = model.predict(X_test.values)

print(predictions[:5])

results = np.argmax(predictions ,axis = 1)

print(results[:5])

# %%
def plot_image(i, predictions_array, true_label, img):
  true_label, img = true_label[i], img[i]
  plt.grid(False)
  plt.xticks([])
  plt.yticks([])

  plt.imshow(img, cmap=plt.cm.binary)

  predicted_label = np.argmax(predictions_array)
  if predicted_label == true_label:
    color = 'blue'
  else:
    color = 'red'

  plt.xlabel("{} {:2.0f}% ({})".format(class_names[predicted_label],
                                100*np.max(predictions_array),
                                class_names[true_label]),
                                color=color)

def plot_value_array(i, predictions_array, true_label):
  true_label = true_label[i]
  plt.grid(False)
  plt.xticks(range(10))
  plt.yticks([])
  thisplot = plt.bar(range(10), predictions_array, color="#777777")
  plt.ylim([0, 1])
  predicted_label = np.argmax(predictions_array)

  thisplot[predicted_label].set_color('red')
  thisplot[true_label].set_color('blue')


# %%
y_test = y_test.argmax(axis=1)

# %%
y_train = y_train.argmax(axis=1)

# %%
X_train = np.reshape(X_train, (X_train.shape[0], 28, 28, 1))

# %%
i = 0
plt.figure(figsize=(6,3))
plt.subplot(1,2,1)
plot_image(i, predictions[i], y_test, X_test)
plt.subplot(1,2,2)
plot_value_array(i, predictions[i],  y_test)
plt.show()

# %%
# Plot the first X test images, their predicted labels, and the true labels.
# Color correct predictions in blue and incorrect predictions in red.
num_rows = 5
num_cols = 3
num_images = num_rows*num_cols
plt.figure(figsize=(2*2*num_cols, 2*num_rows))
for i in range(num_images):
    plt.subplot(num_rows, 2*num_cols, 2*i+1)
    plot_image(i, predictions[i], y_test, X_test)
    plt.subplot(num_rows, 2*num_cols, 2*i+2)
    plot_value_array(i, predictions[i], y_test)
plt.tight_layout()
plt.show()



# %%
my_submission = pd.DataFrame({'ImageId': list(range(1,len(results)+1)), 'label': results})

# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)

# %%


Solution 1:[1]

You should convert your labels to one-hot encoded labels if you want to use categorical_crossentropy. Also, use a Flatten layer in the beginning of your model. Here is a working example:

import tensorflow as tf
import numpy as np

datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    featurewise_center=True,
    featurewise_std_normalization=True,
    rotation_range=20,
    validation_split=0.2)

(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
X_train = np.expand_dims(X_train, axis=-1)
datagen.fit(X_train)

model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28, 1)),
    tf.keras.layers.Dense(784, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(rate=0.5),
    tf.keras.layers.Dense(784, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(rate=0.5),
    tf.keras.layers.Dense(784, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(rate=0.5),
    tf.keras.layers.Dense(784, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(rate=0.5),
    tf.keras.layers.Dense(10, activation='softmax'),
])

model.compile(
    optimizer="Adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)
history = model.fit(datagen.flow(
    X_train,
    tf.keras.utils.to_categorical(y_train, 10)),
    validation_data=(np.expand_dims(X_test, axis=-1), tf.keras.utils.to_categorical(y_test, 10)),
    batch_size=64,
    epochs=2,
)

Results for 2 epochs:

Epoch 1/2
1875/1875 [==============================] - 124s 65ms/step - loss: 0.5946 - accuracy: 0.8242 - val_loss: 82.8574 - val_accuracy: 0.7302
Epoch 2/2
1875/1875 [==============================] - 65s 35ms/step - loss: 0.3617 - accuracy: 0.8899 - val_loss: 112.3475 - val_accuracy: 0.7139

Update 1:

Regarding your dimensions issue, I think you forgot to reshape X_test in your code:

X_test = X_test.reshape(X_test.shape[0], 28, 28, 1)

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1