'ValueError: Data cardinality is ambiguous in model.fit()

I am trying to train a sequential network on data taken from a pandas dataframe.
Here is the code:

    df = pd.read_csv(io.BytesIO(uploaded['20200325_counsel_chat.csv']))



    df_test = pd.read_csv(io.BytesIO(uploaded_test['counselchat-data.csv']))


#defining data
    #1)Sorting the data according to topic

    train_size = 0.8
    valid_size = 0.1

    train_index = int(len(df)*train_size)

    df.sort_values(by='topics', ascending=True, inplace=True)
    train = df[0:train_index]
    df_remaining = df[train_index:]
    valid_index = int(len(df)*valid_size)
    validation = df[train_index:train_index+valid_index]
    test = df[train_index+valid_index:]


train = train.fillna({'questionText':'0'})
train = train.fillna({'topics':'0'})
test = test.fillna({'questionText':'0'})
test = test.fillna({'topics':'0'})


def get_names(data_set):
  questions = data_set['questionText']
  labels = data_set['topics']
  return questions,labels
#Retrieving names and labels
questions , labels = get_names(train)
print(questions[0])
print(labels[0])
#Tokenization of questions/names
tokenizer = Tokenizer(num_words=100000, oov_token='<UNK>',lower=None, split=' ',filters='!@#$%^&*()_+-=[]{}\|;:"<,>.?/`~')
tokenizer.fit_on_texts(questions)

tokenizer.texts_to_sequences(questions)
#determining length of each token

lengths = [len(t.split(' ')) for t in questions]
plt.hist(lengths, bins=len(set(lengths)))
plt.show

maxlen = 70

def get_sequences(_questions):
  #Making all sequences of same length
  sequences = tokenizer.texts_to_sequences(_questions)
  padded = pad_sequences(sequences, truncating = 'post', padding='post', maxlen=maxlen)
  return padded

padded_training_seq = get_sequences(questions)
padded_training_seq[0]

classes = set(labels)
print(classes)

#setting class_to_index and vice versa

class_to_index = dict((c,i) for i, c in enumerate(classes))
index_to_class = dict((v,k) for k, v in class_to_index.items())

class_to_index
index_to_class

names_to_ids = lambda labels: np.array([class_to_index.get(x) for x in labels])
train_labels = names_to_ids(labels)
train_labels=np.asarray(train_labels).astype(np.float32)

from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
train_labels = lb.fit_transform(train_labels)

#DEFINING A KERAS SEQUENTIAL MODEL

model = tf.keras.Sequential([
                            tf.keras.layers.Embedding(100000,16,input_length=maxlen),
                            tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20, return_sequences=True)),
                            tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20)),
                            tf.keras.layers.Dense(20000, activation='softmax')
])

model.compile(
    loss = 'sparse_categorical_crossentropy',
    optimizer = 'adam',
    metrics = ['accuracy']
)

model.summary()

#Setting parameters to fit the model

val_questions , val_labels = get_names(validation)
val_seq = get_sequences(val_questions)

val_labels = names_to_ids(val_labels)
val_labels=np.asarray(val_labels).astype(np.float32)

from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
labels = lb.fit_transform(labels)


val_questions = val_questions.dropna()
val_labels = val_labels[~np.isnan(val_labels)]
train_labels=np.array(train_labels)
padded_training_seq=np.array(padded_training_seq)
val_labels=lb.fit_transform(val_labels)
display(val_labels)
val_questions,val_labels

x=padded_training_seq
y=train_labels
eps=20

display(val_labels)
h = model.fit(
    np.array(x),np.array(y),
    validation_data = (np.array(val_seq),np.array(val_labels)),
    epochs = eps,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=2)]
)

This is the error that I'm getting:

ValueError: Data cardinality is ambiguous:
  x sizes: 212
  y sizes: 134
Make sure all arrays contain the same number of samples.

How can I rectify this error?



Solution 1:[1]

Your validation data has different shape for X and y, you have reduced val_labels here:

val_labels[~np.isnan(val_labels)

But you didn't remove the corresponding X values.

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 desertnaut