'ValueError: Data cardinality is ambiguous in model.fit()
I am trying to train a sequential network on data taken from a pandas dataframe.
Here is the code:
df = pd.read_csv(io.BytesIO(uploaded['20200325_counsel_chat.csv']))
df_test = pd.read_csv(io.BytesIO(uploaded_test['counselchat-data.csv']))
#defining data
#1)Sorting the data according to topic
train_size = 0.8
valid_size = 0.1
train_index = int(len(df)*train_size)
df.sort_values(by='topics', ascending=True, inplace=True)
train = df[0:train_index]
df_remaining = df[train_index:]
valid_index = int(len(df)*valid_size)
validation = df[train_index:train_index+valid_index]
test = df[train_index+valid_index:]
train = train.fillna({'questionText':'0'})
train = train.fillna({'topics':'0'})
test = test.fillna({'questionText':'0'})
test = test.fillna({'topics':'0'})
def get_names(data_set):
questions = data_set['questionText']
labels = data_set['topics']
return questions,labels
#Retrieving names and labels
questions , labels = get_names(train)
print(questions[0])
print(labels[0])
#Tokenization of questions/names
tokenizer = Tokenizer(num_words=100000, oov_token='<UNK>',lower=None, split=' ',filters='!@#$%^&*()_+-=[]{}\|;:"<,>.?/`~')
tokenizer.fit_on_texts(questions)
tokenizer.texts_to_sequences(questions)
#determining length of each token
lengths = [len(t.split(' ')) for t in questions]
plt.hist(lengths, bins=len(set(lengths)))
plt.show
maxlen = 70
def get_sequences(_questions):
#Making all sequences of same length
sequences = tokenizer.texts_to_sequences(_questions)
padded = pad_sequences(sequences, truncating = 'post', padding='post', maxlen=maxlen)
return padded
padded_training_seq = get_sequences(questions)
padded_training_seq[0]
classes = set(labels)
print(classes)
#setting class_to_index and vice versa
class_to_index = dict((c,i) for i, c in enumerate(classes))
index_to_class = dict((v,k) for k, v in class_to_index.items())
class_to_index
index_to_class
names_to_ids = lambda labels: np.array([class_to_index.get(x) for x in labels])
train_labels = names_to_ids(labels)
train_labels=np.asarray(train_labels).astype(np.float32)
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
train_labels = lb.fit_transform(train_labels)
#DEFINING A KERAS SEQUENTIAL MODEL
model = tf.keras.Sequential([
tf.keras.layers.Embedding(100000,16,input_length=maxlen),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20, return_sequences=True)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20)),
tf.keras.layers.Dense(20000, activation='softmax')
])
model.compile(
loss = 'sparse_categorical_crossentropy',
optimizer = 'adam',
metrics = ['accuracy']
)
model.summary()
#Setting parameters to fit the model
val_questions , val_labels = get_names(validation)
val_seq = get_sequences(val_questions)
val_labels = names_to_ids(val_labels)
val_labels=np.asarray(val_labels).astype(np.float32)
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
labels = lb.fit_transform(labels)
val_questions = val_questions.dropna()
val_labels = val_labels[~np.isnan(val_labels)]
train_labels=np.array(train_labels)
padded_training_seq=np.array(padded_training_seq)
val_labels=lb.fit_transform(val_labels)
display(val_labels)
val_questions,val_labels
x=padded_training_seq
y=train_labels
eps=20
display(val_labels)
h = model.fit(
np.array(x),np.array(y),
validation_data = (np.array(val_seq),np.array(val_labels)),
epochs = eps,
callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=2)]
)
This is the error that I'm getting:
ValueError: Data cardinality is ambiguous:
x sizes: 212
y sizes: 134
Make sure all arrays contain the same number of samples.
How can I rectify this error?
Solution 1:[1]
Your validation data has different shape for X and y, you have reduced val_labels here:
val_labels[~np.isnan(val_labels)
But you didn't remove the corresponding X values.
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|---|
| Solution 1 | desertnaut |
