'custom loss function outputs nan in model.fit()
I am trying to implement YOLOv1 and its loss function using tensorflow. The YOLOv1 architecture and its loss function implementation were referenced from this video by aladdin persson: https://www.youtube.com/watch?v=n9_XyCGr-MI.
I think I manage to create the code implementation in tensorflow and trying to train the yolov1 model using model.fit(). It ran fine, and I don't face any error or exception but the loss and val loss always outputs nan and I don't know where it went wrong.
this is the code implementation of YOLOv1 architecture
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, InputLayer, Dropout, Flatten, Reshape
from tensorflow.keras.layers import Conv2D, MaxPooling2D, GlobalMaxPooling2D, Activation, LeakyReLU
from tensorflow.keras.regularizers import l2
lrelu = tf.keras.layers.LeakyReLU(alpha=0.1)
nb_boxes=1
grid_w=7
grid_h=7
cell_w=64
cell_h=64
img_w=grid_w*cell_w
img_h=grid_h*cell_h
model = Sequential()
model.add(Conv2D(filters=64, kernel_size= (7, 7), strides=(1, 1), input_shape =(img_h, img_w, 3), padding = 'same', activation=lrelu, kernel_regularizer=l2(5e-4)))
model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding = 'same'))
model.add(Conv2D(filters=192, kernel_size= (3, 3), padding = 'same', activation=lrelu, kernel_regularizer=l2(5e-4)))
model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding = 'same'))
model.add(Conv2D(filters=128, kernel_size= (1, 1), padding = 'same', activation=lrelu, kernel_regularizer=l2(5e-4)))
model.add(Conv2D(filters=256, kernel_size= (3, 3), padding = 'same', activation=lrelu, kernel_regularizer=l2(5e-4)))
model.add(Conv2D(filters=256, kernel_size= (1, 1), padding = 'same', activation=lrelu, kernel_regularizer=l2(5e-4)))
model.add(Conv2D(filters=512, kernel_size= (3, 3), padding = 'same', activation=lrelu, kernel_regularizer=l2(5e-4)))
model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding = 'same'))
model.add(Conv2D(filters=256, kernel_size= (1, 1), padding = 'same', activation=lrelu, kernel_regularizer=l2(5e-4)))
model.add(Conv2D(filters=512, kernel_size= (3, 3), padding = 'same', activation=lrelu, kernel_regularizer=l2(5e-4)))
model.add(Conv2D(filters=256, kernel_size= (1, 1), padding = 'same', activation=lrelu, kernel_regularizer=l2(5e-4)))
model.add(Conv2D(filters=512, kernel_size= (3, 3), padding = 'same', activation=lrelu, kernel_regularizer=l2(5e-4)))
model.add(Conv2D(filters=256, kernel_size= (1, 1), padding = 'same', activation=lrelu, kernel_regularizer=l2(5e-4)))
model.add(Conv2D(filters=512, kernel_size= (3, 3), padding = 'same', activation=lrelu, kernel_regularizer=l2(5e-4)))
model.add(Conv2D(filters=256, kernel_size= (1, 1), padding = 'same', activation=lrelu, kernel_regularizer=l2(5e-4)))
model.add(Conv2D(filters=512, kernel_size= (3, 3), padding = 'same', activation=lrelu, kernel_regularizer=l2(5e-4)))
model.add(Conv2D(filters=512, kernel_size= (1, 1), padding = 'same', activation=lrelu, kernel_regularizer=l2(5e-4)))
model.add(Conv2D(filters=1024, kernel_size= (3, 3), padding = 'same', activation=lrelu, kernel_regularizer=l2(5e-4)))
model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding = 'same'))
model.add(Conv2D(filters=512, kernel_size= (1, 1), padding = 'same', activation=lrelu, kernel_regularizer=l2(5e-4)))
model.add(Conv2D(filters=1024, kernel_size= (3, 3), padding = 'same', activation=lrelu, kernel_regularizer=l2(5e-4)))
model.add(Conv2D(filters=512, kernel_size= (1, 1), padding = 'same', activation=lrelu, kernel_regularizer=l2(5e-4)))
model.add(Conv2D(filters=1024, kernel_size= (3, 3), padding = 'same', activation=lrelu, kernel_regularizer=l2(5e-4)))
model.add(Conv2D(filters=1024, kernel_size= (3, 3), padding = 'same', activation=lrelu, kernel_regularizer=l2(5e-4)))
model.add(Conv2D(filters=1024, kernel_size= (3, 3), strides=(2, 2), padding = 'same'))
model.add(Conv2D(filters=1024, kernel_size= (3, 3), activation=lrelu, kernel_regularizer=l2(5e-4)))
model.add(Conv2D(filters=1024, kernel_size= (3, 3), activation=lrelu, kernel_regularizer=l2(5e-4)))
model.add(Flatten())
model.add(Dense(512))
model.add(Dense(1024))
model.add(Dropout(0.5))
model.add(LeakyReLU(0.1))
model.add(Dense(1470))
model.summary()
the implementation of its loss function
batch_size = 4
def intersection_over_union(boxes_preds, boxes_labels, box_format="midpoint"):
if box_format == "midpoint":
box1_x1 = boxes_preds[..., 0:1] - boxes_preds[..., 2:3] / 2 ## ==> x - w / 2 for each grid in each image
box1_y1 = boxes_preds[..., 1:2] - boxes_preds[..., 3:4] / 2 ## ==> y - h / 2 for each grid in each image
box1_x2 = boxes_preds[..., 0:1] + boxes_preds[..., 2:3] / 2 ## ==> x + w / 2 for each grid in each image
box1_y2 = boxes_preds[..., 1:2] + boxes_preds[..., 3:4] / 2 ## ==> y + h / 2 for each grid in each image
box2_x1 = boxes_labels[..., 0:1] - boxes_labels[..., 2:3] / 2
box2_y1 = boxes_labels[..., 1:2] - boxes_labels[..., 3:4] / 2
box2_x2 = boxes_labels[..., 0:1] + boxes_labels[..., 2:3] / 2
box2_y2 = boxes_labels[..., 1:2] + boxes_labels[..., 3:4] / 2
if box_format == "corners":
box1_x1 = boxes_preds[..., 0:1]
box1_y1 = boxes_preds[..., 1:2]
box1_x2 = boxes_preds[..., 2:3]
box1_y2 = boxes_preds[..., 3:4] # (N, 1)
box2_x1 = boxes_labels[..., 0:1]
box2_y1 = boxes_labels[..., 1:2]
box2_x2 = boxes_labels[..., 2:3]
box2_y2 = boxes_labels[..., 3:4]
x1 = K.max((box1_x1, box2_x1))
y1 = K.max((box1_y1, box2_y1))
x2 = K.min((box1_x2, box2_x2))
y2 = K.min((box1_y2, box2_y2))
intersection = K.clip((x2-x1), min_value=0, max_value=(x2-x1)) * K.clip((y2-y1), min_value=0, max_value=(y2-y1))
box1_area = abs((box1_x2 - box1_x1) * (box1_y2 - box1_y1))
box2_area = abs((box2_x2 - box2_x1) * (box2_y2 - box2_y1))
return intersection / (box1_area + box2_area - intersection + 1e-6)
def my_sum_mse(y_actual, y_pred):
mse = tf.reduce_sum(K.square(y_pred - y_actual))
return mse
def YOLO_loss(y_actual, y_pred):
S=7
B=2
C=20
lambda_coord = 5
lambda_noobj = 0.5
y_pred = K.reshape(y_pred, (batch_size, S, S, C+5*B))
target_box = y_actual[..., 21:25]
pred_box1 = y_pred[..., 21:25]
pred_box2 = y_pred[..., 26:30]
#mse = tf.keras.losses.MeanSquaredError(tf.keras.losses.Reduction.SUM)
iou_box1 = intersection_over_union(pred_box1, target_box, box_format="midpoint")
iou_box2 = intersection_over_union(pred_box2, target_box, box_format="midpoint")
ious = K.concatenate((K.expand_dims(iou_box1, 0), K.expand_dims(iou_box2, 0)), axis=0)
best_box = K.argmax(ious, axis=0) ## index, tipe data integer
exists_box = y_actual[..., 20:21] ## in paper this is Iobj_i ## di Ground truth ada objek atau tidak (0 atau 1)
#print(best_box.shape)
#print(exists_box.shape)
best_box = K.cast(best_box, tf.float32)
exists_box = K.cast(exists_box, tf.float32)
## BOX LOSS ##
box_pred_xy = exists_box * (pred_box1[..., :2] * (1 - best_box) + pred_box2[..., :2] * best_box)
box_actual_xy = exists_box * target_box[..., :2]
box_pred_wh = exists_box * (pred_box1[..., 2:4] * (1 - best_box) + pred_box2[..., 2:4] * best_box)
box_pred_wh = K.sign(box_pred_wh) * K.sqrt(K.abs(box_pred_wh + 1e-6))
box_actual_wh = exists_box * target_box[..., 2:4]
box_actual_wh = K.sqrt(K.abs(box_actual_wh))
box_pred = K.concatenate((box_pred_xy, box_pred_wh), axis=-1)
box_actual = K.concatenate((box_actual_xy, box_actual_wh), axis=-1)
print(box_pred.shape)
print(box_actual.shape)
"""
box_pred = box_pred.numpy()
box_actual = box_actual.numpy()
box_pred[..., 2:4] = K.sign(box_pred[..., 2:4]) * K.sqrt(K.abs(box_pred[..., 2:4] + 1e-6))
box_actual[..., 2:4] = K.sqrt(K.abs(box_actual[..., 2:4]))
box_pred = tf.constant(box_pred)
box_actual = tf.constant(box_actual)
"""
box_loss = my_sum_mse(
K.reshape(box_actual, (batch_size * S * S, 4)),
K.reshape(box_pred, (batch_size * S * S, 4))
)
## OBJECT LOSS ##
conf_pred = exists_box * (y_pred[..., 20:21] * (1 - best_box) + y_pred[..., 25:26] * best_box)
conf_actual = exists_box * y_actual[..., 20:21]
object_loss = my_sum_mse(
K.reshape(conf_actual, (-1, S * S * 1)),
K.reshape(conf_pred, (-1, S * S * 1))
)
## NO OBJECT LOSS ##
no_conf_pred1 = (1 - exists_box) * y_pred[..., 20:21]
no_conf_pred2 = (1 - exists_box) * y_pred[..., 25:26]
no_conf_actual = (1 - exists_box) * y_actual[..., 20:21]
no_object_loss = my_sum_mse(
K.reshape(no_conf_actual, (-1, S * S * 1)),
K.reshape(no_conf_pred1, (-1, S * S * 1))
)
no_object_loss += my_sum_mse(
K.reshape(no_conf_actual, (-1, S * S * 1)),
K.reshape(no_conf_pred2, (-1, S * S * 1))
)
## CLASSES LOSS ##
classes_pred = exists_box * y_pred[..., :20]
classes_actual = exists_box * y_actual[..., :20]
classes_loss = my_sum_mse(
K.reshape(classes_actual, (batch_size * S * S, C)),
K.reshape(classes_pred, (batch_size * S * S, C))
)
loss = (
lambda_coord * box_loss +
object_loss +
lambda_noobj * no_object_loss +
classes_loss
)
return loss
and the code I use to compile and fit the model along with several lines of its output
from tensorflow import keras
model.compile(loss=YOLO_loss ,optimizer='adam')
model.fit(x=my_training_batch_generator,
steps_per_epoch = int(len(X_train) // batch_size),
epochs = 135,
verbose = 1,
workers= 4,
validation_data = my_validation_batch_generator,
validation_steps = int(len(X_val) // batch_size),
callbacks=[
CustomLearningRateScheduler(lr_schedule),
mcp_save
])
output:
Epoch 1/135
Epoch 00000: Learning rate is 0.0100.
(4, 7, 7, None)
(None, None, None, None)
(4, 7, 7, 4)
(None, None, None, None)
(4, 7, 7, None)
(None, None, None, None)
(4, 7, 7, 4)
(None, None, None, None)
6/142 [>.............................] - ETA: 43s - loss: nanWARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.1375s vs `on_train_batch_end` time: 0.1875s). Check your callbacks.
142/142 [==============================] - ETA: 0s - loss: nan(4, 7, 7, None)
(None, None, None, None)
(4, 7, 7, 4)
(None, None, None, None)
142/142 [==============================] - 134s 929ms/step - loss: nan - val_loss: nan
Epoch 2/135
Epoch 00001: Learning rate is 0.0100.
142/142 [==============================] - 139s 981ms/step - loss: nan - val_loss: nan
Epoch 3/135
Epoch 00002: Learning rate is 0.0100.
142/142 [==============================] - 140s 992ms/step - loss: nan - val_loss: nan
Epoch 4/135
Solution 1:[1]
Loss function outputs nan is usually caused by the following reasons:
- Data Problems. Maybe your data exists Nan, you could check you data.
- Gradient explosion. You could give a lower learning rate.
- Denominator is 0 in the loss function.
I think your data exists Nan data by your problem description. I am sorry i couldn't give your a accurate answer, there are many reasons why this can be a problem.
I am a non-native English speaker. Hope you can understand my answer and help you.
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|---|
| Solution 1 | Mason Ma |
