'Loss function has negative values

I'm training a CNN for binary segmentation with a loss function that is a combination of dice coefficient and cross entropy. During the training I'm getting a loss that is negative. The dice is always positive (0-1) while the binary cross entropy (I am using sigmoid as output function) I think should be also positive. Training images were standardized with zero mean and unit standard deviation. Even normalizing images in range 0-1 the loss is always negative. I've truncated the training after the first epoch, but the loss in negative even in all the other epochs. Does anyone know why the loss is negative? Thanks for considering my request.

My code:

    def Unet(input_size1 = (160,160,1), num_class=2, n_filt=32):
  
  input_model1 = Input(input_size1)

  #layer1 2D
  x1 = ReLU()(BatchNormalization()(Conv2D(n_filt, 3, padding = 'same', kernel_initializer = 'he_normal')(input_model1)))
  conv1 = ReLU()(BatchNormalization()(Conv2D(n_filt, 3, padding = 'same', kernel_initializer = 'he_normal')(x1)))
  pool1 = MaxPooling2D(pool_size=(2, 2))(conv1)
  
  #layer2 2D
  conv2 = ReLU()(BatchNormalization()(Conv2D(n_filt*2, 3, padding = 'same', kernel_initializer = 'he_normal')(pool1)))
  conv2 = ReLU()(BatchNormalization()(Conv2D(n_filt*2, 3, padding = 'same', kernel_initializer = 'he_normal')(conv2)))
  pool2 = MaxPooling2D(pool_size=(2, 2))(conv2)

  #layer3 2D
  conv3 = ReLU()(BatchNormalization()(Conv2D(n_filt*4, 3, padding = 'same', kernel_initializer = 'he_normal')(pool2)))
  conv3 = ReLU()(BatchNormalization()(Conv2D(n_filt*4, 3, padding = 'same', kernel_initializer = 'he_normal')(conv3)))
  pool3 = MaxPooling2D(pool_size=(2, 2))(conv3)

  #layer4 2D
  conv4 = ReLU()(BatchNormalization()(Conv2D(n_filt*8, 3, padding = 'same', kernel_initializer = 'he_normal')(pool3)))
  conv4 = ReLU()(BatchNormalization()(Conv2D(n_filt*8, 3, padding = 'same', kernel_initializer = 'he_normal')(conv4)))
  pool4 = MaxPooling2D(pool_size=(2, 2))(conv4)
  #layer5 2D
  conv5 = ReLU()(BatchNormalization()(Conv2D(n_filt*16, 3, padding = 'same', kernel_initializer = 'he_normal')(pool4)))
  conv5 = ReLU()(BatchNormalization()(Conv2D(n_filt*16, 3, padding = 'same', kernel_initializer = 'he_normal')(conv5)))

  conv_up5 = ReLU()(BatchNormalization()(Conv2DTranspose(num_class, 4, strides=(2, 2), padding='same',activation = 'relu',kernel_initializer = 'he_normal')(conv5)))

  merge6 = concatenate([conv_up5,conv4], axis = 3)
  conv6 = ReLU()(BatchNormalization()(Conv2D(n_filt*8, 3, padding = 'same', kernel_initializer = 'he_normal')(merge6)))
  conv6 = ReLU()(BatchNormalization()(Conv2D(n_filt*8, 3, padding = 'same', kernel_initializer = 'he_normal')(conv6)))

  conv_up6 = ReLU()(BatchNormalization()(Conv2DTranspose(num_class, 4, strides=(2, 2), padding='same',activation = 'relu',kernel_initializer = 'he_normal')(conv6)))

  merge7 = concatenate([conv_up6,conv3], axis = 3)
  conv7 = ReLU()(BatchNormalization()(Conv2D(n_filt*4, 3, padding = 'same', kernel_initializer = 'he_normal')(merge7)))
  conv7 = ReLU()(BatchNormalization()(Conv2D(n_filt*4, 3, padding = 'same', kernel_initializer = 'he_normal')(conv7)))

  conv_up7 = ReLU()(BatchNormalization()(Conv2DTranspose(num_class, 4, strides=(2, 2), padding='same',activation = 'relu',kernel_initializer = 'he_normal')(conv7)))

  merge8 = concatenate([conv_up7,conv2], axis = 3)
  conv8 = ReLU()(BatchNormalization()(Conv2D(n_filt*2, 3, padding = 'same', kernel_initializer = 'he_normal')(merge8)))
  conv8 = ReLU()(BatchNormalization()(Conv2D(n_filt*2, 3, padding = 'same', kernel_initializer = 'he_normal')(conv8)))

  conv_up8 = ReLU()(BatchNormalization()(Conv2DTranspose(num_class, 4, strides=(2, 2), padding='same',activation = 'relu',kernel_initializer = 'he_normal')(conv8)))

  merge9 = concatenate([conv_up8,conv1], axis = 3)
  conv9 = ReLU()(BatchNormalization()(Conv2D(n_filt, 3, padding = 'same', kernel_initializer = 'he_normal')(merge9)))
  conv9 = ReLU()(BatchNormalization()(Conv2D(n_filt, 3, padding = 'same', kernel_initializer = 'he_normal')(conv9)))
  
  if num_class>2:
      output=Conv2D(num_class, 1, activation = 'softmax', padding = 'same', kernel_initializer = 'he_normal')(conv9)
  else:
      output=Conv2D(num_class-1, 1, activation = 'sigmoid', padding = 'same', kernel_initializer = 'he_normal')(conv9)

  model = Model(inputs=input_model1, outputs=output)
  return model

model = Unet()

def dice_loss(delta = 0.5, smooth = 0.000001):
    def loss_function(y_true, y_pred):
        axis = identify_axis(y_true.get_shape())
        # Calculate true positives (tp), false negatives (fn) and false positives (fp)
        tp = K.sum(y_true * y_pred, axis=axis)
        fn = K.sum(y_true * (1-y_pred), axis=axis)
        fp = K.sum((1-y_true) * y_pred, axis=axis)
        # Calculate Dice score
        dice_class = (tp + smooth)/(tp + delta*fn + (1-delta)*fp + smooth)
        # Average class scores
        dice_loss = K.mean(1-dice_class)

        return dice_loss
        
    return loss_function

def combo_loss(alpha=0.5, beta=0.6):
    def loss_function(y_true,y_pred):
        dice = dice_loss()(y_true, y_pred)
        axis = identify_axis(y_true.get_shape())
        # Clip values to prevent division by zero error
        epsilon = K.epsilon()
        y_pred = K.clip(y_pred, epsilon, 1. - epsilon)
        cross_entropy = -y_true * (K.log(y_pred)) + (1-y_true) * (-K.log(1-y_pred))
        if beta is not None:
            beta_weight = np.array([beta, 1-beta])
            cross_entropy = beta_weight * cross_entropy
        # sum over classes
        cross_entropy = K.mean(K.sum(cross_entropy, axis=[-1]))
        if alpha is not None:
            combo_loss = (alpha * cross_entropy) - ((1 - alpha) * dice)
        else:
            combo_loss = cross_entropy - dice
        return combo_loss

    return loss_function

# each img has zero mean and unit standard deviation
img_train = train_img[...,np.newaxis].astype('float32')
mask_train = train_label[...,np.newaxis].astype('float32')
img_val = val_img[...,np.newaxis].astype('float32')
mask_val = val_label[...,np.newaxis].astype('float32')

batch_size = 2
epochs = 10
initial_learning_rate = 1e-3

opt = tf.keras.optimizers.Adam(learning_rate=initial_learning_rate)

model.compile(optimizer=opt, loss=combo_loss(), 
              metrics = [dice_coef])

results = model.fit(img_train, mask_train, batch_size=batch_size, epochs=epochs, validation_data=(img_val, mask_val), callbacks=[stopping, checkpoint, callback], shuffle=True)


Fitting model...
Epoch 1/10
23/23 [==============================] - ETA: 0s - loss: -0.0231 - dice_coef: 0.4117


Solution 1:[1]

The dice_loss seems wrong, if i remember correctly.

In your code dice coefficient is $(TP)/(TP+FN+FP)$, it should be $(2*TP)/(TP+FN+FP+TP)$.

And this is a better way to implemention

def dice_coeff(input, target):
    inputs = input.float()
    target = target.float()
    noreducedim = [0] + list(range(2, len(inputs.shape)))

    intersect = (inputs * target).sum(dim=noreducedim)
    denominator = (inputs + target).sum(dim=noreducedim)
    dices = (2 * intersect + 1e-6) / (denominator + 1e-6)
    return dices

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1