'tensorflow.python.framework.errors_impl.InternaalError:BLAS GEMM launch failed: a.shpe=(8, 459), b.shape=(8, 2), m=459, n=2, k=8

I am trying to train a CNN model on gpu with tensorflow-gpu2.0.0, and a error occurred as title mentioned, It can run well on cpu with tensorflow2.0.0. I use cuda 10.0, cudnn 7.6.5, RTX 3080(10GB). I couldn't find out which part is wrong, I saw some issues similar with this, they said it was caused by the memory occupation of process, so I tried to add

gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_virtual_device_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)])

behind import tensorflow as tf, but it didn't work to this, reduce batch_size didn't work as well.

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras import layers
import time
import datetime
import os
import tensorflow_probability as tfp
import pickle
import numpy as np
from Fusion_layer import perception_layer_topologyV1_template
from loss_2 import Custom_loss
import math



class am_lfs():

    def __init__(self, M=6, sigma=0.2, epochs=1, batch_size=128, B=4, num_class=2, Dropout_rate=0.3):
        self.M = M
        self.sigma = sigma
        self.batch_size = batch_size
        self.Dropout_rate = Dropout_rate
        self.B = B
        self.epochs = epochs
        self.num_class = num_class
        self.train_set = train_dataset
        self.test_set = test_dataset

    def train_model(self, model, early_stopping, csv_logger, train_set, test_set, sear, i, train_dataset, test_dataset):
        print("training model...")
        history = model.fit(train_dataset,
                              steps_per_epoch = max(1, math.ceil(len(train_set) / self.batch_size)),                       
                              epochs=epochs,
                              validation_data = test_dataset,                 #remember to change it
                              validation_steps = max(1, math.ceil(len(test_set) / self.batch_size)),
                              callbacks=[early_stopping, csv_logger])  
        path1 = 'model{}_{}.h5'.format(i, sear)
        model.save(os.path.join(result_path,path1))
        return history


    def Mymodel(self, theta):
        inputs = layers.Input(shape=(112, 112, 1), name='input')  # shape of input
        x = inputs
        # normal convolution layers
        output = layers.Conv2D(137, (7, 7), strides=2, padding='same', activation='relu')(x)

        output = layers.MaxPooling2D(pool_size=(3, 3), strides=2, padding='same')(output)

        output = layers.Conv2D(137, (1, 1))(output)

        output = layers.Conv2D(144, (3, 3), padding='same')(output)

        output = layers.MaxPooling2D(pool_size=(3, 3), strides=2, padding='same')(output)

        # Inception module (fusion 3)
        output = perception_layer_topologyV1_template(output, output_size=245, convMM_size=2, convMM_num=24,
                                             num_of_c11_1=308, conv1_size=6, conv1_num=77,
                                             num_of_c11_2=24, conv2_size=7, conv2_num=24,
                                             pool1_size=2, num_of_pool1c1_max=40, num_of_pool1c1_min=40,
                                             pool2_size=5, num_of_pool2c1_max=20, num_of_pool2c1_min=20,
                                             name="fusion_3")

        # Inception module (fusion 4)
        output = perception_layer_topologyV1_template(output, output_size=228, convMM_size=2, convMM_num=35,
                                             num_of_c11_1=354, conv1_size=6, conv1_num=15,
                                             num_of_c11_2=281, conv2_size=7, conv2_num=13,
                                             pool1_size=2, num_of_pool1c1_max=24, num_of_pool1c1_min=23,
                                             pool2_size=5, num_of_pool2c1_max=59, num_of_pool2c1_min=59,
                                             name="fusion_4")

        # max_pool
        output = layers.MaxPooling2D(pool_size=[3, 3],strides =2, padding = 'same', name = 'pool2')(output)

        # Inception module (fusion 5)
        output = perception_layer_topologyV1_template(output, output_size=459, convMM_size=2, convMM_num=14,
                                             num_of_c11_1=46, conv1_size=3, conv1_num=25,
                                             num_of_c11_2=170, conv2_size=5, conv2_num=116,
                                             pool1_size=4, num_of_pool1c1_max=107, num_of_pool1c1_min=107,
                                             pool2_size=5, num_of_pool2c1_max=45, num_of_pool2c1_min=45,
                                             name="fusion_5")
        # AveragePooling
        output = layers.AveragePooling2D(pool_size=[7, 7], strides=1, name='aver_pool_1')(output)
        # Flatten
        # output = layers.Flatten()(output)
        # Dropout
        output = layers.Dropout(self.Dropout_rate)(output)
        output = tf.squeeze(output, axis=2)
        output1 = tf.squeeze(output, axis=1, name='output1')
        # Dense: activation - softmax
        # logits = MyLayer(2)           # original FC
        logits = layers.Dense(2, activation='elu', name='output')(output1)

        # Build model
        model = keras.Model(inputs, logits)
        model.summary()
        w = model.layers[-1].get_weights()
        w_1 = tf.cast(w[0], dtype=tf.float32, name='w_1')
        model.compile(optimizer=keras.optimizers.Adam(), loss=Custom_loss(theta, w_1, output1), metrics=['accuracy'],
                      experimental_run_tf_function = False
                      )
        return model

class idk():
    def __init__(self, mus, B, M, sigma, dist):
        self.mus = mus
        self.B = B
        self.optim_mus = tf.keras.optimizers.Adam(lr=0.05) 
        self.dist =dist

    def sample(self):
        self.thetas = self.dist.sample((self.B,))
        return self.thetas

    def run(self, dict_m, thetas):
        accs = []
        loss_mu = 0
        with tf.GradientTape() as Tape:
            Tape.watch(self.mus)
            for i in range(self.B):
                max_acc = dict_m['Max_acc{}'.format(i)]
                acc = dict_m['acc{}'.format(i)]
                accs += acc
                loss_mu -= dist.log_prob(thetas[i]) * (max_acc - np.mean(accs)) / (np.std(accs) + np.finfo(np.float32).eps.item())
            loss_mu = loss_mu/self.B
        grad = Tape.gradient(loss_mu, [self.mus])
        self.optim_mus.apply_gradients(zip(grad, [self.mus,]))

if __name__ == '__main__':
    if not os.path.exists(r'.\logs\search\{}'.format(datetime.date.today().strftime("%Y%m%d"))):
        os.mkdir(r'.\logs\search\{}'.format(datetime.date.today().strftime("%Y%m%d")))
    result_path = r'.\logs\search\{}'.format(datetime.date.today().strftime("%Y%m%d"))

    start_time = time.time()
    batch_size = 128
    epochs = 1
    last_acc = 0.995
    B=4
    M=6
    sigma=0.25
    sear = 0
    dict_m = {}  
    
    train_dataset =...
    print("train data loaded...")

    # callbacks
    early_stopping = EarlyStopping(monitor='val_accuracy', min_delta=0.003, patience=3, verbose=1)        

    for i in range(B):
        dict_m['csv_logger{}'.format(str(i))] = tf.keras.callbacks.CSVLogger(r".\logs\csv\training_model{}_{}.log.csv".format(str(i), datetime.datetime.now().strftime("%Y%m%d%H%M%S")), append=True)
      

    # start training
    Am_Lfs = am_lfs(M=6, sigma=0.2, epochs=1, batch_size=128, B=4, num_class=2, Dropout_rate=0.3)
    mus = tf.Variable(tf.convert_to_tensor(np.concatenate([np.ones([6, ]), np.zeros([6, ])])), dtype=tf.float64, name='mus')
    dist = tfp.distributions.MultivariateNormalDiag(mus, tf.cast(np.ones(2 * M) * sigma, dtype=tf.float64))
    thetas = idk(mus, B, M, sigma, dist).sample()

    for i in range(B):
        model = Am_Lfs.Mymodel(thetas[i])
        dict_m["m{}".format(str(i))] = model

    for epo in range(200):
        sear = sear + 1
        max_acc_list = []
        for i in range(B):
            history = Am_Lfs.train_model(dict_m["m{}".format(str(i))], early_stopping, dict_m["csv_logger{}".format(str(i))], train_set, test_set, sear, i,
                                                        train_dataset, test_dataset)
            val_acc = history.history['val_accuracy']
            dict_m['acc{}'.format(i)] = val_acc
            dict_m['Max_acc{}'.format(i)] = max(val_acc)
            max_acc_list.append(dict_m['Max_acc{}'.format(i)])
          
        idk(mus, B, M, sigma, dist).run(dict_m, thetas)  
        dist = tfp.distributions.MultivariateNormalDiag(mus, tf.cast(np.ones(2 * M) * sigma, dtype=tf.float64))  
        thetas = idk(mus, B, M, sigma, dist).sample()   

        index, value = np.argmax(max_acc_list), np.max(max_acc_list)
        model_path = os.path.join(result_path, 'model{}_{}.h5'.format(index, sear))

        for i in range(B):
            dict_m["m{}".format(str(i))].load_weights(model_path)

    end_time = time.time()
    print('time_cost(s):', end_time - start_time)

here is the error if it helps

2022-02-25 10:05:38.104299: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cublas64_100.dll
2022-02-25 10:06:35.446853: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cudnn64_7.dll
2022-02-25 10:17:34.713995: W tensorflow/stream_executor/cuda/redzone_allocator.cc:312] Internal: Invoking ptxas not supported on Windows
Relying on driver to perform ptx compilation. This message will be only logged once.
2022-02-25 10:17:34.956997: E tensorflow/stream_executor/cuda/cuda_blas.cc:428] failed to run cuBLAS routine: CUBLAS_STATUS_EXECUTION_FAILED
2022-02-25 10:17:34.957108: W tensorflow/core/common_runtime/base_collective_executor.cc:216] BaseCollectiveExecutor::StartAbort Internal: Blas GEMM launch failed : a.shape=(8, 459), b.shape=(8, 2), m=459, n=2, k=8
  [[{{node training/Adam/gradients/gradients/output/MatMul_grad/MatMul_1}}]]
Traceback (most recent call last):
  File "D:\anaconda3\envs\py37tf\lib\site-packages\tensorflow_core\python\eager\function.py", line 511, in call
    ctx=ctx)
  File "D:\anaconda3\envs\py37tf\lib\site-packages\tensorflow_core\python\eager\execute.py", line 67, in quick_execute
    six.raise_from(core._status_to_exception(e.code, message), None)
  File "<string>", line 3, in raise_from
tensorflow.python.framework.errors_impl.InternalError:  Blas GEMM launch failed : a.shape=(8, 459), b.shape=(8, 2), m=459, n=2, k=8
  [[node training/Adam/gradients/gradients/output/MatMul_grad/MatMul_1 (defined at \anaconda3\envs\py37tf\lib\site-packages\tensorflow_core\python\framework\ops.py:1751) ]] [Op:__inference_keras_scratch_graph_5416]

Function call stack:
keras_scratch_graph





Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source