'tensorflow.python.framework.errors_impl.InternaalError:BLAS GEMM launch failed: a.shpe=(8, 459), b.shape=(8, 2), m=459, n=2, k=8
I am trying to train a CNN model on gpu with tensorflow-gpu2.0.0, and a error occurred as title mentioned, It can run well on cpu with tensorflow2.0.0. I use cuda 10.0, cudnn 7.6.5, RTX 3080(10GB). I couldn't find out which part is wrong, I saw some issues similar with this, they said it was caused by the memory occupation of process, so I tried to add
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_virtual_device_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)])
behind import tensorflow as tf, but it didn't work to this, reduce batch_size didn't work as well.
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras import layers
import time
import datetime
import os
import tensorflow_probability as tfp
import pickle
import numpy as np
from Fusion_layer import perception_layer_topologyV1_template
from loss_2 import Custom_loss
import math
class am_lfs():
def __init__(self, M=6, sigma=0.2, epochs=1, batch_size=128, B=4, num_class=2, Dropout_rate=0.3):
self.M = M
self.sigma = sigma
self.batch_size = batch_size
self.Dropout_rate = Dropout_rate
self.B = B
self.epochs = epochs
self.num_class = num_class
self.train_set = train_dataset
self.test_set = test_dataset
def train_model(self, model, early_stopping, csv_logger, train_set, test_set, sear, i, train_dataset, test_dataset):
print("training model...")
history = model.fit(train_dataset,
steps_per_epoch = max(1, math.ceil(len(train_set) / self.batch_size)),
epochs=epochs,
validation_data = test_dataset, #remember to change it
validation_steps = max(1, math.ceil(len(test_set) / self.batch_size)),
callbacks=[early_stopping, csv_logger])
path1 = 'model{}_{}.h5'.format(i, sear)
model.save(os.path.join(result_path,path1))
return history
def Mymodel(self, theta):
inputs = layers.Input(shape=(112, 112, 1), name='input') # shape of input
x = inputs
# normal convolution layers
output = layers.Conv2D(137, (7, 7), strides=2, padding='same', activation='relu')(x)
output = layers.MaxPooling2D(pool_size=(3, 3), strides=2, padding='same')(output)
output = layers.Conv2D(137, (1, 1))(output)
output = layers.Conv2D(144, (3, 3), padding='same')(output)
output = layers.MaxPooling2D(pool_size=(3, 3), strides=2, padding='same')(output)
# Inception module (fusion 3)
output = perception_layer_topologyV1_template(output, output_size=245, convMM_size=2, convMM_num=24,
num_of_c11_1=308, conv1_size=6, conv1_num=77,
num_of_c11_2=24, conv2_size=7, conv2_num=24,
pool1_size=2, num_of_pool1c1_max=40, num_of_pool1c1_min=40,
pool2_size=5, num_of_pool2c1_max=20, num_of_pool2c1_min=20,
name="fusion_3")
# Inception module (fusion 4)
output = perception_layer_topologyV1_template(output, output_size=228, convMM_size=2, convMM_num=35,
num_of_c11_1=354, conv1_size=6, conv1_num=15,
num_of_c11_2=281, conv2_size=7, conv2_num=13,
pool1_size=2, num_of_pool1c1_max=24, num_of_pool1c1_min=23,
pool2_size=5, num_of_pool2c1_max=59, num_of_pool2c1_min=59,
name="fusion_4")
# max_pool
output = layers.MaxPooling2D(pool_size=[3, 3],strides =2, padding = 'same', name = 'pool2')(output)
# Inception module (fusion 5)
output = perception_layer_topologyV1_template(output, output_size=459, convMM_size=2, convMM_num=14,
num_of_c11_1=46, conv1_size=3, conv1_num=25,
num_of_c11_2=170, conv2_size=5, conv2_num=116,
pool1_size=4, num_of_pool1c1_max=107, num_of_pool1c1_min=107,
pool2_size=5, num_of_pool2c1_max=45, num_of_pool2c1_min=45,
name="fusion_5")
# AveragePooling
output = layers.AveragePooling2D(pool_size=[7, 7], strides=1, name='aver_pool_1')(output)
# Flatten
# output = layers.Flatten()(output)
# Dropout
output = layers.Dropout(self.Dropout_rate)(output)
output = tf.squeeze(output, axis=2)
output1 = tf.squeeze(output, axis=1, name='output1')
# Dense: activation - softmax
# logits = MyLayer(2) # original FC
logits = layers.Dense(2, activation='elu', name='output')(output1)
# Build model
model = keras.Model(inputs, logits)
model.summary()
w = model.layers[-1].get_weights()
w_1 = tf.cast(w[0], dtype=tf.float32, name='w_1')
model.compile(optimizer=keras.optimizers.Adam(), loss=Custom_loss(theta, w_1, output1), metrics=['accuracy'],
experimental_run_tf_function = False
)
return model
class idk():
def __init__(self, mus, B, M, sigma, dist):
self.mus = mus
self.B = B
self.optim_mus = tf.keras.optimizers.Adam(lr=0.05)
self.dist =dist
def sample(self):
self.thetas = self.dist.sample((self.B,))
return self.thetas
def run(self, dict_m, thetas):
accs = []
loss_mu = 0
with tf.GradientTape() as Tape:
Tape.watch(self.mus)
for i in range(self.B):
max_acc = dict_m['Max_acc{}'.format(i)]
acc = dict_m['acc{}'.format(i)]
accs += acc
loss_mu -= dist.log_prob(thetas[i]) * (max_acc - np.mean(accs)) / (np.std(accs) + np.finfo(np.float32).eps.item())
loss_mu = loss_mu/self.B
grad = Tape.gradient(loss_mu, [self.mus])
self.optim_mus.apply_gradients(zip(grad, [self.mus,]))
if __name__ == '__main__':
if not os.path.exists(r'.\logs\search\{}'.format(datetime.date.today().strftime("%Y%m%d"))):
os.mkdir(r'.\logs\search\{}'.format(datetime.date.today().strftime("%Y%m%d")))
result_path = r'.\logs\search\{}'.format(datetime.date.today().strftime("%Y%m%d"))
start_time = time.time()
batch_size = 128
epochs = 1
last_acc = 0.995
B=4
M=6
sigma=0.25
sear = 0
dict_m = {}
train_dataset =...
print("train data loaded...")
# callbacks
early_stopping = EarlyStopping(monitor='val_accuracy', min_delta=0.003, patience=3, verbose=1)
for i in range(B):
dict_m['csv_logger{}'.format(str(i))] = tf.keras.callbacks.CSVLogger(r".\logs\csv\training_model{}_{}.log.csv".format(str(i), datetime.datetime.now().strftime("%Y%m%d%H%M%S")), append=True)
# start training
Am_Lfs = am_lfs(M=6, sigma=0.2, epochs=1, batch_size=128, B=4, num_class=2, Dropout_rate=0.3)
mus = tf.Variable(tf.convert_to_tensor(np.concatenate([np.ones([6, ]), np.zeros([6, ])])), dtype=tf.float64, name='mus')
dist = tfp.distributions.MultivariateNormalDiag(mus, tf.cast(np.ones(2 * M) * sigma, dtype=tf.float64))
thetas = idk(mus, B, M, sigma, dist).sample()
for i in range(B):
model = Am_Lfs.Mymodel(thetas[i])
dict_m["m{}".format(str(i))] = model
for epo in range(200):
sear = sear + 1
max_acc_list = []
for i in range(B):
history = Am_Lfs.train_model(dict_m["m{}".format(str(i))], early_stopping, dict_m["csv_logger{}".format(str(i))], train_set, test_set, sear, i,
train_dataset, test_dataset)
val_acc = history.history['val_accuracy']
dict_m['acc{}'.format(i)] = val_acc
dict_m['Max_acc{}'.format(i)] = max(val_acc)
max_acc_list.append(dict_m['Max_acc{}'.format(i)])
idk(mus, B, M, sigma, dist).run(dict_m, thetas)
dist = tfp.distributions.MultivariateNormalDiag(mus, tf.cast(np.ones(2 * M) * sigma, dtype=tf.float64))
thetas = idk(mus, B, M, sigma, dist).sample()
index, value = np.argmax(max_acc_list), np.max(max_acc_list)
model_path = os.path.join(result_path, 'model{}_{}.h5'.format(index, sear))
for i in range(B):
dict_m["m{}".format(str(i))].load_weights(model_path)
end_time = time.time()
print('time_cost(s):', end_time - start_time)
here is the error if it helps
2022-02-25 10:05:38.104299: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cublas64_100.dll
2022-02-25 10:06:35.446853: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cudnn64_7.dll
2022-02-25 10:17:34.713995: W tensorflow/stream_executor/cuda/redzone_allocator.cc:312] Internal: Invoking ptxas not supported on Windows
Relying on driver to perform ptx compilation. This message will be only logged once.
2022-02-25 10:17:34.956997: E tensorflow/stream_executor/cuda/cuda_blas.cc:428] failed to run cuBLAS routine: CUBLAS_STATUS_EXECUTION_FAILED
2022-02-25 10:17:34.957108: W tensorflow/core/common_runtime/base_collective_executor.cc:216] BaseCollectiveExecutor::StartAbort Internal: Blas GEMM launch failed : a.shape=(8, 459), b.shape=(8, 2), m=459, n=2, k=8
[[{{node training/Adam/gradients/gradients/output/MatMul_grad/MatMul_1}}]]
Traceback (most recent call last):
File "D:\anaconda3\envs\py37tf\lib\site-packages\tensorflow_core\python\eager\function.py", line 511, in call
ctx=ctx)
File "D:\anaconda3\envs\py37tf\lib\site-packages\tensorflow_core\python\eager\execute.py", line 67, in quick_execute
six.raise_from(core._status_to_exception(e.code, message), None)
File "<string>", line 3, in raise_from
tensorflow.python.framework.errors_impl.InternalError: Blas GEMM launch failed : a.shape=(8, 459), b.shape=(8, 2), m=459, n=2, k=8
[[node training/Adam/gradients/gradients/output/MatMul_grad/MatMul_1 (defined at \anaconda3\envs\py37tf\lib\site-packages\tensorflow_core\python\framework\ops.py:1751) ]] [Op:__inference_keras_scratch_graph_5416]
Function call stack:
keras_scratch_graph
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|
