'At distributed Tensorflow using ParameterServerStrategy using GCP, my instances are stuck after initializing GrpcChannelCache
I'm trying making distributed Tensorflow using ParameterServerStrategy. It is stuck after printing this. I don't know why. This is my code for chief. I run this code for each ps, worker. Is there anything I can do?
import os
import json
import tensorflow as tf
import mnist_setup
per_worker_batch_size = 64
os.environ["TF_CONFIG"] = json.dumps({
'cluster': {
'ps':['10.128.0.8:9090'],
'worker': ['10.128.0.9:9090'],
'chief':['10.128.0.10:9090']
},
'task': {'type': 'chief', 'index': 0}
})
cluster_spec = tf.train.ClusterSpec({
'ps': ['10.128.0.8:9090'],
'worker': ['10.128.0.9:9090'],
'chief':['10.128.0.10:9090']
})
cluster_resolver = tf.distribute.cluster_resolver.SimpleClusterResolver(cluster_spec, task_type="chief",task_id=0)
tf_config = json.loads(os.environ['TF_CONFIG'])
num_workers = len(tf_config['cluster']['worker'])
strategy = tf.distribute.experimental.ParameterServerStrategy(cluster_resolver)
global_batch_size = per_worker_batch_size * num_workers
multi_worker_dataset = mnist_setup.mnist_dataset(global_batch_size)
dc = tf.keras.utils.experimental.DatasetCreator(multi_worker_dataset)
with strategy.scope():
# Model building/compiling need to be within `strategy.scope()`.
multi_worker_model = mnist_setup.build_and_compile_cnn_model()
working_dir = '/my_working_dir'
log_dir = os.path.join(working_dir, 'log')
ckpt_filepath = os.path.join(working_dir, 'ckpt')
backup_dir = os.path.join(working_dir, 'backup')
callbacks = [
tf.keras.callbacks.TensorBoard(log_dir=log_dir),
tf.keras.callbacks.ModelCheckpoint(filepath=ckpt_filepath),
tf.keras.callbacks.BackupAndRestore(backup_dir=backup_dir),
]
result = multi_worker_model.fit(dc, epochs=3)
2022-05-19 04:51:49.861861: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:272] Initialize GrpcChannelCache for job chief -> {0 -> 10.128.0.10:9090} 2022-05-19 04:51:49.862023: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:272] Initialize GrpcChannelCache for job ps -> {0 -> 10.128.0.8:9090} 2022-05-19 04:51:49.862083: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:272] Initialize GrpcChannelCache for job worker -> {0 -> 10.128.0.9:9090}
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|