'Is CsvDataset always shuffling the dataset
I working with multiple CSV files that can be shuffled by their file name, but the content of the file cannot be shuffled before creating a batch. The CSV files contain time-series data ordered by the time of their creation (measurement of the sensors in time).
Is tf.data.experimental.CsvDataset shuffling the data during loading CSV automatically or I can disable this feature and do it yourself?
Code
def make_csv_dataset(dataset_path):
def get_random_path(path, labels, idx):
return path + "/*_" + str(labels[idx].numpy()) + ".csv"
def _make_dataset(idx):
path = tf.py_function(
get_random_path, [dataset_path, labels, idx], Tout=[tf.string]
)
# shuffle paths
paths = tf.io.matching_files(path)
paths = tf.random.shuffle(paths)
tf.print(paths)
dataset = (
tf.data.experimental.CsvDataset(
filenames=paths,
record_defaults=[
tf.float32,
tf.int32,
tf.float32,
tf.float32,
tf.float32,
tf.float32,
tf.float32,
tf.float32,
tf.float32,
tf.float32,
tf.float32,
tf.float32,
],
header=True,
na_value="NaN",
)
.map(
lambda t, y, temp, aX, aY, aZ, gX, gY, gZ, mX, mY, mZ: (
tf.stack(
[t, temp, aX, aY, aZ, gX, gY, gZ, mX, mY, mZ], axis=-1
),
y,
)
)
.batch(64, drop_remainder=True) # FIRSTLY CREATE BATCH
.shuffle(64). # MAYBE SHUFFLE BATCHES HERE FOR BETTER TRAINING PERFORMANCE ???
)
return dataset
return _make_dataset
dataset = (
tf.data.Dataset.range(3)
.interleave(
map_func=make_csv_dataset("dataset/train"),
cycle_length=tf.data.AUTOTUNE,
num_parallel_calls=tf.data.AUTOTUNE,
deterministic=False,
)
)
for features in dataset.take(10):
print(features)
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|
