'Is CsvDataset always shuffling the dataset

I working with multiple CSV files that can be shuffled by their file name, but the content of the file cannot be shuffled before creating a batch. The CSV files contain time-series data ordered by the time of their creation (measurement of the sensors in time).

Is tf.data.experimental.CsvDataset shuffling the data during loading CSV automatically or I can disable this feature and do it yourself?

Code

def make_csv_dataset(dataset_path):
    def get_random_path(path, labels, idx):
        return path + "/*_" + str(labels[idx].numpy()) + ".csv"

    def _make_dataset(idx):
        path = tf.py_function(
            get_random_path, [dataset_path, labels, idx], Tout=[tf.string]
        )

        # shuffle paths
        paths = tf.io.matching_files(path)
        paths = tf.random.shuffle(paths)
        tf.print(paths)

        dataset = (
            tf.data.experimental.CsvDataset(
                filenames=paths,
                record_defaults=[
                    tf.float32,
                    tf.int32,
                    tf.float32,
                    tf.float32,
                    tf.float32,
                    tf.float32,
                    tf.float32,
                    tf.float32,
                    tf.float32,
                    tf.float32,
                    tf.float32,
                    tf.float32,
                ],
                header=True,
                na_value="NaN",
            )
            .map(
                lambda t, y, temp, aX, aY, aZ, gX, gY, gZ, mX, mY, mZ: (
                    tf.stack(
                        [t, temp, aX, aY, aZ, gX, gY, gZ, mX, mY, mZ], axis=-1
                    ),
                    y,
                )
            )
            .batch(64, drop_remainder=True)   # FIRSTLY CREATE BATCH
            .shuffle(64).  # MAYBE SHUFFLE BATCHES HERE FOR BETTER TRAINING PERFORMANCE ???
        )

        return dataset

    return _make_dataset

dataset = (
    tf.data.Dataset.range(3)
    .interleave(
        map_func=make_csv_dataset("dataset/train"),
        cycle_length=tf.data.AUTOTUNE,
        num_parallel_calls=tf.data.AUTOTUNE,
        deterministic=False,
    )
)


for features in dataset.take(10):
    print(features)


Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source