'Polars: Parallelizing creating a new dataframe while using groupby of another dataframe

I have an artifacts dataframe which I need to groupby some columns. This sub-dataframe is then passed to a function where I create a new dataframe.

for sub_df in merged_dataset.groupby(['PRODUCT','PROGRAM','TESTSUITE','MODULE', 'BASECLASS', 'SUBCLASS']):
    flow_summary_df = sub_df.select(['PRODUCT','PROGRAM','TESTSUITE','MODULE', 'BASECLASS', 'SUBCLASS', 'VERSION', 'RELEASE_DATE', 'Status'])
    current_test_events_dataset = add_test_events(flow_summary_df)

Here is the add_test_events code:

def add_test_events_new(flow_summary_df):
    previous_test_status = None
    known_versions = flow_summary_df.VERSION.to_list()
    test_events = []
    test_event_versions = []
    test_events_data = {}

    for i, version in enumerate(known_versions):
        if i == 0:
            test_events.append('Introduced')
            test_event_versions.append(version)
            if flow_summary_df.Status[i] == 'BINNING':
                test_events.append('Binning Enabled')
                test_event_versions.append(version)
            previous_test_status = flow_summary_df.Status[i]
            continue
        if flow_summary_df.Status[i] == 'BYPASSED' and previous_test_status != 'BYPASSED':
            test_events.append('Removed')
            test_event_versions.append(version)
            previous_test_status = 'BYPASSED'
        elif flow_summary_df.Status[i] == 'BINNING' and previous_test_status != 'BINNING':
            test_events.append('Binning Enabled')
            test_event_versions.append(version)
            previous_test_status = 'BINNING'
        elif flow_summary_df.Status[i] == 'NON-BINNING' and previous_test_status == 'BINNING':
            test_events.append('Binning Disabled')
            test_event_versions.append(version)
            previous_test_status = 'NON-BINNING'
        elif flow_summary_df.Status[i] != 'BYPASSED' and previous_test_status == 'BYPASSED':
            test_events.append('Re-Introduced')
            test_event_versions.append(version)
            if flow_summary_df.Status[i] == 'BINNING':
                test_events.append('Binning Enabled')

    for column in ['PRODUCT','PROGRAM','TESTSUITE','MODULE', 'BASECLASS', 'SUBCLASS', 'VERSION', 'RELEASE_DATE', 'TEST_EVENT']:
        if column == 'TEST_EVENT':
            test_events_data[column] = test_events
        elif column == 'VERSION':
            test_events_data[column] = test_event_versions
        else:
            if flow_summary_df[column][0] is None:
                test_events_data[column] = [''] * len(test_events)
            else:
                test_events_data[column] = [flow_summary_df[column][0]] * len(test_events)
    test_events_df = pl.DataFrame(test_events_data)
    return test_events_df

It just feels like this is not the polars way to do things.

thx



Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source