'Polars: Parallelizing creating a new dataframe while using groupby of another dataframe
I have an artifacts dataframe which I need to groupby some columns. This sub-dataframe is then passed to a function where I create a new dataframe.
for sub_df in merged_dataset.groupby(['PRODUCT','PROGRAM','TESTSUITE','MODULE', 'BASECLASS', 'SUBCLASS']):
flow_summary_df = sub_df.select(['PRODUCT','PROGRAM','TESTSUITE','MODULE', 'BASECLASS', 'SUBCLASS', 'VERSION', 'RELEASE_DATE', 'Status'])
current_test_events_dataset = add_test_events(flow_summary_df)
Here is the add_test_events code:
def add_test_events_new(flow_summary_df):
previous_test_status = None
known_versions = flow_summary_df.VERSION.to_list()
test_events = []
test_event_versions = []
test_events_data = {}
for i, version in enumerate(known_versions):
if i == 0:
test_events.append('Introduced')
test_event_versions.append(version)
if flow_summary_df.Status[i] == 'BINNING':
test_events.append('Binning Enabled')
test_event_versions.append(version)
previous_test_status = flow_summary_df.Status[i]
continue
if flow_summary_df.Status[i] == 'BYPASSED' and previous_test_status != 'BYPASSED':
test_events.append('Removed')
test_event_versions.append(version)
previous_test_status = 'BYPASSED'
elif flow_summary_df.Status[i] == 'BINNING' and previous_test_status != 'BINNING':
test_events.append('Binning Enabled')
test_event_versions.append(version)
previous_test_status = 'BINNING'
elif flow_summary_df.Status[i] == 'NON-BINNING' and previous_test_status == 'BINNING':
test_events.append('Binning Disabled')
test_event_versions.append(version)
previous_test_status = 'NON-BINNING'
elif flow_summary_df.Status[i] != 'BYPASSED' and previous_test_status == 'BYPASSED':
test_events.append('Re-Introduced')
test_event_versions.append(version)
if flow_summary_df.Status[i] == 'BINNING':
test_events.append('Binning Enabled')
for column in ['PRODUCT','PROGRAM','TESTSUITE','MODULE', 'BASECLASS', 'SUBCLASS', 'VERSION', 'RELEASE_DATE', 'TEST_EVENT']:
if column == 'TEST_EVENT':
test_events_data[column] = test_events
elif column == 'VERSION':
test_events_data[column] = test_event_versions
else:
if flow_summary_df[column][0] is None:
test_events_data[column] = [''] * len(test_events)
else:
test_events_data[column] = [flow_summary_df[column][0]] * len(test_events)
test_events_df = pl.DataFrame(test_events_data)
return test_events_df
It just feels like this is not the polars way to do things.
thx
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|
