'PYTHON - Fastest Way of Flattening/Exploding multiple large JSON files with nested arrays, have more than 100000 json files
I have written an efficient JSON flattening logic that explodes and join nested JSON arrays it works faster for on JSON with more than 100s of nested arrays and nested dict but problem is now I have 100000 JSON files to handle. is there a way to either merge multiple JSONs to one big ad run this code or something else, any help will be great....
I know there are some duplicate question but this is mainly regarding the efficiently handling large number of large JSON files
# let's say I have this json and flattening/exploding code:
from collections import defaultdict, MutableMapping
from copy import deepcopy
import pandas as pd
sample = {
"rss": {
"overview": {
"id": {
"data": [
{
"stuff": [
{
"onetype": [
{"id": '1', "name": "John Doe"},
{"id": '2', "name": "Don Joeh"},
]
},
{"othertype": [{"id": '2', "company": "ACME"}]},
]
},
{"otherstuff": [{"thing": [['1', '42'], ['2', '2']]}]},
]
}
}
}
}
# Flattening with exploding Logic:
def cross_join(left, right):
new_rows = [] if right else left
for left_row in left:
for right_row in right:
temp_row = deepcopy(left_row)
for key, value in right_row.items():
temp_row[key] = value
new_rows.append(deepcopy(temp_row))
return new_rows
def dict_maker(dic_list):
dd = defaultdict(list)
for d in dic_list:
for key, value in d.items():
dd[key].append(value)
return dd
def flatten_list(data):
for elem in data:
if isinstance(elem, list):
yield from flatten_list(elem)
else:
yield elem
def flatten_struct(data, prev_heading=""):
if isinstance(data, dict):
rows = [{}]
for key, value in data.items():
rows = cross_join(rows, flatten_struct(value, prev_heading + "_" + key))
elif isinstance(data, list):
rows = []
for i in range(len(data)):
[
rows.append(elem)
for elem in flatten_list(flatten_struct(data[i], prev_heading))
]
else:
rows = [{prev_heading[1:]: data}]
return rows
def flatten(d, parent_key="", sep="_"):
items = []
if isinstance(d, dict):
for k, v in d.items():
new_key = parent_key + sep + k if parent_key else k
if isinstance(v, MutableMapping):
items.extend(flatten(v, new_key, sep=sep).items())
else:
items.append((new_key, v))
else:
{}
return dict(items)
def get_section_df(section, section_grp, id=None):
df_lst = []
finalMap = {}
for elem in section:
d = flatten(elem)
flat = [
{k + "_" + key: val for key, val in dict_maker(flatten_struct(v)).items()}
if isinstance(v, list)
else {k: v}
for k, v in d.items()
]
for new_d in flat:
finalMap.update(new_d)
# finalMap.update({k:v for k,v in id})
if len(finalMap) > 0:
df = pd.concat(
{
str(section_grp)
+ "_"
+ k.replace("@", "").replace("#", ""): pd.Series(v)
for k, v in finalMap.items()
},
axis=1,
)
df_lst.append(df)
return df_lst
def process(json_sample):
df_list = []
master_d = flatten(json_sample)
master_keys = [k for k in master_d.keys() if type(master_d.get(k)) == list]
grouped_path_dict = {x: x.split("_")[2] for x in master_keys}
master_id = ''
for flatted in master_keys:
lst = master_d.get(flatted)
path_group = grouped_path_dict.get(flatted)
# if isinstance(lst, list):
if len(get_section_df(section=lst, id=master_id, section_grp=path_group)) > 0:
pdf = pd.concat(
get_section_df(section=lst, id=master_id, section_grp=path_group)
)
df_list.append(pdf)
df = pd.concat(df_list)
return df
print(process(json_sample=sample))
id_stuff_onetype_id id_stuff_onetype_name id_stuff_othertype_id id_stuff_othertype_company id_otherstuff_thing
0 1 John Doe 2 ACME NaN
1 2 Don Joeh NaN NaN NaN
0 1 John Doe 2 ACME 1
1 2 Don Joeh NaN NaN 42
2 NaN NaN NaN NaN 2
3 NaN NaN NaN NaN 2
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|
