'PYTHON - Fastest Way of Flattening/Exploding multiple large JSON files with nested arrays, have more than 100000 json files

I have written an efficient JSON flattening logic that explodes and join nested JSON arrays it works faster for on JSON with more than 100s of nested arrays and nested dict but problem is now I have 100000 JSON files to handle. is there a way to either merge multiple JSONs to one big ad run this code or something else, any help will be great....

I know there are some duplicate question but this is mainly regarding the efficiently handling large number of large JSON files

# let's say I have this json and flattening/exploding code:
from collections import defaultdict, MutableMapping
from copy import deepcopy
import pandas as pd

sample = {
    "rss": {
        "overview": {
            "id": {
                "data": [
                    {
                        "stuff": [
                            {
                                "onetype": [
                                    {"id": '1', "name": "John Doe"},
                                    {"id": '2', "name": "Don Joeh"},
                                ]
                            },
                            {"othertype": [{"id": '2', "company": "ACME"}]},
                        ]
                    },
                    {"otherstuff": [{"thing": [['1', '42'], ['2', '2']]}]},
                ]
            }
        }
    }
}

# Flattening with exploding Logic:


def cross_join(left, right):
    new_rows = [] if right else left
    for left_row in left:
        for right_row in right:
            temp_row = deepcopy(left_row)
            for key, value in right_row.items():
                temp_row[key] = value
            new_rows.append(deepcopy(temp_row))
    return new_rows


def dict_maker(dic_list):
    dd = defaultdict(list)
    for d in dic_list:
        for key, value in d.items():
            dd[key].append(value)
    return dd


def flatten_list(data):
    for elem in data:
        if isinstance(elem, list):
            yield from flatten_list(elem)
        else:
            yield elem


def flatten_struct(data, prev_heading=""):
    if isinstance(data, dict):
        rows = [{}]
        for key, value in data.items():
            rows = cross_join(rows, flatten_struct(value, prev_heading + "_" + key))
    elif isinstance(data, list):
        rows = []
        for i in range(len(data)):
            [
                rows.append(elem)
                for elem in flatten_list(flatten_struct(data[i], prev_heading))
            ]
    else:
        rows = [{prev_heading[1:]: data}]
    return rows


def flatten(d, parent_key="", sep="_"):
    items = []
    if isinstance(d, dict):
        for k, v in d.items():
            new_key = parent_key + sep + k if parent_key else k
            if isinstance(v, MutableMapping):
                items.extend(flatten(v, new_key, sep=sep).items())
            else:
                items.append((new_key, v))
    else:
        {}
    return dict(items)


def get_section_df(section, section_grp, id=None):
    df_lst = []
    finalMap = {}
    for elem in section:
        d = flatten(elem)
        flat = [
            {k + "_" + key: val for key, val in dict_maker(flatten_struct(v)).items()}
            if isinstance(v, list)
            else {k: v}
            for k, v in d.items()
        ]
        for new_d in flat:
            finalMap.update(new_d)
            # finalMap.update({k:v for k,v in id})
        if len(finalMap) > 0:
            df = pd.concat(
                {
                    str(section_grp)
                    + "_"
                    + k.replace("@", "").replace("#", ""): pd.Series(v)
                    for k, v in finalMap.items()
                },
                axis=1,
            )

            df_lst.append(df)
    return df_lst



def process(json_sample):
    df_list = []
    master_d = flatten(json_sample)
    master_keys = [k for k in master_d.keys() if type(master_d.get(k)) == list]
    grouped_path_dict = {x: x.split("_")[2] for x in master_keys}
    master_id = ''

    for flatted in master_keys:
        lst = master_d.get(flatted)
        path_group = grouped_path_dict.get(flatted)

        # if isinstance(lst, list):
        if len(get_section_df(section=lst, id=master_id, section_grp=path_group)) > 0:
            pdf = pd.concat(
                get_section_df(section=lst, id=master_id, section_grp=path_group)
            )
            df_list.append(pdf)
    df = pd.concat(df_list)
    return df

print(process(json_sample=sample))
  id_stuff_onetype_id id_stuff_onetype_name id_stuff_othertype_id id_stuff_othertype_company id_otherstuff_thing
0                   1              John Doe                     2                       ACME                 NaN
1                   2              Don Joeh                   NaN                        NaN                 NaN
0                   1              John Doe                     2                       ACME                   1
1                   2              Don Joeh                   NaN                        NaN                  42
2                 NaN                   NaN                   NaN                        NaN                   2
3                 NaN                   NaN                   NaN                        NaN                   2

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution	Source

'PYTHON - Fastest Way of Flattening/Exploding multiple large JSON files with nested arrays, have more than 100000 json files

Sources

Related Questions