'3 CSV files into a single JSON

I have 3 csv files, loan, customer and security, each of these files are quite large (800k+ rows). Each file is linked by a single column uniqueid. My aim is to create a single JSON file, the code below achieves this, however, it is very slow.

My question is how can i achieve this result faster?

import csv
import json

def multicsvtojson():

loanscsvfile = open('C:\\***\\loan.csv', 'r')
custcsvfile = open('C:\\***\\customer.csv', 'r')
securcsvfile = open('C:\\***\\security.csv', 'r')

loansreader = csv.DictReader(loanscsvfile, delimiter=',')
custreader = csv.DictReader(custcsvfile, delimiter=',')
securreader = csv.DictReader(securcsvfile, delimiter=',')

jsonfile = open('test.json', 'w')
#ready json file

output = []
loanscount = 0
#total loan count
for loansrow in loansreader:
    loansrow['customers'] = []
    loansrow['securities'] = []

    output.append(loansrow)
    custcsvfile.seek(0)
    securcsvfile.seek(0)
    for custrow in custreader:
        if (loansrow["UniqueID"] == custrow["UniqueID"]):
            loansrow['customers'].append(custrow) 
    for securrow in securreader:
        if (loansrow["UniqueID"] == securrow["UniqueID"]):
            loansrow['securities'].append(securrow)

    loanscount = loanscount + 1 #increment the loan counter
    print(loanscount)

total = {}
total['total'] = loanscount
output.insert(0, total)

json.dump(output, jsonfile, indent=4)

The current output is as follow

[{
    "total": 2
},
{
    "uniqueID": "",
    "uniqueID2": "",
    "colA": "",
    "colB": "",
    "colC": "",
    "colD": "",
    "customers": [
        {
            "uniqueID": "",
            "custID": "",
            "colA": "",
            "colB": "",
        }
    ],
    "securities": [
        {
            "uniqueID": "",
            "secuID": "",
            "colA": "",
            "colB": ""
        }
    ]
},
{
    "uniqueID": "",
    "uniqueID2": "",
    "colA": "",
    "colB": "",
    "colC": "",
    "colD": "",
    "customers": [
        {
            "uniqueID": "",
            "custID": "",
            "colA": "",
            "colB": "",
        },
        {
            "uniqueID": "",
            "secuID": "",
            "colA": "",
            "colB": ""
        }
    ],
    "securities": [
        {
            "uniqueID": "",
            "secuID": "",
            "colA": "",
            "colB": ""
        },
        {
            "uniqueID": "",
            "secuID": "",
            "colA": "",
            "colB": ""
        }
    ]
}

}]



Solution 1:[1]

What probably costs you performance are the multiple reads of the customer and securities files. Because you reread the whole file at every loan row.

Maybe try to regroup customers and securties by ID before getting to the loans file, so you don't have to re-read every time.

import csv
import json

def multicsvtojson():

    loanscsvfile = open('C:\\***\\loan.csv', 'r')
    custcsvfile = open('C:\\***\\customer.csv', 'r')
    securcsvfile = open('C:\\***\\security.csv', 'r')

    loansreader = csv.DictReader(loanscsvfile, delimiter=',')
    custreader = csv.DictReader(custcsvfile, delimiter=',')
    securreader = csv.DictReader(securcsvfile, delimiter=',')

    jsonfile = open('test.json', 'w')
    #ready json file

    output = []
    loanscount = 0

    # regroup customers by ID
    customers = {}
    for custrow in custreader:
        id = custrow["UniqueID"]
        if id not in customers:
            customers[id] = []
        customers[id].append(custrow)

    # regroup securities by ID
    securities = {}
    for securrow in securreader:
        id = securrow["UniqueID"]
        if id not in securities:
            securities[id] = []
        securities[id].append(securrow)

    #total loan count
    for loansrow in loansreader:

        loansrow['customers'] = customers.get("UniqueID", [])
        loansrow['securities'] = securities.get("UniqueID", [])
        output.append(loansrow)

        loanscount = loanscount + 1 #increment the loan counter
        print(loanscount)

    total = {}
    total['total'] = loanscount
    output.insert(0, total)

    json.dump(output, jsonfile, indent=4)

If the CSV files are too big to handle in memory, you can also try to use a temporary database such as tinydb.

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 marc_s