'3 CSV files into a single JSON
I have 3 csv files, loan, customer and security, each of these files are quite large (800k+ rows). Each file is linked by a single column uniqueid. My aim is to create a single JSON file, the code below achieves this, however, it is very slow.
My question is how can i achieve this result faster?
import csv
import json
def multicsvtojson():
loanscsvfile = open('C:\\***\\loan.csv', 'r')
custcsvfile = open('C:\\***\\customer.csv', 'r')
securcsvfile = open('C:\\***\\security.csv', 'r')
loansreader = csv.DictReader(loanscsvfile, delimiter=',')
custreader = csv.DictReader(custcsvfile, delimiter=',')
securreader = csv.DictReader(securcsvfile, delimiter=',')
jsonfile = open('test.json', 'w')
#ready json file
output = []
loanscount = 0
#total loan count
for loansrow in loansreader:
loansrow['customers'] = []
loansrow['securities'] = []
output.append(loansrow)
custcsvfile.seek(0)
securcsvfile.seek(0)
for custrow in custreader:
if (loansrow["UniqueID"] == custrow["UniqueID"]):
loansrow['customers'].append(custrow)
for securrow in securreader:
if (loansrow["UniqueID"] == securrow["UniqueID"]):
loansrow['securities'].append(securrow)
loanscount = loanscount + 1 #increment the loan counter
print(loanscount)
total = {}
total['total'] = loanscount
output.insert(0, total)
json.dump(output, jsonfile, indent=4)
The current output is as follow
[{
"total": 2
},
{
"uniqueID": "",
"uniqueID2": "",
"colA": "",
"colB": "",
"colC": "",
"colD": "",
"customers": [
{
"uniqueID": "",
"custID": "",
"colA": "",
"colB": "",
}
],
"securities": [
{
"uniqueID": "",
"secuID": "",
"colA": "",
"colB": ""
}
]
},
{
"uniqueID": "",
"uniqueID2": "",
"colA": "",
"colB": "",
"colC": "",
"colD": "",
"customers": [
{
"uniqueID": "",
"custID": "",
"colA": "",
"colB": "",
},
{
"uniqueID": "",
"secuID": "",
"colA": "",
"colB": ""
}
],
"securities": [
{
"uniqueID": "",
"secuID": "",
"colA": "",
"colB": ""
},
{
"uniqueID": "",
"secuID": "",
"colA": "",
"colB": ""
}
]
}
}]
Solution 1:[1]
What probably costs you performance are the multiple reads of the customer and securities files. Because you reread the whole file at every loan row.
Maybe try to regroup customers and securties by ID before getting to the loans file, so you don't have to re-read every time.
import csv
import json
def multicsvtojson():
loanscsvfile = open('C:\\***\\loan.csv', 'r')
custcsvfile = open('C:\\***\\customer.csv', 'r')
securcsvfile = open('C:\\***\\security.csv', 'r')
loansreader = csv.DictReader(loanscsvfile, delimiter=',')
custreader = csv.DictReader(custcsvfile, delimiter=',')
securreader = csv.DictReader(securcsvfile, delimiter=',')
jsonfile = open('test.json', 'w')
#ready json file
output = []
loanscount = 0
# regroup customers by ID
customers = {}
for custrow in custreader:
id = custrow["UniqueID"]
if id not in customers:
customers[id] = []
customers[id].append(custrow)
# regroup securities by ID
securities = {}
for securrow in securreader:
id = securrow["UniqueID"]
if id not in securities:
securities[id] = []
securities[id].append(securrow)
#total loan count
for loansrow in loansreader:
loansrow['customers'] = customers.get("UniqueID", [])
loansrow['securities'] = securities.get("UniqueID", [])
output.append(loansrow)
loanscount = loanscount + 1 #increment the loan counter
print(loanscount)
total = {}
total['total'] = loanscount
output.insert(0, total)
json.dump(output, jsonfile, indent=4)
If the CSV files are too big to handle in memory, you can also try to use a temporary database such as tinydb.
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|---|
| Solution 1 | marc_s |
