'In python, function to run tukey test on multiple csv files and export output in pdf format

I have 20 csv files with each file having 4 columns state, dept, phase and budget.

I want to write a python function that does the following

  1. Run tukey tests on each csv

  2. Export the results as pdf files (2 per page if possible)

See below for 2 of the files (copy and paste the code to generate 2/20 of the csv files)

# The datasets generated below represents 2 out of the 20 files/datasets
# The 20 csv files have the same columns- state, dept, phase, budget
# 
import pandas as pd
import numpy as np
# genrating csv file 1 

df1={

'state':["PA","PA","PA","PA","PA","PA","PA","PA","PA","PA","PA","PA","PA","PA","PA","PA","PA","PA","PA","PA"],
    
'dept':["hlth",'hlth','hlth','hlth','edu','edu','edu','edu','fin','fin','fin','fin','parks','parks','parks',"parks",'trans','trans','trans','trans'],
'phase':["one","one","one","one","one","one","one","one","one","one","one","one","one","one","one","one","one","one","one","one"],
'budget':[112.9,123,124,523.5,112,334,55,449,221.6,332,235,239,235,223,235.6,204,315.5,614,512,514.2]}

df1 = pd.DataFrame(df1)
df1


#generating csv file 2

df2={

'state':["NY","NY","NY","NY","NY","NY","NY","NY","NY","NY","NY","NY","NY","NY","NY","NY","NY","NY","NY","NY"],
    
'dept':["hlth",'hlth','hlth','hlth','edu','edu','edu','edu','fin','fin','fin','fin','parks','parks','parks',"parks",'trans','trans','trans','trans'],
'phase':["two","two","two","two","two","two","two","two","two","two","two","two","two","two","two","two","two","two","two","two"],
'budget':[1212.9,1253,1244.4,5123.5,1312,3134,515.8,2449.9,3221.6,3132.5,2235.09,2239.01,3235.01,5223.01,4235.6,2204.5,2315.5,6114,4512,3514.2]}

df2 = pd.DataFrame(df2)
df2
# export the files to the folder

df1.to_csv(r'Path where you want to store the exported CSV file\PA_one.csv', index = False)
df2.to_csv(r'Path where you want to store the exported CSV file\NY_two.csv', index = False)

See below for my attempt

#libraries

from scipy import stats # for 1-way anova
from statsmodels.stats.multicomp import pairwise_tukeyhsd # for tukey 
#import seaborn as sns # for visualization
#import matplotlib.pyplot as plt # for visualization

import matplotlib.backends.backend_pdf
import glob # iterate through files in a folder
import os

path = "full\\path\\to_folder\\"  # 20 csv files
    
files = glob.glob(path+"\*.csv")

def tukey_reports(df,nFigPerPage):
    for file in files:    
        df = pd.read_csv(file)
        phase = df['phase'].drop_duplicates().sort_values()
        state = df['state'].drop_duplicates().sort_values()
      # set the pdf
        pdf = matplotlib.backends.backend_pdf.PdfPages("TukeyReports.pdf")
        nFig = 0
        iFigPerPage = 0
# 1-way anova 
        pval = stats.f_oneway(*[group["budget"].values for name, group in df.groupby("dept")])
        print(pval)
        if pval[1] < 0.05:
              print("*****samples are statistically different*****")
        else:
              print("*****samples are NOT statistically different*****")
# tukey test             
       tukey = pairwise_tukeyhsd(endog=df['budget'],groups=df['dept'],alpha=0.05)
       print(tukey)
       title = "state: {}, phase: {}".format(state, phase)
        
       iFigPerPage += 1
       if iFigPerPage % nFigPerPage == 0:
             iFigPerPage = 0

       nFig += 1
            
       if nFig % nFigPerPage == 0:
            fig.tight_layout()
            pdf.savefig()
               
                
    pdf.close()


# driver code
tukey_reports(df,nFigPerPage = 2)
    

I am looking for something like the figure below (2 figure per page)

enter image description here

Please share and comment your code for easier comprehension. Thanks in advance.



Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source