'Pandas groupby AND cumulative sum AND from inital value based on group

I have the following code:

import pandas as pd
import numpy as np
import math

            
def update_cumulative_variables (df_previous, df_new, row_id = 'ID', row_dam = 'A', row_damcum = 'cumA', row_date = 'date', row_years_in_op = 'years_in_op' ):
    d = {}
    d_damcum = {row[row_id]:row[row_damcum] for index,row in df_previous.iterrows()}
    d_yearop = {row[row_id]:row[row_years_in_op] for index,row in df_previous.iterrows()}
    d_date = {row[row_id]:row[row_date] for index,row in df_previous.iterrows()}
    
    for index, row in df_new.iterrows():
        new_damcum = row[row_dam] + d_damcum[row[row_id]]
        new_yearop = (row[row_date] - d_date[row[row_id]])/365 + d_yearop[row[row_id]] 
        df_new.at[index, row_damcum] = new_damcum
        df_new.at[index, row_years_in_op] = new_yearop
        d_damcum[row[row_id]] = new_damcum
    
    df = pd.concat([df_previous, df_new], axis=0)
    return df

df1 = pd.DataFrame(np.random.randint(25, size=(6, 4)),
index=["1", "2", "3", "4", "5", "6"],
columns=["A", "B", "C", "D"])
ID_list = [1,2,3,2,1,3]
df1['ID'] = ID_list
date_list = [4,4,4,5,5,5]
df1['date'] = date_list
df1 = df1.assign(cumA=df1.groupby('ID')['A'].cumsum())
df1 = df1.assign(years_in_op = df1.groupby('ID')['date'].transform(lambda x: (x-x.min())/365+2))
print('df1')
print(df1)
print(' ')

g = df1.groupby('ID')
df1_last = g.tail(1)
print('df1_last')
print(df1_last)
print(' ')

df2 = pd.DataFrame(np.random.randint(25, size=(6, 4)),
index=["7", "8", "9", "10", "11", "12"],
columns=["A", "B", "C", "D"])
ID_list = [1,2,3,1,3,2]
df2['ID'] = ID_list
date_list = [6,6,6,7,7,7]
df2['date'] = date_list

print('df2')
print(df2)
print(' ')

df3 = update_cumulative_variables(df1_last,df2)
print('df3')
print(df3)

It returns the following dataframes:

df1
    A   B   C   D  ID  date  cumA  years_in_op
1   5   6  10  13   1     4     5      2.00000
2  23   8   4   0   2     4    23      2.00000
3   0   4  24   4   3     4     0      2.00000
4   7  21   6  24   2     5    30      2.00274
5   3  23   3  12   1     5     8      2.00274
6  12   0  15  10   3     5    12      2.00274
 
df1_last
    A   B   C   D  ID  date  cumA  years_in_op
4   7  21   6  24   2     5    30      2.00274
5   3  23   3  12   1     5     8      2.00274
6  12   0  15  10   3     5    12      2.00274
 
df2
     A   B   C   D  ID  date
7   18   8  12   8   1     6
8    8   7   1   8   2     6
9   19   4   1   0   3     6
10   8  18   4   3   1     7
11  14  20  23  18   3     7
12   4   1  12   1   2     7
 
df3
     A   B   C   D  ID  date  cumA  years_in_op
4    7  21   6  24   2     5  30.0     2.002740
5    3  23   3  12   1     5   8.0     2.002740
6   12   0  15  10   3     5  12.0     2.002740
7   18   8  12   8   1     6  26.0     2.005479
8    8   7   1   8   2     6  38.0     2.005479
9   19   4   1   0   3     6  31.0     2.005479
10   8  18   4   3   1     7  34.0     2.008219
11  14  20  23  18   3     7  45.0     2.008219
12   4   1  12   1   2     7  42.0     2.008219

in df1 cumA column is the cumulative sum of 'A' column grouped by column 'ID'. I then take the last row of df1 for each 'ID' values and I get df1_last. This can be easily written in a pandas one liner : 'df1.assign(cumA=df1.groupby('ID')['A'].cumsum())'

I then have a new dataframe df2, I would like to get there also the cumulative sum of 'A' grouped by column 'ID' but starting at an initial value that is given in df1_last. I was wonderign if there was a similar pandas one liner to do that or if I had to write a function with a for loop (see update_cumulative_variables function) to fix that.

I also would like to know a similar approach to compute the years_in_op column: Use a pandas one liner instead of looping through the dataframe.

Thanks for your help!

Matt



Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source