I have given it some more thought and I think this is what you are looking for.
import pandas as pd
import numpy as np
def myfunc(x):
df = x
# Shift columns activity_start and performed_at_start
df['performed_at_end'] = df['performed_at_start'].shift(-1)
df['activity_end'] = df['activity_start'].shift(-1)
# Make combinations of activity start-end
df['activity_start_end'] = df['activity_start']+'-'+df['activity_end']
# Take only those that start with deactivated, end with activated
df = df[df['activity_start_end']=='deactivated-activated']
# Drop all that don't have performed_at_end date (does not exist)
df = df[~pd.isna(df['performed_at_end'])]
# Compute time difference in days, then return sum of all delta's
df['delta'] = (df['performed_at_end']-df['performed_at_start'])/np.timedelta64(1,'D')
return df['delta'].sum()
# Example dataframe
df = pd.DataFrame({'UserId': [1]*4+[2]*2+[3]*1,
'activity_start': ['activated', 'deactivated', 'activated', 'deactivated', 'deactivated', 'activated', 'activated'],
'performed_at_start': [pd.Timestamp(2020,1,1), pd.Timestamp(2020,1,2), pd.Timestamp(2020,1,6), pd.Timestamp(2020,1,8),
pd.Timestamp(2020,1,1), pd.Timestamp(2020,1,3), pd.Timestamp(2020,1,1)]})
# Show dataframe
print(df)
UserId activity_start performed_at_start
0 1 activated 2020-01-01
1 1 deactivated 2020-01-02
2 1 activated 2020-01-06
3 1 deactivated 2020-01-08
4 2 deactivated 2020-01-01
5 2 activated 2020-01-03
6 3 activated 2020-01-01
# Compute result
res = (
df.groupby(by='UserId')
.apply(lambda x: myfunc(x)).reset_index(drop=False)
)
res.columns = ['UserId', 'time_spent_deactivated']
# Show result
print(res)
UserId time_spent_deactivated
0 1 4.0
1 2 2.0
2 3 0.0
与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…