This document is to replicate the GP model from Dew et al. (2018). In particular, we examine how individual daily product usage evolves over time along the calendar time and individual-level predictors (e.g., recency, lifetime, and usage stock).
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(psych)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(cmdstanr)
## This is cmdstanr version 0.5.3
## - CmdStanR documentation and vignettes: mc-stan.org/cmdstanr
## - CmdStan path: /home/3276.cai/.cmdstan/cmdstan-2.32.0
## - CmdStan version: 2.32.0
library(rstan)
## Loading required package: StanHeaders
## rstan (Version 2.21.8, GitRev: 2e1f913d3ca3)
## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)
##
## Attaching package: 'rstan'
## The following object is masked from 'package:psych':
##
## lookup
setwd("~/Documents/pp/Smartick New Data/Smartick New Data/data_20221018")
load("~/Documents/pp/Smartick New Data/Smartick New Data/data_20221018/summ_230428_data.RData")
load("~/Documents/pp/Smartick New Data/Smartick New Data/data_20221018/summ_230428_trial_math.RData")
load("~/Documents/pp/Smartick New Data/Smartick New Data/data_20221018/summ_230428_contract1st.RData")
load("~/Documents/pp/Smartick New Data/Smartick New Data/data_20221018/summ_230428_contract.RData")
table(data_contract_1st$tarifa)
##
## 101 102 103 104 106 107 108 109 111 123 124 125 129 134 135 136
## 6375 6046 2026 758 536 7 354 5 2 16 260 374 2 1132 427 3586
## 140 144 145 146 147 148 149 301
## 11 25 18 23 9 1 1 283
describe(data_contract_1st)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## vars n mean sd median
## id 1 22277 299187.03 88351.76 295856.0
## estado* 2 22277 2.00 0.01 2.0
## alumno 3 22277 810984.38 309415.75 829143.0
## tutor 4 22277 447928.64 171851.41 446924.0
## tarifa 5 22277 113.16 25.71 102.0
## nombre* 6 22277 11.10 7.54 9.0
## pata_negra 7 22277 0.98 0.14 1.0
## fecha_inicio 8 22277 NaN NA NA
## fecha_fin 9 22277 NaN NA NA
## duracion 10 22166 97.54 102.71 89.0
## renovacion_automatica 11 22123 0.38 0.49 0.0
## renovacion_automatica_original 12 1408 0.81 0.39 1.0
## fecha_creacion 13 22277 NaN NA NA
## fecha_cancelacion 14 1 NaN NA NA
## importe 15 22250 87.13 79.44 69.0
## importe_sin_dto 16 22250 94.13 91.17 69.0
## primer_cargo 17 22117 NaN NA NA
## ultimo_cargo 18 22117 NaN NA NA
## num_cargos 19 22123 1.34 1.82 1.0
## moneda* 20 22194 3.01 0.22 3.0
## tipo_descuento* 21 2983 3.11 0.34 3.0
## prescripcion 22 2672 29357.90 7213.35 26778.5
## materia* 23 22277 3.00 0.08 3.0
## fecha_tarifa 24 20090 NaN NA NA
## index 25 22277 1.02 0.20 1.0
## mean_price 26 22264 89.75 79.40 69.0
## num_contract 27 22277 7.32 10.15 4.0
## num_effect_paid 28 22277 5.32 7.83 2.0
## eff_index 29 22277 1.00 0.00 1.0
## lt 30 22166 97.54 102.71 89.0
## trimmed mad min max range
## id 296137.79 74573.30 4 915726.00 915722.00
## estado* 2.00 0.00 1 2.00 1.00
## alumno 809761.82 359960.45 151277 1485604.00 1334327.00
## tutor 442607.01 199658.78 23287 864409.00 841122.00
## tarifa 109.16 1.48 101 301.00 200.00
## nombre* 10.67 2.97 1 24.00 23.00
## pata_negra 1.00 0.00 0 1.00 1.00
## fecha_inicio NaN NA Inf -Inf -Inf
## fecha_fin NaN NA Inf -Inf -Inf
## duracion 72.80 87.47 -1 933.00 934.00
## renovacion_automatica 0.35 0.00 0 1.00 1.00
## renovacion_automatica_original 0.89 0.00 0 1.00 1.00
## fecha_creacion NaN NA Inf -Inf -Inf
## fecha_cancelacion NaN NA Inf -Inf -Inf
## importe 69.29 50.41 0 435.50 435.50
## importe_sin_dto 72.50 51.89 0 477.60 477.60
## primer_cargo NaN NA Inf -Inf -Inf
## ultimo_cargo NaN NA Inf -Inf -Inf
## num_cargos 1.00 0.00 0 15.00 15.00
## moneda* 3.00 0.00 1 10.00 9.00
## tipo_descuento* 3.02 0.00 1 5.00 4.00
## prescripcion 28800.61 5472.28 1245 96623.00 95378.00
## materia* 3.00 0.00 1 3.00 2.00
## fecha_tarifa NaN NA Inf -Inf -Inf
## index 1.00 0.00 1 13.00 12.00
## mean_price 72.75 48.20 0 413.17 413.17
## num_contract 4.95 4.45 1 85.00 84.00
## num_effect_paid 3.58 1.48 0 79.00 79.00
## eff_index 1.00 0.00 1 1.00 0.00
## lt 72.80 87.47 -1 933.00 934.00
## skew kurtosis se
## id 1.04 4.92 591.95
## estado* -86.15 7420.00 0.00
## alumno 0.01 -0.85 2073.07
## tutor 0.16 -0.83 1151.40
## tarifa 5.05 33.34 0.17
## nombre* 0.75 -0.88 0.05
## pata_negra -6.84 44.85 0.00
## fecha_inicio NA NA NA
## fecha_fin NA NA NA
## duracion 2.00 2.73 0.69
## renovacion_automatica 0.49 -1.76 0.00
## renovacion_automatica_original -1.57 0.47 0.01
## fecha_creacion NA NA NA
## fecha_cancelacion NA NA NA
## importe 2.07 3.84 0.53
## importe_sin_dto 2.00 3.00 0.61
## primer_cargo NA NA NA
## ultimo_cargo NA NA NA
## num_cargos 5.55 29.46 0.01
## moneda* 20.86 491.08 0.00
## tipo_descuento* 1.86 4.95 0.01
## prescripcion 0.76 5.28 139.55
## materia* -19.21 409.53 0.00
## fecha_tarifa NA NA NA
## index 20.49 812.71 0.00
## mean_price 1.87 2.89 0.53
## num_contract 2.93 10.19 0.07
## num_effect_paid 3.29 13.81 0.05
## eff_index NaN NaN 0.00
## lt 2.00 2.73 0.69
data_contract_1st$fcon = with(data_contract_1st,ifelse(tarifa %in% c(101,102,103,104),tarifa,ifelse(tarifa %in% c(134,135,136),'summer','Other')))
data_contract_1st$ym = format(data_contract_1st$fecha_inicio,'%y-%m')
with(data_contract_1st,table(ym,fcon))
## fcon
## ym 101 102 103 104 Other summer
## 17-01 64 75 14 6 21 0
## 17-02 182 298 48 16 96 0
## 17-03 158 255 44 13 98 0
## 17-04 98 128 32 9 43 0
## 17-05 101 91 20 6 46 0
## 17-06 40 31 13 4 35 109
## 17-07 10 4 7 5 29 443
## 17-08 64 70 20 7 21 82
## 17-09 77 127 43 16 38 0
## 17-10 170 252 84 16 120 0
## 17-11 105 203 62 27 74 0
## 17-12 94 128 38 9 39 0
## 18-01 188 225 67 24 66 0
## 18-02 372 322 95 39 78 0
## 18-03 277 312 60 28 85 0
## 18-04 243 183 59 28 73 0
## 18-05 145 74 35 23 46 0
## 18-06 74 53 20 9 16 274
## 18-07 19 6 14 3 51 1196
## 18-08 181 50 25 9 38 79
## 18-09 280 290 122 27 51 0
## 18-10 368 404 173 72 110 0
## 18-11 349 288 124 34 47 0
## 18-12 189 163 85 16 59 0
## 19-01 293 294 111 30 69 2
## 19-02 322 365 90 59 48 0
## 19-03 250 270 95 32 58 0
## 19-04 245 154 65 30 47 0
## 19-05 190 97 44 20 29 0
## 19-06 52 63 21 11 32 517
## 19-07 30 3 15 5 67 1770
## 19-08 224 83 34 16 40 130
## 19-09 218 229 108 34 50 0
## 19-10 221 191 89 44 33 0
## 19-11 37 29 9 4 7 0
## 19-12 19 10 7 2 2 0
## 20-01 33 24 8 5 5 0
## 20-02 33 14 8 9 6 0
## 20-03 132 47 7 2 4 0
## 20-04 66 33 1 3 3 0
## 20-05 25 21 0 0 7 0
## 20-06 0 0 1 0 4 289
## 20-07 6 1 1 0 2 86
## 20-08 24 11 0 0 0 0
## 20-09 35 15 3 3 4 0
## 20-10 7 12 1 1 4 0
## 20-11 5 14 0 0 1 0
## 20-12 8 8 0 0 1 0
## 21-01 8 7 1 0 11 0
## 21-02 15 12 1 1 2 0
## 21-03 9 6 0 0 2 0
## 21-04 8 0 0 0 1 0
## 21-05 6 0 0 0 2 0
## 21-06 0 0 0 0 0 113
## 21-07 2 0 0 0 2 54
## 21-08 3 1 0 0 0 0
## 21-09 0 0 0 0 2 0
## 21-11 1 0 0 0 1 0
## 22-01 0 0 0 1 1 0
## 22-07 0 0 0 0 0 1
## 22-08 0 0 1 0 0 0
## 22-10 0 0 1 0 0 0
#data_contract_1st %>% group_by(ym,fcon) %>% summarise(n=n())
ggplot(subset(data_contract_1st,ym<'2019-10-01' & fcon!='Other'),aes(x=ym)) + geom_line(aes(group=fcon,col=fcon),stat = 'count') + theme_bw()+ theme(axis.text.x = element_text(angle=90)) + ggtitle('Number of New Subscriptions')
The average trial usage rates vary significantly across individuals who signup for the free trials at the time of the year and exhibit cyclical patterns.
trial_period = data %>% mutate(t_trialstart=as.numeric(fecha-fecha_MATEMATICAS),t_trialend = as.numeric(fecha - trial_math_end),trialdays=as.numeric(trial_math_end- fecha_MATEMATICAS)+1) %>% group_by(alumno) %>% mutate(cumusage=cumsum(floor(use)))%>% filter(t_trialend<=0 & t_trialstart>=0)
trial_period %>% group_by(alumno,singlekid=(num_kids==1),ym=format(fecha_MATEMATICAS,'%y-%m')) %>% summarise(usagerate=mean(use,na.rm=T),gamerate=mean(mundo_virtual,na.rm=T)) %>% ggplot(aes(x=ym,y=usagerate)) + geom_line(aes(group=singlekid,col=singlekid),stat='summary',fun=mean) + geom_line(aes(group=singlekid,col=singlekid,y=gamerate),stat='summary',fun=mean,linetype=c('dashed')) + theme_bw()+ theme(axis.text.x = element_text(angle=90)) + ggtitle('Average Trial Usage Rates & Gaming Rates Across Individuals')
## `summarise()` has grouped output by 'alumno', 'singlekid'. You can override
## using the `.groups` argument.
trial_period = trial_period %>% select(-'estado') %>% left_join(data_contract_1st[,c('alumno','tarifa','fecha_inicio','fecha_fin','fecha_creacion','fecha_cancelacion','importe','num_contract','num_effect_paid','mean_price','estado')],by='alumno')
trial_period$t = as.numeric(trial_period$fecha - min(trial_period$fecha))+1
trial_period = trial_period %>% group_by(alumno,cumusage) %>% mutate(recen = 1,recen=cumsum(recen)-1)
trial_period$recen[trial_period$recen==0 & trial_period$t_trialstart!=0] = 1
data_trialperiod = trial_period[trial_period$trialdays==15,] %>% select(alumno,fecha,t_trialstart,t_trialend,days,t,use,mundo_virtual,cumusage)
data_trialperiod$last = c(0,data_trialperiod$cumusage[0:(nrow(data_trialperiod)-1)])
data_trialperiod$last[data_trialperiod$t_trialstart==0] = 0
data_trialperiod = data_trialperiod %>% group_by(alumno,last) %>% mutate(recen=1,recen=cumsum(recen))
data_trialperiod$recen[data_trialperiod$t_trialstart==0] = 0
data_trialperiod = data_trialperiod %>% group_by(alumno) %>% mutate(trialdays=n()) %>% select(-last)
data_trialperiod = data_trialperiod[data_trialperiod$trialdays==15,]
length(unique(data_trialperiod$alumno))
## [1] 113668
names(trial_period)
## [1] "alumno" "nprogram_t1" "fecha_MATEMATICAS"
## [4] "trial_math_end" "num_kids" "fecha"
## [7] "id_sesion" "tipo" "efectividad"
## [10] "tmr" "mundo_virtual" "orden"
## [13] "orden_program" "pregunta_inicio" "pregunta_fin"
## [16] "calificacion" "velocidad" "correccion"
## [19] "num_problemas" "program" "use"
## [22] "days" "min_fecha" "t_trialstart"
## [25] "t_trialend" "trialdays" "cumusage"
## [28] "tarifa" "fecha_inicio" "fecha_fin"
## [31] "fecha_creacion" "fecha_cancelacion" "importe"
## [34] "num_contract" "num_effect_paid" "mean_price"
## [37] "estado" "t" "recen"
range(trial_period$t)
## [1] 1 1095
Accommodating parameter evolution is important in our context. and in many marketing contexts in which consumers’ preferences and responsiveness to marketing activities change over time.
An individual’s probability of using the product will change with the passage of time since subscribing a new contract and over the lifetime
use_cont = data_contract_1st[data_contract_1st$tarifa %in%c(101,102,103,104),c('alumno','tutor','tarifa','fecha_inicio','fecha_fin','fecha_creacion','index','tipo_descuento','importe','mean_price','num_contract','num_effect_paid')]
use_cont = data %>% right_join(use_cont,by='alumno')
use_cont = use_cont[use_cont$fecha_fin-use_cont$fecha>=0,]
use_cont_2 = use_cont[use_cont$fecha_inicio - use_cont$fecha <=0,]
gp = use_cont_2 %>% group_by(alumno,tarifa=factor(tarifa),ym=format(fecha_inicio,'%y-%m')) %>% summarise(n=n(),usagerate=mean(use,na.rm=T))
## `summarise()` has grouped output by 'alumno', 'tarifa'. You can override using
## the `.groups` argument.
table(gp$tarifa)
##
## 101 102 103 104
## 5937 5818 1992 733
gp %>% ggplot(aes(x=usagerate,fill=tarifa,col=tarifa)) + geom_histogram(alpha=0.4) + ggtitle('Distribution of Overall Usage Rates By Contract Type')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
gp %>% ggplot(aes(x=ym,col=tarifa,fill=tarifa)) + geom_bar(stat='count',alpha=0.4)+ theme(axis.text.x = element_text(angle=90)) + ggtitle('Number of New Contracts by Year-Month')
gp %>% group_by(ym,tarifa) %>% mutate(n=n()) %>% filter(n>30) %>% ggplot(aes(x=ym,y=usagerate)) + geom_boxplot()+ theme(axis.text.x = element_text(angle=90)) + facet_wrap(~tarifa) + ggtitle('Distribution of Usage Rates Under 1st Contract by Subscription Time')
use_cont_2 %>% mutate(t=as.numeric(fecha-fecha_inicio))%>% group_by(t,tarifa=factor(tarifa)) %>% summarise(n=n(),usagerate=mean(use,na.rm=T)) %>% group_by(tarifa) %>% mutate(max=max(n),surv=n/max) %>% filter(n>20) %>% ggplot(aes(x=t,group=tarifa,col=tarifa,y=usagerate)) + geom_line() + theme_bw() + ggtitle('Average Usage Rates By Tarifa')#+ geom_line(aes(y=surv))
## `summarise()` has grouped output by 't'. You can override using the `.groups`
## argument.
use_cont_2 = use_cont_2 %>% mutate(t=as.numeric(fecha-fecha_inicio))
use_cont_2 %>% mutate(dow=factor(weekdays(fecha,T),levels=c('Mon','Tue','Wed','Thu','Fri','Sat','Sun'))) %>% group_by(dow) %>% summarise(usagerate=mean(use,na.rm=T))
## # A tibble: 7 × 2
## dow usagerate
## <fct> <dbl>
## 1 Mon 0.682
## 2 Tue 0.674
## 3 Wed 0.666
## 4 Thu 0.647
## 5 Fri 0.532
## 6 Sat 0.560
## 7 Sun 0.637
use_cont_2 %>% mutate(dow=factor(weekdays(fecha,T),levels=c('Mon','Tue','Wed','Thu','Fri','Sat','Sun'))) %>% group_by(dow,alumno) %>% summarise(usagerate=mean(use,na.rm=T)) %>% ggplot(aes(x=dow,y=usagerate)) + geom_bar(stat='summary',fun=mean) + ggtitle('Usage Rates Within Indivduals by Day of Week')
## `summarise()` has grouped output by 'dow'. You can override using the `.groups`
## argument.
use_cont_2$index = c(0,as.integer(use_cont_2$use)[1:(nrow(use_cont_2)-1)])
use_cont_2$index[use_cont_2$t==0] = 0
use_cont_2 = use_cont_2 %>% group_by(alumno) %>% mutate(index= cumsum(index))
use_cont_2 = use_cont_2 %>% group_by(alumno,index) %>% mutate(recen=1,recen=cumsum(recen))
use_cont_2$t_calen = as.numeric(use_cont_2$fecha-min(use_cont_2$fecha))
data103 = subset(use_cont_2,tarifa==103 & fecha_inicio >='2018-01-01' & fecha_inicio <='2018-04-01')
data103$t_calen = as.numeric(data103$fecha - min(data103$fecha))
data103 %>% group_by(t_calen) %>% summarise(n=n()) %>% ggplot(aes(x=t_calen,y=n)) + geom_line(aes(group=1))
data103 %>% group_by(t_calen) %>% summarise(n=n(),usagerate=mean(use)) %>% ggplot(aes(x=t_calen,y=usagerate)) + geom_line(aes(group=1))
data103 %>% group_by(t) %>% summarise(n=n(),usagerate=mean(use)) %>% mutate(sur=n/max(n)) %>% filter(sur>0.7) %>% ggplot(aes(x=t,y=usagerate)) + geom_line(aes(group=1))
data103.id = unique(data103$alumno)
subset(data103, alumno %in% sample(data103.id,16)) %>% ggplot(aes(x=t,y=index)) + geom_line(aes(group=1)) + facet_wrap(~alumno)
data103 = data103 %>% group_by(alumno) %>% mutate(y=as.integer(use),usecum= cumsum(y))
data103$id = ifelse(duplicated(data103$alumno),0,1)
id = cumsum(data103$id)
data103$id = id
\(\alpha\)
data102 = subset(use_cont_2,tarifa==102 & fecha_inicio >='2018-01-01' & fecha_inicio <='2018-04-01')
data102$t_calen = as.numeric(data102$fecha - min(data102$fecha))
data102 %>% group_by(t_calen) %>% summarise(n=n()) %>% ggplot(aes(x=t_calen,y=n)) + geom_line(aes(group=1))
data102 %>% group_by(t_calen) %>% summarise(n=n(),usagerate=mean(use)) %>% ggplot(aes(x=t_calen,y=usagerate)) + geom_line(aes(group=1))
data102 %>% group_by(t) %>% summarise(n=n(),usagerate=mean(use)) %>% mutate(sur=n/max(n)) %>% filter(sur>0.7) %>% ggplot(aes(x=t,y=usagerate)) + geom_line(aes(group=1))
data102.id = unique(data102$alumno)
subset(data102, alumno %in% sample(data102.id,16)) %>% ggplot(aes(x=t,y=index)) + geom_line(aes(group=1)) + facet_wrap(~alumno)
data102 = data102 %>% group_by(alumno) %>% mutate(y=as.integer(use),usecum= cumsum(y))
data102$id = ifelse(duplicated(data102$alumno),0,1)
id = cumsum(data102$id)
data102$id = id
#save(fit_list,file='summ_230428_fit_list.RData')
load('~/Documents/pp/Smartick New Data/Smartick New Data/data_20221018/summ_230428_fit_list.RData')
dashdf <- with(fit_list, { rbind(data.frame(t=1:ncol(alpha_long),
plot="Calendar, Long-run",
med=apply(alpha_long,2,median),
lower=apply(alpha_long,2,quantile,probs=0.025),
upper=apply(alpha_long,2,quantile,probs=0.975)),
data.frame(t=1:ncol(alpha_short),
plot="Calendar, Short-run",
med=apply(alpha_short,2,median),
lower=apply(alpha_short,2,quantile,probs=0.025),
upper=apply(alpha_short,2,quantile,probs=0.975)),
data.frame(t=1:ncol(alpha_week),
plot="Calendar, Weekly",
med=apply(alpha_week,2,median),
lower=apply(alpha_week,2,quantile,probs=0.025),
upper=apply(alpha_week,2,quantile,probs=0.975)),
data.frame(t=1:ncol(alpha_rec),
plot="Recency",
med=apply(alpha_rec,2,median),
lower=apply(alpha_rec,2,quantile,probs=0.025),
upper=apply(alpha_rec,2,quantile,probs=0.975)),
data.frame(t=1:ncol(alpha_life),
plot="Lifetime",
med=apply(alpha_life,2,median),
lower=apply(alpha_life,2,quantile,probs=0.025),
upper=apply(alpha_life,2,quantile,probs=0.975)),
data.frame(t=1:ncol(alpha_pnum),
plot="Purchase Number",
med=apply(alpha_pnum,2,median),
lower=apply(alpha_pnum,2,quantile,probs=0.025),
upper=apply(alpha_pnum,2,quantile,probs=0.975))) })
colvec <- c(rep("blue2",3),rep("red2",3))
ggplot()+
geom_ribbon(data=dashdf,aes(x=t,ymin=lower,ymax=upper,fill=plot),alpha=0.15,color=NA)+
geom_line(data=dashdf,aes(x=t,y=med,color=plot))+
scale_color_manual(values=colvec)+
scale_fill_manual(values=colvec)+
facet_wrap(~plot,ncol=3,scales="free")+
ylab("Function Value")+
xlab("Input")+
theme_bw()+
theme(legend.position="none")