Init

library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## Loading required package: weights
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## Loading required package: assertthat
## 
## Attaching package: 'assertthat'
## The following object is masked from 'package:tibble':
## 
##     has_name
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
## Loading required package: psych
## 
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
## 
##     describe
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## Loading required package: metafor
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loading the 'metafor' package (version 3.0-2). For an
## introduction to the package please type: help(metafor)
## Loading required package: rlang
## 
## Attaching package: 'rlang'
## The following object is masked from 'package:magrittr':
## 
##     set_names
## The following object is masked from 'package:assertthat':
## 
##     has_name
## The following objects are masked from 'package:purrr':
## 
##     %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
##     flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
##     splice
## 
## Attaching package: 'kirkegaard'
## The following object is masked from 'package:rlang':
## 
##     is_logical
## The following object is masked from 'package:psych':
## 
##     rescale
## The following object is masked from 'package:assertthat':
## 
##     are_equal
## The following objects are masked from 'package:purrr':
## 
##     is_logical, is_numeric
## The following object is masked from 'package:base':
## 
##     +
load_packages(
  lubridate,
  patchwork,
  rms,
  ggeffects
)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
## 
## Attaching package: 'rms'
## The following object is masked from 'package:metafor':
## 
##     vif
theme_set(theme_classic())

Data

Need to manually download the data export.

d = read_csv("kvSQPM61RlWnyZnfIr5ivw.csv") %>% 
  df_legalize_names()
## Rows: 305 Columns: 33
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (7): Email, Name, Subscription type, Revenue, Subscription source (fre...
## dbl  (21): Activity, Comments (all time), Comments (last 7 days), Comments (...
## dttm  (5): Subscription date, Expires at, First payment at, Paid upgrade dat...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#simplify date, remove time
d %<>% mutate(
  date = as.Date(Subscription_date)
)

#get totals by date
#somewhat complicated method
d_totals = map_df(seq(min(d$date) - days(1), max(d$date), by = "1 day"), function(i_date) {
  #count by that date
  tibble(
    date = i_date,
    total_subs = d %>% filter(date <= i_date) %>% nrow()
  )
})

#add gains
d_totals %<>% mutate(
  sub_gains = diff(c(0, total_subs), lag = 1),
  
  date_num = as.numeric(date)
)

Analysis

#plot change over time
ggplot(d_totals, aes(date, total_subs)) +
  geom_line() +
  ylab("Total subs") +

  ggplot(d_totals, aes(date, sub_gains)) +
  geom_line() +
  ylab("New subs") +
  geom_hline(yintercept = 0, linetype = "dotted")

#fit a model to predict value for end of year
ols(total_subs ~ date_num, data = d_totals) -> model1
model1
## Linear Regression Model
##  
##  ols(formula = total_subs ~ date_num, data = d_totals)
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs       87    LR chi2    228.96    R2       0.928    
##  sigma18.4247    d.f.            1    R2 adj   0.927    
##  d.f.      85    Pr(> chi2) 0.0000    g       76.397    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -91.1473  -8.8041   0.9823  11.2244  27.0709 
##  
##  
##            Coef        S.E.      t      Pr(>|t|)
##  Intercept -49158.6704 1490.8002 -32.97 <0.0001 
##  date_num       2.6044    0.0787  33.11 <0.0001 
## 
ols(total_subs ~ rcs(date_num, 3), data = d_totals) -> model2
model2
## Linear Regression Model
##  
##  ols(formula = total_subs ~ rcs(date_num, 3), data = d_totals)
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs       87    LR chi2    290.71    R2       0.965    
##  sigma12.9975    d.f.            2    R2 adj   0.964    
##  d.f.      84    Pr(> chi2) 0.0000    g       76.397    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -65.984  -5.788   2.678   7.451  19.187 
##  
##  
##            Coef        S.E.      t      Pr(>|t|)
##  Intercept -75133.0821 2979.6320 -25.22 <0.0001 
##  date_num       3.9767    0.1574  25.27 <0.0001 
##  date_num'     -1.8297    0.1964  -9.32 <0.0001 
## 
#forecast to end of year
#try with ggpredict
#end of year as numeric value
as.Date("2023-01-01") %>% as.numeric() -> target_date_num

ggpredict(
  model2,
  terms = str_glue("date_num [18910:{target_date_num} by=1]")
) %>% as.data.frame() %>% mutate(
  date = as.Date(x, origin = origin),
  observed = c(d_totals$total_subs, rep(NA, n() - nrow(d_totals)))
) -> pred_values

pred_values %>% 
  ggplot(aes(date)) +
  geom_point(aes(y = observed)) +
  geom_line(aes(y = predicted), color = "blue") +
  geom_ribbon(aes(y = predicted, ymax = conf.high, ymin = conf.low), alpha = .1) +
  geom_text(data = pred_values[nrow(pred_values), ], aes(y = predicted, label = round(predicted)), nudge_y = 5) +
  geom_text(data = pred_values[nrow(pred_values), ], aes(y = conf.high, label = round(conf.high)), nudge_y = 5) +
    geom_text(data = pred_values[nrow(pred_values), ], aes(y = conf.low, label = round(conf.low)), nudge_y = 5)
## Warning: Removed 362 rows containing missing values (geom_point).