library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## Loading required package: weights
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
## Loading required package: assertthat
##
## Attaching package: 'assertthat'
## The following object is masked from 'package:tibble':
##
## has_name
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
## Loading required package: psych
##
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## Loading required package: metafor
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loading the 'metafor' package (version 3.0-2). For an
## introduction to the package please type: help(metafor)
## Loading required package: rlang
##
## Attaching package: 'rlang'
## The following object is masked from 'package:magrittr':
##
## set_names
## The following object is masked from 'package:assertthat':
##
## has_name
## The following objects are masked from 'package:purrr':
##
## %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
## flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
## splice
##
## Attaching package: 'kirkegaard'
## The following object is masked from 'package:rlang':
##
## is_logical
## The following object is masked from 'package:psych':
##
## rescale
## The following object is masked from 'package:assertthat':
##
## are_equal
## The following objects are masked from 'package:purrr':
##
## is_logical, is_numeric
## The following object is masked from 'package:base':
##
## +
load_packages(
lubridate,
patchwork,
rms,
ggeffects
)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
##
## Attaching package: 'rms'
## The following object is masked from 'package:metafor':
##
## vif
theme_set(theme_classic())
Need to manually download the data export.
d = read_csv("kvSQPM61RlWnyZnfIr5ivw.csv") %>%
df_legalize_names()
## Rows: 305 Columns: 33
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): Email, Name, Subscription type, Revenue, Subscription source (fre...
## dbl (21): Activity, Comments (all time), Comments (last 7 days), Comments (...
## dttm (5): Subscription date, Expires at, First payment at, Paid upgrade dat...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#simplify date, remove time
d %<>% mutate(
date = as.Date(Subscription_date)
)
#get totals by date
#somewhat complicated method
d_totals = map_df(seq(min(d$date) - days(1), max(d$date), by = "1 day"), function(i_date) {
#count by that date
tibble(
date = i_date,
total_subs = d %>% filter(date <= i_date) %>% nrow()
)
})
#add gains
d_totals %<>% mutate(
sub_gains = diff(c(0, total_subs), lag = 1),
date_num = as.numeric(date)
)
#plot change over time
ggplot(d_totals, aes(date, total_subs)) +
geom_line() +
ylab("Total subs") +
ggplot(d_totals, aes(date, sub_gains)) +
geom_line() +
ylab("New subs") +
geom_hline(yintercept = 0, linetype = "dotted")
#fit a model to predict value for end of year
ols(total_subs ~ date_num, data = d_totals) -> model1
model1
## Linear Regression Model
##
## ols(formula = total_subs ~ date_num, data = d_totals)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 87 LR chi2 228.96 R2 0.928
## sigma18.4247 d.f. 1 R2 adj 0.927
## d.f. 85 Pr(> chi2) 0.0000 g 76.397
##
## Residuals
##
## Min 1Q Median 3Q Max
## -91.1473 -8.8041 0.9823 11.2244 27.0709
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -49158.6704 1490.8002 -32.97 <0.0001
## date_num 2.6044 0.0787 33.11 <0.0001
##
ols(total_subs ~ rcs(date_num, 3), data = d_totals) -> model2
model2
## Linear Regression Model
##
## ols(formula = total_subs ~ rcs(date_num, 3), data = d_totals)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 87 LR chi2 290.71 R2 0.965
## sigma12.9975 d.f. 2 R2 adj 0.964
## d.f. 84 Pr(> chi2) 0.0000 g 76.397
##
## Residuals
##
## Min 1Q Median 3Q Max
## -65.984 -5.788 2.678 7.451 19.187
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -75133.0821 2979.6320 -25.22 <0.0001
## date_num 3.9767 0.1574 25.27 <0.0001
## date_num' -1.8297 0.1964 -9.32 <0.0001
##
#forecast to end of year
#try with ggpredict
#end of year as numeric value
as.Date("2023-01-01") %>% as.numeric() -> target_date_num
ggpredict(
model2,
terms = str_glue("date_num [18910:{target_date_num} by=1]")
) %>% as.data.frame() %>% mutate(
date = as.Date(x, origin = origin),
observed = c(d_totals$total_subs, rep(NA, n() - nrow(d_totals)))
) -> pred_values
pred_values %>%
ggplot(aes(date)) +
geom_point(aes(y = observed)) +
geom_line(aes(y = predicted), color = "blue") +
geom_ribbon(aes(y = predicted, ymax = conf.high, ymin = conf.low), alpha = .1) +
geom_text(data = pred_values[nrow(pred_values), ], aes(y = predicted, label = round(predicted)), nudge_y = 5) +
geom_text(data = pred_values[nrow(pred_values), ], aes(y = conf.high, label = round(conf.high)), nudge_y = 5) +
geom_text(data = pred_values[nrow(pred_values), ], aes(y = conf.low, label = round(conf.low)), nudge_y = 5)
## Warning: Removed 362 rows containing missing values (geom_point).