Harlan D. Harris, PhD
NY Open Statistical Programming Meetup, May 2017
* AKA, “Cancelling Classes for Fun and Profit”
years = 3; dpm = 28
mo <- function(t) 1 + (t-1) %/% dpm
dom <- function(t) 1 + (t-1) %% dpm
dat <- tibble(t=seq.int(years*12*dpm),
t_mo=mo(t),
t_dom=dom(t),
bts=t_mo%%12 == 8, # Back to School!
set=factor(ifelse(t > 33*dpm, "Test", "Train"),
levels=c("Train", "Test")),
sales = 100 + (t/5) + 3 *
arima.sim(list(order = c(1,1,0),
ar = .1),
n = years*12*dpm - 1))
# add bts effect
dat %<>% mutate(sales = sales * (1 + .25 * bts))
# add dom effect
dat %<>% mutate(sales = sales * (1 + .4 * (t_dom/28)^2))
# add both additive and multiplicitive noise
dat %<>% mutate(sales = sales *
rnorm(years*12*dpm, 1, .2) +
rnorm(years*12*dpm, 0, 20))
# force to be *nonnegative integer*
dat %<>% mutate(sales = pmax(0, round(sales)))
monthly <- dat %>%
group_by(t_mo, set, bts) %>%
summarise(sales=sum(sales))
glimpse(monthly)
Observations: 36
Variables: 4
$ t_mo <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1...
$ set <fctr> Train, Train, Train, Train, Train, Train, Train, Train,...
$ bts <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, F...
$ sales <dbl> 3405, 2924, 3474, 3875, 3788, 4060, 4671, 6059, 5039, 53...
daily <- dat %>%
group_by(t_mo) %>%
mutate(monthly_sales=sum(sales),
bod_sales=lag(sales, default = 0),
prop_sales = cumsum(bod_sales) /
monthly_sales,
prop_month=(t_dom-1) / dpm) %>%
ungroup()
glimpse(daily)
Observations: 1,008
Variables: 10
$ t <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...
$ t_mo <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
$ t_dom <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...
$ bts <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
$ set <fctr> Train, Train, Train, Train, Train, Train, Train...
$ sales <dbl> 115, 91, 109, 125, 137, 121, 110, 108, 93, 93, 1...
$ monthly_sales <dbl> 3405, 3405, 3405, 3405, 3405, 3405, 3405, 3405, ...
$ bod_sales <dbl> 0, 115, 91, 109, 125, 137, 121, 110, 108, 93, 93...
$ prop_sales <dbl> 0.00000000, 0.03377386, 0.06049927, 0.09251101, ...
$ prop_month <dbl> 0.00000000, 0.03571429, 0.07142857, 0.10714286, ...
\[ P(\text{forecast sales}=y \vert \text{sales to date}=x) \propto \\ P(\text{sales to date}=x | \text{frac of month}=z, \text{forecast sales}=y) \cdot P(\text{forecast sales}=y) \\ \equiv \\ P(\text{fraction sales to date}=\frac{x}{y}|\text{fraction of month}=z) \cdot P(\text{forecast sales} = y) \]
\[ \text{sales} = f(t,x) + N(0, \sigma) \\ \text{sales} = f(t,x) + N(0, g(t,x)) \\ \text{sales} = f(t,x) + ?(0, g(t,x), h(t,x)) \]
“Mathematically speaking, GAM is an additive modeling technique where the impact of the predictive variables is captured through smooth functions which—depending on the underlying patterns in the data—can be nonlinear:”
GAM: The Predictive Modeling Silver Bullet (Kim Larsen, Stichfix, 2015)
\[ log(y) = s(time) + seasonality + holiday + changepoints + error \]
e.g.:
monthly_training <- filter(monthly, set=="Train")
prior_fit <- gamlss(log(sales) ~ cs(t_mo) + bts,
sigma.formula = ~ t_mo, nu.formula = ~ t_mo,
family=TF, data=monthly_training,
control=gamlss.control(trace=FALSE))
Family: c("TF", "t Family")
Fitting method: RS()
Call:
gamlss(formula = log(sales) ~ cs(t_mo) + bts, sigma.formula = ~t_mo,
nu.formula = ~t_mo, family = TF, data = monthly_training,
control = gamlss.control(trace = FALSE))
Mu Coefficients:
(Intercept) cs(t_mo) btsTRUE
8.27662 0.01552 0.25989
Sigma Coefficients:
(Intercept) t_mo
-2.755428 0.007908
Nu Coefficients:
(Intercept) t_mo
2.1004 0.3959
Degrees of Freedom for the fit: 9.999 Residual Deg. of Freedom 23
Global Deviance: -78.6846
AIC: -58.6858
SBC: -43.7216
prior_dist <- function(t_mo, bts, sales_range=2000:15000) {
with(predictAll(prior_fit,
newdata=data.frame(t_mo=t_mo, bts=bts)),
data_frame(sales=sales_range,
d=dTF(log(sales_range), mu=mu,
sigma=sigma, nu=nu),
t_mo=t_mo, bts=bts) %>%
mutate(d=d/sum(d)))
}
samp_priors <- data_frame(t_mo=c(15,36),
bts=c(FALSE, TRUE)) %>%
by_row(~ prior_dist(.$t_mo, .$bts),
.collate='rows')
\[ P(\text{sales}=y \vert \text{sales to date}=x) \propto \\ P(\text{sales to date}=x | \text{frac of month}=z, \text{sales}=y) \cdot P(\text{sales}=y) \\ \text{or} \\ P(\text{fraction sales to date}=\frac{x}{y}|\text{fraction of month}=z) \cdot P(\text{sales} = y) \]
daily_training <- filter(daily,
set=="Train")
timecourse_fit <-
gamlss(prop_sales ~ cs(prop_month),
sigma.formula = ~ cs(prop_month),
nu.formula = ~ cs(prop_month),
data=daily_training,
family=BEINF0)
GAMLSS-RS iteration 1: Global Deviance = -2658.72
GAMLSS-RS iteration 2: Global Deviance = -3675.776
GAMLSS-RS iteration 3: Global Deviance = -3864.726
GAMLSS-RS iteration 4: Global Deviance = -3918.817
GAMLSS-RS iteration 5: Global Deviance = -3936.177
GAMLSS-RS iteration 6: Global Deviance = -3942.352
GAMLSS-RS iteration 7: Global Deviance = -3944.684
GAMLSS-RS iteration 8: Global Deviance = -3945.591
GAMLSS-RS iteration 9: Global Deviance = -3945.944
GAMLSS-RS iteration 10: Global Deviance = -3946.058
GAMLSS-RS iteration 11: Global Deviance = -3946.11
GAMLSS-RS iteration 12: Global Deviance = -3946.134
GAMLSS-RS iteration 13: Global Deviance = -3946.145
GAMLSS-RS iteration 14: Global Deviance = -3946.15
GAMLSS-RS iteration 15: Global Deviance = -3946.152
GAMLSS-RS iteration 16: Global Deviance = -3946.153
Family: c("BEINF0", "Beta Inflated zero")
Fitting method: RS()
Call:
gamlss(formula = prop_sales ~ cs(prop_month), sigma.formula = ~cs(prop_month),
nu.formula = ~cs(prop_month), family = BEINF0,
data = daily_training)
Mu Coefficients:
(Intercept) cs(prop_month)
-2.614 4.886
Sigma Coefficients:
(Intercept) cs(prop_month)
-2.57953 -0.01477
Nu Coefficients:
(Intercept) cs(prop_month)
-2.333 -71.074
Degrees of Freedom for the fit: 15 Residual Deg. of Freedom 909
Global Deviance: -3946.15
AIC: -3916.15
SBC: -3843.72
likelihood_dist <- function(prop_month, sales_to_date,
sales_range=2000:15000) {
with(predictAll(timecourse_fit,
newdata=data.frame(prop_month=prop_month)),
tibble(prop_sales=pmin(1, sales_to_date / sales_range),
sales=sales_range,
d=dBEINF0(prop_sales, mu=mu,
sigma=sigma, nu=nu)) %>%
mutate(d = ifelse(prop_sales >= 1, 0, d),
d = d/sum(d)))
}
samp_likelihoods <- tibble(prop_month=c(.03, .5, .90),
std=c(200, 3000, 5000)) %>%
by_row(~ likelihood_dist(.$prop_month, .$std),
.collate='rows')
\[ \forall y, P(\text{sales}=y \vert \text{sales to date}=x) \propto \\ P(\text{sales to date}=x | \text{frac of month}=z, \text{sales}=y) \cdot \\ P(\text{sales}=y) \]
Algorithm:
sample_prior <- prior_dist(36, FALSE)
sales_mode <-
function(dd) dd$sales[[which.max(dd$d)]]
sales_thresh <-
function(dd, th=10000) sum(dd$d[dd$sales >= th])
sample_llh <-
likelihood_dist(prop_month = 10/28,
sales_to_date = 4000)
sample_post <-
data_frame(sales=sample_prior$sales,
d=sample_prior$d * sample_llh$d) %>%
mutate(d = d/sum(d))
\[ X_t = c + \sum_{i=1}^p \phi_i X_{t-1} + \epsilon_t \]
monthly_ts <- ts(filter(monthly,
set=="Train")$sales,
start=c(2014,1), frequency=12)
monthly_fc <- forecast(auto.arima(monthly_ts,
max.q=0), 3)