SHAIKH_MARIUM_0082762

knitr::opts_chunk$set(
  echo    = TRUE,
  message = FALSE,
  warning = FALSE,
  fig.width  = 9,
  fig.height = 5,
  fig.align  = "center"
)

# ═════════════════════════════════════════════════════════════════════════════
# HOW TO USE THIS SCRIPT
# ───────────────────────
# Run this script section by section (Ctrl+Enter in RStudio).
# Each section produces output you will need to answer the exam questions.
# Do NOT just source the whole file at once — read each section first,
# understand what it does, then run it and study the output before moving on.
#
# The 20 exam questions follow the same order as the sections below.
# ═══════════════════════════════════════════════════════════════════════════════


# ── SETUP ────────────────────────────────────────────────────────────────────
# Install fpp3 the first time only (remove the # to run):
# install.packages("fpp3")

library(fpp3)   # loads tsibble, fable, feasts, ggplot2, dplyr, lubridate

## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr

## Registered S3 methods overwritten by 'ggtime':
##   method                        from      
##   +.gg_tsensemble               feasts    
##   autolayer.fbl_ts              fabletools
##   autolayer.tbl_ts              fabletools
##   autoplot.dcmp_ts              fabletools
##   autoplot.fbl_ts               fabletools
##   autoplot.tbl_cf               feasts    
##   autoplot.tbl_ts               fabletools
##   chooseOpsMethod.gg_tsensemble feasts    
##   fortify.fbl_ts                fabletools
##   grid.draw.gg_tsensemble       feasts    
##   print.gg_tsensemble           feasts    
##   scale_type.cf_lag             feasts

## Warning: replacing previous import 'feasts::scale_x_cf_lag' by
## 'ggtime::scale_x_cf_lag' when loading 'fpp3'

## Warning: replacing previous import 'feasts::gg_season' by 'ggtime::gg_season'
## when loading 'fpp3'

## Warning: replacing previous import 'feasts::gg_tsresiduals' by
## 'ggtime::gg_tsresiduals' when loading 'fpp3'

## Warning: replacing previous import 'feasts::gg_irf' by 'ggtime::gg_irf' when
## loading 'fpp3'

## Warning: replacing previous import 'feasts::gg_arma' by 'ggtime::gg_arma' when
## loading 'fpp3'

## Warning: replacing previous import 'feasts::gg_tsdisplay' by
## 'ggtime::gg_tsdisplay' when loading 'fpp3'

## Warning: replacing previous import 'feasts::gg_subseries' by
## 'ggtime::gg_subseries' when loading 'fpp3'

## Warning: replacing previous import 'feasts::gg_lag' by 'ggtime::gg_lag' when
## loading 'fpp3'

## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.3 ──

## ✔ tibble      3.2.1     ✔ tsibble     1.1.6
## ✔ dplyr       1.1.4     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.1     ✔ ggtime      0.2.0
## ✔ lubridate   1.9.4     ✔ feasts      0.4.2
## ✔ ggplot2     4.0.0     ✔ fable       0.5.0

## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()        masks base::date()
## ✖ dplyr::filter()          masks stats::filter()
## ✖ feasts::gg_arma()        masks ggtime::gg_arma()
## ✖ feasts::gg_irf()         masks ggtime::gg_irf()
## ✖ feasts::gg_lag()         masks ggtime::gg_lag()
## ✖ feasts::gg_season()      masks ggtime::gg_season()
## ✖ feasts::gg_subseries()   masks ggtime::gg_subseries()
## ✖ feasts::gg_tsdisplay()   masks ggtime::gg_tsdisplay()
## ✖ feasts::gg_tsresiduals() masks ggtime::gg_tsresiduals()
## ✖ tsibble::intersect()     masks base::intersect()
## ✖ tsibble::interval()      masks lubridate::interval()
## ✖ dplyr::lag()             masks stats::lag()
## ✖ feasts::scale_x_cf_lag() masks ggtime::scale_x_cf_lag()
## ✖ tsibble::setdiff()       masks base::setdiff()
## ✖ tsibble::union()         masks base::union()

# ═══════════════════════════════════════════════════════════════════════════════
# SECTION 1 — LOAD DATA & CREATE TSIBBLE
# ═══════════════════════════════════════════════════════════════════════════════

# Read CSV. The Month column is formatted as "Jan-07", "Feb-07", ...
# my() from lubridate parses this format into a Date.
# yearmonth() converts that Date into a tsibble-compatible monthly index.

novabrew <- read.csv("NovaBrew_Revenue.csv") |>
  mutate(Month = yearmonth(my(Month))) |>   # "Jan-07" → 2007 Jan
  as_tsibble(index = Month)                 # declare the time index

# ── Checks ─────────────────────────────────────────────────────────────
glimpse(novabrew)          # confirm 200 rows, correct column types

## Rows: 200
## Columns: 2
## $ Month   <mth> 2007 Jan, 2007 Feb, 2007 Mar, 2007 Apr, 2007 May, 2007 Jun, 20…
## $ Revenue <dbl> 17.8, 25.0, 32.3, 27.2, 54.2, 34.3, 14.8, 28.7, 42.6, 26.8, 32…

summary(novabrew$Revenue)  # Q3: mean, median, min, max

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    14.8   113.0   329.3   413.6   669.8  1141.3

# ═══════════════════════════════════════════════════════════════════════════════
# SECTION 2 — EXPLORATORY DATA ANALYSIS
# Q1: time series plot | Q2: month-over-month change | Q3: summary statistics
# ═══════════════════════════════════════════════════════════════════════════════

# ── Q1: Time series plot ──────────────────────────────────────────────────────
# Look for: level, trend direction, whether growth accelerates or is constant
autoplot(novabrew, Revenue) +
  labs(
    title    = "NovaBrew Coffee Roasters: Monthly Revenue",
    subtitle = "Jan 2007 – Aug 2023  |  200 monthly observations",
    y        = "Revenue ($thousands)",
    x        = "Month"
  ) +
  theme_minimal()

#QI ANS: The time series exhibits three clearly identifiable features:

#Level: The series begins at a very low level revenues in early 2007 ranged from roughly $15k to $55k per month and closes near $1,141k in August 2023. This 64-fold increase over 16 years is the dominant feature of the data.

#Trend direction: The trend is consistently and unambiguously upward throughout the entire period. There are no extended reversals, plateaus, or structural breaks visible in the plot.

#Growth pattern: Growth is clearly accelerating. In the early years the series is nearly flat; the slope visibly steepens as time progresses, producing a curve that bends upward rather than following a straight path. This is the signature of a quadratic (parabolic) trend rather than a linear one. No seasonal pattern is apparent the month-to-month noise appears largely random around the upward-curving trend.

# ── Q2: Month-over-month change ───────────────────────────────────────────────
# If the CHANGES grow over time → growth is accelerating → quadratic trend likely needed
# If the CHANGES are roughly flat → constant growth → linear trend may suffice
novabrew |>
  mutate(MoM_Change = Revenue - lag(Revenue, 1)) |>
  autoplot(MoM_Change) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
  labs(
    title = "Month-over-Month Revenue Change",
    y     = "Change in Revenue ($thousands)",
    x     = "Month"
  ) +
  theme_minimal()

#Q2 ANS: The month-over-month changes are decidedly not constant. In the early years (2007–2011) the changes hover near zero, oscillating between roughly −$20k and +$20k. By 2015–2023 the swings have grown substantially, frequently exceeding ±$40k and reaching as high as ±$70k.

#This pattern is highly informative for model selection: if the changes themselves were roughly flat (i.e., white noise around a constant), a linear trend would suffice. Instead, the growing amplitude of the changes mirrors the acceleration visible in the level plot revenue is not merely growing, but growing faster each year. This directly implies that a quadratic trend model is more appropriate than a linear one, since the quadratic term (t²) is precisely designed to capture a rate of growth that itself changes over time.

# ── Q3: Summary statistics ────────────────────────────────────────────────────
# Run both lines and note Mean, Median, and compute SD separately
summary(novabrew$Revenue)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    14.8   113.0   329.3   413.6   669.8  1141.3

sd(novabrew$Revenue)

## [1] 329.0079

# Coefficient of variation = SD / Mean — tells you variability relative to average
cat("Coefficient of Variation:", round(sd(novabrew$Revenue) / mean(novabrew$Revenue) * 100, 1), "%\n")

## Coefficient of Variation: 79.6 %

#Q3ANS: The coefficient of variation (CV) of 79.6% nearly 80 cents of standard deviation for every dollar of mean revenue indicates exceptionally high variability relative to the average. This is not a sign of noisiness per se; rather, it reflects the fact that the data span a very wide range ($14.8k to $1,141.3k) driven by a strong upward trend. The mean ($413.6k) substantially exceeds the median ($329.3k), confirming right-skew driven by the cluster of high values in the final years of the series. A model that ignores this trend will produce a high CV in its errors the trend itself is the primary source of spread.

# ═══════════════════════════════════════════════════════════════════════════════
# SECTION 3 — FIT THE LINEAR TREND MODEL
# Q4: write the equation | Q5: interpret b₁ | Q6: assess R²
# ═══════════════════════════════════════════════════════════════════════════════

# TSLM = Time Series Linear Model (from fable)
# trend() automatically creates a counter variable t = 1, 2, 3, ..., 200
# The model: Revenue_t = b0 + b1 * t + error_t

fit_linear <- novabrew |>
  model(linear = TSLM(Revenue ~ trend()))

# ── View coefficients, R², and F-statistic ────────────────────────────────────
report(fit_linear)

## Series: Revenue 
## Model: TSLM 
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -110.01  -63.53  -25.80   56.27  177.67 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -142.02980   10.89674  -13.03   <2e-16 ***
## trend()        5.52831    0.09402   58.80   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 76.76 on 198 degrees of freedom
## Multiple R-squared: 0.9458,  Adjusted R-squared: 0.9456
## F-statistic:  3458 on 1 and 198 DF, p-value: < 2.22e-16

# KEY OUTPUT TO RECORD FOR Q4:
#   (Intercept) = b₀  ← starting level at t = 0
#   trend()     = b₁  ← average revenue added each month
#   R-squared        ← proportion of variation explained by time alone

# Plain English interpretation template for Q5:
# "On average, NovaBrew's monthly revenue increases by $[b₁ × 1000] each month."

#Q4ANS Ŷt = −142.030 + 5.528 · t #where t = 1 corresponds to January 2007, t = 2 to February 2007, and t = 200 to August 2023. The intercept b₀ = −142.030 is the model’s estimated revenue at t = 0 (a theoretical reference point before the observed data); it has no direct business interpretation. The slope b₁ = 5.528 represents the estimated average monthly revenue increment.

#Q5ANS Business interpretation: Based on its 16-year revenue history, NovaBrew adds approximately $5,528 in monthly revenue on average for every month that passes — equivalent to an estimated annual revenue run-rate increase of roughly $66,000 per year under the linear trend assumption.

#Q6 The linear model’s R² = 0.9458, meaning the linear trend explains 94.6% of the total variance in monthly revenue. This is a high value by conventional standards. However, a high R² alone does not confirm the model is well-specified. R² measures how much variance is explained, but says nothing about how the residuals behave. If the true relationship is curved and the model is straight, the residuals will not be random they will exhibit a systematic pattern (such as a U-shape). A model with patterned residuals violates the regression assumption of white-noise errors, making its standard errors, confidence intervals, and forecasts unreliable, regardless of how high R² appears. Residual diagnostics are essential before trusting any model for forecasting.

# ═══════════════════════════════════════════════════════════════════════════════
# SECTION 4 — RESIDUAL DIAGNOSTICS: LINEAR MODEL
# Q7: time plot | Q8: ACF | Q9: histogram | Q10: overall judgment
# ═══════════════════════════════════════════════════════════════════════════════

# gg_tsresiduals() produces THREE panels automatically:
#   Top:          Residuals over time  → look for patterns (U-shape = missed curvature)
#   Bottom-left:  ACF of residuals     → bars outside blue bands = autocorrelation
#   Bottom-right: Histogram            → should be roughly bell-shaped and centred at 0

fit_linear |>
  select(linear) |>
  gg_tsresiduals() +
  labs(title = "Linear Model: Residual Diagnostics")

# ── What to look for ──────────────────────────────────────────────────────────
# Q7 — TIME PLOT: does it form a U-shape (positive → negative → positive)?
#      A U-shape means the data curves but the model is straight → need quadratic
#
# Q8 — ACF: are bars outside the blue dashed 95% bands?
#      Many significant spikes → residuals are NOT white noise → model is incomplete
#
# Q9 — HISTOGRAM: is it centred at zero? Symmetric?
#      A left-shifted histogram means systematic under-prediction (negative bias)
#
# Q10 — OVERALL: if residuals show a U-shape AND significant ACF spikes,
#       the linear model is misspecified and should NOT be trusted for forecasting

#Q7ANS: The residuals form a pronounced U-shape (or inverted arch): they begin large and positive in the early years (2007–2009), decline steadily to large negative values in the middle period (roughly 2010–2018), and then rise back to large positive values toward the end of the series (2019–2023). The maximum negative residual is −110.0 and the maximum positive residual is +177.7. This pattern is called systematic curvature in the residuals, and it is the diagnostic fingerprint of a misspecified functional form. The linear model fits a straight line through data that actually follows a curve: in the early and late periods the curve is above the line (positive residuals), while in the middle the curve is below the line (negative residuals). The U-shape makes clear that the linear model systematically under-predicts in the middle years and over-predicts at the extremes a problem no amount of better data can fix without changing the model structure. The remedy is to add a quadratic (t²) term.

#Q8 ANS: Virtually all ACF bars at every lag from 1 through at least 20 — fall far outside the blue 95% significance bands. The bars are not just marginally significant; the ACF remains extremely large across many lags, decaying very slowly. This slow decay is the ACF signature of a non-stationary or strongly trended residual series.

#The implication is clear: the residuals are definitively not white noise. Each month’s residual is strongly predictable from prior months’ residuals, meaning the model has left substantial, structured information in its errors. The Ljung-Box Q-statistic of 1,396 at lag 10 confirms this with a p-value of essentially zero (p < 2.2×10⁻¹⁶), strongly rejecting the null hypothesis of white noise.

#Q9 ANS:The histogram is not centered at zero and is not bell-shaped. Consistent with the U-shaped time plot, the residuals are spread across a wide range (roughly −110 to +178) and display a multimodal distribution: a left cluster (the large negative residuals from the middle years) and a right cluster (the large positive residuals from the early and late years). There is a gap near zero, reflecting the systematic arch pattern. Both heavy tails and bimodality indicate severe departures from normality a consequence of the model’s misspecification, not of random noise.

#Q10 ANS: No the linear model should not be used for forecasting. All three diagnostic panels fail simultaneously and severely. The time plot reveals a textbook U-shaped residual arch, proving the model is functionally misspecified. The ACF confirms that residuals are strongly autocorrelated across all tested lags, with the Ljung-Box test returning p ≈ 0,a decisive rejection of white noise. A model with structured, predictable residuals violates the core assumptions of regression forecasting: its prediction intervals are too narrow to be trusted, and its point forecasts will be systematically biased wherever the curve diverges from the straight line.

# ═══════════════════════════════════════════════════════════════════════════════
# SECTION 5 — FIT THE QUADRATIC TREND MODEL
# Q11: write quadratic equation | Q12: interpret b₂ | Q13: significance of b₂
# ═══════════════════════════════════════════════════════════════════════════════

# The quadratic model adds a t² term to capture accelerating (or decelerating) growth
# Model: Revenue_t = b0 + b1*t + b2*t² + error_t
#
# ⚠ IMPORTANT: I() is REQUIRED around trend()^2
#   In R formulas, ^ means "interaction" not "power".
#   I(trend()^2) tells R: "treat this as a literal squared value."

fit_quad <- novabrew |>
  model(quadratic = TSLM(Revenue ~ trend() + I(trend()^2)))

# ── View coefficients ─────────────────────────────────────────────────────────
report(fit_quad)

## Series: Revenue 
## Model: TSLM 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -61.2796 -10.5152   0.2241  10.4967  46.9041 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.700e+01  3.667e+00   7.363 4.78e-12 ***
## trend()      5.077e-01  8.423e-02   6.027 8.06e-09 ***
## I(trend()^2) 2.498e-02  4.059e-04  61.543  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.11 on 197 degrees of freedom
## Multiple R-squared: 0.9973,  Adjusted R-squared: 0.9973
## F-statistic: 3.668e+04 on 2 and 197 DF, p-value: < 2.22e-16

# KEY OUTPUT TO RECORD FOR Q11:
#   (Intercept)   = b₀
#   trend()       = b₁
#   I(trend()^2)  = b₂  ← positive = growth is accelerating

# Q12 — Plain English interpretation of b₂:
#   b₂ > 0 means each additional month adds slightly MORE revenue than the last
#   Example: "NovaBrew's growth rate is itself increasing over time — the business
#   is not just growing, it is growing faster each year."

# Q13 — Is b₂ significant?
#   Look at the p-value next to I(trend()^2) in the report() output
#   If p < 0.05, the quadratic term is statistically significant

#Q11 ANS: Ŷt = 26.998 + 0.5077 · t + 0.024978 · t² # b₀ = 26.998.The estimated revenue at t = 0 (the theoretical origin, before data begin). b₁ = 0.5077 — the linear component of the growth rate. b₂ = 0.024978.The curvature coefficient, governing how quickly the growth rate itself accelerates. All three coefficients are highly statistically significant (p < 2×10⁻⁸ or smaller)

#Q12 ANS: The positive value of b₂ tells us that NovaBrew is not merely growing it is growing faster each year. Each additional month contributes slightly more revenue than the month before it, so the company’s upward momentum is compounding over time. In practical terms rather than adding a flat $5,528 every month (as the linear model assumes), the quadratic model shows that NovaBrew’s monthly revenue increment grows by approximately $50 per month² (2 × b₂ ≈ 0.050 × t, evaluated at the midpoint). This accelerating trajectory is what makes the curve bend upward, rather than extending as a straight line.

#Q13 ANS: Yes b₂ is highly statistically significant. The t-statistic of 61.54 is enormous, and the p-value is effectively zero (reported as < 2×10⁻¹⁶ in R output). This means we have overwhelming evidence — at any conventional significance level — that the quadratic coefficient is not zero. The curvature in NovaBrew’s revenue growth is a real, stable feature of the data, not a statistical artifact. Including the t² term is not optional window-dressing; it is empirically required to correctly characterize the growth trajectory.

# ═══════════════════════════════════════════════════════════════════════════════
# SECTION 6 — COMPARE BOTH MODELS
# Q14: comparison table | Q15: quadratic residuals | Q16: Ljung-Box | Q17: R² always goes up
# ═══════════════════════════════════════════════════════════════════════════════

# Fit both models together for side-by-side comparison
fit_both <- novabrew |>
  model(
    linear    = TSLM(Revenue ~ trend()),
    quadratic = TSLM(Revenue ~ trend() + I(trend()^2))
  )

# ── Q14: Model comparison table ───────────────────────────────────────────────
# R²:      proportion of variance explained (higher = better, but can mislead)
# adj. R²: penalises for extra parameters (more honest comparison)
# AICc:    balances fit vs complexity — LOWER AICc = better model

glance(fit_both) |>
  select(.model, r_squared, adj_r_squared, AICc) |>
  arrange(AICc)

## # A tibble: 2 × 4
##   .model    r_squared adj_r_squared  AICc
##   <chr>         <dbl>         <dbl> <dbl>
## 1 quadratic     0.997         0.997 1141.
## 2 linear        0.946         0.946 1740.

# ── Q15: Quadratic residual diagnostics ───────────────────────────────────────
# The U-shape should be GONE if the quadratic term absorbed the curvature
fit_both |>
  select(quadratic) |>
  gg_tsresiduals() +
  labs(title = "Quadratic Model: Residual Diagnostics")

# ── Q16: Ljung-Box test for BOTH models ───────────────────────────────────────
# H₀: residuals are white noise (no leftover structure)
# p > 0.05 → fail to reject H₀ → residuals consistent with white noise → model is adequate
# p < 0.05 → reject H₀ → significant autocorrelation remains → model is incomplete

augment(fit_both) |>
  features(.innov, ljung_box, lag = 10)

## # A tibble: 2 × 3
##   .model    lb_stat lb_pvalue
##   <chr>       <dbl>     <dbl>
## 1 linear     1396.      0    
## 2 quadratic    11.3     0.334

# ── Q17 conceptual note ───────────────────────────────────────────────────────
# R² ALWAYS increases (or stays the same) when you add variables.
# This is why adj. R² and AICc exist — they penalise complexity.
# Residual diagnostics (U-shape, ACF, Ljung-Box) are equally important:
# a model with higher R² but patterned residuals is WORSE than one with
# slightly lower R² and clean (random) residuals.

#Q14ANS:
Metric Linear Quadratic Better model R² 0.9458 0.9973 Quadratic
Adjusted R² 0.9456 0.9973 Quadratic AICc 1,740 1,141 Quadratic (lower = better) Ljung-BoxQ(lag 10) 1,396 11.3 Quadratic (smaller Q) Ljung-Box p-value ≈ 0.000(fail) 0.334(pass) Quadratic

#The quadratic model outperforms the linear model on every metric without exception. The AICc gap of 599 points is enormous any difference above 10 is considered decisive in model selection. The improvement in adjusted R² (0.9456 → 0.9973) confirms the quadratic term adds genuine explanatory power, not just additional parameters.

#Q15ANS: Yes, the U-shape is effectively corrected. The quadratic residuals oscillate randomly around zero without any visible systematic arch or drift. The range has narrowed dramatically: the minimum quadratic residual is approximately −61.3 and the maximum is +46.9, compared to −110.0 and +177.7 for the linear model. The histogram is approximately bell-shaped and centered near zero. Some residual variability remains particularly a few larger negative observations in the 2010–2012 window but there is no systematic pattern. The ACF bars are now mostly within the blue significance bands, indicating that the quadratic term successfully absorbed the curvature that the linear model could not capture.

#Q16 ANS: #Linear — p-value ≈ 0.000 FAIL #Quadratic — p-value 0.334 PASS

#Only the quadratic model passes the white-noise test. With p = 0.334 > 0.05, we fail to reject the null hypothesis that the quadratic residuals are white noise — meaning no statistically detectable autocorrelation remains at the first 10 lags. This is a necessary condition for valid forecasting if residuals were predictable, the model would be demonstrably incomplete, and its confidence intervals would be too narrow.

#Q17 ANS: The classmate’s reasoning contains a fundamental flaw. R² is mathematically guaranteed to increase (or remain unchanged) whenever any predictor is added to a model even a predictor consisting of pure random noise. This makes raw R² a poor basis for model selection, because it rewards complexity indiscriminately rather than rewarding genuine explanatory power. The appropriate tools for comparing models of different complexity are adjusted R² and AICc, both of which penalise for the number of estimated parameters. A model with a higher R² but a worse AICc is actually less efficient it uses more model complexity than the data warrant. More critically, residual diagnostics and the Ljung-Box test address the most important question: has the model actually captured the data-generating process? A model with moderately lower R² but clean, white-noise residuals is preferable to one with higher R² and structured, autocorrelated residuals. In this analysis, the quadratic model happens to win on all criteria simultaneously — but that is not always the case, and R² alone would be insufficient evidence even when it is. The linear model’s p ≈ 0 is a decisive failure. It means the Ljung-Box test rejects white noise with near-infinite certainty, confirming the linear model should not be deployed.

# ═══════════════════════════════════════════════════════════════════════════════
# SECTION 7 — GENERATE FORECASTS
# Q18: March 2024 values | Q19: $1.5M threshold | Q20: deployment decision
# ═══════════════════════════════════════════════════════════════════════════════

# ── Q18: 12-month forecast (Sep 2023 – Aug 2024) ─────────────────────────────
fc_12 <- fit_both |>
  forecast(h = 12)

# Overlay plot: both forecasts against full observed series
fc_12 |>
  autoplot(novabrew, level = 95) +
  labs(
    title    = "NovaBrew Revenue: Linear vs Quadratic Forecasts",
    subtitle = "Forecast horizon: Sep 2023 – Aug 2024  |  95% prediction intervals",
    y        = "Revenue ($thousands)",
    x        = "Month"
  ) +
  theme_minimal()

# Extract the March 2024 predictions for both models
fc_12 |>
  as_tibble() |>
  filter(Month == yearmonth("Mar 2024")) |>
  select(.model, Month, .mean)

## # A tibble: 2 × 3
##   .model       Month .mean
##   <chr>        <mth> <dbl>
## 1 linear    2024 Mar 1002.
## 2 quadratic 2024 Mar 1202.

# Q18: Why does the gap between models widen?
# The linear model extrapolates a straight line; the quadratic curves upward.
# The further you forecast, the more t² dominates and the wider the divergence.

# ── Q19: Extend to December 2026 (CFO threshold question) ────────────────────
# Sep 2023 is t = 201, Dec 2026 is t = 240 → horizon = 40 months
fc_40 <- fit_both |>
  select(quadratic) |>
  forecast(h = 40)

# Plot extended forecast
fc_40 |>
  autoplot(novabrew, level = 95) +
  geom_hline(yintercept = 1500, linetype = "dashed", color = "firebrick", linewidth = 1) +
  annotate("text", x = yearmonth("Jan 2020"), y = 1550,
           label = "$1,500,000 threshold", color = "firebrick", size = 3.5) +
  labs(
    title    = "NovaBrew: Quadratic Forecast Extended to Dec 2026",
    subtitle = "Dashed line = $1.5M facility opening threshold",
    y        = "Revenue ($thousands)",
    x        = "Month"
  ) +
  theme_minimal()

# Find the first month the quadratic forecast exceeds $1,500,000
fc_40 |>
  as_tibble() |>
  filter(.mean > 1500) |>
  select(Month, .mean) |>
  slice_head(n = 3)   # show first few months above threshold

## # A tibble: 3 × 2
##      Month .mean
##      <mth> <dbl>
## 1 2026 May 1501.
## 2 2026 Jun 1514.
## 3 2026 Jul 1526.

# ── Q20: Deployment decision checklist ───────────────────────────────────────
# Use the evidence you have collected to answer Q20:
#
# Step 1: Does the quadratic model beat the linear?
#   → Check AICc (lower = better) and adj. R² (higher = better)
#
# Step 2: Are residuals white noise?
#   → Check Ljung-Box p-value. If p > 0.05, residuals are adequate.
#
# Step 3: Is there meaningful bias?
#   → Look at the sign and magnitude of ME (mean error) in the accuracy table below
#
# Step 4: What is the forecast risk for a capital decision?
#   → Quadratic curves can "explode" far into the future.
#   → A model fit on 2007-2023 data may not anticipate structural breaks,
#     economic downturns, or market saturation after 2023.

# Accuracy summary (in-sample, for reference)
accuracy(fit_both)

## # A tibble: 2 × 10
##   .model    .type          ME  RMSE   MAE   MPE  MAPE  MASE RMSSE   ACF1
##   <chr>     <chr>       <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 linear    Training 1.87e-15  76.4  65.3 34.7  59.7  0.955 0.979  0.920
## 2 quadratic Training 1.95e-15  17.0  13.2 -2.78  9.36 0.193 0.218 -0.120

#Q18 ANS: The quadratic model forecasts March 2024 revenue of $1,202k**,compared to $1,002k from the linear model a gap of approximately $200k (20%).The gap widens further into the future because the linear model extrapolates a constant slope (the same $5,528/month increment forever), while the quadratic model’s t² term grows without bound. The further one projects, the larger the contribution of the t² term becomes relative to the linear component, causing the quadratic curve to bend progressively higher above the straight line. Two models that appear nearly equivalent in-sample can produce drastically different and practically consequential out-of-sample forecasts.

#Q19 ANS: The quadratic trend forecast first crosses $1,500,000 in May 2026, with a predicted revenue of $1,501k. June 2026 ($1,514k) and July 2026 ($1,526k) follow in subsequent months, confirming the thresholdcrossing is sustained rather than a one-month artifact.

#Q20 ANS: Recommendation: Deploy with active monitoring but not as the sole basis for a capital decision.

#Evidence supporting deployment: The quadratic model clears every benchmark for a deployable forecasting model. On fit statistics, it achieves R² = adjusted R² = 0.9973 and an AICc of 1,141 — a 599-point improvement over the linear model’s 1,740, a difference that is decisive by any standard (models within 2 AICc points are considered equivalent; differences above 10 are decisive). On the white-noise test, the Ljung-Box p-value is 0.334, comfortably above the 0.05 threshold we cannot detect any residual autocorrelation at lags 1 through 10.The residual time plot shows random oscillation around zero with no systematic arch, and the histogram is approximately bell-shaped. These results collectively establish that the quadratic model has successfully captured the dominant structure in NovaBrew’s revenue data.

#Concrete limitation — extrapolation risk for a capital decision: The quadratic term t² grows without bound, meaning the forecast will accelerate indefinitely into the future. A model calibrated on 2007–2023 data has no mechanism to anticipate market saturation, competitive disruption, commodity price shocks, or macroeconomic contractions after August 2023. The $1.5M threshold forecast for May 2026 is produced by extrapolating a 16-year mathematical trend forward by 33 months more than two and a half years beyond the last observation. Even a modest slowdown in NovaBrew’s growth rate, not captured by the model, could push the actual threshold crossing months or years later than predicted. Committing $5M in capital to a facility opening based on a single quadratic extrapolation without scenario analysis, confidence intervals reviewed at each new month of data, and business intelligence about market conditions introduces substantial financial risk. The model is suitable for operational forecasting (staffing, inventory) with frequent re-estimation; it is insufficient as the sole analytical basis for an irreversible capital allocation decision.

#Conclusion

#This analysis evaluated two trend regression models against NovaBrew Coffee Roasters’ 200-month revenue series. The linear model, while explaining 94.6% of variance (R² = 0.9458), fails all residual diagnostic tests: its residuals display a pronounced U-shape, the ACF reveals pervasive autocorrelation, and the Ljung-Box test returns p ≈ 0 at lag 10 (Q = 1,396). It should not be used for forecasting. The quadratic model corrects these deficiencies across the board. Its R² rises to 0.9973, AICc falls by 599 points (1,740 → 1,141), residuals are visually random, and the Ljung-Box test produces p = 0.334 — a clean pass. The quadratic term b₂ = 0.024978 is statistically indistinguishable from non-zero (t = 61.5, p < 10⁻¹⁵⁰), confirming that NovaBrew’s accelerating growth is a genuine, stable feature of its business trajectory over this period. Applied to NovaBrew’s business question: the quadratic model forecasts March 2024 revenue of approximately $1,202k, and projects the $1.5M monthly threshold — the CFO’s trigger for a second roasting facility will first be crossed in May 2026. The quadratic model is the appropriate tool for short-to-medium-horizon forecasting, and should be re-estimated quarterly as new data accumulate. For the facility investment decision specifically, the model forecast should be supplemented with scenario analysis, external market intelligence, and formal uncertainty quantification before a $5M commitment is made.