library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(readxl) 
# Assume your dataset is called df with variables: period (1:150) and price
library(readxl)
df <- read_excel("C:/Users/ajasa/Downloads/Coca_Cola Information  SalesBoxes 2015-2018.xlsx", sheet = "Summary", skip = 3)

# Renombrar columnas
colnames(df) <- c("Month_Year", "Sales_Unit_Boxes")

# Convertir tipos
df$Month_Year <- as.Date(df$Month_Year)
df$Sales_Unit_Boxes <- as.numeric(df$Sales_Unit_Boxes)
## Warning: NAs introduced by coercion
df <- head(df, -2)
y_ts <- ts(df$Sales_Unit_Boxes, start = c(2015,1), frequency = 12)
head(df)
## # A tibble: 6 Ă— 2
##   Month_Year Sales_Unit_Boxes
##   <date>                <dbl>
## 1 2015-01-01         5516689.
## 2 2015-02-01         5387496.
## 3 2015-03-01         5886747.
## 4 2015-04-01         6389182.
## 5 2015-05-01         6448275.
## 6 2015-06-01         6697947.
# Compute mean and sd
price_mean <- mean(df$Sales_Unit_Boxes)
price_sd   <- sd(df$Sales_Unit_Boxes)

# Plot time series with mean, sd lines, and a trend line
ggplot(df, aes(x = Month_Year, y = Sales_Unit_Boxes)) +
  geom_line(color = "steelblue", linewidth = 1) +
  geom_hline(yintercept = price_mean, color = "purple", linetype = "dashed", linewidth = 1) +
  geom_hline(yintercept = price_mean + price_sd, color = "lightgreen", linetype = "dotted", linewidth = 1) +
  geom_hline(yintercept = price_mean - price_sd, color = "lightgreen", linetype = "dotted", linewidth = 1) +
  geom_smooth(method = "lm", se = FALSE, color = "lightblue", linewidth = 1) +
  labs(title = "Coca cola sales time series",
       x = "Date", y = "Sales")
## `geom_smooth()` using formula = 'y ~ x'

The Coca-Cola sales time series shows a slight upward trend with irregular fluctuations, including sharp peaks around 2016 and 2018 and sudden drops. There is no clear seasonal pattern.

# 0) Diagnóstico rápido
length(df$Sales_Unit_Boxes)                  
## [1] 48
ts_sales <- ts(df$Sales_Unit_Boxes, start = c(2015, 1), frequency = 12)
length(ts_sales)                     
## [1] 48
frequency(ts_sales)                 
## [1] 12
floor(length(ts_sales) / frequency(ts_sales)) 
## [1] 4
ts_price <- ts(df$Sales_Unit_Boxes)

ts_sales <- ts(df$Sales_Unit_Boxes, start = c(2015, 1), frequency = 12)

# DescomposiciĂłn aditiva
decomp_add <- decompose(ts_sales, type = "additive")

# VisualizaciĂłn
plot(decomp_add)

Observed: The raw time series shows fluctuations between ~5.5M and ~8M.

Trend: There is a clear upward trend from 2005 to 2008, showing long-term growth.

Seasonal: The seasonal component oscillates, but the magnitude of seasonality remains fairly stable, supporting the additive assumption.

Random: The residuals are irregular, without strong remaining structure, but some spikes suggest noise still present.

acf(df$Sales_Unit_Boxes, main = "Autocorrelation Function (ACF)")  #MA

pacf(df$Sales_Unit_Boxes, main = "Parcial Autocorrelation Function (PAC)") #AR

ACF: Strong autocorrelation at lag 1 and 2, gradually decaying afterwards,there are some spikes around lag 10–12, suggesting possible seasonal correlation., the slow decay indicates the series is non-stationary because it has a trend..

PACF shows a clear spike at lag 1, then quickly drops within the confidence bands.

We will be comparing ARIMA vs SARIMA, since its a time series problem bit SARIMA extends ARIMA by adding seasonal components.

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
model_lm <- lm(Sales_Unit_Boxes ~ Month_Year, data = df)
dwtest(model_lm)
## 
##  Durbin-Watson test
## 
## data:  model_lm
## DW = 0.97665, p-value = 3.089e-05
## alternative hypothesis: true autocorrelation is greater than 0
#4
# Augmented Dickey-Fuller Test. 
# p-value < 0.05 → reject H0 → data is stationary.
#si beta<1 es estacionaria, 
library(tseries)
adf_test <- adf.test(df$Sales_Unit_Boxes)
## Warning in adf.test(df$Sales_Unit_Boxes): p-value smaller than printed p-value
adf_test
## 
##  Augmented Dickey-Fuller Test
## 
## data:  df$Sales_Unit_Boxes
## Dickey-Fuller = -4.4282, Lag order = 3, p-value = 0.01
## alternative hypothesis: stationary

DW = 0.97665 → The statistic is well below 2, indicating strong positive autocorrelation in the residuals.

p-value is small, so we reject the null hypothesis of no autocorrelation.

The residuals of the linear model are not independent, meaning the model suffers from serial correlation.

The regression model shows autocorrelated residuals, making it inadequate on its own. However, the sales series itself is stationary.

fit_ar1 <- Arima(y_ts, order = c(1,1,0))   # AR(1)
fit_ma1 <- Arima(y_ts, order = c(0,1,1))   # MA(1)

# --- Compare models ---
summary(fit_ar1)
## Series: y_ts 
## ARIMA(1,1,0) 
## 
## Coefficients:
##           ar1
##       -0.1779
## s.e.   0.1428
## 
## sigma^2 = 3.139e+11:  log likelihood = -688.3
## AIC=1380.59   AICc=1380.87   BIC=1384.29
## 
## Training set error measures:
##                    ME     RMSE      MAE         MPE     MAPE     MASE
## Training set 24138.94 548430.8 437845.3 -0.01946129 6.809614 1.329443
##                     ACF1
## Training set -0.01694275
summary(fit_ma1)
## Series: y_ts 
## ARIMA(0,1,1) 
## 
## Coefficients:
##           ma1
##       -0.2106
## s.e.   0.1614
## 
## sigma^2 = 3.122e+11:  log likelihood = -688.18
## AIC=1380.36   AICc=1380.64   BIC=1384.06
## 
## Training set error measures:
##                    ME     RMSE      MAE         MPE     MAPE     MASE
## Training set 25605.26 547014.4 439205.5 -0.01227034 6.829469 1.333573
##                    ACF1
## Training set 0.01009479
# Compare AIC and BIC
AIC(fit_ar1, fit_ma1)
##         df      AIC
## fit_ar1  2 1380.595
## fit_ma1  2 1380.365
BIC(fit_ar1, fit_ma1)
##         df      BIC
## fit_ar1  2 1384.295
## fit_ma1  2 1384.065
# --- Residual diagnostics ---
# Check autocorrelation of residuals
checkresiduals(fit_ar1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,0)
## Q* = 7.7725, df = 9, p-value = 0.5572
## 
## Model df: 1.   Total lags used: 10
checkresiduals(fit_ma1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1)
## Q* = 8.1026, df = 9, p-value = 0.5238
## 
## Model df: 1.   Total lags used: 10
# --- Forecasting ---
fc_ar1 <- forecast(fit_ar1, h = 12)
fc_ma1 <- forecast(fit_ma1, h = 12)

autoplot(fc_ar1) + ggtitle("Forecast ARIMA(1,1,0)")

autoplot(fc_ma1) + ggtitle("Forecast ARIMA(0,1,1)")

checkresiduals(fit_ar1, lag = 12)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,0)
## Q* = 29.94, df = 11, p-value = 0.00162
## 
## Model df: 1.   Total lags used: 12
checkresiduals(fit_ma1, lag = 12)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1)
## Q* = 30.676, df = 11, p-value = 0.00124
## 
## Model df: 1.   Total lags used: 12
nsdiffs(y_ts)  # test if seasonal differencing is needed
## [1] 1

The models pass Ljung–Box at 10 lags , nad AIC/BIC are nearly identical. That only establishes that short-lag residual autocorrelation is small. It does not test for seasonal correlation because the test stopped at lag 10. For monthly data, the first seasonal correlation is at lag 12.

Both ARIMA models leave systematic correlation in the residuals at seasonal lags (12, 24). That means important structure in the data is not being captured. The low p-values confirm this is statistically significant.

A SARIMA with D=1 is required.

fit_sarima <- auto.arima(ts_sales, seasonal = TRUE)

# Resumen del modelo
summary(fit_sarima)
## Series: ts_sales 
## ARIMA(1,0,0)(1,1,0)[12] with drift 
## 
## Coefficients:
##          ar1     sar1     drift
##       0.3351  -0.4162  9706.378
## s.e.  0.1935   0.2142  5119.602
## 
## sigma^2 = 1.037e+11:  log likelihood = -507.28
## AIC=1022.57   AICc=1023.86   BIC=1028.9
## 
## Training set error measures:
##                    ME     RMSE      MAE        MPE     MAPE      MASE
## Training set 12204.66 267035.2 193928.8 0.02922099 2.948075 0.5888319
##                     ACF1
## Training set -0.07376709
# Revisar residuos para ver si el modelo captura bien la dinámica
checkresiduals(fit_sarima)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,0)(1,1,0)[12] with drift
## Q* = 10.982, df = 8, p-value = 0.2027
## 
## Model df: 2.   Total lags used: 10
# PronĂłstico de 12 meses
forecast_sarima <- forecast(fit_sarima, h = 12)

# Gráfico
autoplot(forecast_sarima) +
  ggtitle("PronĂłstico SARIMA de Ventas Coca-Cola") +
  xlab("Año") + ylab("Ventas (Unit Boxes)")

Ljung–Box (lag 10): p = 0.20 → Fail to reject H₀. Residuals behave like white noise. This means the model successfully captured both the trend and seasonality.

Time series plot: residuals fluctuate around zero with no clear structure.

ACF of residuals: all autocorrelations within confidence bounds theres no leftover autocorrelation.

Histogram + density: residuals look roughly normal, with slight skewness, but acceptable.

Peaks and troughs are reproduced: unlike non-seasonal ARIMA, this SARIMA model preserves the yearly cycle.

fit_arima <- auto.arima(ts_sales, seasonal = FALSE)
summary(fit_arima)
## Series: ts_sales 
## ARIMA(1,0,0) with non-zero mean 
## 
## Coefficients:
##          ar1     mean
##       0.5487  6452606
## s.e.  0.1220   155338
## 
## sigma^2 = 2.575e+11:  log likelihood = -697.85
## AIC=1401.7   AICc=1402.24   BIC=1407.31
## 
## Training set error measures:
##                    ME     RMSE      MAE        MPE     MAPE     MASE       ACF1
## Training set 13902.88 496742.9 394341.4 -0.3828106 6.198876 1.197351 0.02645002
fit_sarima <- auto.arima(ts_sales, seasonal = TRUE)
summary(fit_sarima)
## Series: ts_sales 
## ARIMA(1,0,0)(1,1,0)[12] with drift 
## 
## Coefficients:
##          ar1     sar1     drift
##       0.3351  -0.4162  9706.378
## s.e.  0.1935   0.2142  5119.602
## 
## sigma^2 = 1.037e+11:  log likelihood = -507.28
## AIC=1022.57   AICc=1023.86   BIC=1028.9
## 
## Training set error measures:
##                    ME     RMSE      MAE        MPE     MAPE      MASE
## Training set 12204.66 267035.2 193928.8 0.02922099 2.948075 0.5888319
##                     ACF1
## Training set -0.07376709
# Medidas de error en el ajuste
accuracy(fit_arima)
##                    ME     RMSE      MAE        MPE     MAPE     MASE       ACF1
## Training set 13902.88 496742.9 394341.4 -0.3828106 6.198876 1.197351 0.02645002
accuracy(fit_sarima)
##                    ME     RMSE      MAE        MPE     MAPE      MASE
## Training set 12204.66 267035.2 193928.8 0.02922099 2.948075 0.5888319
##                     ACF1
## Training set -0.07376709
# PronĂłsticos de 12 meses
forecast_arima <- forecast(fit_arima, h = 12)
forecast_sarima <- forecast(fit_sarima, h = 12)

# Graficar comparativo
autoplot(ts_sales) +
  autolayer(forecast_arima$mean, series="ARIMA", color="red") +
  autolayer(forecast_sarima$mean, series="SARIMA", color="blue") +
  ggtitle("Comparativo ARIMA vs SARIMA") +
  ylab("Ventas (Unit Boxes)") + xlab("Año") +
  guides(colour=guide_legend(title="Modelo"))

fit_arima   <- auto.arima(ts_sales, seasonal = FALSE)
fit_sarima  <- auto.arima(ts_sales, seasonal = TRUE)

Statistically, SARIMA is clearly superior. Lower AIC/BIC and error measures confirm it models the data-generating process more effectively.

Why SARIMA succeeds: SARIMA includes both an AR(1) term (short-term memory) and a seasonal AR(1) with D=1 seasonal differencing. This allows it to handle both the general trend and repeating 12-month cycles, aligning with real consumption patterns.

acc_arima  <- accuracy(fit_arima)
acc_sarima <- accuracy(fit_sarima)

# 4) Crear tabla comparativa con fila 1
results <- data.frame(
  Modelo = c("ARIMA", "SARIMA"),
  AIC  = c(fit_arima$aic, fit_sarima$aic),
  BIC  = c(fit_arima$bic, fit_sarima$bic),
  RMSE = c(acc_arima[1,"RMSE"], acc_sarima[1,"RMSE"]),
  MAE  = c(acc_arima[1,"MAE"], acc_sarima[1,"MAE"]),
  MAPE = c(acc_arima[1,"MAPE"], acc_sarima[1,"MAPE"])
)

print(results)
##   Modelo      AIC      BIC     RMSE      MAE     MAPE
## 1  ARIMA 1401.696 1407.309 496742.9 394341.4 6.198876
## 2 SARIMA 1022.567 1028.901 267035.2 193928.8 2.948075
start(ts_sales)   # beginning (year, period)
## [1] 2015    1
end(ts_sales)     # ending (year, period)
## [1] 2018   12
frequency(ts_sales)  # seasonal frequency (12 = monthly)
## [1] 12
train <- window(ts_sales, end   = c(2018,1))
test  <- window(ts_sales, start = c(2017,12))

fit_arima  <- auto.arima(train,  seasonal = FALSE)
fit_sarima <- auto.arima(train,  seasonal = TRUE)

# === Forecasts para el periodo TEST ===
fc_arima  <- forecast(fit_arima,  h = length(test))
fc_sarima <- forecast(fit_sarima, h = length(test))

# === Métricas: in-sample (train) y out-of-sample (test) ===
acc_arima_train  <- accuracy(fit_arima)[1, c("RMSE","MAE","MAPE")]
acc_sarima_train <- accuracy(fit_sarima)[1, c("RMSE","MAE","MAPE")]

acc_arima_test  <- accuracy(fc_arima,  test)
acc_sarima_test <- accuracy(fc_sarima, test)
# Tomamos SIEMPRE la Ăşltima fila por seguridad
acc_arima_test  <- acc_arima_test[nrow(acc_arima_test), c("RMSE","MAE","MAPE")]
acc_sarima_test <- acc_sarima_test[nrow(acc_sarima_test), c("RMSE","MAE","MAPE")]

# === Tabla comparativa ===
results <- bind_rows(
  tibble(Modelo = "ARIMA",  Conjunto = "Train",
         AIC = fit_arima$aic,  BIC = fit_arima$bic,
         RMSE = acc_arima_train["RMSE"], MAE = acc_arima_train["MAE"], MAPE = acc_arima_train["MAPE"]),
  tibble(Modelo = "ARIMA",  Conjunto = "Test",
         AIC = NA_real_, BIC = NA_real_,
         RMSE = acc_arima_test["RMSE"], MAE = acc_arima_test["MAE"], MAPE = acc_arima_test["MAPE"]),
  tibble(Modelo = "SARIMA", Conjunto = "Train",
         AIC = fit_sarima$aic, BIC = fit_sarima$bic,
         RMSE = acc_sarima_train["RMSE"], MAE = acc_sarima_train["MAE"], MAPE = acc_sarima_train["MAPE"]),
  tibble(Modelo = "SARIMA", Conjunto = "Test",
         AIC = NA_real_, BIC = NA_real_,
         RMSE = acc_sarima_test["RMSE"], MAE = acc_sarima_test["MAE"], MAPE = acc_sarima_test["MAPE"])
) %>% mutate(across(c(RMSE,MAE,MAPE,AIC,BIC), as.numeric))

print(results)
## # A tibble: 4 Ă— 7
##   Modelo Conjunto   AIC   BIC    RMSE     MAE  MAPE
##   <chr>  <chr>    <dbl> <dbl>   <dbl>   <dbl> <dbl>
## 1 ARIMA  Train    1077. 1082. 465989. 378039.  6.05
## 2 ARIMA  Test       NA    NA  812557. 648130.  9.04
## 3 SARIMA Train     681.  684. 259334. 167037.  2.60
## 4 SARIMA Test       NA    NA  509797. 418410.  5.99
# === Residuos en TRAIN ===
res_arima  <- residuals(fit_arima)
res_sarima <- residuals(fit_sarima)

# (a) Serie temporal de residuos lado a lado
autoplot(cbind(ARIMA = res_arima, SARIMA = res_sarima)) +
  ggtitle("Residuos en train: ARIMA vs SARIMA") +
  xlab("Tiempo") + ylab("Residuo")

# (b) ACF de residuos
ggAcf(res_arima)  + ggtitle("ACF Residuos ARIMA (Train)")

ggAcf(res_sarima) + ggtitle("ACF Residuos SARIMA (Train)")

# (c) DiagnĂłstico completo (incluye Ljung-Box por defecto)
checkresiduals(fit_arima)   # mirar si quedan autocorrelaciones o patrones

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,0) with non-zero mean
## Q* = 5.2957, df = 6, p-value = 0.5065
## 
## Model df: 1.   Total lags used: 7
checkresiduals(fit_sarima)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1)(0,1,0)[12]
## Q* = 3.0387, df = 6, p-value = 0.804
## 
## Model df: 1.   Total lags used: 7
# (d) Ljung-Box explĂ­cito (p-valor): si p > 0.05 => residuos ~ ruido blanco (mejor)
lb_arima  <- Box.test(res_arima,  lag = 12, type = "Ljung-Box",
                      fitdf = length(fit_arima$coef))
lb_sarima <- Box.test(res_sarima, lag = 12, type = "Ljung-Box",
                      fitdf = length(fit_sarima$coef))

cat("\nLjung-Box p-values:\n",
    "ARIMA  :", signif(lb_arima$p.value, 4), "\n",
    "SARIMA :", signif(lb_sarima$p.value, 4), "\n")
## 
## Ljung-Box p-values:
##  ARIMA  : 0.007793 
##  SARIMA : 0.891

SARIMA reduces error by nearly 50% and cuts AIC by almost 400 points, a very strong indication of a superior model.On unseen data, SARIMA continues to outperform ARIMA, showing better generalization and more reliable forecasting ability.Diagnostics prove that ARIMA leaves systematic structure unmodeled, while SARIMA residuals resemble white noise.

For Coca-Cola sales forecasting, SARIMA is clearly the superior model. It not only reduces forecast error substantially but also preserves the true seasonal dynamics of sales, making it a far more robust tool for decision-making in inventory planning, marketing strategies, and supply chain management.