library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(readxl)
# Assume your dataset is called df with variables: period (1:150) and price
library(readxl)
df <- read_excel("C:/Users/ajasa/Downloads/Coca_Cola Information SalesBoxes 2015-2018.xlsx", sheet = "Summary", skip = 3)
# Renombrar columnas
colnames(df) <- c("Month_Year", "Sales_Unit_Boxes")
# Convertir tipos
df$Month_Year <- as.Date(df$Month_Year)
df$Sales_Unit_Boxes <- as.numeric(df$Sales_Unit_Boxes)
## Warning: NAs introduced by coercion
df <- head(df, -2)
y_ts <- ts(df$Sales_Unit_Boxes, start = c(2015,1), frequency = 12)
head(df)
## # A tibble: 6 Ă— 2
## Month_Year Sales_Unit_Boxes
## <date> <dbl>
## 1 2015-01-01 5516689.
## 2 2015-02-01 5387496.
## 3 2015-03-01 5886747.
## 4 2015-04-01 6389182.
## 5 2015-05-01 6448275.
## 6 2015-06-01 6697947.
# Compute mean and sd
price_mean <- mean(df$Sales_Unit_Boxes)
price_sd <- sd(df$Sales_Unit_Boxes)
# Plot time series with mean, sd lines, and a trend line
ggplot(df, aes(x = Month_Year, y = Sales_Unit_Boxes)) +
geom_line(color = "steelblue", linewidth = 1) +
geom_hline(yintercept = price_mean, color = "purple", linetype = "dashed", linewidth = 1) +
geom_hline(yintercept = price_mean + price_sd, color = "lightgreen", linetype = "dotted", linewidth = 1) +
geom_hline(yintercept = price_mean - price_sd, color = "lightgreen", linetype = "dotted", linewidth = 1) +
geom_smooth(method = "lm", se = FALSE, color = "lightblue", linewidth = 1) +
labs(title = "Coca cola sales time series",
x = "Date", y = "Sales")
## `geom_smooth()` using formula = 'y ~ x'
The Coca-Cola sales time series shows a slight upward trend with
irregular fluctuations, including sharp peaks around 2016 and 2018 and
sudden drops. There is no clear seasonal pattern.
# 0) Diagnóstico rápido
length(df$Sales_Unit_Boxes)
## [1] 48
ts_sales <- ts(df$Sales_Unit_Boxes, start = c(2015, 1), frequency = 12)
length(ts_sales)
## [1] 48
frequency(ts_sales)
## [1] 12
floor(length(ts_sales) / frequency(ts_sales))
## [1] 4
ts_price <- ts(df$Sales_Unit_Boxes)
ts_sales <- ts(df$Sales_Unit_Boxes, start = c(2015, 1), frequency = 12)
# DescomposiciĂłn aditiva
decomp_add <- decompose(ts_sales, type = "additive")
# VisualizaciĂłn
plot(decomp_add)
Observed: The raw time series shows fluctuations between ~5.5M and
~8M.
Trend: There is a clear upward trend from 2005 to 2008, showing long-term growth.
Seasonal: The seasonal component oscillates, but the magnitude of seasonality remains fairly stable, supporting the additive assumption.
Random: The residuals are irregular, without strong remaining structure, but some spikes suggest noise still present.
acf(df$Sales_Unit_Boxes, main = "Autocorrelation Function (ACF)") #MA
pacf(df$Sales_Unit_Boxes, main = "Parcial Autocorrelation Function (PAC)") #AR
ACF: Strong autocorrelation at lag 1 and 2, gradually decaying
afterwards,there are some spikes around lag 10–12, suggesting possible
seasonal correlation., the slow decay indicates the series is
non-stationary because it has a trend..
PACF shows a clear spike at lag 1, then quickly drops within the confidence bands.
We will be comparing ARIMA vs SARIMA, since its a time series problem bit SARIMA extends ARIMA by adding seasonal components.
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
model_lm <- lm(Sales_Unit_Boxes ~ Month_Year, data = df)
dwtest(model_lm)
##
## Durbin-Watson test
##
## data: model_lm
## DW = 0.97665, p-value = 3.089e-05
## alternative hypothesis: true autocorrelation is greater than 0
#4
# Augmented Dickey-Fuller Test.
# p-value < 0.05 → reject H0 → data is stationary.
#si beta<1 es estacionaria,
library(tseries)
adf_test <- adf.test(df$Sales_Unit_Boxes)
## Warning in adf.test(df$Sales_Unit_Boxes): p-value smaller than printed p-value
adf_test
##
## Augmented Dickey-Fuller Test
##
## data: df$Sales_Unit_Boxes
## Dickey-Fuller = -4.4282, Lag order = 3, p-value = 0.01
## alternative hypothesis: stationary
DW = 0.97665 → The statistic is well below 2, indicating strong positive autocorrelation in the residuals.
p-value is small, so we reject the null hypothesis of no autocorrelation.
The residuals of the linear model are not independent, meaning the model suffers from serial correlation.
The regression model shows autocorrelated residuals, making it inadequate on its own. However, the sales series itself is stationary.
fit_ar1 <- Arima(y_ts, order = c(1,1,0)) # AR(1)
fit_ma1 <- Arima(y_ts, order = c(0,1,1)) # MA(1)
# --- Compare models ---
summary(fit_ar1)
## Series: y_ts
## ARIMA(1,1,0)
##
## Coefficients:
## ar1
## -0.1779
## s.e. 0.1428
##
## sigma^2 = 3.139e+11: log likelihood = -688.3
## AIC=1380.59 AICc=1380.87 BIC=1384.29
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 24138.94 548430.8 437845.3 -0.01946129 6.809614 1.329443
## ACF1
## Training set -0.01694275
summary(fit_ma1)
## Series: y_ts
## ARIMA(0,1,1)
##
## Coefficients:
## ma1
## -0.2106
## s.e. 0.1614
##
## sigma^2 = 3.122e+11: log likelihood = -688.18
## AIC=1380.36 AICc=1380.64 BIC=1384.06
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 25605.26 547014.4 439205.5 -0.01227034 6.829469 1.333573
## ACF1
## Training set 0.01009479
# Compare AIC and BIC
AIC(fit_ar1, fit_ma1)
## df AIC
## fit_ar1 2 1380.595
## fit_ma1 2 1380.365
BIC(fit_ar1, fit_ma1)
## df BIC
## fit_ar1 2 1384.295
## fit_ma1 2 1384.065
# --- Residual diagnostics ---
# Check autocorrelation of residuals
checkresiduals(fit_ar1)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,0)
## Q* = 7.7725, df = 9, p-value = 0.5572
##
## Model df: 1. Total lags used: 10
checkresiduals(fit_ma1)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,1)
## Q* = 8.1026, df = 9, p-value = 0.5238
##
## Model df: 1. Total lags used: 10
# --- Forecasting ---
fc_ar1 <- forecast(fit_ar1, h = 12)
fc_ma1 <- forecast(fit_ma1, h = 12)
autoplot(fc_ar1) + ggtitle("Forecast ARIMA(1,1,0)")
autoplot(fc_ma1) + ggtitle("Forecast ARIMA(0,1,1)")
checkresiduals(fit_ar1, lag = 12)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,0)
## Q* = 29.94, df = 11, p-value = 0.00162
##
## Model df: 1. Total lags used: 12
checkresiduals(fit_ma1, lag = 12)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,1)
## Q* = 30.676, df = 11, p-value = 0.00124
##
## Model df: 1. Total lags used: 12
nsdiffs(y_ts) # test if seasonal differencing is needed
## [1] 1
The models pass Ljung–Box at 10 lags , nad AIC/BIC are nearly identical. That only establishes that short-lag residual autocorrelation is small. It does not test for seasonal correlation because the test stopped at lag 10. For monthly data, the first seasonal correlation is at lag 12.
Both ARIMA models leave systematic correlation in the residuals at seasonal lags (12, 24). That means important structure in the data is not being captured. The low p-values confirm this is statistically significant.
A SARIMA with D=1 is required.
fit_sarima <- auto.arima(ts_sales, seasonal = TRUE)
# Resumen del modelo
summary(fit_sarima)
## Series: ts_sales
## ARIMA(1,0,0)(1,1,0)[12] with drift
##
## Coefficients:
## ar1 sar1 drift
## 0.3351 -0.4162 9706.378
## s.e. 0.1935 0.2142 5119.602
##
## sigma^2 = 1.037e+11: log likelihood = -507.28
## AIC=1022.57 AICc=1023.86 BIC=1028.9
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 12204.66 267035.2 193928.8 0.02922099 2.948075 0.5888319
## ACF1
## Training set -0.07376709
# Revisar residuos para ver si el modelo captura bien la dinámica
checkresiduals(fit_sarima)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,0)(1,1,0)[12] with drift
## Q* = 10.982, df = 8, p-value = 0.2027
##
## Model df: 2. Total lags used: 10
# PronĂłstico de 12 meses
forecast_sarima <- forecast(fit_sarima, h = 12)
# Gráfico
autoplot(forecast_sarima) +
ggtitle("PronĂłstico SARIMA de Ventas Coca-Cola") +
xlab("Año") + ylab("Ventas (Unit Boxes)")
Ljung–Box (lag 10): p = 0.20 → Fail to reject H₀. Residuals behave like
white noise. This means the model successfully captured both the trend
and seasonality.
Time series plot: residuals fluctuate around zero with no clear structure.
ACF of residuals: all autocorrelations within confidence bounds theres no leftover autocorrelation.
Histogram + density: residuals look roughly normal, with slight skewness, but acceptable.
Peaks and troughs are reproduced: unlike non-seasonal ARIMA, this SARIMA model preserves the yearly cycle.
fit_arima <- auto.arima(ts_sales, seasonal = FALSE)
summary(fit_arima)
## Series: ts_sales
## ARIMA(1,0,0) with non-zero mean
##
## Coefficients:
## ar1 mean
## 0.5487 6452606
## s.e. 0.1220 155338
##
## sigma^2 = 2.575e+11: log likelihood = -697.85
## AIC=1401.7 AICc=1402.24 BIC=1407.31
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 13902.88 496742.9 394341.4 -0.3828106 6.198876 1.197351 0.02645002
fit_sarima <- auto.arima(ts_sales, seasonal = TRUE)
summary(fit_sarima)
## Series: ts_sales
## ARIMA(1,0,0)(1,1,0)[12] with drift
##
## Coefficients:
## ar1 sar1 drift
## 0.3351 -0.4162 9706.378
## s.e. 0.1935 0.2142 5119.602
##
## sigma^2 = 1.037e+11: log likelihood = -507.28
## AIC=1022.57 AICc=1023.86 BIC=1028.9
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 12204.66 267035.2 193928.8 0.02922099 2.948075 0.5888319
## ACF1
## Training set -0.07376709
# Medidas de error en el ajuste
accuracy(fit_arima)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 13902.88 496742.9 394341.4 -0.3828106 6.198876 1.197351 0.02645002
accuracy(fit_sarima)
## ME RMSE MAE MPE MAPE MASE
## Training set 12204.66 267035.2 193928.8 0.02922099 2.948075 0.5888319
## ACF1
## Training set -0.07376709
# PronĂłsticos de 12 meses
forecast_arima <- forecast(fit_arima, h = 12)
forecast_sarima <- forecast(fit_sarima, h = 12)
# Graficar comparativo
autoplot(ts_sales) +
autolayer(forecast_arima$mean, series="ARIMA", color="red") +
autolayer(forecast_sarima$mean, series="SARIMA", color="blue") +
ggtitle("Comparativo ARIMA vs SARIMA") +
ylab("Ventas (Unit Boxes)") + xlab("Año") +
guides(colour=guide_legend(title="Modelo"))
fit_arima <- auto.arima(ts_sales, seasonal = FALSE)
fit_sarima <- auto.arima(ts_sales, seasonal = TRUE)
Statistically, SARIMA is clearly superior. Lower AIC/BIC and error measures confirm it models the data-generating process more effectively.
Why SARIMA succeeds: SARIMA includes both an AR(1) term (short-term memory) and a seasonal AR(1) with D=1 seasonal differencing. This allows it to handle both the general trend and repeating 12-month cycles, aligning with real consumption patterns.
acc_arima <- accuracy(fit_arima)
acc_sarima <- accuracy(fit_sarima)
# 4) Crear tabla comparativa con fila 1
results <- data.frame(
Modelo = c("ARIMA", "SARIMA"),
AIC = c(fit_arima$aic, fit_sarima$aic),
BIC = c(fit_arima$bic, fit_sarima$bic),
RMSE = c(acc_arima[1,"RMSE"], acc_sarima[1,"RMSE"]),
MAE = c(acc_arima[1,"MAE"], acc_sarima[1,"MAE"]),
MAPE = c(acc_arima[1,"MAPE"], acc_sarima[1,"MAPE"])
)
print(results)
## Modelo AIC BIC RMSE MAE MAPE
## 1 ARIMA 1401.696 1407.309 496742.9 394341.4 6.198876
## 2 SARIMA 1022.567 1028.901 267035.2 193928.8 2.948075
start(ts_sales) # beginning (year, period)
## [1] 2015 1
end(ts_sales) # ending (year, period)
## [1] 2018 12
frequency(ts_sales) # seasonal frequency (12 = monthly)
## [1] 12
train <- window(ts_sales, end = c(2018,1))
test <- window(ts_sales, start = c(2017,12))
fit_arima <- auto.arima(train, seasonal = FALSE)
fit_sarima <- auto.arima(train, seasonal = TRUE)
# === Forecasts para el periodo TEST ===
fc_arima <- forecast(fit_arima, h = length(test))
fc_sarima <- forecast(fit_sarima, h = length(test))
# === Métricas: in-sample (train) y out-of-sample (test) ===
acc_arima_train <- accuracy(fit_arima)[1, c("RMSE","MAE","MAPE")]
acc_sarima_train <- accuracy(fit_sarima)[1, c("RMSE","MAE","MAPE")]
acc_arima_test <- accuracy(fc_arima, test)
acc_sarima_test <- accuracy(fc_sarima, test)
# Tomamos SIEMPRE la Ăşltima fila por seguridad
acc_arima_test <- acc_arima_test[nrow(acc_arima_test), c("RMSE","MAE","MAPE")]
acc_sarima_test <- acc_sarima_test[nrow(acc_sarima_test), c("RMSE","MAE","MAPE")]
# === Tabla comparativa ===
results <- bind_rows(
tibble(Modelo = "ARIMA", Conjunto = "Train",
AIC = fit_arima$aic, BIC = fit_arima$bic,
RMSE = acc_arima_train["RMSE"], MAE = acc_arima_train["MAE"], MAPE = acc_arima_train["MAPE"]),
tibble(Modelo = "ARIMA", Conjunto = "Test",
AIC = NA_real_, BIC = NA_real_,
RMSE = acc_arima_test["RMSE"], MAE = acc_arima_test["MAE"], MAPE = acc_arima_test["MAPE"]),
tibble(Modelo = "SARIMA", Conjunto = "Train",
AIC = fit_sarima$aic, BIC = fit_sarima$bic,
RMSE = acc_sarima_train["RMSE"], MAE = acc_sarima_train["MAE"], MAPE = acc_sarima_train["MAPE"]),
tibble(Modelo = "SARIMA", Conjunto = "Test",
AIC = NA_real_, BIC = NA_real_,
RMSE = acc_sarima_test["RMSE"], MAE = acc_sarima_test["MAE"], MAPE = acc_sarima_test["MAPE"])
) %>% mutate(across(c(RMSE,MAE,MAPE,AIC,BIC), as.numeric))
print(results)
## # A tibble: 4 Ă— 7
## Modelo Conjunto AIC BIC RMSE MAE MAPE
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ARIMA Train 1077. 1082. 465989. 378039. 6.05
## 2 ARIMA Test NA NA 812557. 648130. 9.04
## 3 SARIMA Train 681. 684. 259334. 167037. 2.60
## 4 SARIMA Test NA NA 509797. 418410. 5.99
# === Residuos en TRAIN ===
res_arima <- residuals(fit_arima)
res_sarima <- residuals(fit_sarima)
# (a) Serie temporal de residuos lado a lado
autoplot(cbind(ARIMA = res_arima, SARIMA = res_sarima)) +
ggtitle("Residuos en train: ARIMA vs SARIMA") +
xlab("Tiempo") + ylab("Residuo")
# (b) ACF de residuos
ggAcf(res_arima) + ggtitle("ACF Residuos ARIMA (Train)")
ggAcf(res_sarima) + ggtitle("ACF Residuos SARIMA (Train)")
# (c) DiagnĂłstico completo (incluye Ljung-Box por defecto)
checkresiduals(fit_arima) # mirar si quedan autocorrelaciones o patrones
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,0) with non-zero mean
## Q* = 5.2957, df = 6, p-value = 0.5065
##
## Model df: 1. Total lags used: 7
checkresiduals(fit_sarima)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,1)(0,1,0)[12]
## Q* = 3.0387, df = 6, p-value = 0.804
##
## Model df: 1. Total lags used: 7
# (d) Ljung-Box explĂcito (p-valor): si p > 0.05 => residuos ~ ruido blanco (mejor)
lb_arima <- Box.test(res_arima, lag = 12, type = "Ljung-Box",
fitdf = length(fit_arima$coef))
lb_sarima <- Box.test(res_sarima, lag = 12, type = "Ljung-Box",
fitdf = length(fit_sarima$coef))
cat("\nLjung-Box p-values:\n",
"ARIMA :", signif(lb_arima$p.value, 4), "\n",
"SARIMA :", signif(lb_sarima$p.value, 4), "\n")
##
## Ljung-Box p-values:
## ARIMA : 0.007793
## SARIMA : 0.891
SARIMA reduces error by nearly 50% and cuts AIC by almost 400 points, a very strong indication of a superior model.On unseen data, SARIMA continues to outperform ARIMA, showing better generalization and more reliable forecasting ability.Diagnostics prove that ARIMA leaves systematic structure unmodeled, while SARIMA residuals resemble white noise.
For Coca-Cola sales forecasting, SARIMA is clearly the superior model. It not only reduces forecast error substantially but also preserves the true seasonal dynamics of sales, making it a far more robust tool for decision-making in inventory planning, marketing strategies, and supply chain management.