tidy data

df <- read.csv("https://raw.githubusercontent.com/yli1048/yli1048/refs/heads/607/sample.csv")
glimpse(df)
## Rows: 230,090
## Columns: 4
## $ Date        <chr> "2010-01-01", "2010-01-02", "2010-01-03", "2010-01-04", "2…
## $ store       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ product     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ number_sold <int> 801, 810, 818, 796, 808, 812, 830, 812, 817, 832, 834, 826…
df$Date <- ymd(df$Date)
glimpse(df)
## Rows: 230,090
## Columns: 4
## $ Date        <date> 2010-01-01, 2010-01-02, 2010-01-03, 2010-01-04, 2010-01-0…
## $ store       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ product     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ number_sold <int> 801, 810, 818, 796, 808, 812, 830, 812, 817, 832, 834, 826…
sub_df <- df %>%
  filter(store == 0 & product == 0)
nrow(sub_df)
## [1] 3287
sales <- select(sub_df, Date, number_sold)
glimpse(sales)
## Rows: 3,287
## Columns: 2
## $ Date        <date> 2010-01-01, 2010-01-02, 2010-01-03, 2010-01-04, 2010-01-0…
## $ number_sold <int> 801, 810, 818, 796, 808, 812, 830, 812, 817, 832, 834, 826…
test_set_start_date <- as.Date("2016-12-31")

train_set <- subset(sales, Date < test_set_start_date)
test_set <- subset(sales, Date >= test_set_start_date)

dim(train_set)
## [1] 2556    2
dim(test_set)
## [1] 731   2

basic visualisation

ggplot() +
  geom_line(data = train_set, aes(x = Date, y = number_sold, color = "Training"), size = 1) +
  geom_line(data = test_set, aes(x = Date, y = number_sold, color = "Testing"), size = 1) +
  labs(
    title = "Sales - Training and Testing Sets",
    x = "Date",
    y = "Sales"
  ) +
  scale_color_manual(values = c("Training" = "#12355B", "Testing" = "#D72638"), name = "Sales") +
  theme_minimal() +
  theme(plot.title = element_text(size = 20))

averaging forecasting

train_set_avg <- mean(train_set$number_sold)
simple_avg_predictions <- data.frame(
  Date = test_set$Date,
  number_sold = rep(train_set_avg, nrow(test_set))
)

glimpse(train_set_avg)
##  num 827
series_ts <- ts(train_set$number_sold, frequency = 12)
glimpse(series_ts)
##  Time-Series [1:2556] from 1 to 214: 801 810 818 796 808 812 830 812 817 832 ...
ma3 <- ma(series_ts, order = 3, centre = FALSE)
ma6 <- ma(series_ts, order = 6, centre = FALSE)
ma12 <- ma(series_ts, order = 12, centre = FALSE)

ma3_forecast <- forecast(ma3, h = nrow(test_set))
ma6_forecast <- forecast(ma6, h = nrow(test_set))
ma12_forecast <- forecast(ma12, h = nrow(test_set))

ma_forecast_df <- data.frame(
  Date = test_set$Date,
  MA3 = ma3_forecast$mean,
  MA6 = ma6_forecast$mean,
  MA12 = ma12_forecast$mean
)
glimpse(ma_forecast_df)
## Rows: 731
## Columns: 4
## $ Date <date> 2016-12-31, 2017-01-01, 2017-01-02, 2017-01-03, 2017-01-04, 2017…
## $ MA3  <dbl> 826.3337, 826.3337, 826.3337, 826.3337, 826.3337, 826.3337, 826.3…
## $ MA6  <dbl> 827.0614, 827.4481, 827.8271, 828.1985, 828.5625, 828.9192, 829.2…
## $ MA12 <dbl> 824.6271, 825.1669, 825.6959, 826.2143, 826.7224, 827.2203, 827.7…
ses <- HoltWinters(series_ts, beta = FALSE, gamma = FALSE)
des <- HoltWinters(series_ts, gamma = FALSE)
tes <- HoltWinters(series_ts)

ses_forecast <- forecast(ses, h = nrow(test_set))
des_forecast <- forecast(des, h = nrow(test_set))
tes_forecast <- forecast(tes, h = nrow(test_set))

exsm_forecast_df <- data.frame(
  Date = test_set$Date,
  SES = ses_forecast$mean,
  DES = des_forecast$mean,
  TES = tes_forecast$mean
)
glimpse(exsm_forecast_df)
## Rows: 731
## Columns: 4
## $ Date <date> 2016-12-31, 2017-01-01, 2017-01-02, 2017-01-03, 2017-01-04, 2017…
## $ SES  <dbl> 825.4735, 825.4735, 825.4735, 825.4735, 825.4735, 825.4735, 825.4…
## $ DES  <dbl> 829.2283, 829.9168, 830.6053, 831.2938, 831.9824, 832.6709, 833.3…
## $ TES  <dbl> 830.5500, 831.8447, 832.4659, 833.8433, 833.5984, 832.8094, 831.4…
tes_seasonal_add <- HoltWinters(series_ts, seasonal = "additive")
tes_seasonal_mul <- HoltWinters(series_ts, seasonal = "multiplicative")

tes_seasonal_add_forecast <- forecast(tes_seasonal_add, h = nrow(test_set))
tes_seasonal_mul_forecast <- forecast(tes_seasonal_mul, h = nrow(test_set))

exsm_tes_forecast_df <- data.frame(
  Date = test_set$Date,
  TES = tes_forecast$mean,
  TESAdd = tes_seasonal_add_forecast$mean,
  TESMul = tes_seasonal_mul_forecast$mean
)

glimpse(exsm_tes_forecast_df)
## Rows: 731
## Columns: 4
## $ Date   <date> 2016-12-31, 2017-01-01, 2017-01-02, 2017-01-03, 2017-01-04, 20…
## $ TES    <dbl> 830.5500, 831.8447, 832.4659, 833.8433, 833.5984, 832.8094, 831…
## $ TESAdd <dbl> 830.5500, 831.8447, 832.4659, 833.8433, 833.5984, 832.8094, 831…
## $ TESMul <dbl> 830.5082, 831.8705, 832.4786, 833.9290, 833.6180, 832.6958, 831…
ar_model <- Arima(series_ts, order = c(1, 0, 0))
ma_model <- Arima(series_ts, order = c(0, 0, 1))
arma_model <- Arima(series_ts, order = c(1, 0, 1))
arima_model <- Arima(series_ts, order = c(1, 1, 1))
auto_arima_no_season_model <- auto.arima(series_ts, seasonal = FALSE)
auto_arima_season_model <- auto.arima(series_ts, seasonal = TRUE)

ar_forecasts <- forecast(ar_model, h = nrow(test_set))
ma_forecasts <- forecast(ma_model, h = nrow(test_set))
arma_forecasts <- forecast(arma_model, h = nrow(test_set))
arima_forecasts <- forecast(arima_model, h = nrow(test_set))
auto_arima_no_season_forecasts <- forecast(auto_arima_no_season_model, h = nrow(test_set))
auto_arima_season_forecasts <- forecast(auto_arima_season_model, h = nrow(test_set))

arima_forcast_df <- data.frame(
  Date = test_set$Date,
  AR = ar_forecasts$mean,
  MA = ma_forecasts$mean,
  ARMA = arma_forecasts$mean,
  ARIMA = arima_forecasts$mean,
  AutoARIMANoSeason = auto_arima_no_season_forecasts$mean,
  AutoARIMASeason = auto_arima_season_forecasts$mean
)

glimpse(arima_forcast_df)
## Rows: 731
## Columns: 7
## $ Date              <date> 2016-12-31, 2017-01-01, 2017-01-02, 2017-01-03, 201…
## $ AR                <dbl> 823.2499, 823.4835, 823.7017, 823.9056, 824.0961, 82…
## $ MA                <dbl> 828.9101, 826.8067, 826.8067, 826.8067, 826.8067, 82…
## $ ARMA              <dbl> 825.4921, 825.4956, 825.4991, 825.5026, 825.5061, 82…
## $ ARIMA             <dbl> 825.8523, 825.6359, 825.6523, 825.6510, 825.6511, 82…
## $ AutoARIMANoSeason <dbl> 826.5948, 823.2160, 824.2842, 824.7978, 824.1417, 82…
## $ AutoARIMASeason   <dbl> 826.5948, 823.2160, 824.2842, 824.7978, 824.1417, 82…
all_model_data <- data.frame(
  Date = test_set$Date,
  number_sold = test_set$number_sold,
  AVG = simple_avg_predictions$number_sold,
  MA = ma_forecasts$mean,
  MA3 = ma3_forecast$mean,
  MA6 = ma6_forecast$mean,
  MA12 = ma12_forecast$mean,
  SES = ses_forecast$mean,
  DES = des_forecast$mean,
  TES = tes_forecast$mean,
  TESAdd = tes_seasonal_add_forecast$mean,
  TESMul = tes_seasonal_mul_forecast$mean,
  AR = ar_forecasts$mean,
  ARMA = arma_forecasts$mean,
  ARIMA = arima_forecasts$mean,
  AutoARIMANoSeason = auto_arima_no_season_forecasts$mean,
  AutoARIMASeason = auto_arima_season_forecasts$mean
)

glimpse(all_model_data)
## Rows: 731
## Columns: 17
## $ Date              <date> 2016-12-31, 2017-01-01, 2017-01-02, 2017-01-03, 201…
## $ number_sold       <int> 842, 850, 839, 838, 835, 835, 822, 832, 824, 849, 83…
## $ AVG               <dbl> 826.8083, 826.8083, 826.8083, 826.8083, 826.8083, 82…
## $ MA                <dbl> 828.9101, 826.8067, 826.8067, 826.8067, 826.8067, 82…
## $ MA3               <dbl> 826.3337, 826.3337, 826.3337, 826.3337, 826.3337, 82…
## $ MA6               <dbl> 827.0614, 827.4481, 827.8271, 828.1985, 828.5625, 82…
## $ MA12              <dbl> 824.6271, 825.1669, 825.6959, 826.2143, 826.7224, 82…
## $ SES               <dbl> 825.4735, 825.4735, 825.4735, 825.4735, 825.4735, 82…
## $ DES               <dbl> 829.2283, 829.9168, 830.6053, 831.2938, 831.9824, 83…
## $ TES               <dbl> 830.5500, 831.8447, 832.4659, 833.8433, 833.5984, 83…
## $ TESAdd            <dbl> 830.5500, 831.8447, 832.4659, 833.8433, 833.5984, 83…
## $ TESMul            <dbl> 830.5082, 831.8705, 832.4786, 833.9290, 833.6180, 83…
## $ AR                <dbl> 823.2499, 823.4835, 823.7017, 823.9056, 824.0961, 82…
## $ ARMA              <dbl> 825.4921, 825.4956, 825.4991, 825.5026, 825.5061, 82…
## $ ARIMA             <dbl> 825.8523, 825.6359, 825.6523, 825.6510, 825.6511, 82…
## $ AutoARIMANoSeason <dbl> 826.5948, 823.2160, 824.2842, 824.7978, 824.1417, 82…
## $ AutoARIMASeason   <dbl> 826.5948, 823.2160, 824.2842, 824.7978, 824.1417, 82…
mae_values <- c()
mape_values <- c()
rmse_values <- c()

for (col in names(all_model_data)[3:ncol(all_model_data)]) {
  mae_values <- c(mae_values, mae(all_model_data$number_sold, all_model_data[[col]]))
  mape_values <- c(mape_values, mape(all_model_data$number_sold, all_model_data[[col]]))
  rmse_values <- c(rmse_values, rmse(all_model_data$number_sold, all_model_data[[col]]))
}

model_test_set_metrics <- data.frame(
  Model = names(all_model_data)[3:ncol(all_model_data)],
  MAE = mae_values,
  MAPE = mape_values,
  RMSE = rmse_values
)
glimpse(model_test_set_metrics)
## Rows: 15
## Columns: 4
## $ Model <chr> "AVG", "MA", "MA3", "MA6", "MA12", "SES", "DES", "TES", "TESAdd"…
## $ MAE   <dbl> 33.39954, 33.39686, 33.45732, 34.61466, 35.74166, 33.56493, 247.…
## $ MAPE  <dbl> 0.03969737, 0.03969411, 0.03974447, 0.04202932, 0.04360139, 0.03…
## $ RMSE  <dbl> 38.41682, 38.41612, 38.52340, 39.63951, 41.46537, 38.73067, 293.…
result <- arrange(model_test_set_metrics, MAE)
print(result)
##                Model       MAE       MAPE      RMSE
## 1                 MA  33.39686 0.03969411  38.41612
## 2                AVG  33.39954 0.03969737  38.41682
## 3                MA3  33.45732 0.03974447  38.52340
## 4                 AR  33.46154 0.03977048  38.45220
## 5              ARIMA  33.54208 0.03981422  38.68629
## 6                SES  33.56493 0.03983333  38.73067
## 7               ARMA  33.57911 0.03988560  38.64412
## 8  AutoARIMANoSeason  33.62708 0.03995766  38.64806
## 9    AutoARIMASeason  33.62708 0.03995766  38.64806
## 10               MA6  34.61466 0.04202932  39.63951
## 11              MA12  35.74166 0.04360139  41.46537
## 12               DES 247.00190 0.30061958 293.23411
## 13            TESMul 291.46369 0.35442087 344.80077
## 14               TES 291.51791 0.35448658 344.86567
## 15            TESAdd 291.51791 0.35448658 344.86567