HANDS ON SARIMA

Coding SARIMA

library(forecast)
library(tseries)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(TSA)

## Registered S3 methods overwritten by 'TSA':
##   method       from    
##   fitted.Arima forecast
##   plot.Arima   forecast

## 
## Attaching package: 'TSA'

## The following objects are masked from 'package:stats':
## 
##     acf, arima

## The following object is masked from 'package:utils':
## 
##     tar

library(ggplot2)
library(Metrics)

## 
## Attaching package: 'Metrics'

## The following object is masked from 'package:forecast':
## 
##     accuracy

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# Membaca data
climate_data <- read.csv("C:/Users/user/Downloads/Walmart.csv")

str(climate_data)

## 'data.frame':    6435 obs. of  8 variables:
##  $ Store       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Date        : chr  "05-02-2010" "12-02-2010" "19-02-2010" "26-02-2010" ...
##  $ Weekly_Sales: num  1643691 1641957 1611968 1409728 1554807 ...
##  $ Holiday_Flag: int  0 1 0 0 0 0 0 0 0 0 ...
##  $ Temperature : num  42.3 38.5 39.9 46.6 46.5 ...
##  $ Fuel_Price  : num  2.57 2.55 2.51 2.56 2.62 ...
##  $ CPI         : num  211 211 211 211 211 ...
##  $ Unemployment: num  8.11 8.11 8.11 8.11 8.11 ...

# Mengubah format tanggal
climate_data$Tanggal <- as.Date(climate_data$Date,
                                format = "%d-%m-%Y")

# Mengurutkan data berdasarkan tanggal
climate_data <- climate_data[order(climate_data$Tanggal), ]

head(climate_data)

##     Store       Date Weekly_Sales Holiday_Flag Temperature Fuel_Price      CPI
## 1       1 05-02-2010    1643690.9            0       42.31      2.572 211.0964
## 144     2 05-02-2010    2136989.5            0       40.19      2.572 210.7526
## 287     3 05-02-2010     461622.2            0       45.71      2.572 214.4249
## 430     4 05-02-2010    2135143.9            0       43.76      2.598 126.4421
## 573     5 05-02-2010     317173.1            0       39.70      2.572 211.6540
## 716     6 05-02-2010    1652635.1            0       40.43      2.572 212.6224
##     Unemployment    Tanggal
## 1          8.106 2010-02-05
## 144        8.324 2010-02-05
## 287        7.368 2010-02-05
## 430        8.623 2010-02-05
## 573        6.566 2010-02-05
## 716        7.259 2010-02-05

# Mengambil data penjualan mingguan
weekly_data <- climate_data %>%
  group_by(Tanggal) %>%
  summarise(Total_Penjualan = sum(Weekly_Sales))

head(weekly_data)

## # A tibble: 6 × 2
##   Tanggal    Total_Penjualan
##   <date>               <dbl>
## 1 2010-02-05       49750740.
## 2 2010-02-12       48336678.
## 3 2010-02-19       48276994.
## 4 2010-02-26       43968571.
## 5 2010-03-05       46871470.
## 6 2010-03-12       45925397.

# Membentuk time series
series_penjualan <- ts(weekly_data$Total_Penjualan,
                       frequency = 52,
                       start = c(2010,1))

series_penjualan

## Time Series:
## Start = c(2010, 1) 
## End = c(2012, 39) 
## Frequency = 52 
##   [1] 49750741 48336678 48276994 43968571 46871470 45925397 44988975 44133961
##   [9] 50423831 47365290 45183667 44734453 43705127 48503244 45330080 45120108
##  [17] 47757503 50188543 47826547 47622046 46609036 48917485 47899529 46243900
##  [25] 44888849 44630363 48204999 46464418 47060953 45909740 47194258 45634398
##  [33] 43080727 41358514 42239876 45102974 43149473 43066670 43602831 45781982
##  [41] 46124801 45125584 65821003 49909028 55666770 61820800 80931416 40432519
##  [49] 42775788 40673678 40654648 39599853 46153111 47336193 48716164 44125860
##  [57] 46980604 44627319 44872326 42876199 43458991 45887467 44973328 48676692
##  [65] 43530033 46861958 45446145 44046598 45293457 48771994 47669735 47447562
##  [73] 45884095 47578520 47859264 45515930 45274411 43683274 48015467 46249569
##  [81] 46917348 47416948 45376623 46763228 43793960 42718097 42195831 47211688
##  [89] 44374820 45818953 45855821 48655544 48474225 46438981 66593605 49390556
##  [97] 55561148 60085696 76998241 46042461 44955422 42023078 42080997 39834975
## [105] 46085608 50009408 50197057 45771507 46861035 47480454 46901505 44993794
## [113] 45272862 53502316 46629261 45072530 43716799 47124198 46925879 46823939
## [121] 47892463 48281650 49651172 48412111 47668285 46597112 51253022 46099732
## [129] 46059543 44097155 47485900 47403451 47354452 47447324 47159639 48330059
## [137] 44226039 44354547 43734899 47566639 46128514 45122411 45544116

plot(series_penjualan,
     main = "Plot Weekly Sales",
     ylab = "Penjualan",
     xlab = "Waktu",
     col = "blue")

# Split data training dan testing
train_data <- subset(series_penjualan,
                     start = 1,
                     end = 114)

test_data <- subset(series_penjualan,
                    start = 115,
                    end = 143)

# Plot training data
ts.plot(train_data,
        col = "blue",
        ylab = "Weekly Sales",
        xlab = "Time")

title(main = "Train Time Series Plot",
      cex.sub = 0.8)

points(train_data,
       pch = 20,
       col = "blue")

log_train_data <- log(train_data)

Uji stasioneritas

adf.test(train_data)

## Warning in adf.test(train_data): p-value smaller than printed p-value

## 
##  Augmented Dickey-Fuller Test
## 
## data:  train_data
## Dickey-Fuller = -5.3396, Lag order = 4, p-value = 0.01
## alternative hypothesis: stationary

Differencing non-musiman

diff_nonmusiman <- diff(train_data)

# Plot differencing
ts.plot(diff_nonmusiman,
        main = "Differencing Non Seasonal",
        ylab = "Differenced Data",
        xlab = "Time",
        col = "red")

points(diff_nonmusiman,
       pch = 20,
       col = "yellow")

# Uji ADF
adf.test(diff_nonmusiman)

## Warning in adf.test(diff_nonmusiman): p-value smaller than printed p-value

## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff_nonmusiman
## Dickey-Fuller = -5.9312, Lag order = 4, p-value = 0.01
## alternative hypothesis: stationary

Differencing musiman

diff_musiman <- diff(diff_nonmusiman,
                     lag = 52)

# Plot seasonal differencing
ts.plot(diff_musiman,
        main = "Seasonal Differencing",
        ylab = "Seasonal Difference",
        xlab = "Time",
        col = "green")

points(diff_musiman,
       pch = 20,
       col = "black")

# Uji ADF
adf.test(na.omit(diff_musiman))

## Warning in adf.test(na.omit(diff_musiman)): p-value smaller than printed
## p-value

## 
##  Augmented Dickey-Fuller Test
## 
## data:  na.omit(diff_musiman)
## Dickey-Fuller = -5.5458, Lag order = 3, p-value = 0.01
## alternative hypothesis: stationary

Plot ACF dan PACF

# Plot ACF dan PACF
par(mfrow = c(1,2))

Acf(diff_musiman,
    main = "ACF")

Pacf(diff_musiman,
     main = "PACF")

# Membentuk model SARIMA

# Membentuk model SARIMA
sarima_model1 <- Arima(train_data,
                       order = c(1,1,1),
                       seasonal = list(order = c(0,1,1),
                                       period = 52))

summary(sarima_model1)

## Series: train_data 
## ARIMA(1,1,1)(0,1,1)[52] 
## 
## Coefficients:
##          ar1      ma1    sma1
##       0.1869  -0.8662  0.1532
## s.e.  0.1645   0.0718  0.2818
## 
## sigma^2 = 3.931e+12:  log likelihood = -970.64
## AIC=1949.27   AICc=1949.99   BIC=1957.72
## 
## Training set error measures:
##                    ME    RMSE      MAE       MPE     MAPE      MASE        ACF1
## Training set 234645.4 1414217 720978.7 0.5177475 1.500458 0.4743825 -0.01323472

sarima_model2 <- Arima(train_data,
                       order = c(0,1,1),
                       seasonal = list(order = c(0,1,1),
                                       period = 52))

summary(sarima_model2)

## Series: train_data 
## ARIMA(0,1,1)(0,1,1)[52] 
## 
## Coefficients:
##           ma1    sma1
##       -0.8302  0.0549
## s.e.   0.0891  0.2621
## 
## sigma^2 = 4.009e+12:  log likelihood = -971.28
## AIC=1948.56   AICc=1948.98   BIC=1954.89
## 
## Training set error measures:
##                    ME    RMSE      MAE       MPE     MAPE      MASE      ACF1
## Training set 232083.5 1440455 743524.9 0.5210133 1.541746 0.4892173 0.1031782

sarima_model3 <- Arima(train_data,
                       order = c(1,1,0),
                       seasonal = list(order = c(0,1,1),
                                       period = 52))

summary(sarima_model3)

## Series: train_data 
## ARIMA(1,1,0)(0,1,1)[52] 
## 
## Coefficients:
##           ar1    sma1
##       -0.3595  0.4328
## s.e.   0.1236  0.4026
## 
## sigma^2 = 4.075e+12:  log likelihood = -975.78
## AIC=1957.56   AICc=1957.98   BIC=1963.89
## 
## Training set error measures:
##                    ME    RMSE      MAE       MPE     MAPE      MASE        ACF1
## Training set 107021.8 1452225 732927.7 0.2213365 1.553227 0.4822446 -0.07248425

sarima_model4 <- Arima(train_data,
                       order = c(2,1,1),
                       seasonal = list(order = c(0,1,1),
                                       period = 52))

summary(sarima_model4)

## Series: train_data 
## ARIMA(2,1,1)(0,1,1)[52] 
## 
## Coefficients:
##          ar1      ar2      ma1    sma1
##       0.1847  -0.0284  -0.8612  0.1501
## s.e.  0.1652   0.1489   0.0789  0.2812
## 
## sigma^2 = 3.999e+12:  log likelihood = -970.62
## AIC=1951.24   AICc=1952.33   BIC=1961.79
## 
## Training set error measures:
##                    ME    RMSE      MAE     MPE     MAPE      MASE        ACF1
## Training set 234666.3 1414111 720943.3 0.51859 1.499942 0.4743592 -0.01947318

Membandingkan nilai AIC

# Membandingkan nilai AIC
AIC(sarima_model1)

## [1] 1949.273

AIC(sarima_model2)

## [1] 1948.562

AIC(sarima_model3)

## [1] 1957.558

AIC(sarima_model4)

## [1] 1951.237

# Model otomatis SARIMA
log_train_data <- log(train_data)

model_otomatis <- auto.arima(log_train_data,
                             seasonal = TRUE,
                             stepwise = FALSE,
                             approximation = FALSE)

summary(model_otomatis)

## Series: log_train_data 
## ARIMA(0,1,1)(0,1,0)[52] 
## 
## Coefficients:
##           ma1
##       -0.8319
## s.e.   0.0800
## 
## sigma^2 = 0.00195:  log likelihood = 108.37
## AIC=-212.74   AICc=-212.53   BIC=-208.52
## 
## Training set error measures:
##                       ME       RMSE        MAE        MPE       MAPE      MASE
## Training set 0.004080985 0.03203452 0.01695113 0.02316051 0.09585153 0.5264379
##                    ACF1
## Training set 0.07301659

# Diagnostik residual dan forecasting
checkresiduals(model_otomatis)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1)(0,1,0)[52]
## Q* = 24.87, df = 22, p-value = 0.3034
## 
## Model df: 1.   Total lags used: 23

hasil_forecast <- forecast(model_otomatis,
                           h = length(test_data))

hasil_forecast$mean <- exp(hasil_forecast$mean)
hasil_forecast$lower <- exp(hasil_forecast$lower)
hasil_forecast$upper <- exp(hasil_forecast$upper)

# Menampilkan hasil forecast
hasil_forecast

##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 2012.192       47552028 44935834 50320539 43609683 51850764
## 2012.212       51467737 48597501 54507494 47143463 56188659
## 2012.231       46025977 43425185 48782533 42108438 50307981
## 2012.250       49548950 46712966 52557108 45277997 54222769
## 2012.269       48051956 45267106 51008130 43858836 52645958
## 2012.288       46572161 43840037 49474552 42459228 51083504
## 2012.308       47890513 45047520 50912930 43611500 52589368
## 2012.327       51568504 48471526 54863356 46908099 56691928
## 2012.346       50403043 47341653 53662399 45797055 55472272
## 2012.365       50168131 47087183 53450667 45533574 55274408
## 2012.385       48515017 45503268 51726105 43985383 53511114
## 2012.404       50306597 47150504 53673948 45560729 55546822
## 2012.423       50603439 47395783 54028183 45780897 55933986
## 2012.442       48125742 45044165 51418137 43493573 53251248
## 2012.462       47870375 44774674 51180111 43217790 53023831
## 2012.481       46188005 43172012 49414694 41655999 51213074
## 2012.500       50768599 47421865 54351523 45740466 56349461
## 2012.519       48901447 45647642 52387187 44013759 54331909
## 2012.538       49607515 46276447 53178360 44604608 55171554
## 2012.558       50135762 46738923 53779472 45034924 55814342
## 2012.577       47978448 44699061 51498429 43054793 53465161
## 2012.596       49444558 46035656 53105885 44327286 55152582
## 2012.615       46305037 43085402 49765265 41472665 51700474
## 2012.635       45167485 42000676 48573069 40415162 50478624
## 2012.654       44615274 41461470 48008975 39883220 49908775
## 2012.673       49918733 46361524 53748877 44582241 55894002
## 2012.692       46919203 43549182 50550010 41864319 52584436
## 2012.712       48446140 44939262 52226682 43186791 54345981
## 2012.731       48485122 44948449 52300070 43181905 54439633

plot(hasil_forecast,
     main = "Forecast SARIMA",
     xlab = "Time",
     ylab = "Sales",
     col = "blue")

# Menambahkan data aktual
lines(test_data,
      col = "red",
      lwd = 2)

legend("topleft",
       legend = c("Forecast", "Data Aktual"),
       col = c("blue", "red"),
       lty = 1,
       lwd = 2)

# Menghitung akurasi model
rmse_value <- rmse(test_data,
                   hasil_forecast$mean)

mae_value <- mae(test_data,
                 hasil_forecast$mean)

mape_value <- mape(test_data,
                   hasil_forecast$mean)

cat("RMSE :", rmse_value, "\n")

## RMSE : 2306284

cat("MAE  :", mae_value, "\n")

## MAE  : 1903285

cat("MAPE :", mape_value, "\n")

## MAPE : 0.04107816