library(readxl)
library(tseries)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(urca)
library(tseries)
library(tinytex)
library(TSA)
## 
## Attaching package: 'TSA'
## The following objects are masked from 'package:stats':
## 
##     acf, arima
## The following object is masked from 'package:utils':
## 
##     tar
library(AICcmodavg)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(forecast)
## Registered S3 methods overwritten by 'forecast':
##   method       from
##   fitted.Arima TSA 
##   plot.Arima   TSA

Import Data

data.sales = read_excel("C:\\Users\\MUTHI'AH IFFA\\Downloads\\Semester 5\\MPDW\\Praktikum\\Tugas Mandiri Iffa_MPDW_(store sales) 301_400.xlsx")
data.sales = data.sales[1:100,]
data.sales.ts = ts(data.sales)

Cek Struktur Data

data.sales
## # A tibble: 100 Ɨ 2
##    Date                Total_Sales
##    <dttm>                    <dbl>
##  1 2022-10-28 00:00:00        238.
##  2 2022-10-29 00:00:00        204.
##  3 2022-10-30 00:00:00        207.
##  4 2022-10-31 00:00:00        250.
##  5 2022-11-01 00:00:00        236.
##  6 2022-11-02 00:00:00        286.
##  7 2022-11-03 00:00:00        226.
##  8 2022-11-04 00:00:00        214.
##  9 2022-11-05 00:00:00        230.
## 10 2022-11-06 00:00:00        204.
## # ℹ 90 more rows
str(data.sales)
## tibble [100 Ɨ 2] (S3: tbl_df/tbl/data.frame)
##  $ Date       : POSIXct[1:100], format: "2022-10-28" "2022-10-29" ...
##  $ Total_Sales: num [1:100] 238 204 207 250 236 ...
head(data.sales)
## # A tibble: 6 Ɨ 2
##   Date                Total_Sales
##   <dttm>                    <dbl>
## 1 2022-10-28 00:00:00        238.
## 2 2022-10-29 00:00:00        204.
## 3 2022-10-30 00:00:00        207.
## 4 2022-10-31 00:00:00        250.
## 5 2022-11-01 00:00:00        236.
## 6 2022-11-02 00:00:00        286.

Buat Data Time Series & Cek Stationeritas

# Ambil kolom nilai penjualan
y = data.sales$Total_Sales
head(y)
## [1] 238.49 204.26 207.48 250.01 236.39 286.10
# Buat Objek Time series 
ts.data = ts(y, frequency = 1, start = 1) # frequency 1 karena data berjumlah 100
head(ts.data)
## Time Series:
## Start = 1 
## End = 6 
## Frequency = 1 
## [1] 238.49 204.26 207.48 250.01 236.39 286.10

data bulanan –> frquency = 12 (karena 12 bulan/tahun)

data kuartal –> frequency = 4

data harian satu tahun penuh –> frequency = 365

tidak tahu ada musiman/data pendek –> frequency = 1

# Plot deret waktu 
plot(ts.data, 
     main = "Total Sales (Time Series)",
     ylab = "Total Sales", 
     xlab = "Observation (1-100)")

# Plot ACF dan PACF 
acf(ts.data, main = "ACF - Total Sales") # plot ACF

pacf(ts.data, main = "PACF - Total Sales") # plot PACF

Uji stationeritas

ADF Test

ADF test mendeteksi unit root

\(H_0\) = data tidak stationer

\(H_1\) = data stationer

tolak \(H_0\) ketika < 0.05

# Uji ADF 
adf.test(ts.data) 
## Warning in adf.test(ts.data): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts.data
## Dickey-Fuller = -6.0915, Lag order = 4, p-value = 0.01
## alternative hypothesis: stationary

KPSS (Kwiatkowski-Phillips-Schmidt-Shin) Test

KPSS test mendeteksi kestationeran di sekitar mean atau tren

\(H_0\) = data stationer

\(H_1\) = data tidak stationer

tolak \(H_0\) ketika p-value < 0.05

# Uji KPSS 
kpss.test(ts.data)
## Warning in kpss.test(ts.data): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  ts.data
## KPSS Level = 0.21043, Truncation lag parameter = 4, p-value = 0.1

Pemodelan

Menggunakan data.sales dengan 100 amatan yang sudah dicek stationeritasnya.

Eksplorasi Data

plot.ts(ts.data, lty = 1, xlab = "waktu", ylab = "Total Sales", main = "Plot Total Sales")

berdasarkan plot deret waktu dari data.sales terlihat bahwa data amatan cenderung sudah staitioner.

pembagian data akan dilakukan dengan proporsi 80:20

# Data latih 
sales.train = data.sales$Total_Sales[1:80]
train.ts = ts(sales.train)
plot.ts(sales.train, 
        lty = 1, 
        xlab = "Waktu", 
        ylab = "Total Sales", 
        main = "Plot Sales Train")

# Data test  
sales.test = data.sales$Total_Sales[81:100]
test.ts = ts(sales.test)
plot.ts(sales.test, 
        lty = 1, 
        xlab = "Waktu",
        ylab = "Total Sales",
        main = "Plot Sales Train")

Plot Data Uji

Uji Stationeritas data

# Data latih 
acf(sales.test)

Berdasarkan plot ACF dapat terlihat bahwa data menurun secara perlahan (tails off), ini mengindikasikan bahwa data tidak stationer dalam rataan.

Uji ADF

\(H_0\) = Data tidak stationer dalam rataan

\(H_1\) = Data stationer dalam rataan

Tolak \(H_0\) ketika \(p-value\) < 0.05

tseries::adf.test(sales.train)
## Warning in tseries::adf.test(sales.train): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  sales.train
## Dickey-Fuller = -5.2203, Lag order = 4, p-value = 0.01
## alternative hypothesis: stationary

Hasil ADF menunjukkan bahwa didapat \(p-value\) = 0.01 yang lebih kecil dari taraf nyata 5% (\(p-value\) < 0.05). Artinya tolak \(H_0\) dapat disimpulkan bahwa data stationer dalam rataan.

Maka tidak diperlukan transformasi box-cox

Plot ACF

# Plot ACF 
acf(sales.train, lag.max = 10)

Berdasarkan plot PACF data cenderung cut off pada lag ke-3 dan ke-4, maka model ARIMA nya adalah (0, 0, 2)

Plot PACF

pacf(sales.train, lag.max = 15)

Berdasarkan plot PACF data cenderung cut off pada lag ke-3 dan ke-6, maka model ARIMA nya adalah (2, 0, 0)

Jika plot ACF dan PACF dianggap tails off, maka model yang terbentuk adalah ARIMA (2, 0, 2)

Plot EACF

eacf(sales.train)
## AR/MA
##   0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 o o x x o x x x o x x  o  o  x 
## 1 x o x o o x o x o o o  o  o  o 
## 2 o x x o o x o x o o o  o  o  o 
## 3 x x x o o o o o o o o  o  o  o 
## 4 x x x o o o o o o o o  o  o  o 
## 5 x x o o x o o o o o o  o  o  o 
## 6 x x o x o o o o o o o  o  o  o 
## 7 x x o o o o o o o o o  o  o  o

Identifikasi model dengan plot EACF, terbentuk pola ujung segitiga nol. Model tentatif yang terbentuk adalah ARIMA (3, 0, 3)

(kak ini kyk gimana yah aku bingung di plot ACF, PACF, sama EACF kok modelnya beda beda yah… minta feedbacknya kak boleh tak:)?)

Pendugaan Parameter Model Tentatif

# ARIMA (0,0,2)
model1.da = Arima(sales.train, order = c(0,0,2), method = "ML")
summary(model1.da)
## Series: sales.train 
## ARIMA(0,0,2) with non-zero mean 
## 
## Coefficients:
##         ma1     ma2      mean
##       0.298  0.4893  230.3692
## s.e.  0.093  0.1109    3.9161
## 
## sigma^2 = 406.1:  log likelihood = -352.54
## AIC=713.09   AICc=713.62   BIC=722.61
## 
## Training set error measures:
##                       ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -0.06034901 19.77007 16.52346 -0.6855156 7.121838 0.7076046
##                     ACF1
## Training set -0.04155337
lmtest::coeftest(model1.da)
## 
## z test of coefficients:
## 
##             Estimate Std. Error z value  Pr(>|z|)    
## ma1         0.298030   0.093049  3.2029   0.00136 ** 
## ma2         0.489251   0.110856  4.4134 1.018e-05 ***
## intercept 230.369166   3.916060 58.8268 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ARIMA (2,0,0)
model2.da = Arima(sales.train, order = c(2,0,0), method = "ML")
summary(model2.da)
## Series: sales.train 
## ARIMA(2,0,0) with non-zero mean 
## 
## Coefficients:
##          ar1     ar2      mean
##       0.1470  0.0411  230.5048
## s.e.  0.1119  0.1140    2.8822
## 
## sigma^2 = 457.2:  log likelihood = -357.01
## AIC=722.01   AICc=722.54   BIC=731.54
## 
## Training set error measures:
##                       ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -0.00670896 20.97758 17.04422 -0.8094011 7.387632 0.7299056
##                   ACF1
## Training set 0.0213556
lmtest::coeftest(model2.da)
## 
## z test of coefficients:
## 
##             Estimate Std. Error z value Pr(>|z|)    
## ar1         0.146983   0.111909  1.3134   0.1890    
## ar2         0.041114   0.113956  0.3608   0.7183    
## intercept 230.504791   2.882153 79.9766   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ARIMA (3,0,3)
model3.da = Arima(sales.train, order = c(3,0,3), method = "ML")
summary(model3.da)
## Series: sales.train 
## ARIMA(3,0,3) with non-zero mean 
## 
## Coefficients:
##          ar1     ar2      ar3      ma1     ma2     ma3      mean
##       0.4000  0.0549  -0.8459  -0.5712  0.1190  0.6930  231.1197
## s.e.  0.1488  0.1853   0.1483   0.2618  0.3062  0.1917    1.5470
## 
## sigma^2 = 262.7:  log likelihood = -336.33
## AIC=688.67   AICc=690.69   BIC=707.72
## 
## Training set error measures:
##                      ME     RMSE      MAE        MPE     MAPE     MASE
## Training set -0.2880938 15.48224 12.62435 -0.5462405 5.376372 0.540628
##                    ACF1
## Training set 0.04875799
lmtest::coeftest(model3.da)
## 
## z test of coefficients:
## 
##             Estimate Std. Error  z value  Pr(>|z|)    
## ar1         0.400004   0.148793   2.6883 0.0071812 ** 
## ar2         0.054914   0.185337   0.2963 0.7670074    
## ar3        -0.845888   0.148286  -5.7045 1.167e-08 ***
## ma1        -0.571155   0.261801  -2.1816 0.0291363 *  
## ma2         0.119048   0.306208   0.3888 0.6974369    
## ma3         0.693047   0.191707   3.6151 0.0003002 ***
## intercept 231.119720   1.547021 149.3966 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Cek signifikansi parameter tiap model
models <- list(
  "ARIMA(0,0,2)" = model1.da,
  "ARIMA(2,0,0)" = model2.da,
  "ARIMA(3,0,3)" = model3.da
)

lapply(models, coeftest)
## $`ARIMA(0,0,2)`
## 
## z test of coefficients:
## 
##             Estimate Std. Error z value  Pr(>|z|)    
## ma1         0.298030   0.093049  3.2029   0.00136 ** 
## ma2         0.489251   0.110856  4.4134 1.018e-05 ***
## intercept 230.369166   3.916060 58.8268 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## $`ARIMA(2,0,0)`
## 
## z test of coefficients:
## 
##             Estimate Std. Error z value Pr(>|z|)    
## ar1         0.146983   0.111909  1.3134   0.1890    
## ar2         0.041114   0.113956  0.3608   0.7183    
## intercept 230.504791   2.882153 79.9766   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## $`ARIMA(3,0,3)`
## 
## z test of coefficients:
## 
##             Estimate Std. Error  z value  Pr(>|z|)    
## ar1         0.400004   0.148793   2.6883 0.0071812 ** 
## ar2         0.054914   0.185337   0.2963 0.7670074    
## ar3        -0.845888   0.148286  -5.7045 1.167e-08 ***
## ma1        -0.571155   0.261801  -2.1816 0.0291363 *  
## ma2         0.119048   0.306208   0.3888 0.6974369    
## ma3         0.693047   0.191707   3.6151 0.0003002 ***
## intercept 231.119720   1.547021 149.3966 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Untuk cari model overfitting, identifikasi AIC terkecil dan seluruh parameternya signifikan.

Pada hasil analisis tersebut didapatkan nilai AIC terkecil yaitu model ARIMA(0,0,2) dan ARIMA(3,0,3). Namun, pada kasus ini dipilih model dengan AIC terkecil, semua parameternya signifikan, dan model yang paling sederhana. Artinya model ARIMA(0,0,2) yang akan dipilih.

Analisis Sisaan

Model terbaik hasil identifikasi kemudian dicek asumsi sisaannya. Sisaan model ARIMA harus memenuhi asumsi normalitas, kebebasan sisaan, dan kehomogenan ragam. Diagnostik model dilakukan secara eksplorasi dan uji formal.

Eksplorasi

# Eksplorasi 
sisaan.da = model2.da$residuals
par(mforw = c(2,2))
## Warning in par(mforw = c(2, 2)): "mforw" is not a graphical parameter
qqnorm(sisaan.da)
qqline(sisaan.da,
       col = "blue",
       lwd = 1)

plot(c(1:length(sisaan.da)),
     sisaan.da)

acf(sisaan.da, lag.max = 20)

pacf(sisaan.da, lag.max = 20)

Eksplorasi Sisaan

par(mfrow = c(1,1))

Uji Formal

# Sisaan menyebar normal 
ks.test(sisaan.da, "pnorm") # tak tolak H0 > sisaan menyebar normal
## 
##  Exact one-sample Kolmogorov-Smirnov test
## 
## data:  sisaan.da
## D = 0.46133, p-value < 2.2e-16
## alternative hypothesis: two-sided

\(H_0\) = Sisaan menyebar normal

\(H_1\) = Sisaan tidak menyebar normal

Berdasarkan hasil uji KS didapatkan nilai \(p-value\) < 2.2e-16 yang kurang dari taraf nyata 5% ( \(p-value\) < 0.05). Sehingga dapat disimpulkan tolak \(H_0\), artinya sisaan tidak menyebar normal.

# Sisaan saling bebas (tidak ada autokorelasi)
Box.test(sisaan.da, type = "Ljung") # tak tollak H0 > sisaan saling bebas
## 
##  Box-Ljung test
## 
## data:  sisaan.da
## X-squared = 0.03787, df = 1, p-value = 0.8457

\(H _0\) = Sisaan saling bebas

\(H_1\) = Sisaan tidak saling bebas

Berdasarkan hasil Ljung-Box didapatkan hasil \(p-value\) = 0.8457 yang lebih besar dari taraf nyata 5% (\(p-value\) > 0.05). Sehingga tak tolak \(H_0\), artinya sisaan saling bebas.

# Sisaan Homogen
Box.test((sisaan.da)^2, type = "Ljung") # tak tolak H0 > sisaan homogen
## 
##  Box-Ljung test
## 
## data:  (sisaan.da)^2
## X-squared = 2.9936, df = 1, p-value = 0.0836

\(H_0\) = Ragam sisaan homogen

\(H_1\) = Ragam sisaan tidak homogen

Berdasarkan hasil Ljung-Box terhadap sisaan kuadrat tersebut didapatkan hasil \(p-value\) = 0.0836 yang lebih besar dari taraf nyata 5% (\(p-value\) > 0.05). Sehingga tak tolak \(H_0\), artinya ragam sisaan homogen.

# Nilai tengah sisaan = 0
t.test(sisaan.da,
       mu = 0,
       conf.level = 0.95) # tak tolak Ho > nilai tengah sisaan = 0 
## 
##  One Sample t-test
## 
## data:  sisaan.da
## t = -0.0028426, df = 79, p-value = 0.9977
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -4.704494  4.691077
## sample estimates:
##   mean of x 
## -0.00670896

Overfitting

\(H_0\) = Nilai tengah sisaan sama dengan 0

\(H_1\) = Nilai tengah sisaan tidak sama dengan 0

Berdasarkan hasil Ljung-Box terhadap sisaan kuadrat tersebut didapatkan hasil \(p-value\) = 0.9977 yang lebih besar dari taraf nyata 5% (\(p-value\) > 0.05). Sehingga tak tolak \(H_0\), artinya nilai tengah sisaan sama dengan 0.

Peramalan

# Forecast 
ramalan = forecast::forecast(model2.da, h = 20)

# Ambil hasil prediksi
hasil = as.numeric(ramalan$mean)

# Buat perbandingan dengan data aktual
perbandingan <- data.frame(
  Aktual = as.numeric(head(test.ts, n = 20)),
  Forecast = hasil
)

perbandingan
##    Aktual Forecast
## 1  220.97 225.3802
## 2  229.03 228.6651
## 3  270.94 230.0237
## 4  269.00 230.3584
## 5  246.24 230.4635
## 6  206.43 230.4927
## 7  218.01 230.5013
## 8  229.07 230.5038
## 9  237.12 230.5045
## 10 239.42 230.5047
## 11 235.60 230.5048
## 12 209.24 230.5048
## 13 214.23 230.5048
## 14 244.88 230.5048
## 15 252.32 230.5048
## 16 231.79 230.5048
## 17 251.04 230.5048
## 18 232.92 230.5048
## 19 222.38 230.5048
## 20 197.45 230.5048
# Forecast sesuai panjang test
horizon <- length(test.ts)
ramalan <- forecast::forecast(model2.da, h = horizon)

# Plot train + test + forecast
ts.plot(train.ts, xlim = c(1, length(train.ts) + horizon), col = "black", lwd = 2)
lines((length(train.ts)+1):(length(train.ts)+horizon), test.ts, col = "blue", lwd = 2)
lines((length(train.ts)+1):(length(train.ts)+horizon), ramalan$mean, col = "red", lwd = 2, lty = 2)

legend("topleft", legend = c("Train", "Test", "Forecast"),
       col = c("black", "blue", "red"), lty = c(1,1,2), lwd = 2, bty = "n")

(aneh banget)

perbandingan = matrix(data = c(head(test.ts, 
                n = 30),
                hasil[-1]), 
                nrow = 30, 
                ncol = 2)
## Warning in matrix(data = c(head(test.ts, n = 30), hasil[-1]), nrow = 30, : data
## length [39] is not a sub-multiple or multiple of the number of rows [30]
colnames(perbandingan) = c("Aktual", "Hasil Forecast")
perbandingan
##         Aktual Hasil Forecast
##  [1,] 220.9700       230.5048
##  [2,] 229.0300       230.5048
##  [3,] 270.9400       230.5048
##  [4,] 269.0000       230.5048
##  [5,] 246.2400       230.5048
##  [6,] 206.4300       230.5048
##  [7,] 218.0100       230.5048
##  [8,] 229.0700       230.5048
##  [9,] 237.1200       230.5048
## [10,] 239.4200       220.9700
## [11,] 235.6000       229.0300
## [12,] 209.2400       270.9400
## [13,] 214.2300       269.0000
## [14,] 244.8800       246.2400
## [15,] 252.3200       206.4300
## [16,] 231.7900       218.0100
## [17,] 251.0400       229.0700
## [18,] 232.9200       237.1200
## [19,] 222.3800       239.4200
## [20,] 197.4500       235.6000
## [21,] 228.6651       209.2400
## [22,] 230.0237       214.2300
## [23,] 230.3584       244.8800
## [24,] 230.4635       252.3200
## [25,] 230.4927       231.7900
## [26,] 230.5013       251.0400
## [27,] 230.5038       232.9200
## [28,] 230.5045       222.3800
## [29,] 230.5047       197.4500
## [30,] 230.5048       228.6651
horizon <- length(test.ts)
ramalan <- forecast(model2.da, h = horizon)
accuracy(ramalan, sales.test)
##                       ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -0.00670896 20.97758 17.04422 -0.8094011 7.387632 0.7299056
## Test set      2.78170478 19.11743 14.89342  0.5417239 6.357944 0.6377993
##                   ACF1
## Training set 0.0213556
## Test set            NA