library(readxl)
library(tseries)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(urca)
library(tseries)
library(tinytex)
library(TSA)
##
## Attaching package: 'TSA'
## The following objects are masked from 'package:stats':
##
## acf, arima
## The following object is masked from 'package:utils':
##
## tar
library(AICcmodavg)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(forecast)
## Registered S3 methods overwritten by 'forecast':
## method from
## fitted.Arima TSA
## plot.Arima TSA
data.sales = read_excel("C:\\Users\\MUTHI'AH IFFA\\Downloads\\Semester 5\\MPDW\\Praktikum\\Tugas Mandiri Iffa_MPDW_(store sales) 301_400.xlsx")
data.sales = data.sales[1:100,]
data.sales.ts = ts(data.sales)
data.sales
## # A tibble: 100 Ć 2
## Date Total_Sales
## <dttm> <dbl>
## 1 2022-10-28 00:00:00 238.
## 2 2022-10-29 00:00:00 204.
## 3 2022-10-30 00:00:00 207.
## 4 2022-10-31 00:00:00 250.
## 5 2022-11-01 00:00:00 236.
## 6 2022-11-02 00:00:00 286.
## 7 2022-11-03 00:00:00 226.
## 8 2022-11-04 00:00:00 214.
## 9 2022-11-05 00:00:00 230.
## 10 2022-11-06 00:00:00 204.
## # ā¹ 90 more rows
str(data.sales)
## tibble [100 Ć 2] (S3: tbl_df/tbl/data.frame)
## $ Date : POSIXct[1:100], format: "2022-10-28" "2022-10-29" ...
## $ Total_Sales: num [1:100] 238 204 207 250 236 ...
head(data.sales)
## # A tibble: 6 Ć 2
## Date Total_Sales
## <dttm> <dbl>
## 1 2022-10-28 00:00:00 238.
## 2 2022-10-29 00:00:00 204.
## 3 2022-10-30 00:00:00 207.
## 4 2022-10-31 00:00:00 250.
## 5 2022-11-01 00:00:00 236.
## 6 2022-11-02 00:00:00 286.
# Ambil kolom nilai penjualan
y = data.sales$Total_Sales
head(y)
## [1] 238.49 204.26 207.48 250.01 236.39 286.10
# Buat Objek Time series
ts.data = ts(y, frequency = 1, start = 1) # frequency 1 karena data berjumlah 100
head(ts.data)
## Time Series:
## Start = 1
## End = 6
## Frequency = 1
## [1] 238.49 204.26 207.48 250.01 236.39 286.10
data bulanan ā> frquency = 12 (karena 12 bulan/tahun)
data kuartal ā> frequency = 4
data harian satu tahun penuh ā> frequency = 365
tidak tahu ada musiman/data pendek ā> frequency = 1
# Plot deret waktu
plot(ts.data,
main = "Total Sales (Time Series)",
ylab = "Total Sales",
xlab = "Observation (1-100)")
# Plot ACF dan PACF
acf(ts.data, main = "ACF - Total Sales") # plot ACF
pacf(ts.data, main = "PACF - Total Sales") # plot PACF
ADF test mendeteksi unit root
\(H_0\) = data tidak stationer
\(H_1\) = data stationer
tolak \(H_0\) ketika < 0.05
# Uji ADF
adf.test(ts.data)
## Warning in adf.test(ts.data): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: ts.data
## Dickey-Fuller = -6.0915, Lag order = 4, p-value = 0.01
## alternative hypothesis: stationary
KPSS test mendeteksi kestationeran di sekitar mean atau tren
\(H_0\) = data stationer
\(H_1\) = data tidak stationer
tolak \(H_0\) ketika p-value < 0.05
# Uji KPSS
kpss.test(ts.data)
## Warning in kpss.test(ts.data): p-value greater than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: ts.data
## KPSS Level = 0.21043, Truncation lag parameter = 4, p-value = 0.1
Menggunakan data.sales dengan 100 amatan yang sudah dicek stationeritasnya.
plot.ts(ts.data, lty = 1, xlab = "waktu", ylab = "Total Sales", main = "Plot Total Sales")
berdasarkan plot deret waktu dari data.sales terlihat bahwa data amatan cenderung sudah staitioner.
pembagian data akan dilakukan dengan proporsi 80:20
# Data latih
sales.train = data.sales$Total_Sales[1:80]
train.ts = ts(sales.train)
plot.ts(sales.train,
lty = 1,
xlab = "Waktu",
ylab = "Total Sales",
main = "Plot Sales Train")
# Data test
sales.test = data.sales$Total_Sales[81:100]
test.ts = ts(sales.test)
plot.ts(sales.test,
lty = 1,
xlab = "Waktu",
ylab = "Total Sales",
main = "Plot Sales Train")
# Data latih
acf(sales.test)
Berdasarkan plot ACF dapat terlihat bahwa data menurun secara perlahan (tails off), ini mengindikasikan bahwa data tidak stationer dalam rataan.
\(H_0\) = Data tidak stationer dalam rataan
\(H_1\) = Data stationer dalam rataan
Tolak \(H_0\) ketika \(p-value\) < 0.05
tseries::adf.test(sales.train)
## Warning in tseries::adf.test(sales.train): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: sales.train
## Dickey-Fuller = -5.2203, Lag order = 4, p-value = 0.01
## alternative hypothesis: stationary
Hasil ADF menunjukkan bahwa didapat \(p-value\) = 0.01 yang lebih kecil dari taraf nyata 5% (\(p-value\) < 0.05). Artinya tolak \(H_0\) dapat disimpulkan bahwa data stationer dalam rataan.
Maka tidak diperlukan transformasi box-cox
# Plot ACF
acf(sales.train, lag.max = 10)
Berdasarkan plot PACF data cenderung cut off pada lag ke-3 dan ke-4, maka model ARIMA nya adalah (0, 0, 2)
pacf(sales.train, lag.max = 15)
Berdasarkan plot PACF data cenderung cut off pada lag ke-3 dan ke-6, maka model ARIMA nya adalah (2, 0, 0)
Jika plot ACF dan PACF dianggap tails off, maka model yang terbentuk adalah ARIMA (2, 0, 2)
eacf(sales.train)
## AR/MA
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 o o x x o x x x o x x o o x
## 1 x o x o o x o x o o o o o o
## 2 o x x o o x o x o o o o o o
## 3 x x x o o o o o o o o o o o
## 4 x x x o o o o o o o o o o o
## 5 x x o o x o o o o o o o o o
## 6 x x o x o o o o o o o o o o
## 7 x x o o o o o o o o o o o o
Identifikasi model dengan plot EACF, terbentuk pola ujung segitiga nol. Model tentatif yang terbentuk adalah ARIMA (3, 0, 3)
(kak ini kyk gimana yah aku bingung di plot ACF, PACF, sama EACF kok modelnya beda beda yah⦠minta feedbacknya kak boleh tak:)?)
# ARIMA (0,0,2)
model1.da = Arima(sales.train, order = c(0,0,2), method = "ML")
summary(model1.da)
## Series: sales.train
## ARIMA(0,0,2) with non-zero mean
##
## Coefficients:
## ma1 ma2 mean
## 0.298 0.4893 230.3692
## s.e. 0.093 0.1109 3.9161
##
## sigma^2 = 406.1: log likelihood = -352.54
## AIC=713.09 AICc=713.62 BIC=722.61
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.06034901 19.77007 16.52346 -0.6855156 7.121838 0.7076046
## ACF1
## Training set -0.04155337
lmtest::coeftest(model1.da)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ma1 0.298030 0.093049 3.2029 0.00136 **
## ma2 0.489251 0.110856 4.4134 1.018e-05 ***
## intercept 230.369166 3.916060 58.8268 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ARIMA (2,0,0)
model2.da = Arima(sales.train, order = c(2,0,0), method = "ML")
summary(model2.da)
## Series: sales.train
## ARIMA(2,0,0) with non-zero mean
##
## Coefficients:
## ar1 ar2 mean
## 0.1470 0.0411 230.5048
## s.e. 0.1119 0.1140 2.8822
##
## sigma^2 = 457.2: log likelihood = -357.01
## AIC=722.01 AICc=722.54 BIC=731.54
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.00670896 20.97758 17.04422 -0.8094011 7.387632 0.7299056
## ACF1
## Training set 0.0213556
lmtest::coeftest(model2.da)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 0.146983 0.111909 1.3134 0.1890
## ar2 0.041114 0.113956 0.3608 0.7183
## intercept 230.504791 2.882153 79.9766 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ARIMA (3,0,3)
model3.da = Arima(sales.train, order = c(3,0,3), method = "ML")
summary(model3.da)
## Series: sales.train
## ARIMA(3,0,3) with non-zero mean
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 ma3 mean
## 0.4000 0.0549 -0.8459 -0.5712 0.1190 0.6930 231.1197
## s.e. 0.1488 0.1853 0.1483 0.2618 0.3062 0.1917 1.5470
##
## sigma^2 = 262.7: log likelihood = -336.33
## AIC=688.67 AICc=690.69 BIC=707.72
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.2880938 15.48224 12.62435 -0.5462405 5.376372 0.540628
## ACF1
## Training set 0.04875799
lmtest::coeftest(model3.da)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 0.400004 0.148793 2.6883 0.0071812 **
## ar2 0.054914 0.185337 0.2963 0.7670074
## ar3 -0.845888 0.148286 -5.7045 1.167e-08 ***
## ma1 -0.571155 0.261801 -2.1816 0.0291363 *
## ma2 0.119048 0.306208 0.3888 0.6974369
## ma3 0.693047 0.191707 3.6151 0.0003002 ***
## intercept 231.119720 1.547021 149.3966 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Cek signifikansi parameter tiap model
models <- list(
"ARIMA(0,0,2)" = model1.da,
"ARIMA(2,0,0)" = model2.da,
"ARIMA(3,0,3)" = model3.da
)
lapply(models, coeftest)
## $`ARIMA(0,0,2)`
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ma1 0.298030 0.093049 3.2029 0.00136 **
## ma2 0.489251 0.110856 4.4134 1.018e-05 ***
## intercept 230.369166 3.916060 58.8268 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## $`ARIMA(2,0,0)`
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 0.146983 0.111909 1.3134 0.1890
## ar2 0.041114 0.113956 0.3608 0.7183
## intercept 230.504791 2.882153 79.9766 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## $`ARIMA(3,0,3)`
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 0.400004 0.148793 2.6883 0.0071812 **
## ar2 0.054914 0.185337 0.2963 0.7670074
## ar3 -0.845888 0.148286 -5.7045 1.167e-08 ***
## ma1 -0.571155 0.261801 -2.1816 0.0291363 *
## ma2 0.119048 0.306208 0.3888 0.6974369
## ma3 0.693047 0.191707 3.6151 0.0003002 ***
## intercept 231.119720 1.547021 149.3966 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Untuk cari model overfitting, identifikasi AIC terkecil dan seluruh parameternya signifikan.
Pada hasil analisis tersebut didapatkan nilai AIC terkecil yaitu model ARIMA(0,0,2) dan ARIMA(3,0,3). Namun, pada kasus ini dipilih model dengan AIC terkecil, semua parameternya signifikan, dan model yang paling sederhana. Artinya model ARIMA(0,0,2) yang akan dipilih.
Model terbaik hasil identifikasi kemudian dicek asumsi sisaannya. Sisaan model ARIMA harus memenuhi asumsi normalitas, kebebasan sisaan, dan kehomogenan ragam. Diagnostik model dilakukan secara eksplorasi dan uji formal.
# Eksplorasi
sisaan.da = model2.da$residuals
par(mforw = c(2,2))
## Warning in par(mforw = c(2, 2)): "mforw" is not a graphical parameter
qqnorm(sisaan.da)
qqline(sisaan.da,
col = "blue",
lwd = 1)
plot(c(1:length(sisaan.da)),
sisaan.da)
acf(sisaan.da, lag.max = 20)
pacf(sisaan.da, lag.max = 20)
par(mfrow = c(1,1))
# Sisaan menyebar normal
ks.test(sisaan.da, "pnorm") # tak tolak H0 > sisaan menyebar normal
##
## Exact one-sample Kolmogorov-Smirnov test
##
## data: sisaan.da
## D = 0.46133, p-value < 2.2e-16
## alternative hypothesis: two-sided
\(H_0\) = Sisaan menyebar normal
\(H_1\) = Sisaan tidak menyebar normal
Berdasarkan hasil uji KS didapatkan nilai \(p-value\) < 2.2e-16 yang kurang dari taraf nyata 5% ( \(p-value\) < 0.05). Sehingga dapat disimpulkan tolak \(H_0\), artinya sisaan tidak menyebar normal.
# Sisaan saling bebas (tidak ada autokorelasi)
Box.test(sisaan.da, type = "Ljung") # tak tollak H0 > sisaan saling bebas
##
## Box-Ljung test
##
## data: sisaan.da
## X-squared = 0.03787, df = 1, p-value = 0.8457
\(H _0\) = Sisaan saling bebas
\(H_1\) = Sisaan tidak saling bebas
Berdasarkan hasil Ljung-Box didapatkan hasil \(p-value\) = 0.8457 yang lebih besar dari taraf nyata 5% (\(p-value\) > 0.05). Sehingga tak tolak \(H_0\), artinya sisaan saling bebas.
# Sisaan Homogen
Box.test((sisaan.da)^2, type = "Ljung") # tak tolak H0 > sisaan homogen
##
## Box-Ljung test
##
## data: (sisaan.da)^2
## X-squared = 2.9936, df = 1, p-value = 0.0836
\(H_0\) = Ragam sisaan homogen
\(H_1\) = Ragam sisaan tidak homogen
Berdasarkan hasil Ljung-Box terhadap sisaan kuadrat tersebut didapatkan hasil \(p-value\) = 0.0836 yang lebih besar dari taraf nyata 5% (\(p-value\) > 0.05). Sehingga tak tolak \(H_0\), artinya ragam sisaan homogen.
# Nilai tengah sisaan = 0
t.test(sisaan.da,
mu = 0,
conf.level = 0.95) # tak tolak Ho > nilai tengah sisaan = 0
##
## One Sample t-test
##
## data: sisaan.da
## t = -0.0028426, df = 79, p-value = 0.9977
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -4.704494 4.691077
## sample estimates:
## mean of x
## -0.00670896
\(H_0\) = Nilai tengah sisaan sama dengan 0
\(H_1\) = Nilai tengah sisaan tidak sama dengan 0
Berdasarkan hasil Ljung-Box terhadap sisaan kuadrat tersebut didapatkan hasil \(p-value\) = 0.9977 yang lebih besar dari taraf nyata 5% (\(p-value\) > 0.05). Sehingga tak tolak \(H_0\), artinya nilai tengah sisaan sama dengan 0.
# Forecast
ramalan = forecast::forecast(model2.da, h = 20)
# Ambil hasil prediksi
hasil = as.numeric(ramalan$mean)
# Buat perbandingan dengan data aktual
perbandingan <- data.frame(
Aktual = as.numeric(head(test.ts, n = 20)),
Forecast = hasil
)
perbandingan
## Aktual Forecast
## 1 220.97 225.3802
## 2 229.03 228.6651
## 3 270.94 230.0237
## 4 269.00 230.3584
## 5 246.24 230.4635
## 6 206.43 230.4927
## 7 218.01 230.5013
## 8 229.07 230.5038
## 9 237.12 230.5045
## 10 239.42 230.5047
## 11 235.60 230.5048
## 12 209.24 230.5048
## 13 214.23 230.5048
## 14 244.88 230.5048
## 15 252.32 230.5048
## 16 231.79 230.5048
## 17 251.04 230.5048
## 18 232.92 230.5048
## 19 222.38 230.5048
## 20 197.45 230.5048
# Forecast sesuai panjang test
horizon <- length(test.ts)
ramalan <- forecast::forecast(model2.da, h = horizon)
# Plot train + test + forecast
ts.plot(train.ts, xlim = c(1, length(train.ts) + horizon), col = "black", lwd = 2)
lines((length(train.ts)+1):(length(train.ts)+horizon), test.ts, col = "blue", lwd = 2)
lines((length(train.ts)+1):(length(train.ts)+horizon), ramalan$mean, col = "red", lwd = 2, lty = 2)
legend("topleft", legend = c("Train", "Test", "Forecast"),
col = c("black", "blue", "red"), lty = c(1,1,2), lwd = 2, bty = "n")
(aneh banget)
perbandingan = matrix(data = c(head(test.ts,
n = 30),
hasil[-1]),
nrow = 30,
ncol = 2)
## Warning in matrix(data = c(head(test.ts, n = 30), hasil[-1]), nrow = 30, : data
## length [39] is not a sub-multiple or multiple of the number of rows [30]
colnames(perbandingan) = c("Aktual", "Hasil Forecast")
perbandingan
## Aktual Hasil Forecast
## [1,] 220.9700 230.5048
## [2,] 229.0300 230.5048
## [3,] 270.9400 230.5048
## [4,] 269.0000 230.5048
## [5,] 246.2400 230.5048
## [6,] 206.4300 230.5048
## [7,] 218.0100 230.5048
## [8,] 229.0700 230.5048
## [9,] 237.1200 230.5048
## [10,] 239.4200 220.9700
## [11,] 235.6000 229.0300
## [12,] 209.2400 270.9400
## [13,] 214.2300 269.0000
## [14,] 244.8800 246.2400
## [15,] 252.3200 206.4300
## [16,] 231.7900 218.0100
## [17,] 251.0400 229.0700
## [18,] 232.9200 237.1200
## [19,] 222.3800 239.4200
## [20,] 197.4500 235.6000
## [21,] 228.6651 209.2400
## [22,] 230.0237 214.2300
## [23,] 230.3584 244.8800
## [24,] 230.4635 252.3200
## [25,] 230.4927 231.7900
## [26,] 230.5013 251.0400
## [27,] 230.5038 232.9200
## [28,] 230.5045 222.3800
## [29,] 230.5047 197.4500
## [30,] 230.5048 228.6651
horizon <- length(test.ts)
ramalan <- forecast(model2.da, h = horizon)
accuracy(ramalan, sales.test)
## ME RMSE MAE MPE MAPE MASE
## Training set -0.00670896 20.97758 17.04422 -0.8094011 7.387632 0.7299056
## Test set 2.78170478 19.11743 14.89342 0.5417239 6.357944 0.6377993
## ACF1
## Training set 0.0213556
## Test set NA