library(googlesheets4)
library(tseries)
library(forecast)
library(TTR)
library(TSA)
library(imputeTS)
library(ggplot2)
library(lmtest)
library(ggpubr)

Data untuk Analisis

Data yang digunakan adalah data Indeks Harga Saham Gabungan 2 tahun terakhir yang bersumber dari yahoo finance. Data tersebut telah diunduh dan dirangkum pada google sheets ini..

Import Data

Data yang digunakan hanyalah data Close, didapatkan dengan syntax berikut:

library(googlesheets4)
gs4_deauth()
daily <- read_sheet("https://docs.google.com/spreadsheets/d/1qLvTOVejNRXFKpu57O_vxJ55cc3qFc_aKDAS_xzU2Go/edit?usp=sharing")
daily <- daily[c("Date","Close")]
daily
## # A tibble: 365 × 2
##    Date                Close
##    <dttm>              <dbl>
##  1 2021-12-01 00:00:00 6508.
##  2 2021-12-02 00:00:00 6584.
##  3 2021-12-03 00:00:00 6539.
##  4 2021-12-04 00:00:00   NA 
##  5 2021-12-05 00:00:00   NA 
##  6 2021-12-06 00:00:00 6547.
##  7 2021-12-07 00:00:00 6603.
##  8 2021-12-08 00:00:00 6604.
##  9 2021-12-09 00:00:00 6644.
## 10 2021-12-10 00:00:00 6653.
## # … with 355 more rows
# Cek apakah ada missing value:
daily[which(is.na(daily$Close)),]
## # A tibble: 119 × 2
##    Date                Close
##    <dttm>              <dbl>
##  1 2021-12-04 00:00:00    NA
##  2 2021-12-05 00:00:00    NA
##  3 2021-12-11 00:00:00    NA
##  4 2021-12-12 00:00:00    NA
##  5 2021-12-18 00:00:00    NA
##  6 2021-12-19 00:00:00    NA
##  7 2021-12-25 00:00:00    NA
##  8 2021-12-26 00:00:00    NA
##  9 2021-12-31 00:00:00    NA
## 10 2022-01-01 00:00:00    NA
## # … with 109 more rows

Dari output di atas, dapat terlihat bahwa terdapat missing value pada baris ke-87 atau pada tanggal 2 mei 2022. Hal ini melanggar salah satu syarat data time series, yaitu rentang waktu yang tidak sama. Oleh karena itu, akan dilakukan penanganan missing value.

Menduga Missing Value

library(ggplot2)
ggplot_na_distribution(daily$Close)

Pendugaan dilakukan dengan teknik interpolasi “spline”. Nilai dugaan dari teknik ini didapatkan dengan memperhatikan pola nilai sebelum dan setelah nilai missing value. Hal ini lebih baik daripada metode lain yang hanya mengambil nilai rataan 2 nilai sebelum dan setelah, atau bahkan hanya menduplikasi nilai sebelum/setelahnya. Oleh karena itu, teknik ini dianggap paling akurat untuk menduga nilai missing value. Fungsi yang digunakan untuk mengatasi missing value adalah fungsi na_interpolation yang tersedia pada package imputeTS, yang syntax-nya adalah sebagai berikut:

daily$Date <- as.Date(daily$Date)
daily_spline <- na_interpolation(daily, option = "spline")
daily_linear <- na_interpolation(daily, option = "linear")
daily_stine <- na_interpolation(daily, option = "stine")

Setelah dilakukan penanganan, dapat terlihat bahwa nilai IHSG pada tanggal 2 mei 2022 telah terisi dengan nilai \(6840.217\).

Eksplorasi Data

Eksplorasi data diperlukan untuk melihat pola dari data time series. Selain itu, eksplorasi juga berguna dalam pembagian data training dan data testing. Data testing yang baik adalah data testing yang memiliki pola seperti data trainingnya. Maka dari itu, diperlukan eksplorasi data untuk menentukan proporsi terbaik data training vs data testing.

asl <- ggplot(daily, aes(x=Date, y=Close)) +
  geom_line() +
  scale_x_date(date_breaks = "3 month", date_labels = "%Y %b") +
  labs(title = "Plot Time Series Data IHSG Harian",
       subtitle = "(November 2021 sd November 2022)",
       y="Harga IHSG") +
  theme(plot.title =  element_text(face = "bold", hjust=.5),
        plot.subtitle = element_text(hjust=.5))

#linear
lin <- ggplot(daily_linear, aes(x=Date, y=Close)) +
  geom_line() +
  scale_x_date(date_breaks = "3 month", date_labels = "%Y %b") +
  labs(title = "Plot Time Series Data IHSG Harian Linear",
       subtitle = "(November 2021 sd November 2022)",
       y="Harga IHSG") +
  theme(plot.title =  element_text(face = "bold", hjust=.5),
        plot.subtitle = element_text(hjust=.5))

spl <- ggplot(daily_spline, aes(x=Date, y=Close)) +
  geom_line() +
  scale_x_date(date_breaks = "3 month", date_labels = "%Y %b") +
  labs(title = "Plot Time Series Data IHSG Harian Spline",
       subtitle = "(November 2021 sd November 2022)",
       y="Harga IHSG") +
  theme(plot.title =  element_text(face = "bold", hjust=.5),
        plot.subtitle = element_text(hjust=.5))

sti <- ggplot(daily_stine, aes(x=Date, y=Close)) +
  geom_line() +
  scale_x_date(date_breaks = "3 month", date_labels = "%Y %b") +
  labs(title = "Plot Time Series Data IHSG Harian Stine",
       subtitle = "(November 2021 sd November 2022)",
       y="Harga IHSG") +
  theme(plot.title =  element_text(face = "bold", hjust=.5),
        plot.subtitle = element_text(hjust=.5))

ggarrange(asl, lin, spl, sti,
          ncol = 2, nrow = 2)

Dari plot di atas, terlihat bahwa data cenderung memiliki pola trend yang naik. Terlihat pula bahwa pembagian data training-testing dapat dilakukan antara bulan april 2022 hingga juli 2022. Hal ini karena pola setelah bulan-bulan tersebut cenderung memiliki pola trend naik pula, sama seperti data trainingnya. Untuk lebih jelasnya, perhatikan penjelasan di bawah ini:

p = 0.67 #setelah dilakukan pencarian nilai proporsi data training terbaik
freq_train=as.integer(p*nrow(daily))
ggplot(daily_linear, aes(x=Date, y=Close)) +
  geom_line() + 
  scale_x_date(date_breaks = "3 month", date_labels = "%Y %b") +
  labs(title = "Plot Time Series Data IHSG mingguan",
       subtitle = "(November 2021 sd November 2022)") +
  geom_vline(aes(xintercept = Date[freq_train], 
                 col="Frequency_Train_Data"), lty=2, lwd=.7) +
  theme(plot.title =  element_text(face = "bold", hjust=.5),
        plot.subtitle = element_text(hjust=.5),
        legend.position = "bottom") +
  scale_color_manual(name = "", values = c(Frequency_Train_Data="red"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.

ggplot(daily_spline, aes(x=Date, y=Close)) +
  geom_line() + 
  scale_x_date(date_breaks = "3 month", date_labels = "%Y %b") +
  labs(title = "Plot Time Series Data IHSG mingguan",
       subtitle = "(November 2021 sd November 2022)") +
  geom_vline(aes(xintercept = Date[freq_train], 
                 col="Frequency_Train_Data"), lty=2, lwd=.7) +
  theme(plot.title =  element_text(face = "bold", hjust=.5),
        plot.subtitle = element_text(hjust=.5),
        legend.position = "bottom") +
  scale_color_manual(name = "", values = c(Frequency_Train_Data="red"))

ggplot(daily_stine, aes(x=Date, y=Close)) +
  geom_line() + 
  scale_x_date(date_breaks = "3 month", date_labels = "%Y %b") +
  labs(title = "Plot Time Series Data IHSG mingguan",
       subtitle = "(November 2021 sd November 2022)") +
  geom_vline(aes(xintercept = Date[freq_train], 
                 col="Frequency_Train_Data"), lty=2, lwd=.7) +
  theme(plot.title =  element_text(face = "bold", hjust=.5),
        plot.subtitle = element_text(hjust=.5),
        legend.position = "bottom") +
  scale_color_manual(name = "", values = c(Frequency_Train_Data="red"))

daily_linear[freq_train,1]
## # A tibble: 1 × 1
##   Date      
##   <date>    
## 1 2022-08-01

Splitting Data

train <- 1:freq_train
y <- daily_linear$Close
y_train <- ts(y[train])
y_test <- ts(y[-train], start = 88)
ggplot(daily_linear[train,], aes(x=Date, y=Close)) +
  geom_line() +
  scale_x_date(date_breaks = "3 month", date_labels = "%Y %b") +
  labs(title = "Plot Time Series Data Training",
       subtitle = "(September 2020 sd Mei 2022)",
       y="Harga IHSG") +
  theme(plot.title =  element_text(face = "bold", hjust=.5),
        plot.subtitle = element_text(hjust=.5))

ggplot(daily_linear[-train,], aes(x=Date, y=Close)) +
  geom_line() +
  scale_x_date(date_breaks = "1 month", date_labels = "%Y %b") +
  labs(title = "Plot Time Series Data Testing",
       subtitle = "(Mei 2022 sd September 2022)",
       y="Harga IHSG") +
  theme(plot.title =  element_text(face = "bold", hjust=.5),
        plot.subtitle = element_text(hjust=.5))

# Cek Kestasioneran Data ## Prosedur Eksploratif

plot.ts(y_train, ylab=expression(Y[t]), main = "Plot Time Series Data IHSG mingguan")

acf(y_train)

## Uji Formal

adf.test(y_train)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  y_train
## Dickey-Fuller = -2.2698, Lag order = 6, p-value = 0.4623
## alternative hypothesis: stationary

Differencing

y_diff <- diff(y_train, differences = 1)
y_diff
## Time Series:
## Start = 2 
## End = 244 
## Frequency = 1 
##   [1]   76.14306600  -45.31396500    2.87011733    2.87011733    2.87011733
##   [6]   55.45263700    1.22900400   40.13427700    8.98974600    3.31640633
##  [11]    3.31640633    3.31640633  -47.23388700   10.61962900  -31.45898400
##  [16]    7.13427700  -18.27343767  -18.27343767  -18.27343767    7.19726600
##  [21]  -24.71630900   25.95800800    7.34912100    4.18131533    4.18131533
##  [26]    4.18131533   22.89892500    2.33398500  -19.19482400   20.95654275
##  [31]   20.95654275   20.95654275   20.95654275   30.06494200  -33.07421900
##  [36]   -8.94775400   47.96484400   -3.39729833   -3.39729833   -3.39729833
##  [41]  -43.15380800   -0.90527400   11.29101600   35.04492200  -16.11767567
##  [46]  -16.11767567  -16.11767567  -30.98877000  -22.07812500   34.89013700
##  [51]   99.50195300  -23.73567700  -23.73567700  -23.73567700  -86.99316400
##  [56]   32.64599600   10.34228500   34.35009700   -4.78678367   -4.78678367
##  [61]   -4.78678367   38.25048800   38.25048800  -23.80078100   47.54003900
##  [66]   24.51529967   24.51529967   24.51529967  -15.41503900   45.08398400
##  [71]  -10.96386700   -8.03515600  -27.04036467  -27.04036467  -27.04036467
##  [76]   73.01123000   42.70117200  -15.08105400   57.70068300    3.38232433
##  [81]    3.38232433    3.38232433  -40.97070300   58.06201100 -102.23632800
##  [86]   70.35107400    8.31750500    8.31750500    8.31750500    8.31750500
##  [91]  -53.03906300   29.96313500   29.96313500  -19.75406900  -19.75406900
##  [96]  -19.75406900  -54.88281300   50.25781300   59.56689500   -1.40576200
## [101]    9.86735033    9.86735033    9.86735033  -34.01904300   74.20996100
## [106]  -28.01025400   -9.41992200    0.07210267    0.07210267    0.07210267
## [111]   45.64062500   -4.70654300   53.56982500  -47.15283200   15.69026667
## [116]   15.69026667   15.69026667  -37.91406200   41.50097600   18.25195400
## [121]    7.31787100   12.48600233   12.48600233   12.48600233   32.08105500
## [126]  -44.08300800   23.15136800   83.46777300   -2.34700533   -2.34700533
## [131]   -2.34700533   10.98681700   47.99609300  -27.24462800    9.93920900
## [136]    9.93920900    9.93920900    9.93920900  -76.05712900   28.12988200
## [141]   48.83105500  -50.58691400   -3.20898433   -3.20898433   -3.20898433
## [146]   16.17382800  -35.38867200   32.14990300  -29.01482600  -29.01482600
## [151]  -29.01482600  -29.01482600  -29.01482600  -29.01482600  -29.01482600
## [156]  -29.01482600  -29.01482600  -29.01482600  -29.01482600  -89.95703200
## [161]   -3.59082000 -216.36328100   -1.84668000   11.61840825   11.61840825
## [166]   11.61840825   11.61840825  148.94726600   29.92089800   94.80908200
## [171]  -25.78971367  -25.78971367  -25.78971367   73.36621100  -30.63720700
## [176]   71.37597650   71.37597650    3.76969400    3.76969400    3.76969400
## [181]  111.40527400   -0.12353500   -0.12353500   34.23779300  -28.79296900
## [186]  -28.79296900  -28.79296900   44.46289100   52.26904300  -10.48388700
## [191]  -96.18212900  -30.40201800  -30.40201800  -30.40201800   54.43994100
## [196]  -42.83203100   43.27636700 -113.35937500   13.13671867   13.13671867
## [201]   13.13671867   67.69384800  -59.75976600   13.95605500   44.66992200
## [206]   -8.96061200   -8.96061200   -8.96061200  -19.59912100  -54.10498100
## [211]  -30.76904300 -117.25390600  -51.71875000  -51.71875000  -51.71875000
## [216]   64.09423800  -56.85595700    6.17675800   87.63232400   -6.02473933
## [221]   -6.02473933   -6.02473933   -3.85107500  -77.30371100   49.09668000
## [226]  -38.18212900    2.44938167    2.44938167    2.44938167   76.83886700
## [231]  138.65136700  -10.61230500   22.83105500   -9.51822900   -9.51822900
## [236]   -9.51822900   13.12988200   26.67871100   58.60107500   -5.69384800
## [241]    5.88671867    5.88671867    5.88671867
adf.test(y_diff)
## Warning in adf.test(y_diff): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  y_diff
## Dickey-Fuller = -5.056, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary

Identifikasi Model ARIMA

ACF

acf(y_diff)

ARIMA(0,1,2)

PACF

pacf(y_diff)

ARIMA(2,1,0)

EACF

eacf(y_diff)
## AR/MA
##   0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 o x o o o o o o o o o  o  o  o 
## 1 o x o o o o o o o o o  o  o  o 
## 2 x x o o o o o o o o o  o  o  o 
## 3 x o x o o o o o o o o  o  o  o 
## 4 x o o x o o o o o o o  o  o  o 
## 5 x x x o o o o o o o o  o  o  o 
## 6 x x x x o o o o o o o  o  o  o 
## 7 x x x x o o o o o o o  o  o  o

ARIMA(0,1,2), ARIMA(1,1,2), ARIMA(0,1,3)

Pemilihan Model Terbaik

Signifikansi Parameter

ARIMA(0,1,2)

arima012 <- Arima(y_train, order=c(0,1,2), method="ML")
coeftest(arima012)
## 
## z test of coefficients:
## 
##      Estimate Std. Error z value Pr(>|z|)  
## ma1 -0.019243   0.063429 -0.3034  0.76160  
## ma2  0.165620   0.065732  2.5196  0.01175 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ARIMA(1,1,2)

arima112 <- Arima(y_train, order=c(1,1,2), method="ML")
coeftest(arima112)
## 
## z test of coefficients:
## 
##      Estimate Std. Error z value Pr(>|z|)  
## ar1  0.173281   0.344427  0.5031  0.61489  
## ma1 -0.186199   0.337848 -0.5511  0.58154  
## ma2  0.166383   0.066327  2.5085  0.01212 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ARIMA(0,1,3)

arima013 <- Arima(y_train, order=c(0,1,3), method="ML")
coeftest(arima013)
## 
## z test of coefficients:
## 
##      Estimate Std. Error z value Pr(>|z|)  
## ma1 -0.010284   0.065077 -0.1580  0.87444  
## ma2  0.161738   0.066474  2.4331  0.01497 *
## ma3  0.038634   0.065145  0.5931  0.55315  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ARIMA(2,1,0)

arima210 <- Arima(y_train, order=c(2,1,0), method="ML")
coeftest(arima210)
## 
## z test of coefficients:
## 
##       Estimate Std. Error z value Pr(>|z|)  
## ar1 -0.0060659  0.0636912 -0.0952  0.92412  
## ar2  0.1499402  0.0637433  2.3522  0.01866 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1