Intro

knitr::include_graphics("ihsg.jpg")

Rmd kali ini akan memprediksi harga penutupan ihsg menggunakan algoritma time series yaitu dengan model arima dengan menggunakan variable variable tambahan seperti harga emas, minyak dan lain lain

Import Library and Dataset

library(tidyverse)
library(tidyquant)
library(priceR)
library(Quandl)
library(zoo)
library(forecast)
library(lubridate)
library(padr)
library(tseries)
library(randomForest)
gold <- Quandl("LBMA/GOLD", collapse = "daily")
currency <- historical_exchange_rates("USD", to = "IDR", start_date = "2020-01-01", end_date = "2021-02-28")
## For full currency exchange rate API documentation visit:
##  https://exchangerate.host/#/#docs
##  (this message will only appear once per session)
ihsg <- tq_get("^JKSE")
dowjones <- tq_get("^DJI")
oil <- Quandl("CHRIS/CME_CL1", collapse = "daily")
shanghai <- tq_get("000001.SS")

Data Pre Processing

ihsg <- ihsg %>% 
  select(c(date, open, close)) %>% 
  mutate(date = ymd(date)) %>% 
  filter(date >= "2020-01-01",
         date <= "2021-02-26")
currency <- currency %>% 
  rename(USDIDR = one_USD_equivalent_to_x_IDR)
gold <- gold %>% 
  select(c(Date, `USD (AM)`)) %>% 
  rename(Goldprices = `USD (AM)`,
         date = Date) %>% 
  mutate(date = ymd(date)) %>% 
  filter(date >= "2020-01-01",
         date <= "2021-02-28")
dowjones <- dowjones %>% 
  select(date, adjusted) %>% 
  rename(dowjones = adjusted) %>% 
  mutate(date = ymd(date)) %>% 
  filter(date >= "2020-01-01",
         date <= "2021-02-28")
oil <- oil %>% 
  select(Date, Settle) %>% 
  rename(date = Date,
         OilPrices = Settle) %>% 
  mutate(date = ymd(date)) %>% 
  filter(date >= "2020-01-01",
         date <= "2021-02-28")
shanghai <- shanghai %>% 
  select(date, adjusted) %>% 
  rename(shanghai = adjusted) %>% 
  mutate(date = ymd(date)) %>% 
  filter(date >= "2020-01-01",
         date <= "2021-02-28")
a <- left_join(ihsg, currency, "date")
b <- left_join(a, gold, "date")
c <- left_join(b, dowjones, "date")
d <- left_join(c, shanghai, "date")
dataset <- left_join(d, oil, "date")

Add variable Inflasi and BI Rate

dataset$Inflasi <- month(dataset$date)
dataset[1:22, ]$Inflasi <- 2.68
dataset[23:42, ]$Inflasi <- 2.98
dataset[43:63, ]$Inflasi <- 2.96
dataset[64:84, ]$Inflasi <- 2.67
dataset[85:100, ]$Inflasi <- 2.19
dataset[101:121, ]$Inflasi <- 1.96
dataset[122:143, ]$Inflasi <- 1.54
dataset[144:161, ]$Inflasi <- 1.32
dataset[162:183, ]$Inflasi <- 1.42
dataset[184:202, ]$Inflasi <- 1.44
dataset[203:223, ]$Inflasi <- 1.59
dataset[224:242, ]$Inflasi <- 1.68
dataset[243:262, ]$Inflasi <- 1.55
dataset[263:281, ]$Inflasi <- 1.38
dataset[1:22, "BiRate"] <- 5
dataset[23:42, "BiRate"] <- 4.75
dataset[43:63, "BiRate"] <- 4.50
dataset[64:84, "BiRate"] <- 4.50
dataset[85:100, "BiRate"] <- 4.50
dataset[101:121, "BiRate"] <- 4.25
dataset[122:143, "BiRate"] <- 4
dataset[144:161, "BiRate"] <- 4
dataset[162:183, "BiRate"] <- 4
dataset[184:202, "BiRate"] <- 4
dataset[203:223, "BiRate"] <- 3.75
dataset[224:242, "BiRate"] <- 3.75
dataset[243:262, "BiRate"] <- 3.75
dataset[263:281, "BiRate"] <- 3.50

Dataset Full

rmarkdown::paged_table(dataset)

Check If There is Variable that didnt influence

model1 <- lm(close~USDIDR + Goldprices + dowjones + OilPrices + shanghai+Inflasi+BiRate, data = dataset)
summary(model1)
## 
## Call:
## lm(formula = close ~ USDIDR + Goldprices + dowjones + OilPrices + 
##     shanghai + Inflasi + BiRate, data = dataset)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -375.69  -95.49    9.89   88.33  406.13 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.546e+03  5.385e+02  -4.727 3.86e-06 ***
## USDIDR      -3.765e-03  9.948e-03  -0.378    0.705    
## Goldprices  -8.230e-01  1.711e-01  -4.809 2.67e-06 ***
## dowjones     1.425e-01  7.228e-03  19.717  < 2e-16 ***
## OilPrices    8.453e+00  1.544e+00   5.475 1.09e-07 ***
## shanghai     1.271e+00  1.024e-01  12.408  < 2e-16 ***
## Inflasi      4.906e+02  4.532e+01  10.825  < 2e-16 ***
## BiRate       3.821e+01  5.170e+01   0.739    0.461    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 138.1 on 242 degrees of freedom
##   (31 observations deleted due to missingness)
## Multiple R-squared:  0.9501, Adjusted R-squared:  0.9486 
## F-statistic: 657.9 on 7 and 242 DF,  p-value: < 2.2e-16

Convert to Time Series

ts_dataset <- dataset %>% 
  select(-date) %>% 
  na.approx() %>% 
  msts(seasonal.periods = c(7, 7*4))
ts_dataset [ , "close"] %>% 
  autoplot()

Cross Validation

test <- tail(ts_dataset, 30)

train <- head(ts_dataset, length(ts_dataset) - 30)
dataset dibagi menjadi 2 kedalam data train dan data test. data train untuk melatih model, dan menggunakan data dari 1 januari 2020 sampai akhr februari dikurangi 30 data terakhir. data test menggunakan data 30 hari terakhir

Modeling

modelarima2 <- stlm(train[ , "close"],
                    method = "arima",
                    xreg = train[ , c("open", "USDIDR", "Goldprices", "dowjones", "OilPrices", "BiRate", "Inflasi", "shanghai")])
model menggunakan model arima dengan fungsi stlm dan agar hasilnya lebih kuat, disini menggunakan beberapa variable tambahan

Forecasting and Evaluating

forecastarima2 <- forecast(modelarima2, h = 30, xreg = test[ , c("open", "USDIDR", "Goldprices", "dowjones", "OilPrices", "BiRate", "Inflasi", "shanghai")])
accuracy(forecastarima2$mean, as.numeric(test[, "close"]))
##                ME     RMSE      MAE       MPE     MAPE
## Test set 15.06163 80.28048 66.05978 0.2383274 1.063973

Plot Model arima

z <- as.data.frame(forecastarima2)
y <- as.data.frame(test)
compare <- cbind(forecast = z$`Point Forecast`, real = y$close)
compare <- as.data.frame(compare)
compare$date <- dataset$date %>% 
  tail(30)
plot <- compare %>% 
  ggplot(aes(x = date)) + 
  geom_line(aes(y = forecast), colour = "red", size = 2) +
  geom_line(aes(y = real), colour = "green", size = 2) +
  labs(x = "Date",
       y = NULL,
       title = "Comparison actual vs prediction") +
  theme_dark()
plot

Red = Forecast
Green = Aktual

Conclusion

model menggunakan model arima dengan prediktor tambahan menjadi model dengan error paling sedikit, dibanding model tbats dan model autoarima tanpa prediktor tambahan, model arima juga berhasil mengalahkan model dengan random forest dalam hal akurasi, namun tetap model ini masih belum sangat akurat karena nilai mapenya masih diatas satu, dan rmse serta mae nya masih cukup tinggi. mungkin perlu prediktor tambahan agar akurasinya semakin meningkat