Forecasting Gold Price Prediction

#Gold_Price <- Quandl('LBMA/GOLD')
Gold_Price <- read_csv("/Users/datascience/Desktop/Time Series Data Science/Time Series Project/Gold_Nasdaq_Max.csv", 
    col_types = cols(Date = col_date(format = "%m/%d/%Y")))

# Time Series Data of Closing Price
Close.TS <- ts(Gold_Price$`Close/Last`)

autoplot(Close.TS, ylab='Close Price', main='Time Series Plot of Gold Prices') +
  theme_classic()

# Check for any NA Values
cbind(
   lapply(
     lapply(Gold_Price, is.na)
     , sum)
   )

##            [,1]
## Date       0   
## Close/Last 0   
## Volume     0   
## Open       0   
## High       0   
## Low        0

Price_plot <- Gold_Price %>%
  plot_ly(x = ~Date,
          type = "candlestick", 
          open = ~Open, 
          close = ~`Close/Last`, 
          high = ~High,
          low = ~Low,
          name = "price") %>%
  layout(
    xaxis = list(
      rangeselector = list(
        buttons = list(
          list(
            count = 3,
            label = "3 mo",
            step = "month",
            stepmode = "backward"),
          list(
            count = 6,
            label = "6 mo",
            step = "month",
            stepmode = "backward"),
          list(
            count = 1,
            label = "1 yr",
            step = "year",
            stepmode = "backward"),
          list(
            count = 2,
            label = "2 yr",
            step = "year",
            stepmode = "backward"),
          list(
            count = 3,
            label = "3 yr",
            step = "year",
            stepmode = "backward"),
          list(
            count = 5,
            label = "5 yr",
            step = "year",
            stepmode = "backward"),
          list(step = "all"))),
      
      rangeslider = list(visible = FALSE)),
         yaxis = list(title = "Price ($)",
                      showgrid = TRUE,
                      showticklabels = TRUE))


Volume <- select(Gold_Price, Date, Volume)
Volume$Date <- as.Date(Volume$Date , format = "%m/%d/%y")
Volume$Vol <- as.numeric(as.character(Volume$Volume)) / 1000


Volume_plot <- Volume %>%
  plot_ly(x=~Date, y=~Vol, type='bar', name = "Volume") %>%
  layout(yaxis = list(title = "Volume (Units of Thousand)"))

plot <- subplot(Price_plot, Volume_plot, heights = c(0.7,0.3), nrows=2,
             shareX = TRUE, titleY = TRUE) %>%
  layout(title = 'GC:CMX')

plot

# Subset Date
Gold_Price_Subset <- subset(Gold_Price, Date > as.Date("2019-01-01") )
Close.TS_1 <- ts(Gold_Price_Subset$`Close/Last`)
autoplot(Close.TS_1, ylab='Close Price', main='Time Series Plot of Gold Prices') +
  theme_classic()

# Random Walk Test
random_walk_test <- arima(Close.TS, order = c(1, 0, 0))
summary(random_walk_test)

## 
## Call:
## arima(x = Close.TS, order = c(1, 0, 0))
## 
## Coefficients:
##          ar1  intercept
##       0.9985  1439.0070
## s.e.  0.0011   177.2876
## 
## sigma^2 estimated as 218.5:  log likelihood = -10481.13,  aic = 20968.25
## 
## Training set error measures:
##                       ME     RMSE      MAE         MPE      MAPE      MASE
## Training set -0.00646364 14.78174 9.986105 -0.00971381 0.6854392 0.9984929
##                     ACF1
## Training set -0.04134238

Acf(diff(Close.TS_1), main= 'Autocorrelation Plot of differenced series')

### Several different time periods were tested (Max, 5 Year, 3 year, 2 year). Attempted to gather the largest sample size that does not result in a random walk series 


# Traditionally, some people who believe the efficient market hypothesis theory [1] argue that the future stock price is predictable based on the historical stock data. Others who trust the random walk theory believe that the future stock price does not depend on the historical stock data, and hence no useful patterns could be found in the historical data to reflect the pattern of the upcoming stock sequences (https://www.mdpi.com/2227-7390/8/9/1441)


# Based on the experiment, data science is very useful for visualization data and our proposed method using Long Short-Term Memory (LSTM) can be used as predictor in short term data with accuracy 94.57% comes from the short term (1 year) with high epoch in training phase rather than using 3 years training data. (https://journalofbigdata.springeropen.com/articles/10.1186/s40537-021-00430-0)

Economic Indicators
Google Search Interest

#trends <- gtrends(keyword = "Gold",  geo = "US", onlyInterest = TRUE) # US Search
trends <- gtrends(keyword = "Gold", onlyInterest = TRUE, time = '2016-01-01 2022-11-17') # World Search
trends <- trends$interest_over_time %>%  
  as_tibble() %>%
  select(c(date, hits, keyword))
trends$date <- as_tibble(ceiling_date(trends$date, unit = "day", change_on_boundary = NULL,
                            week_start = getOption("lubridate.week.start", 1)))   ##ISSUE -> Returns Weeks instead of Days due to the time format. I can either do daily for past 30-90 days, or it will return monthly with specific date, or return weekly for past 5 years



DATE <- as.data.frame(trends$date)
trends$Date <- DATE$value
trends %>%  
  plot_ly(type='scatter',x=~Date, y=~hits, mode = 'lines', name = "Google Search Trends") %>%
  layout(title = paste0("Interest over Time: ", "Gold"), yaxis = list(title = "hits"))

Gold_Price_Subset_2 <- subset(Gold_Price, Date > as.Date("2016-01-01") )
Gold_Price_Subset_2$DDate <- as.Date(Gold_Price_Subset_2$Date , format = "%m/%y")



Gold_Subset_Plot <- Gold_Price_Subset_2 %>%
  plot_ly(x = ~Date,
          type = "candlestick", 
          open = ~Open, 
          close = ~`Close/Last`, 
          high = ~High,
          low = ~Low,
          name = "price") %>%
   layout(xaxis = list(rangeslider = list(visible = F)), yaxis = list(title = "Gold Price"))

Gold_Subset_Plot

Hits (Interest) verse Price (Gold)

Gold_Price$Close <- Gold_Price$`Close/Last`
trends %>%
  left_join(Gold_Price, by = "Date") %>%
  select(one_of(c("Date", "hits", "Close"))) %>%
  drop_na() %>%
  ggplot(aes(hits, Close)) + geom_point(color="red") + geom_smooth(method = 'loess') +
  labs(title =paste0("Gold",": Relationship between World Interest (Hits) and Close Price (Gold)"))

## `geom_smooth()` using formula = 'y ~ x'

Text Sentiment Analysis on Stock/Gold News?

#Test/Train split (Train 5 years of data, Test 1 year)
Gold_Price_Subset_3 <- subset(Gold_Price, Date > as.Date("2017-11-17") )
Gold.ts <- ts(Gold_Price_Subset_3$Close)
Gold.Train.ts <- window(Gold.ts, end = 1006 )
Gold.Test.ts <- window(Gold.ts, start = 1007)

Time Series with Prophet (adding in No external Factors)

Gold_Price_Subset_3 <- subset(Gold_Price, Date > as.Date("2017-11-17") )

Gold.Train <- subset(Gold_Price_Subset_3, Date < as.Date("2021-11-17"))
Gold.Test <- subset(Gold_Price_Subset_3, Date >= as.Date("2021-11-17"))

Gold_Train_df <- Gold.Train %>%
  select(c("Date","Close")) %>%
  rename(ds = Date, y = Close)

Gold_Test_df <- Gold.Test %>%
  select(c("Date","Close"))

#predictions using Prophet
Prophet <- prophet(Gold_Train_df, interval.width = 0.95, daily.seasonality = TRUE)
future <- make_future_dataframe(Prophet, periods = 365) %>% filter(!wday(ds) %in% c(1,7)) #account for regular gaps on weekends
Prophet_Forecast <- predict(Prophet, future) 

Forecast_subset <- Prophet_Forecast %>%
  select(c('ds','yhat','yhat_lower','yhat_upper')) %>%
  rename(Date = ds, ClosePrice = yhat , ClosePrice_lower = yhat_lower, ClosePrice_upper = yhat_upper)


datatable(Forecast_subset[c('Date','ClosePrice','ClosePrice_lower','ClosePrice_upper')])

Forecast Result Plot

plot(Prophet, Prophet_Forecast, xlabel = "Date", ylabel = "Gold Close Price ($)") + ggtitle(paste0("Gold", ": Price Prediction"))

prophet_plot_components(Prophet, Prophet_Forecast)

#autoplot(apple, series = "actual") +
 # autolayer(apple.pred, series = "predicted", alpha = .4) +
 # theme_classic() +
 # coord_cartesian(xlim = c(210, 251))

Forecast Evaluation

Prophet_Results <- Prophet_Forecast %>%
  select(c("ds","yhat")) %>%
  rename(Date = ds, Close = yhat)
tail(Prophet_Results)

##            Date    Close
## 1262 2022-11-09 1723.936
## 1263 2022-11-10 1722.952
## 1264 2022-11-11 1720.154
## 1265 2022-11-14 1718.374
## 1266 2022-11-15 1716.655
## 1267 2022-11-16 1715.051

Prophet_Results$Date <- as.Date(Prophet_Results$Date , format = "%m/%d/%y")
Prophet_Results <- subset(Prophet_Results, Date >= as.Date("2021-11-17"))

Gold_Results <- Gold_Price %>%
  select(c("Date","Close"))  %>%
  rename(Close_Actual = Close)
Gold_Results$Date <- as.Date(Gold_Results$Date , format = "%m/%d/%y")
Gold_Results <- subset(Gold_Results, Date >= as.Date("2021-11-17"))



Results <- inner_join(Gold_Results, Prophet_Results, by="Date") 

accuracy(Results$Close, Results$Close_Actual)

##                ME     RMSE      MAE      MPE     MAPE
## Test set 62.18735 113.3095 85.21852 3.197662 4.572932

#Predicted.ts <- ts(Results$Close)
#Actual.ts <- ts(Results$Close_Actual)

#autoplot(Predicted.ts, series ='predicted') + 
 # autolayer(Actual.ts, series = 'actual') 

Results

## # A tibble: 252 × 3
##    Date       Close_Actual Close
##    <date>            <dbl> <dbl>
##  1 2022-11-16        1776. 1715.
##  2 2022-11-15        1775. 1717.
##  3 2022-11-14        1774. 1718.
##  4 2022-11-11        1774. 1720.
##  5 2022-11-10        1754. 1723.
##  6 2022-11-09        1716. 1724.
##  7 2022-11-08        1716  1725.
##  8 2022-11-07        1680. 1726.
##  9 2022-11-04        1686. 1727.
## 10 2022-11-03        1638. 1729.
## # … with 242 more rows

p <- ggplot(Results, aes(Date, Close_Actual, group=1)) + 
  geom_line() +
 theme_light() + ggtitle("Actual 1 Year Gold Price")
p

p2 <- ggplot(Results, aes(Date, Close, group=1)) + 
  geom_line() +
 theme_light() + ggtitle("Predctivedl 1 Year Gold Price")
p2

Notes for myself * Add In External Regressor (Price of silver, interest rate, dollar index, so on) https://rpubs.com/mpleo/timeseries_prophet (example)

Change the timeline (training size and forecast) - Originally I did 2017-2021-11-17 training and forecast the rest of the year 2021-11-17 to 2022-11-17

*Restart the prophet method or try a different method

Fix Result Graph

Forecasting Gold Price Prediction

Amin Fesharaki

2022-11-18