# Download symbol data
TSX_Composite_Data_gs <- getSymbols("^GSPTSE", auto.assign=FALSE, from="2016-01-01", to="2019-12-31")
TSX_Composite_Data <- as.data.frame(TSX_Composite_Data_gs)
TSX_Composite_Data$Date <- time(TSX_Composite_Data_gs)
# Chart series bar graph
chartSeries(TSX_Composite_Data_gs, type = "bars", name = "TSX Composite Price 2016-2019")

describe(TSX_Composite_Data_gs)
## vars n mean sd median trimmed
## GSPTSE.Open 1 1002 1.541275e+04 1.038090e+03 1.554420e+04 1.553025e+04
## GSPTSE.High 2 1002 1.546375e+04 1.027160e+03 1.560610e+04 1.557977e+04
## GSPTSE.Low 3 1002 1.535533e+04 1.051660e+03 1.548495e+04 1.547418e+04
## GSPTSE.Close 4 1002 1.541131e+04 1.037260e+03 1.553965e+04 1.552738e+04
## GSPTSE.Volume 5 1002 2.154855e+10 6.896609e+09 2.080217e+10 2.095710e+10
## GSPTSE.Adjusted 6 1002 1.541131e+04 1.037260e+03 1.553965e+04 1.552738e+04
## mad min max range skew kurtosis
## GSPTSE.Open 9.828200e+02 11834.9 1.721620e+04 5.381300e+03 -1.01 0.88
## GSPTSE.High 9.582000e+02 11932.4 1.723060e+04 5.298200e+03 -1.01 0.86
## GSPTSE.Low 9.875600e+02 11531.2 1.714500e+04 5.613800e+03 -1.03 0.97
## GSPTSE.Close 9.827400e+02 11843.1 1.718020e+04 5.337100e+03 -1.01 0.89
## GSPTSE.Volume 4.410097e+09 0.0 8.588881e+10 8.588881e+10 2.50 15.41
## GSPTSE.Adjusted 9.827400e+02 11843.1 1.718020e+04 5.337100e+03 -1.01 0.89
## se
## GSPTSE.Open 32.79
## GSPTSE.High 32.45
## GSPTSE.Low 33.22
## GSPTSE.Close 32.77
## GSPTSE.Volume 217872160.77
## GSPTSE.Adjusted 32.77
# Close price of stock with day number
plot(~TSX_Composite_Data$Date + TSX_Composite_Data$GSPTSE.Close , main = "Closing price of stock with day number" , pch =1)

# Correlation to check linearity component
cor(TSX_Composite_Data_gs[,c(1:5)])
## GSPTSE.Open GSPTSE.High GSPTSE.Low GSPTSE.Close GSPTSE.Volume
## GSPTSE.Open 1.0000000 0.9990997 0.9985580 0.9974352 -0.1952186
## GSPTSE.High 0.9990997 1.0000000 0.9988088 0.9987932 -0.1914979
## GSPTSE.Low 0.9985580 0.9988088 1.0000000 0.9990300 -0.2029998
## GSPTSE.Close 0.9974352 0.9987932 0.9990300 1.0000000 -0.2000290
## GSPTSE.Volume -0.1952186 -0.1914979 -0.2029998 -0.2000290 1.0000000
# Welch's t-test on Open Price and Close Price
t.test(TSX_Composite_Data_gs$GSPTSE.Open, TSX_Composite_Data_gs$GSPTSE.Close)
## Warning in tstat + c(-cint, cint): Recycling array of length 1 in array-vector arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in cint * stderr: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
##
## Welch Two Sample t-test
##
## data: TSX_Composite_Data_gs$GSPTSE.Open and TSX_Composite_Data_gs$GSPTSE.Close
## t = 0.030952, df = 2002, p-value = 0.9753
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -89.48356 92.35340
## sample estimates:
## mean of x mean of y
## 15412.75 15411.31
# Welch's t-test on Volumne and Close Price
t.test(TSX_Composite_Data_gs$GSPTSE.Volume, TSX_Composite_Data_gs$GSPTSE.Close)
## Warning in tstat + c(-cint, cint): Recycling array of length 1 in array-vector arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in tstat + c(-cint, cint): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
##
## Welch Two Sample t-test
##
## data: TSX_Composite_Data_gs$GSPTSE.Volume and TSX_Composite_Data_gs$GSPTSE.Close
## t = 98.904, df = 1001, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 21120998037 21976075113
## sample estimates:
## mean of x mean of y
## 2.154855e+10 1.541131e+04
#Creating linear model of Date and Volume on Close Price
model <- lm(formula = TSX_Composite_Data$GSPTSE.Close ~ TSX_Composite_Data$Date + TSX_Composite_Data$GSPTSE.Volume)
summary(model)
##
## Call:
## lm(formula = TSX_Composite_Data$GSPTSE.Close ~ TSX_Composite_Data$Date +
## TSX_Composite_Data$GSPTSE.Volume)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2445.55 -293.32 40.61 430.30 1774.06
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.835e+04 7.904e+02 -23.21 <2e-16 ***
## TSX_Composite_Data$Date 1.957e+00 4.481e-02 43.67 <2e-16 ***
## TSX_Composite_Data$GSPTSE.Volume -2.498e-08 2.736e-09 -9.13 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 596.5 on 999 degrees of freedom
## Multiple R-squared: 0.67, Adjusted R-squared: 0.6693
## F-statistic: 1014 on 2 and 999 DF, p-value: < 2.2e-16
# Plot of linear model
plot(model)




# Plot Date and Close Price
plot(TSX_Composite_Data$Date,fitted(model), xlab = "Date", ylab = "Close Price", main = "Close Price VS Time Scatterplot")

# Calculate RSS
RSS <- c(crossprod(model$residuals))
# Claculate MSE
MSE <- RSS / length(model$residuals)
# Calculate RMSE
RMSE <- sqrt(MSE)
sprintf("MAPE : %s", MSE)
## [1] "MAPE : 354745.497643236"
sprintf("RMSE : %s", RMSE)
## [1] "RMSE : 595.605152465319"
# Check residuals of linear model
checkresiduals(model)

##
## Breusch-Godfrey test for serial correlation of order up to 10
##
## data: Residuals
## LM test = 930.15, df = 10, p-value < 2.2e-16
# Accuracy level of linear model
accuracy(model)
## ME RMSE MAE MPE MAPE MASE
## Training set -1.936137e-14 595.6052 451.6092 -0.1697713 3.027041 0.5664064
# Take only the closing price
closing_pr <- Cl(to.monthly(TSX_Composite_Data_gs))
# Decompose it
dc <- decompose(as.ts(closing_pr, start=c(2016,1)))
plot(dc)

# Seasonal component
dc$seasonal
## Jan Feb Mar Apr May Jun
## 2016 8.263687 -55.147215 -48.836478 153.266319 -6.072155 72.657093
## 2017 8.263687 -55.147215 -48.836478 153.266319 -6.072155 72.657093
## 2018 8.263687 -55.147215 -48.836478 153.266319 -6.072155 72.657093
## 2019 8.263687 -55.147215 -48.836478 153.266319 -6.072155 72.657093
## Jul Aug Sep Oct Nov Dec
## 2016 183.922230 73.307023 113.863853 -156.771238 -51.338960 -287.114161
## 2017 183.922230 73.307023 113.863853 -156.771238 -51.338960 -287.114161
## 2018 183.922230 73.307023 113.863853 -156.771238 -51.338960 -287.114161
## 2019 183.922230 73.307023 113.863853 -156.771238 -51.338960 -287.114161
# Number of period we want to forecast
n <- 100
# Splitting the data
train <- head(Cl(TSX_Composite_Data_gs), length(Cl(TSX_Composite_Data_gs))-n)
test <- tail(Cl(TSX_Composite_Data_gs), n)
# Creating Seasonal ARIMA model
# Create the Model
model_s <- auto.arima(train)
# Forecast n periods of the data
fc_s <- forecast(model_s, h=n)
# Plot the result
autoplot(fc_s)+
autolayer(ts(test, start= length(train)), series="Test Data")

# Residual of ARIMA Model
checkresiduals(fc_s)

##
## Ljung-Box test
##
## data: Residuals from ARIMA(4,1,2) with drift
## Q* = 0.57172, df = 3, p-value = 0.9029
##
## Model df: 7. Total lags used: 10
#Accuracy Metrics of ARIMA
accuracy(fc_s)
## ME RMSE MAE MPE MAPE MASE
## Training set -0.001054656 90.67734 67.60009 -0.000724578 0.452702 0.9933713
## ACF1
## Training set 0.0001861074
# Summary of ARIMA model
summary(model_s)
## Series: train
## ARIMA(4,1,2) with drift
##
## Coefficients:
## ar1 ar2 ar3 ar4 ma1 ma2 drift
## -0.5946 -0.4116 0.0220 -0.0362 0.6426 0.4415 3.5905
## s.e. 0.6415 0.5019 0.0449 0.0367 0.6417 0.5026 3.1180
##
## sigma^2 estimated as 8296: log likelihood=-5340.04
## AIC=10696.09 AICc=10696.25 BIC=10734.51
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.001054656 90.67734 67.60009 -0.000724578 0.452702 0.9933713
## ACF1
## Training set 0.0001861074