# Download symbol data
TSX_Composite_Data_gs <- getSymbols("^GSPTSE", auto.assign=FALSE, from="2016-01-01", to="2019-12-31")

TSX_Composite_Data <- as.data.frame(TSX_Composite_Data_gs)
TSX_Composite_Data$Date <- time(TSX_Composite_Data_gs)

# Chart series bar graph
chartSeries(TSX_Composite_Data_gs, type = "bars", name = "TSX Composite Price 2016-2019")

describe(TSX_Composite_Data_gs)
##                 vars    n         mean           sd       median      trimmed
## GSPTSE.Open        1 1002 1.541275e+04 1.038090e+03 1.554420e+04 1.553025e+04
## GSPTSE.High        2 1002 1.546375e+04 1.027160e+03 1.560610e+04 1.557977e+04
## GSPTSE.Low         3 1002 1.535533e+04 1.051660e+03 1.548495e+04 1.547418e+04
## GSPTSE.Close       4 1002 1.541131e+04 1.037260e+03 1.553965e+04 1.552738e+04
## GSPTSE.Volume      5 1002 2.154855e+10 6.896609e+09 2.080217e+10 2.095710e+10
## GSPTSE.Adjusted    6 1002 1.541131e+04 1.037260e+03 1.553965e+04 1.552738e+04
##                          mad     min          max        range  skew kurtosis
## GSPTSE.Open     9.828200e+02 11834.9 1.721620e+04 5.381300e+03 -1.01     0.88
## GSPTSE.High     9.582000e+02 11932.4 1.723060e+04 5.298200e+03 -1.01     0.86
## GSPTSE.Low      9.875600e+02 11531.2 1.714500e+04 5.613800e+03 -1.03     0.97
## GSPTSE.Close    9.827400e+02 11843.1 1.718020e+04 5.337100e+03 -1.01     0.89
## GSPTSE.Volume   4.410097e+09     0.0 8.588881e+10 8.588881e+10  2.50    15.41
## GSPTSE.Adjusted 9.827400e+02 11843.1 1.718020e+04 5.337100e+03 -1.01     0.89
##                           se
## GSPTSE.Open            32.79
## GSPTSE.High            32.45
## GSPTSE.Low             33.22
## GSPTSE.Close           32.77
## GSPTSE.Volume   217872160.77
## GSPTSE.Adjusted        32.77
# Close price of stock with day number
plot(~TSX_Composite_Data$Date + TSX_Composite_Data$GSPTSE.Close , main = "Closing price of stock with day number" , pch =1)

# Correlation to check linearity component
cor(TSX_Composite_Data_gs[,c(1:5)])
##               GSPTSE.Open GSPTSE.High GSPTSE.Low GSPTSE.Close GSPTSE.Volume
## GSPTSE.Open     1.0000000   0.9990997  0.9985580    0.9974352    -0.1952186
## GSPTSE.High     0.9990997   1.0000000  0.9988088    0.9987932    -0.1914979
## GSPTSE.Low      0.9985580   0.9988088  1.0000000    0.9990300    -0.2029998
## GSPTSE.Close    0.9974352   0.9987932  0.9990300    1.0000000    -0.2000290
## GSPTSE.Volume  -0.1952186  -0.1914979 -0.2029998   -0.2000290     1.0000000
# Welch's t-test on Open Price and Close Price
t.test(TSX_Composite_Data_gs$GSPTSE.Open, TSX_Composite_Data_gs$GSPTSE.Close)
## Warning in tstat + c(-cint, cint): Recycling array of length 1 in array-vector arithmetic is deprecated.
##   Use c() or as.vector() instead.
## Warning in cint * stderr: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.
## 
##  Welch Two Sample t-test
## 
## data:  TSX_Composite_Data_gs$GSPTSE.Open and TSX_Composite_Data_gs$GSPTSE.Close
## t = 0.030952, df = 2002, p-value = 0.9753
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -89.48356  92.35340
## sample estimates:
## mean of x mean of y 
##  15412.75  15411.31
# Welch's t-test on Volumne and Close Price
t.test(TSX_Composite_Data_gs$GSPTSE.Volume, TSX_Composite_Data_gs$GSPTSE.Close)
## Warning in tstat + c(-cint, cint): Recycling array of length 1 in array-vector arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in tstat + c(-cint, cint): Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.
## 
##  Welch Two Sample t-test
## 
## data:  TSX_Composite_Data_gs$GSPTSE.Volume and TSX_Composite_Data_gs$GSPTSE.Close
## t = 98.904, df = 1001, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  21120998037 21976075113
## sample estimates:
##    mean of x    mean of y 
## 2.154855e+10 1.541131e+04
#Creating linear model of Date and Volume on Close Price
model <- lm(formula = TSX_Composite_Data$GSPTSE.Close ~ TSX_Composite_Data$Date + TSX_Composite_Data$GSPTSE.Volume)
summary(model)
## 
## Call:
## lm(formula = TSX_Composite_Data$GSPTSE.Close ~ TSX_Composite_Data$Date + 
##     TSX_Composite_Data$GSPTSE.Volume)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2445.55  -293.32    40.61   430.30  1774.06 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      -1.835e+04  7.904e+02  -23.21   <2e-16 ***
## TSX_Composite_Data$Date           1.957e+00  4.481e-02   43.67   <2e-16 ***
## TSX_Composite_Data$GSPTSE.Volume -2.498e-08  2.736e-09   -9.13   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 596.5 on 999 degrees of freedom
## Multiple R-squared:   0.67,  Adjusted R-squared:  0.6693 
## F-statistic:  1014 on 2 and 999 DF,  p-value: < 2.2e-16
# Plot of linear model
plot(model)

# Plot Date and Close Price
plot(TSX_Composite_Data$Date,fitted(model), xlab = "Date", ylab = "Close Price", main = "Close Price VS Time Scatterplot")

# Calculate RSS
RSS <- c(crossprod(model$residuals))
# Claculate MSE
MSE <- RSS / length(model$residuals)
# Calculate RMSE
RMSE <- sqrt(MSE)
sprintf("MAPE : %s", MSE)
## [1] "MAPE : 354745.497643236"
sprintf("RMSE : %s", RMSE)
## [1] "RMSE : 595.605152465319"
# Check residuals of linear model
checkresiduals(model)

## 
##  Breusch-Godfrey test for serial correlation of order up to 10
## 
## data:  Residuals
## LM test = 930.15, df = 10, p-value < 2.2e-16
# Accuracy level of linear model
accuracy(model)
##                         ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -1.936137e-14 595.6052 451.6092 -0.1697713 3.027041 0.5664064
# Take only the closing price
closing_pr <- Cl(to.monthly(TSX_Composite_Data_gs))

# Decompose it
dc <- decompose(as.ts(closing_pr, start=c(2016,1)))
plot(dc)

# Seasonal component 
dc$seasonal
##              Jan         Feb         Mar         Apr         May         Jun
## 2016    8.263687  -55.147215  -48.836478  153.266319   -6.072155   72.657093
## 2017    8.263687  -55.147215  -48.836478  153.266319   -6.072155   72.657093
## 2018    8.263687  -55.147215  -48.836478  153.266319   -6.072155   72.657093
## 2019    8.263687  -55.147215  -48.836478  153.266319   -6.072155   72.657093
##              Jul         Aug         Sep         Oct         Nov         Dec
## 2016  183.922230   73.307023  113.863853 -156.771238  -51.338960 -287.114161
## 2017  183.922230   73.307023  113.863853 -156.771238  -51.338960 -287.114161
## 2018  183.922230   73.307023  113.863853 -156.771238  -51.338960 -287.114161
## 2019  183.922230   73.307023  113.863853 -156.771238  -51.338960 -287.114161
# Number of period we want to forecast
n <- 100

# Splitting the data
train <- head(Cl(TSX_Composite_Data_gs), length(Cl(TSX_Composite_Data_gs))-n)
test <- tail(Cl(TSX_Composite_Data_gs), n)
# Creating Seasonal ARIMA model

# Create the Model
model_s <- auto.arima(train)

# Forecast n periods of the data
fc_s <- forecast(model_s, h=n)

# Plot the result
autoplot(fc_s)+
  autolayer(ts(test, start= length(train)), series="Test Data")

# Residual of ARIMA Model
checkresiduals(fc_s)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(4,1,2) with drift
## Q* = 0.57172, df = 3, p-value = 0.9029
## 
## Model df: 7.   Total lags used: 10
#Accuracy Metrics of ARIMA
accuracy(fc_s)
##                        ME     RMSE      MAE          MPE     MAPE      MASE
## Training set -0.001054656 90.67734 67.60009 -0.000724578 0.452702 0.9933713
##                      ACF1
## Training set 0.0001861074
# Summary of ARIMA model
summary(model_s)
## Series: train 
## ARIMA(4,1,2) with drift 
## 
## Coefficients:
##           ar1      ar2     ar3      ar4     ma1     ma2   drift
##       -0.5946  -0.4116  0.0220  -0.0362  0.6426  0.4415  3.5905
## s.e.   0.6415   0.5019  0.0449   0.0367  0.6417  0.5026  3.1180
## 
## sigma^2 estimated as 8296:  log likelihood=-5340.04
## AIC=10696.09   AICc=10696.25   BIC=10734.51
## 
## Training set error measures:
##                        ME     RMSE      MAE          MPE     MAPE      MASE
## Training set -0.001054656 90.67734 67.60009 -0.000724578 0.452702 0.9933713
##                      ACF1
## Training set 0.0001861074