1. Introduction

# import libraries
library(DT)
library(tidyquant)
library(magrittr)
library(readr)
library(tseries)
# set the theme of the visualization

library(tidyverse)

theme_bare <- function(base_size=8,base_family="sans"){theme_bw(base_size = base_size, base_family = base_family)+
  theme(
  axis.line = element_blank(), 
  axis.text.x = element_blank(), 
  axis.text.y = element_blank(),
  axis.ticks = element_blank(), 
  axis.title.x = element_blank(), 
  axis.title.y = element_blank(), 
  legend.position = "bottom", 
  panel.background = element_rect(fill = NA), 
  panel.border = element_blank(), 
  panel.grid.major = element_blank(), 
  panel.grid.minor = element_blank(), 
  plot.margin = unit(c(0,0,0,0), "lines")
)
}



my_theme <- function(base_size =9, base_family = "sans"){
  theme_bw(base_size = base_size, base_family = base_family) +
    theme(
      panel.grid.major = element_line(color = "gray"),
      panel.grid.minor = element_blank(),
      panel.background = element_rect(fill = NA),
      strip.background = element_rect(fill = "#001d60", color = "#00113a", size =1.2),
      strip.text = element_text(face = "bold", size = 9, color = "white"),
      legend.position = "bottom",
      legend.justification = "center",
      legend.background = element_blank(),
      legend.margin = margin(0.5,0.5,0.5,0.5)
    )
}

theme_set(my_theme())

mycolors=c("#ff003f","#0094ff", "#ae00ff" , "#94ff00", "#ffc700","#fc1814")

2. Data Exploration

data <- read_csv("~/Final Project/Final_Project_Data.csv") #Change the file path according to where the file is located
knitr::kable(head(data))
Date CPI RDPI Dollar Index Treasury Indsutrial Production M2 Spot Crude Oil Case-Shiller HPI
1987-01-01 111.4 6159.5 99.86 7.18 56.13 2743.9 18.66 63.735
1987-02-01 111.8 6192.1 99.26 7.19 56.88 2747.5 17.73 64.134
1987-03-01 112.2 6200.0 97.11 7.51 56.95 2753.7 18.31 64.469
1987-04-01 112.7 5967.2 95.94 8.21 57.32 2767.7 18.64 64.973
1987-05-01 113.0 6209.1 97.77 8.49 57.68 2772.9 19.42 65.547
1987-06-01 113.5 6200.8 98.13 8.38 57.99 2774.6 20.03 66.218
# Set the start and end dates
start_date <- "1987-01-01"
end_date <- "2022-12-01"

# Get the S&P 500 monthly price data
sp500_data <- tq_get("^GSPC", 
                     from = start_date,
                     to = end_date,
                     get = "stock.prices",
                     periodicity = "monthly")

knitr::kable(tail(sp500_data))
symbol date open high low close volume adjusted
^GSPC 2022-06-01 4149.78 4177.51 3636.87 3785.38 106116710000 3785.38
^GSPC 2022-07-01 3781.00 4140.15 3721.56 4130.29 81688320000 4130.29
^GSPC 2022-08-01 4112.38 4325.28 3954.53 3955.00 92252350000 3955.00
^GSPC 2022-09-01 3936.73 4119.28 3584.13 3585.62 94241020000 3585.62
^GSPC 2022-10-01 3609.78 3905.42 3491.58 3871.98 95823760000 3871.98
^GSPC 2022-11-01 3901.79 4080.11 3698.15 4080.11 92671910000 4080.11
#Extract adjusted price for S&P 500
sp500 <- sp500_data$adjusted

#Merge Dataframe
df <- cbind(data,sp500)

#Change the column names
colnames(df) <- c('Date','CPI','RDPI','DI','TNX','IP','M2','SCO','HPI','sp500')

knitr::kable(head(df))
Date CPI RDPI DI TNX IP M2 SCO HPI sp500
1987-01-01 111.4 6159.5 99.86 7.18 56.13 2743.9 18.66 63.735 274.08
1987-02-01 111.8 6192.1 99.26 7.19 56.88 2747.5 17.73 64.134 284.20
1987-03-01 112.2 6200.0 97.11 7.51 56.95 2753.7 18.31 64.469 291.70
1987-04-01 112.7 5967.2 95.94 8.21 57.32 2767.7 18.64 64.973 288.36
1987-05-01 113.0 6209.1 97.77 8.49 57.68 2772.9 19.42 65.547 290.10
1987-06-01 113.5 6200.8 98.13 8.38 57.99 2774.6 20.03 66.218 304.00
# explore the dataset
Hmisc::describe(df)
## df 
## 
##  10  Variables      431  Observations
## --------------------------------------------------------------------------------
## Date 
##          n    missing   distinct       Info       Mean        Gmd        .05 
##        431          0        431          1 2004-11-30       4383 1988-10-16 
##        .10        .25        .50        .75        .90        .95 
## 1990-08-01 1995-12-16 2004-12-01 2013-11-16 2019-04-01 2021-01-16 
## 
## lowest : 1987-01-01 1987-02-01 1987-03-01 1987-04-01 1987-05-01
## highest: 2022-07-01 2022-08-01 2022-09-01 2022-10-01 2022-11-01
## --------------------------------------------------------------------------------
## CPI 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      431        0      410        1    193.8    54.03    120.1    131.6 
##      .25      .50      .75      .90      .95 
##    154.3    191.7    234.4    255.2    263.1 
## 
## lowest : 111.4  111.8  112.2  112.7  113   , highest: 294.73 295.32 296.54 297.99 298.6 
## --------------------------------------------------------------------------------
## RDPI 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      431        0      430        1    10623     3310     6592     6824 
##      .25      .50      .75      .90      .95 
##     7828    10701    12559    14835    15207 
## 
## lowest : 5967.2  6159.5  6192.1  6200    6200.8 
## highest: 16372.2 16423.4 17099.2 17246.2 19213.9
## --------------------------------------------------------------------------------
## DI 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      431        0      405        1    91.88    10.95    77.32    79.78 
##      .25      .50      .75      .90      .95 
##    84.52    92.17    97.61   102.85   111.03 
## 
## lowest : 71.8   72.46  72.51  72.88  72.93 , highest: 118.62 119.07 119.16 119.43 120.24
## --------------------------------------------------------------------------------
## TNX 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      431        0      406        1    4.593    2.628    1.516    1.782 
##      .25      .50      .75      .90      .95 
##    2.575    4.454    6.285    8.060    8.650 
## 
## lowest : 0.536 0.622 0.648 0.653 0.677, highest: 9.2   9.25  9.3   9.32  9.63 
## --------------------------------------------------------------------------------
## IP 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      431        0      396        1    87.05    16.32    61.24    62.04 
##      .25      .50      .75      .90      .95 
##    73.02    92.25    99.13   102.10   102.94 
## 
## lowest : 56.13  56.88  56.95  57.32  57.68 , highest: 104.27 104.37 104.48 104.57 104.59
## --------------------------------------------------------------------------------
## M2 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      431        0      431        1     7957     5384     2973     3242 
##      .25      .50      .75      .90      .95 
##     3639     6418    10999    14541    19494 
## 
## lowest : 2743.9  2747.5  2753.7  2767.7  2772.9 
## highest: 21644.2 21649   21649.7 21708.4 21739.7
## --------------------------------------------------------------------------------
## SCO 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      431        0      411        1    46.86    32.63    15.79    17.75 
##      .25      .50      .75      .90      .95 
##    20.19    38.02    67.97    94.62   101.92 
## 
## lowest : 11.28  12.01  12.47  12.85  13.36 , highest: 114.84 116.61 125.39 133.44 133.93
## --------------------------------------------------------------------------------
## HPI 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      431        0      431        1    138.2       63    72.94    76.25 
##      .25      .50      .75      .90      .95 
##    81.66   140.17   175.61   207.70   237.87 
## 
## lowest : 63.735  64.134  64.469  64.973  65.547 
## highest: 301.805 303.676 306.581 307.164 308.365
## --------------------------------------------------------------------------------
## sp500 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      431        0      428        1     1399     1060    296.2    349.1 
##      .25      .50      .75      .90      .95 
##    626.0   1191.3   1794.2   2901.5   3735.2 
## 
## lowest : 230.3   247.08  251.79  257.07  258.89 
## highest: 4522.68 4530.41 4567    4605.38 4766.18
## --------------------------------------------------------------------------------
#Revise data shape for visualization purpose
df_long <- tidyr::pivot_longer(df,
                      cols=-Date,
                      names_to = 'Variable',
                      values_to = 'Value')
#Use facet wrap function to plot all the graphs in one grid
library(wesanderson)
library(ggplot2)
ggplot(df_long,aes(x = Date,y = Value, color = Variable)) +
  geom_line(color=mycolors[2])+
  facet_wrap(~ Variable,scales = "free_y")+
  theme(legend.position="none") 

library(ggplot2)
library(gridExtra)
# Normalize the data using min-max normalization
# function for min-max normalization
normalize_column <- function(data, column_name) {
  min_val <- min(data[[column_name]])
  max_val <- max(data[[column_name]])
  
  normalized_column <- (data[[column_name]] - min_val) / (max_val - min_val)
  
  return(normalized_column)
}

# List of columns to normalize
columns_to_normalize <- c("sp500", "CPI", "RDPI", "DI", "TNX", "IP", "M2", "SCO", "HPI")

norm_sp500 <- normalize_column(df,columns_to_normalize[1])
norm_cpi <- normalize_column(df,columns_to_normalize[2])
norm_rdpi <-  normalize_column(df,columns_to_normalize[3])
norm_di <- normalize_column(df,columns_to_normalize[4])
norm_tnx <- normalize_column(df,columns_to_normalize[5])
norm_ip <- normalize_column(df,columns_to_normalize[6])
norm_m2 <- normalize_column(df,columns_to_normalize[7])
norm_sco <- normalize_column(df,columns_to_normalize[8])
norm_hpi <- normalize_column(df,columns_to_normalize[9])


#Create Visualization Plot for each Normalized Value

aa1 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
  geom_line(color=mycolors[2])+
  geom_line(aes(y=norm_cpi),color=mycolors[1])+
  labs(x='Date',y='Value',title='CPI')

aa2 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
  geom_line(color=mycolors[2])+
  geom_line(aes(y=norm_rdpi),color=mycolors[1])+
  labs(x='Date',y='Value',title='RDPI')

aa3 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
  geom_line(color=mycolors[2])+
  geom_line(aes(y=norm_di),color=mycolors[1])+
  labs(x='Date',y='Value',title='DI')

aa4 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
  geom_line(color=mycolors[2])+
  geom_line(aes(y=norm_tnx),color=mycolors[1])+
  labs(x='Date',y='Value',title='TNX')

aa5 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
  geom_line(color=mycolors[2])+
  geom_line(aes(y=norm_ip),color=mycolors[1])+
  labs(x='Date',y='Value',title='IP')

aa6 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
  geom_line(color=mycolors[2])+
  geom_line(aes(y=norm_m2),color=mycolors[1])+
  labs(x='Date',y='Value',title='M2')

aa7 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
  geom_line(color=mycolors[2])+
  geom_line(aes(y=norm_sco),color=mycolors[1])+
  labs(x='Date',y='Value',title='SCO')

aa8 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
  geom_line(color=mycolors[2])+
  geom_line(aes(y=norm_hpi),color=mycolors[1])+
  labs(x='Date',y='Value',title='HPI')
grid.arrange(aa1,aa2,aa3,aa4,aa5,aa6,aa7,aa8)

library(forecast)
library(TSstudio)
library(timetk)
library(tidyverse)
library(stats)
library(dplyr)
# Convert data to time series object
ts_data <- ts(df, start = c(1987, 1), frequency = 12)
ts_sp500 <- ts(df$sp500, start=c(1987,1),frequency = 12)
#Dependent Variable Decomposition
fit_sp500 <- stl(ts_sp500,s.window = 'periodic')

#Plot the Decomposition
library(ggfortify)
autoplot(fit_sp500,ts.colour = mycolors[2],size=0.7)+labs(title='Trend Decomposition of S&P 500')

3. Modeling

Stationarity

  • Before implementing time-series regression, we have to define the stationarity of the data. We have to be careful that regression based on OLS can be conducted between stationary & stationary, non-stationary & non-stationary variables.
#Create acf plot
a1 <- ggAcf(df$sp500,color=mycolors[2])+ggtitle('S&P 500')
b1 <- ggAcf(df$CPI,color=mycolors[2])+ggtitle("CPI") 
c1 <- ggAcf(df$RDPI,color=mycolors[2])+ggtitle('Real Disposable Income')
d1 <- ggAcf(df$DI,color=mycolors[2])+ggtitle("Dollar Index") 
e1 <- ggAcf(df$TNX,color=mycolors[2])+ggtitle("10Yr Treasury Yield") 
f1 <- ggAcf(df$IP,color=mycolors[2])+ggtitle("Industrial Production") 
g1 <- ggAcf(df$M2, color=mycolors[2])+ggtitle("Money Supply (M2)") 
h1 <- ggAcf(df$SCO,color=mycolors[2])+ggtitle("Spot Crude Oil") 
i1 <- ggAcf(df$HPI,color=mycolors[2])+ggtitle("Home Price Index")
#Create pacf plot  
a2 <- ggPacf(df$sp500,color=mycolors[2])+ggtitle('S&P 500')
b2 <- ggPacf(df$CPI,color=mycolors[2])+ggtitle("CPI") 
c2 <- ggPacf(df$RDPI,color=mycolors[2])+ggtitle('Real Disposable Income')
d2 <- ggPacf(df$DI,color=mycolors[2])+ggtitle("Dollar Index") 
e2 <- ggPacf(df$TNX,color=mycolors[2])+ggtitle("10Yr Treasury Yield") 
f2 <- ggPacf(df$IP,color=mycolors[2])+ggtitle("Industrial Production") 
g2 <- ggPacf(df$M2, color=mycolors[2])+ggtitle("Money Supply (M2)") 
h2 <- ggPacf(df$SCO,color=mycolors[2])+ggtitle("Spot Crude Oil") 
i2 <- ggPacf(df$HPI,color=mycolors[2])+ggtitle("Home Price Index")
library(patchwork)
library(gridExtra)

#Plot ACF
grid.arrange(a1,b1,c1,d1,e1,f1,g1,h1,i1, ncol=3)

#Plot PACF
grid.arrange(a2,b2,c2,d2,e2,f2,g2,h2,i2, ncol=3)

  • From the ACF & PACF plot, we can see that none of the data is stationary meaning that there is a chance that dependent variables are cointegrated with independent variable (S&P 500).

  • Now we will also conduct adf test to double check the stationarity of the data.

#CPI
adf.test(df$CPI)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  df$CPI
## Dickey-Fuller = -0.55153, Lag order = 7, p-value = 0.9793
## alternative hypothesis: stationary
#Real Disposable Income
adf.test(df$RDPI)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  df$RDPI
## Dickey-Fuller = -2.5523, Lag order = 7, p-value = 0.3442
## alternative hypothesis: stationary
#Dollar Index
adf.test(df$DI)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  df$DI
## Dickey-Fuller = -1.916, Lag order = 7, p-value = 0.6131
## alternative hypothesis: stationary
#10-yr treasury yield
adf.test(df$TNX)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  df$TNX
## Dickey-Fuller = -2.8651, Lag order = 7, p-value = 0.2119
## alternative hypothesis: stationary
#Industrial Production
adf.test(df$IP)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  df$IP
## Dickey-Fuller = -1.9042, Lag order = 7, p-value = 0.6181
## alternative hypothesis: stationary
#M2
adf.test(df$M2)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  df$M2
## Dickey-Fuller = -0.16108, Lag order = 7, p-value = 0.99
## alternative hypothesis: stationary
#Spot Crude Oil
adf.test(df$SCO)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  df$SCO
## Dickey-Fuller = -2.7648, Lag order = 7, p-value = 0.2544
## alternative hypothesis: stationary
#Home Price Index
adf.test(df$HPI)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  df$HPI
## Dickey-Fuller = -2.2559, Lag order = 7, p-value = 0.4694
## alternative hypothesis: stationary
#S&P500 
adf.test(df$sp500)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  df$sp500
## Dickey-Fuller = -1.2949, Lag order = 7, p-value = 0.8756
## alternative hypothesis: stationary
  • Since we couldn’t find any variable with p-value < 0.05, we could conclude that all the variables are non-stationary

Multicollinearity

  • Because we are dealing with several macroeconomic variables, we can expect that there would be a severe multi-correlinearity issue. Let’s see the result.
# Create corrleation plot

plotfuncLow <- function(data,mapping){
  p <- ggplot(data = data,mapping=mapping)+geom_point(color=mycolors[1],size=0.5,alpha=0.7)+
    geom_smooth(method="lm",alpha=0.5)
  p
}

library(GGally)
ggpairs(df[,c(-1,-10)],lower=list(continuous=plotfuncLow),diag=NULL)+ggtitle('Correlation Analysis')

  • We see that there are a lot of variables showing high correlation. Multicollinearity leads to biased outcome. Let’s see the regression results.
reg <- lm(data=df,sp500~CPI+RDPI+DI+TNX+IP+M2+SCO+HPI)
summary(reg)
## 
## Call:
## lm(formula = sp500 ~ CPI + RDPI + DI + TNX + IP + M2 + SCO + 
##     HPI, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -647.73 -114.92   12.05  120.17  542.01 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.438e+03  2.375e+02  -6.057 3.07e-09 ***
## CPI         -1.394e+01  1.666e+00  -8.369 8.64e-16 ***
## RDPI        -6.970e-02  2.047e-02  -3.404 0.000727 ***
## DI          -8.359e-01  1.323e+00  -0.632 0.527995    
## TNX          7.454e+01  1.318e+01   5.657 2.85e-08 ***
## IP           4.505e+01  2.434e+00  18.509  < 2e-16 ***
## M2           3.163e-01  1.105e-02  28.624  < 2e-16 ***
## SCO         -4.366e+00  6.408e-01  -6.813 3.31e-11 ***
## HPI         -1.595e+00  5.967e-01  -2.673 0.007814 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 170.3 on 422 degrees of freedom
## Multiple R-squared:  0.9719, Adjusted R-squared:  0.9714 
## F-statistic:  1828 on 8 and 422 DF,  p-value: < 2.2e-16
  • From the regression, we see that TNX has positive coefficient. This means that increase in treasury yield increases S&P 500 index. Also, RPDI is negative indicating that increase in reald disposable income decreases S&P 500 index. This does not really make sense and this is due to multi-collinearity. Let’s check for the residuals.
#Plot residuals
autoplot(reg)

#Plot acf & pacf of residuals from regression
checkresiduals(reg,color=mycolors[2])

## 
##  Breusch-Godfrey test for serial correlation of order up to 12
## 
## data:  Residuals
## LM test = 324.17, df = 12, p-value < 2.2e-16
  • Moreover, we also see a serial correlation problem from Breush-Godfrey test. Thus, we can’t just analyze the impact of variables on S&P500 if we were to use normal linear regression. We have to revise the model to see how the variables affect S&P 500.

LASSO

  • One way to deal with multicollinearity is using LASSO method. By using LASSO regression, we can shrink the irrelevant coefficients to 0 and thus find out which variable is significantly affecting S&P500 index. However, there is a trade off of bias and variance when using this method so the coefficients we will be attaining will not be exact to analyze the impact on dependent variable. Now, let’s model LASSO.
library(glmnet)
library(Matrix)
library(gamlr)

x <-  model.matrix(sp500~.,df[,-1])[,-1]
y <- df$sp500

fit_lasso_cv <- cv.glmnet(x,y,alpha=1)
plot(fit_lasso_cv)

coef(fit_lasso_cv,s='lambda.1se')
## 9 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept) -1.944784e+03
## CPI         -8.446089e+00
## RDPI        -2.243345e-02
## DI           2.371691e+00
## TNX          8.146335e+01
## IP           3.209019e+01
## M2           2.548590e-01
## SCO         -4.148577e+00
## HPI          .
coef(fit_lasso_cv,s='lambda.min')
## 9 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept) -1.568698e+03
## CPI         -1.280045e+01
## RDPI        -5.865782e-02
## DI          -9.505555e-02
## TNX          7.715954e+01
## IP           4.242546e+01
## M2           3.037784e-01
## SCO         -4.305910e+00
## HPI         -1.321559e+00
  • Depending on which lamda to choose, the coefficients become different. How much we adjust for the constraint, the result become little bit different. However, LASSO method didn’t really penalize the coefficients. Thus let’s use different method for analysis

Principal Component Analysis

#Exclude date and dependent variable for PCA and create new dataframe
prin_df <- df[,c(-1,-10)]

#Import tools for PCA visualization
library(FactoMineR)
library(factoextra)
my_pca <- PCA(prin_df,scale.unit = TRUE,graph=FALSE)
knitr::kable(get_eigenvalue(my_pca))
eigenvalue variance.percent cumulative.variance.percent
Dim.1 6.0199792 75.2497404 75.24974
Dim.2 1.2397999 15.4974985 90.74724
Dim.3 0.3255643 4.0695543 94.81679
Dim.4 0.2457808 3.0722599 97.88905
Dim.5 0.1167135 1.4589190 99.34797
Dim.6 0.0300415 0.3755184 99.72349
Dim.7 0.0140942 0.1761776 99.89967
Dim.8 0.0080265 0.1003318 100.00000
#Using prcomp function to extract the variance of principal components
pca <- prcomp(prin_df,scale=TRUE)

#Calculate variance contribution
pca.var <- pca$sdev^2
pca.var.per <- round(pca.var/sum(pca.var),3)

#Calculate Cumulative Variance 
pca.cumvar.per <- cumsum(pca.var.per)
#Create PCA data frame for visualization
pca.var.per.df <- data.frame(x=c('PC1','PC2','PC3','PC4','PC5','PC6','PC7','PC8'),
                            y=pca.var.per,
                            z=pca.cumvar.per)

#Visualization
pca1 <- pca.var.per.df %>%
  ggplot(aes(x=x,y=y,group=1))+
  geom_bar(fill=mycolors[2],stat='identity',alpha=0.7)+
  labs(x='PCA',y='Variance %',title='Proportion of Variance')+
   scale_y_continuous(labels = scales::percent,
                      limits=c(0,0.8))+
  geom_text(aes(label=scales::percent(y)), hjust=1, size=4) +
  coord_flip()

pca2 <- pca.var.per.df %>%
  ggplot(aes(x=x,y=z,group=1))+
  geom_point(color='black',size=3)+
  geom_line(color=mycolors[2],size=1,alpha=0.7)+
  labs(x='PCA',y='Variance %',title='Cumulative Variance ')+
  scale_y_continuous(labels = scales::percent,
                     limits = c(0.73,1.005))+
  geom_text(aes(label=scales::percent(z)),
            vjust=1.1,hjust=-0.1, size=4) 

grid.arrange(pca1,pca2,ncol=1)

  • For the analysis, we will use 2 principal components to capture 90% of the variance of the original dataset.
#Correlation Circle
set.seed(123)
var <- get_pca_var(my_pca)

#Use k-means clustering to group the components to similar feature for visualization
res.km <- kmeans(var$coord,centers = 3,nstart=25)
grp <- as.factor(res.km$cluster)

#Visualize Correlation Circle
fviz_pca_var(my_pca,
             axes = c(1,2),
             col.var=grp,
             ggtheme=theme_bw(),
             palette = c(mycolors[1],mycolors[2], mycolors[3]),
             legend.title = "Cluster",
             title='Correlation Circle by Cluster (PC1 & PC2)',
             col.circle = 'grey40')

# Contributions of variables to PC1
# Note that, the total contribution of a given variable, on explaining the variations retained by two principal components, say PC1 and PC2, is calculated as contrib = [(C1 * Eig1) + (C2 * Eig2)]/(Eig1 + Eig2)

#The red dashed line on the graph above indicates the expected average contribution
#If the contribution of the variables were uniform, the expected value would be 1/length(variables) = 1/8 = 12.5%. 

cont1 <- fviz_contrib(my_pca, choice = "var", axes = 1, top = 8,
             fill=mycolors[2],alpha=0.7,
               ylim=c(0,17))+coord_flip()

cont2 <- fviz_contrib(my_pca, choice = "var", axes = 2, top = 8,
             fill=mycolors[2],alpha=0.7)+coord_flip()

grid.arrange(cont1,cont2,ncol=1)

  • In PCA, the contribution of each principal component (PC) refers to the amount of variation in the data explained by that component. The contribution of a PC is measured by its eigenvalue, which represents the variance of the data along that component. The higher the eigenvalue, the more variation the PC explains.

  • On the other hand, the quality of representation of a variable in a PC refers to how well that variable is represented by that PC. The quality of representation of a variable is measured by its correlation with the PC. The higher the absolute value of the correlation, the better the variable is represented by the PC.

  • Therefore, while a PC may have a high contribution (e.g., high eigenvalue) to the total variation of the data, it may not necessarily have a high quality of representation for a particular variable. Conversely, a PC may have a low contribution to the total variation of the data, but may have a high quality of representation for a particular variable.

#Square Cosine - indicator of quality of representation
#The cos2 values are used to estimate the quality of the representation

cos1 <- fviz_cos2(my_pca,choice='var',axes=1,fill=mycolors[2],alpha=0.7)+
  scale_y_continuous(labels = scales::percent)+
  coord_flip()

cos2 <- fviz_cos2(my_pca,choice='var',axes=2,fill=mycolors[2],alpha=0.7)+
  scale_y_continuous(labels = scales::percent)+
  coord_flip()

grid.arrange(cos1,cos2,ncol=2)

-A high cos2 indicates a good representation of the variable on the principal component. A low cos2 indicates that the variable is not perfectly represented by the PCs.

#Biplot
fviz_pca_biplot(my_pca, repel = TRUE,
                col.var = mycolors[1], # Variables color
                col.ind = mycolors[2]  # Individuals color
                )

  • Now we will conduct regression using PC1 and PC2. Although the regression result will represent 90% of the variance of the original dataset, it’s still meaningful that we have reduced the variable to 2 variables which will make the modeling easier.
#Extract the principal components
PC1 <- pca$x[,1]
PC2 <- pca$x[,2]
PC3 <- pca$x[,3]
PC4 <- pca$x[,4]
#Create a new dataframe consiting of the principal components 
pca_df <- tibble(Date=df$Date,
                 sp500 = df$sp500,
                 PC1 = PC1,
                 PC2 = PC2,
                 PC3= PC3)
knitr::kable(head(pca_df))
Date sp500 PC1 PC2 PC3
1987-01-01 274.08 -3.775086 -0.4417784 -0.4432361
1987-02-01 284.20 -3.753670 -0.4078368 -0.4348487
1987-03-01 291.70 -3.778779 -0.2088164 -0.5103323
1987-04-01 288.36 -3.898793 -0.0915033 -0.5835575
1987-05-01 290.10 -3.898220 -0.2512658 -0.5586852
1987-06-01 304.00 -3.859963 -0.2774099 -0.5259170
#Normalize value of each PC to plot the graph
norm_PC1 <- (PC1-min(PC1))/(max(PC1)-min(PC1))
norm_PC2 <- (PC2-min(PC2))/(max(PC2)-min(PC2))
#Plot normalized principal components

apca1 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
  geom_line(color=mycolors[2])+
  geom_line(aes(y=norm_PC1),color=mycolors[1])+
  theme_bw()+
  labs(x='Date',y='Value',title='PC1')

apca2 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
  geom_line(color=mycolors[2])+
  geom_line(aes(y=norm_PC2),color=mycolors[1])+
  theme_bw()+
  labs(x='Date',y='Value',title='PC2')
#Visualization
#Blue color is S&P500, Green color is each PC
grid.arrange(apca1,apca2,ncol=1)

  • Since we have to conduct linear regression using the principal components, let’s conduct stationarity test using ACF and PACF plots as well as adf test.
#Check the acf & pacf of principal components
#PC1
pc1acf <- ggAcf(PC1,color=mycolors[2])+ggtitle('PC1 ACF')
pc1pacf <- ggPacf(PC1,color=mycolors[2])+ggtitle('PC1 PACF')
#PC2
pc2acf <- ggAcf(PC2,color=mycolors[2])+ggtitle('PC2 ACF')
pc2pacf <- ggPacf(PC2,color=mycolors[2])+ggtitle('PC2 PACF')

grid.arrange(pc1acf,pc1pacf,pc2acf,pc2pacf,nrow=2)

#adf test for PC1 & PC2
adf.test(PC1)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  PC1
## Dickey-Fuller = -2.668, Lag order = 7, p-value = 0.2953
## alternative hypothesis: stationary
adf.test(PC2)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  PC2
## Dickey-Fuller = -1.7716, Lag order = 7, p-value = 0.6741
## alternative hypothesis: stationary
  • Since the PCs are non-stationary, we can conduct a conintegration test and see whether the model is valid or not.

  • Let’s see whether the PCs have any correlation.

#Correlation between all the 8 PC
ggpairs(pca$x,lower=list(continuous=plotfuncLow),diag=NULL)+
  ggtitle('Principal Component Correlation Analysis')

- We see that the there is no correlation between principal components.

Dynamic Regression

  • Let’s conduct a linear regression and see whether the model is valid or not.
#Implement Regression using PC1 & PC2 on S&P 500
regPCA <- lm(sp500~PC1+PC2)
summary(regPCA)
## 
## Call:
## lm(formula = sp500 ~ PC1 + PC2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -845.05 -245.73   31.71  204.18 1445.73 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1399.158     17.480   80.04   <2e-16 ***
## PC1          360.089      7.133   50.48   <2e-16 ***
## PC2         -288.689     15.717  -18.37   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 362.9 on 428 degrees of freedom
## Multiple R-squared:  0.8709, Adjusted R-squared:  0.8703 
## F-statistic:  1443 on 2 and 428 DF,  p-value: < 2.2e-16
#Plot the Residuals
autoplot(regPCA) 

#Plot acf & pacf of residuals from regression
checkresiduals(regPCA,color=mycolors[2])

## 
##  Breusch-Godfrey test for serial correlation of order up to 10
## 
## data:  Residuals
## LM test = 396.53, df = 10, p-value < 2.2e-16
  • The QQ plot tells that the residuals are not normally distributed and the BG test tells us that there is a serial correlation issue. Moreover, the Ramsey Reset test is telling the model specification is wrong. Therefore, we can’t estimate the impact of macroeconomic variables on S&P500 using OLS. Let’s utilize dynamic regression model (ARIMA regression) which utilizes MLE (Maximum likelihood estimation).

4. Result

#Regression using ARIMA model 
cb <- cbind(PC1,PC2)
dreg <- auto.arima(sp500,xreg = cb)
summary(dreg)
## Series: sp500 
## Regression with ARIMA(2,1,2) errors 
## 
## Coefficients:
##           ar1      ar2      ma1     ma2   drift      PC1      PC2
##       -0.0678  -0.9598  -0.0302  0.8903  7.9838  45.5646  73.2396
## s.e.   0.0477   0.0287   0.0623  0.0373  3.4270  50.1625  18.2813
## 
## sigma^2 = 5589:  log likelihood = -2462.36
## AIC=4940.72   AICc=4941.06   BIC=4973.23
## 
## Training set error measures:
##                        ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -0.009934146 74.06231 48.22322 -0.6350183 3.947053 0.9833254
##                     ACF1
## Training set -0.03814311
checkresiduals(dreg,color=mycolors[2])

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(2,1,2) errors
## Q* = 31.08, df = 6, p-value = 2.447e-05
## 
## Model df: 4.   Total lags used: 10
#adf test for residuals
#stationary
adf.test(dreg$residuals)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  dreg$residuals
## Dickey-Fuller = -6.3719, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
ggplot(data=df[300:432,],aes(x=Date,y=sp500))+
  scale_color_manual(values=c('Actual'=mycolors[2],'Fitted'=mycolors[1]))+
  geom_line(aes(color='Actual'),size=1)+
  geom_line(aes(y=dreg$fitted[300:432],color='Fitted'),size=1)+
  labs(x='Month',y='Index',title='Actual vs Fitted Value',colour='Series')+
  theme_bw()+
  theme(legend.position = 'top')

knitr::kable(accuracy(dreg))
ME RMSE MAE MPE MAPE MASE ACF1
Training set -0.0099341 74.06231 48.22322 -0.6350183 3.947053 0.9833254 -0.0381431

5. Conclusion