# import libraries
library(DT)
library(tidyquant)
library(magrittr)
library(readr)
library(tseries)
# set the theme of the visualization
library(tidyverse)
theme_bare <- function(base_size=8,base_family="sans"){theme_bw(base_size = base_size, base_family = base_family)+
theme(
axis.line = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
legend.position = "bottom",
panel.background = element_rect(fill = NA),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.margin = unit(c(0,0,0,0), "lines")
)
}
my_theme <- function(base_size =9, base_family = "sans"){
theme_bw(base_size = base_size, base_family = base_family) +
theme(
panel.grid.major = element_line(color = "gray"),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = NA),
strip.background = element_rect(fill = "#001d60", color = "#00113a", size =1.2),
strip.text = element_text(face = "bold", size = 9, color = "white"),
legend.position = "bottom",
legend.justification = "center",
legend.background = element_blank(),
legend.margin = margin(0.5,0.5,0.5,0.5)
)
}
theme_set(my_theme())
mycolors=c("#ff003f","#0094ff", "#ae00ff" , "#94ff00", "#ffc700","#fc1814")
data <- read_csv("~/Final Project/Final_Project_Data.csv") #Change the file path according to where the file is located
knitr::kable(head(data))
| Date | CPI | RDPI | Dollar Index | Treasury | Indsutrial Production | M2 | Spot Crude Oil | Case-Shiller HPI |
|---|---|---|---|---|---|---|---|---|
| 1987-01-01 | 111.4 | 6159.5 | 99.86 | 7.18 | 56.13 | 2743.9 | 18.66 | 63.735 |
| 1987-02-01 | 111.8 | 6192.1 | 99.26 | 7.19 | 56.88 | 2747.5 | 17.73 | 64.134 |
| 1987-03-01 | 112.2 | 6200.0 | 97.11 | 7.51 | 56.95 | 2753.7 | 18.31 | 64.469 |
| 1987-04-01 | 112.7 | 5967.2 | 95.94 | 8.21 | 57.32 | 2767.7 | 18.64 | 64.973 |
| 1987-05-01 | 113.0 | 6209.1 | 97.77 | 8.49 | 57.68 | 2772.9 | 19.42 | 65.547 |
| 1987-06-01 | 113.5 | 6200.8 | 98.13 | 8.38 | 57.99 | 2774.6 | 20.03 | 66.218 |
# Set the start and end dates
start_date <- "1987-01-01"
end_date <- "2022-12-01"
# Get the S&P 500 monthly price data
sp500_data <- tq_get("^GSPC",
from = start_date,
to = end_date,
get = "stock.prices",
periodicity = "monthly")
knitr::kable(tail(sp500_data))
| symbol | date | open | high | low | close | volume | adjusted |
|---|---|---|---|---|---|---|---|
| ^GSPC | 2022-06-01 | 4149.78 | 4177.51 | 3636.87 | 3785.38 | 106116710000 | 3785.38 |
| ^GSPC | 2022-07-01 | 3781.00 | 4140.15 | 3721.56 | 4130.29 | 81688320000 | 4130.29 |
| ^GSPC | 2022-08-01 | 4112.38 | 4325.28 | 3954.53 | 3955.00 | 92252350000 | 3955.00 |
| ^GSPC | 2022-09-01 | 3936.73 | 4119.28 | 3584.13 | 3585.62 | 94241020000 | 3585.62 |
| ^GSPC | 2022-10-01 | 3609.78 | 3905.42 | 3491.58 | 3871.98 | 95823760000 | 3871.98 |
| ^GSPC | 2022-11-01 | 3901.79 | 4080.11 | 3698.15 | 4080.11 | 92671910000 | 4080.11 |
#Extract adjusted price for S&P 500
sp500 <- sp500_data$adjusted
#Merge Dataframe
df <- cbind(data,sp500)
#Change the column names
colnames(df) <- c('Date','CPI','RDPI','DI','TNX','IP','M2','SCO','HPI','sp500')
knitr::kable(head(df))
| Date | CPI | RDPI | DI | TNX | IP | M2 | SCO | HPI | sp500 |
|---|---|---|---|---|---|---|---|---|---|
| 1987-01-01 | 111.4 | 6159.5 | 99.86 | 7.18 | 56.13 | 2743.9 | 18.66 | 63.735 | 274.08 |
| 1987-02-01 | 111.8 | 6192.1 | 99.26 | 7.19 | 56.88 | 2747.5 | 17.73 | 64.134 | 284.20 |
| 1987-03-01 | 112.2 | 6200.0 | 97.11 | 7.51 | 56.95 | 2753.7 | 18.31 | 64.469 | 291.70 |
| 1987-04-01 | 112.7 | 5967.2 | 95.94 | 8.21 | 57.32 | 2767.7 | 18.64 | 64.973 | 288.36 |
| 1987-05-01 | 113.0 | 6209.1 | 97.77 | 8.49 | 57.68 | 2772.9 | 19.42 | 65.547 | 290.10 |
| 1987-06-01 | 113.5 | 6200.8 | 98.13 | 8.38 | 57.99 | 2774.6 | 20.03 | 66.218 | 304.00 |
# explore the dataset
Hmisc::describe(df)
## df
##
## 10 Variables 431 Observations
## --------------------------------------------------------------------------------
## Date
## n missing distinct Info Mean Gmd .05
## 431 0 431 1 2004-11-30 4383 1988-10-16
## .10 .25 .50 .75 .90 .95
## 1990-08-01 1995-12-16 2004-12-01 2013-11-16 2019-04-01 2021-01-16
##
## lowest : 1987-01-01 1987-02-01 1987-03-01 1987-04-01 1987-05-01
## highest: 2022-07-01 2022-08-01 2022-09-01 2022-10-01 2022-11-01
## --------------------------------------------------------------------------------
## CPI
## n missing distinct Info Mean Gmd .05 .10
## 431 0 410 1 193.8 54.03 120.1 131.6
## .25 .50 .75 .90 .95
## 154.3 191.7 234.4 255.2 263.1
##
## lowest : 111.4 111.8 112.2 112.7 113 , highest: 294.73 295.32 296.54 297.99 298.6
## --------------------------------------------------------------------------------
## RDPI
## n missing distinct Info Mean Gmd .05 .10
## 431 0 430 1 10623 3310 6592 6824
## .25 .50 .75 .90 .95
## 7828 10701 12559 14835 15207
##
## lowest : 5967.2 6159.5 6192.1 6200 6200.8
## highest: 16372.2 16423.4 17099.2 17246.2 19213.9
## --------------------------------------------------------------------------------
## DI
## n missing distinct Info Mean Gmd .05 .10
## 431 0 405 1 91.88 10.95 77.32 79.78
## .25 .50 .75 .90 .95
## 84.52 92.17 97.61 102.85 111.03
##
## lowest : 71.8 72.46 72.51 72.88 72.93 , highest: 118.62 119.07 119.16 119.43 120.24
## --------------------------------------------------------------------------------
## TNX
## n missing distinct Info Mean Gmd .05 .10
## 431 0 406 1 4.593 2.628 1.516 1.782
## .25 .50 .75 .90 .95
## 2.575 4.454 6.285 8.060 8.650
##
## lowest : 0.536 0.622 0.648 0.653 0.677, highest: 9.2 9.25 9.3 9.32 9.63
## --------------------------------------------------------------------------------
## IP
## n missing distinct Info Mean Gmd .05 .10
## 431 0 396 1 87.05 16.32 61.24 62.04
## .25 .50 .75 .90 .95
## 73.02 92.25 99.13 102.10 102.94
##
## lowest : 56.13 56.88 56.95 57.32 57.68 , highest: 104.27 104.37 104.48 104.57 104.59
## --------------------------------------------------------------------------------
## M2
## n missing distinct Info Mean Gmd .05 .10
## 431 0 431 1 7957 5384 2973 3242
## .25 .50 .75 .90 .95
## 3639 6418 10999 14541 19494
##
## lowest : 2743.9 2747.5 2753.7 2767.7 2772.9
## highest: 21644.2 21649 21649.7 21708.4 21739.7
## --------------------------------------------------------------------------------
## SCO
## n missing distinct Info Mean Gmd .05 .10
## 431 0 411 1 46.86 32.63 15.79 17.75
## .25 .50 .75 .90 .95
## 20.19 38.02 67.97 94.62 101.92
##
## lowest : 11.28 12.01 12.47 12.85 13.36 , highest: 114.84 116.61 125.39 133.44 133.93
## --------------------------------------------------------------------------------
## HPI
## n missing distinct Info Mean Gmd .05 .10
## 431 0 431 1 138.2 63 72.94 76.25
## .25 .50 .75 .90 .95
## 81.66 140.17 175.61 207.70 237.87
##
## lowest : 63.735 64.134 64.469 64.973 65.547
## highest: 301.805 303.676 306.581 307.164 308.365
## --------------------------------------------------------------------------------
## sp500
## n missing distinct Info Mean Gmd .05 .10
## 431 0 428 1 1399 1060 296.2 349.1
## .25 .50 .75 .90 .95
## 626.0 1191.3 1794.2 2901.5 3735.2
##
## lowest : 230.3 247.08 251.79 257.07 258.89
## highest: 4522.68 4530.41 4567 4605.38 4766.18
## --------------------------------------------------------------------------------
#Revise data shape for visualization purpose
df_long <- tidyr::pivot_longer(df,
cols=-Date,
names_to = 'Variable',
values_to = 'Value')
#Use facet wrap function to plot all the graphs in one grid
library(wesanderson)
library(ggplot2)
ggplot(df_long,aes(x = Date,y = Value, color = Variable)) +
geom_line(color=mycolors[2])+
facet_wrap(~ Variable,scales = "free_y")+
theme(legend.position="none")
library(ggplot2)
library(gridExtra)
# Normalize the data using min-max normalization
# function for min-max normalization
normalize_column <- function(data, column_name) {
min_val <- min(data[[column_name]])
max_val <- max(data[[column_name]])
normalized_column <- (data[[column_name]] - min_val) / (max_val - min_val)
return(normalized_column)
}
# List of columns to normalize
columns_to_normalize <- c("sp500", "CPI", "RDPI", "DI", "TNX", "IP", "M2", "SCO", "HPI")
norm_sp500 <- normalize_column(df,columns_to_normalize[1])
norm_cpi <- normalize_column(df,columns_to_normalize[2])
norm_rdpi <- normalize_column(df,columns_to_normalize[3])
norm_di <- normalize_column(df,columns_to_normalize[4])
norm_tnx <- normalize_column(df,columns_to_normalize[5])
norm_ip <- normalize_column(df,columns_to_normalize[6])
norm_m2 <- normalize_column(df,columns_to_normalize[7])
norm_sco <- normalize_column(df,columns_to_normalize[8])
norm_hpi <- normalize_column(df,columns_to_normalize[9])
#Create Visualization Plot for each Normalized Value
aa1 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
geom_line(color=mycolors[2])+
geom_line(aes(y=norm_cpi),color=mycolors[1])+
labs(x='Date',y='Value',title='CPI')
aa2 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
geom_line(color=mycolors[2])+
geom_line(aes(y=norm_rdpi),color=mycolors[1])+
labs(x='Date',y='Value',title='RDPI')
aa3 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
geom_line(color=mycolors[2])+
geom_line(aes(y=norm_di),color=mycolors[1])+
labs(x='Date',y='Value',title='DI')
aa4 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
geom_line(color=mycolors[2])+
geom_line(aes(y=norm_tnx),color=mycolors[1])+
labs(x='Date',y='Value',title='TNX')
aa5 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
geom_line(color=mycolors[2])+
geom_line(aes(y=norm_ip),color=mycolors[1])+
labs(x='Date',y='Value',title='IP')
aa6 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
geom_line(color=mycolors[2])+
geom_line(aes(y=norm_m2),color=mycolors[1])+
labs(x='Date',y='Value',title='M2')
aa7 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
geom_line(color=mycolors[2])+
geom_line(aes(y=norm_sco),color=mycolors[1])+
labs(x='Date',y='Value',title='SCO')
aa8 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
geom_line(color=mycolors[2])+
geom_line(aes(y=norm_hpi),color=mycolors[1])+
labs(x='Date',y='Value',title='HPI')
grid.arrange(aa1,aa2,aa3,aa4,aa5,aa6,aa7,aa8)
library(forecast)
library(TSstudio)
library(timetk)
library(tidyverse)
library(stats)
library(dplyr)
# Convert data to time series object
ts_data <- ts(df, start = c(1987, 1), frequency = 12)
ts_sp500 <- ts(df$sp500, start=c(1987,1),frequency = 12)
#Dependent Variable Decomposition
fit_sp500 <- stl(ts_sp500,s.window = 'periodic')
#Plot the Decomposition
library(ggfortify)
autoplot(fit_sp500,ts.colour = mycolors[2],size=0.7)+labs(title='Trend Decomposition of S&P 500')
#Create acf plot
a1 <- ggAcf(df$sp500,color=mycolors[2])+ggtitle('S&P 500')
b1 <- ggAcf(df$CPI,color=mycolors[2])+ggtitle("CPI")
c1 <- ggAcf(df$RDPI,color=mycolors[2])+ggtitle('Real Disposable Income')
d1 <- ggAcf(df$DI,color=mycolors[2])+ggtitle("Dollar Index")
e1 <- ggAcf(df$TNX,color=mycolors[2])+ggtitle("10Yr Treasury Yield")
f1 <- ggAcf(df$IP,color=mycolors[2])+ggtitle("Industrial Production")
g1 <- ggAcf(df$M2, color=mycolors[2])+ggtitle("Money Supply (M2)")
h1 <- ggAcf(df$SCO,color=mycolors[2])+ggtitle("Spot Crude Oil")
i1 <- ggAcf(df$HPI,color=mycolors[2])+ggtitle("Home Price Index")
#Create pacf plot
a2 <- ggPacf(df$sp500,color=mycolors[2])+ggtitle('S&P 500')
b2 <- ggPacf(df$CPI,color=mycolors[2])+ggtitle("CPI")
c2 <- ggPacf(df$RDPI,color=mycolors[2])+ggtitle('Real Disposable Income')
d2 <- ggPacf(df$DI,color=mycolors[2])+ggtitle("Dollar Index")
e2 <- ggPacf(df$TNX,color=mycolors[2])+ggtitle("10Yr Treasury Yield")
f2 <- ggPacf(df$IP,color=mycolors[2])+ggtitle("Industrial Production")
g2 <- ggPacf(df$M2, color=mycolors[2])+ggtitle("Money Supply (M2)")
h2 <- ggPacf(df$SCO,color=mycolors[2])+ggtitle("Spot Crude Oil")
i2 <- ggPacf(df$HPI,color=mycolors[2])+ggtitle("Home Price Index")
library(patchwork)
library(gridExtra)
#Plot ACF
grid.arrange(a1,b1,c1,d1,e1,f1,g1,h1,i1, ncol=3)
#Plot PACF
grid.arrange(a2,b2,c2,d2,e2,f2,g2,h2,i2, ncol=3)
From the ACF & PACF plot, we can see that none of the data is stationary meaning that there is a chance that dependent variables are cointegrated with independent variable (S&P 500).
Now we will also conduct adf test to double check the stationarity of the data.
#CPI
adf.test(df$CPI)
##
## Augmented Dickey-Fuller Test
##
## data: df$CPI
## Dickey-Fuller = -0.55153, Lag order = 7, p-value = 0.9793
## alternative hypothesis: stationary
#Real Disposable Income
adf.test(df$RDPI)
##
## Augmented Dickey-Fuller Test
##
## data: df$RDPI
## Dickey-Fuller = -2.5523, Lag order = 7, p-value = 0.3442
## alternative hypothesis: stationary
#Dollar Index
adf.test(df$DI)
##
## Augmented Dickey-Fuller Test
##
## data: df$DI
## Dickey-Fuller = -1.916, Lag order = 7, p-value = 0.6131
## alternative hypothesis: stationary
#10-yr treasury yield
adf.test(df$TNX)
##
## Augmented Dickey-Fuller Test
##
## data: df$TNX
## Dickey-Fuller = -2.8651, Lag order = 7, p-value = 0.2119
## alternative hypothesis: stationary
#Industrial Production
adf.test(df$IP)
##
## Augmented Dickey-Fuller Test
##
## data: df$IP
## Dickey-Fuller = -1.9042, Lag order = 7, p-value = 0.6181
## alternative hypothesis: stationary
#M2
adf.test(df$M2)
##
## Augmented Dickey-Fuller Test
##
## data: df$M2
## Dickey-Fuller = -0.16108, Lag order = 7, p-value = 0.99
## alternative hypothesis: stationary
#Spot Crude Oil
adf.test(df$SCO)
##
## Augmented Dickey-Fuller Test
##
## data: df$SCO
## Dickey-Fuller = -2.7648, Lag order = 7, p-value = 0.2544
## alternative hypothesis: stationary
#Home Price Index
adf.test(df$HPI)
##
## Augmented Dickey-Fuller Test
##
## data: df$HPI
## Dickey-Fuller = -2.2559, Lag order = 7, p-value = 0.4694
## alternative hypothesis: stationary
#S&P500
adf.test(df$sp500)
##
## Augmented Dickey-Fuller Test
##
## data: df$sp500
## Dickey-Fuller = -1.2949, Lag order = 7, p-value = 0.8756
## alternative hypothesis: stationary
# Create corrleation plot
plotfuncLow <- function(data,mapping){
p <- ggplot(data = data,mapping=mapping)+geom_point(color=mycolors[1],size=0.5,alpha=0.7)+
geom_smooth(method="lm",alpha=0.5)
p
}
library(GGally)
ggpairs(df[,c(-1,-10)],lower=list(continuous=plotfuncLow),diag=NULL)+ggtitle('Correlation Analysis')
reg <- lm(data=df,sp500~CPI+RDPI+DI+TNX+IP+M2+SCO+HPI)
summary(reg)
##
## Call:
## lm(formula = sp500 ~ CPI + RDPI + DI + TNX + IP + M2 + SCO +
## HPI, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -647.73 -114.92 12.05 120.17 542.01
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.438e+03 2.375e+02 -6.057 3.07e-09 ***
## CPI -1.394e+01 1.666e+00 -8.369 8.64e-16 ***
## RDPI -6.970e-02 2.047e-02 -3.404 0.000727 ***
## DI -8.359e-01 1.323e+00 -0.632 0.527995
## TNX 7.454e+01 1.318e+01 5.657 2.85e-08 ***
## IP 4.505e+01 2.434e+00 18.509 < 2e-16 ***
## M2 3.163e-01 1.105e-02 28.624 < 2e-16 ***
## SCO -4.366e+00 6.408e-01 -6.813 3.31e-11 ***
## HPI -1.595e+00 5.967e-01 -2.673 0.007814 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 170.3 on 422 degrees of freedom
## Multiple R-squared: 0.9719, Adjusted R-squared: 0.9714
## F-statistic: 1828 on 8 and 422 DF, p-value: < 2.2e-16
#Plot residuals
autoplot(reg)
#Plot acf & pacf of residuals from regression
checkresiduals(reg,color=mycolors[2])
##
## Breusch-Godfrey test for serial correlation of order up to 12
##
## data: Residuals
## LM test = 324.17, df = 12, p-value < 2.2e-16
library(glmnet)
library(Matrix)
library(gamlr)
x <- model.matrix(sp500~.,df[,-1])[,-1]
y <- df$sp500
fit_lasso_cv <- cv.glmnet(x,y,alpha=1)
plot(fit_lasso_cv)
coef(fit_lasso_cv,s='lambda.1se')
## 9 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -1.944784e+03
## CPI -8.446089e+00
## RDPI -2.243345e-02
## DI 2.371691e+00
## TNX 8.146335e+01
## IP 3.209019e+01
## M2 2.548590e-01
## SCO -4.148577e+00
## HPI .
coef(fit_lasso_cv,s='lambda.min')
## 9 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -1.568698e+03
## CPI -1.280045e+01
## RDPI -5.865782e-02
## DI -9.505555e-02
## TNX 7.715954e+01
## IP 4.242546e+01
## M2 3.037784e-01
## SCO -4.305910e+00
## HPI -1.321559e+00
We will implement Principle Component Analysis, which is one type of unsupervised learning method, to extract features that are not correlated to each other. PCA extracts the feature of the variables by storing the maximum variance information that are orthogonal to each other. PCA is really effective in dealing with multicollinearity since it extracts the features of the variables that are orthogonal to each other in n-dimensional space, meaning that correlation doesn’t exist.
Code Reference: http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/112-pca-principal-component-analysis-essentials/
#Exclude date and dependent variable for PCA and create new dataframe
prin_df <- df[,c(-1,-10)]
#Import tools for PCA visualization
library(FactoMineR)
library(factoextra)
my_pca <- PCA(prin_df,scale.unit = TRUE,graph=FALSE)
knitr::kable(get_eigenvalue(my_pca))
| eigenvalue | variance.percent | cumulative.variance.percent | |
|---|---|---|---|
| Dim.1 | 6.0199792 | 75.2497404 | 75.24974 |
| Dim.2 | 1.2397999 | 15.4974985 | 90.74724 |
| Dim.3 | 0.3255643 | 4.0695543 | 94.81679 |
| Dim.4 | 0.2457808 | 3.0722599 | 97.88905 |
| Dim.5 | 0.1167135 | 1.4589190 | 99.34797 |
| Dim.6 | 0.0300415 | 0.3755184 | 99.72349 |
| Dim.7 | 0.0140942 | 0.1761776 | 99.89967 |
| Dim.8 | 0.0080265 | 0.1003318 | 100.00000 |
#Using prcomp function to extract the variance of principal components
pca <- prcomp(prin_df,scale=TRUE)
#Calculate variance contribution
pca.var <- pca$sdev^2
pca.var.per <- round(pca.var/sum(pca.var),3)
#Calculate Cumulative Variance
pca.cumvar.per <- cumsum(pca.var.per)
#Create PCA data frame for visualization
pca.var.per.df <- data.frame(x=c('PC1','PC2','PC3','PC4','PC5','PC6','PC7','PC8'),
y=pca.var.per,
z=pca.cumvar.per)
#Visualization
pca1 <- pca.var.per.df %>%
ggplot(aes(x=x,y=y,group=1))+
geom_bar(fill=mycolors[2],stat='identity',alpha=0.7)+
labs(x='PCA',y='Variance %',title='Proportion of Variance')+
scale_y_continuous(labels = scales::percent,
limits=c(0,0.8))+
geom_text(aes(label=scales::percent(y)), hjust=1, size=4) +
coord_flip()
pca2 <- pca.var.per.df %>%
ggplot(aes(x=x,y=z,group=1))+
geom_point(color='black',size=3)+
geom_line(color=mycolors[2],size=1,alpha=0.7)+
labs(x='PCA',y='Variance %',title='Cumulative Variance ')+
scale_y_continuous(labels = scales::percent,
limits = c(0.73,1.005))+
geom_text(aes(label=scales::percent(z)),
vjust=1.1,hjust=-0.1, size=4)
grid.arrange(pca1,pca2,ncol=1)
#Correlation Circle
set.seed(123)
var <- get_pca_var(my_pca)
#Use k-means clustering to group the components to similar feature for visualization
res.km <- kmeans(var$coord,centers = 3,nstart=25)
grp <- as.factor(res.km$cluster)
#Visualize Correlation Circle
fviz_pca_var(my_pca,
axes = c(1,2),
col.var=grp,
ggtheme=theme_bw(),
palette = c(mycolors[1],mycolors[2], mycolors[3]),
legend.title = "Cluster",
title='Correlation Circle by Cluster (PC1 & PC2)',
col.circle = 'grey40')
# Contributions of variables to PC1
# Note that, the total contribution of a given variable, on explaining the variations retained by two principal components, say PC1 and PC2, is calculated as contrib = [(C1 * Eig1) + (C2 * Eig2)]/(Eig1 + Eig2)
#The red dashed line on the graph above indicates the expected average contribution
#If the contribution of the variables were uniform, the expected value would be 1/length(variables) = 1/8 = 12.5%.
cont1 <- fviz_contrib(my_pca, choice = "var", axes = 1, top = 8,
fill=mycolors[2],alpha=0.7,
ylim=c(0,17))+coord_flip()
cont2 <- fviz_contrib(my_pca, choice = "var", axes = 2, top = 8,
fill=mycolors[2],alpha=0.7)+coord_flip()
grid.arrange(cont1,cont2,ncol=1)
In PCA, the contribution of each principal component (PC) refers to the amount of variation in the data explained by that component. The contribution of a PC is measured by its eigenvalue, which represents the variance of the data along that component. The higher the eigenvalue, the more variation the PC explains.
On the other hand, the quality of representation of a variable in a PC refers to how well that variable is represented by that PC. The quality of representation of a variable is measured by its correlation with the PC. The higher the absolute value of the correlation, the better the variable is represented by the PC.
Therefore, while a PC may have a high contribution (e.g., high eigenvalue) to the total variation of the data, it may not necessarily have a high quality of representation for a particular variable. Conversely, a PC may have a low contribution to the total variation of the data, but may have a high quality of representation for a particular variable.
#Square Cosine - indicator of quality of representation
#The cos2 values are used to estimate the quality of the representation
cos1 <- fviz_cos2(my_pca,choice='var',axes=1,fill=mycolors[2],alpha=0.7)+
scale_y_continuous(labels = scales::percent)+
coord_flip()
cos2 <- fviz_cos2(my_pca,choice='var',axes=2,fill=mycolors[2],alpha=0.7)+
scale_y_continuous(labels = scales::percent)+
coord_flip()
grid.arrange(cos1,cos2,ncol=2)
-A high cos2 indicates a good representation of the variable on the principal component. A low cos2 indicates that the variable is not perfectly represented by the PCs.
#Biplot
fviz_pca_biplot(my_pca, repel = TRUE,
col.var = mycolors[1], # Variables color
col.ind = mycolors[2] # Individuals color
)
#Extract the principal components
PC1 <- pca$x[,1]
PC2 <- pca$x[,2]
PC3 <- pca$x[,3]
PC4 <- pca$x[,4]
#Create a new dataframe consiting of the principal components
pca_df <- tibble(Date=df$Date,
sp500 = df$sp500,
PC1 = PC1,
PC2 = PC2,
PC3= PC3)
knitr::kable(head(pca_df))
| Date | sp500 | PC1 | PC2 | PC3 |
|---|---|---|---|---|
| 1987-01-01 | 274.08 | -3.775086 | -0.4417784 | -0.4432361 |
| 1987-02-01 | 284.20 | -3.753670 | -0.4078368 | -0.4348487 |
| 1987-03-01 | 291.70 | -3.778779 | -0.2088164 | -0.5103323 |
| 1987-04-01 | 288.36 | -3.898793 | -0.0915033 | -0.5835575 |
| 1987-05-01 | 290.10 | -3.898220 | -0.2512658 | -0.5586852 |
| 1987-06-01 | 304.00 | -3.859963 | -0.2774099 | -0.5259170 |
#Normalize value of each PC to plot the graph
norm_PC1 <- (PC1-min(PC1))/(max(PC1)-min(PC1))
norm_PC2 <- (PC2-min(PC2))/(max(PC2)-min(PC2))
#Plot normalized principal components
apca1 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
geom_line(color=mycolors[2])+
geom_line(aes(y=norm_PC1),color=mycolors[1])+
theme_bw()+
labs(x='Date',y='Value',title='PC1')
apca2 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
geom_line(color=mycolors[2])+
geom_line(aes(y=norm_PC2),color=mycolors[1])+
theme_bw()+
labs(x='Date',y='Value',title='PC2')
#Visualization
#Blue color is S&P500, Green color is each PC
grid.arrange(apca1,apca2,ncol=1)
#Check the acf & pacf of principal components
#PC1
pc1acf <- ggAcf(PC1,color=mycolors[2])+ggtitle('PC1 ACF')
pc1pacf <- ggPacf(PC1,color=mycolors[2])+ggtitle('PC1 PACF')
#PC2
pc2acf <- ggAcf(PC2,color=mycolors[2])+ggtitle('PC2 ACF')
pc2pacf <- ggPacf(PC2,color=mycolors[2])+ggtitle('PC2 PACF')
grid.arrange(pc1acf,pc1pacf,pc2acf,pc2pacf,nrow=2)
#adf test for PC1 & PC2
adf.test(PC1)
##
## Augmented Dickey-Fuller Test
##
## data: PC1
## Dickey-Fuller = -2.668, Lag order = 7, p-value = 0.2953
## alternative hypothesis: stationary
adf.test(PC2)
##
## Augmented Dickey-Fuller Test
##
## data: PC2
## Dickey-Fuller = -1.7716, Lag order = 7, p-value = 0.6741
## alternative hypothesis: stationary
Since the PCs are non-stationary, we can conduct a conintegration test and see whether the model is valid or not.
Let’s see whether the PCs have any correlation.
#Correlation between all the 8 PC
ggpairs(pca$x,lower=list(continuous=plotfuncLow),diag=NULL)+
ggtitle('Principal Component Correlation Analysis')
- We see that the there is no correlation between principal
components.
#Implement Regression using PC1 & PC2 on S&P 500
regPCA <- lm(sp500~PC1+PC2)
summary(regPCA)
##
## Call:
## lm(formula = sp500 ~ PC1 + PC2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -845.05 -245.73 31.71 204.18 1445.73
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1399.158 17.480 80.04 <2e-16 ***
## PC1 360.089 7.133 50.48 <2e-16 ***
## PC2 -288.689 15.717 -18.37 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 362.9 on 428 degrees of freedom
## Multiple R-squared: 0.8709, Adjusted R-squared: 0.8703
## F-statistic: 1443 on 2 and 428 DF, p-value: < 2.2e-16
#Plot the Residuals
autoplot(regPCA)
#Plot acf & pacf of residuals from regression
checkresiduals(regPCA,color=mycolors[2])
##
## Breusch-Godfrey test for serial correlation of order up to 10
##
## data: Residuals
## LM test = 396.53, df = 10, p-value < 2.2e-16
#Regression using ARIMA model
cb <- cbind(PC1,PC2)
dreg <- auto.arima(sp500,xreg = cb)
summary(dreg)
## Series: sp500
## Regression with ARIMA(2,1,2) errors
##
## Coefficients:
## ar1 ar2 ma1 ma2 drift PC1 PC2
## -0.0678 -0.9598 -0.0302 0.8903 7.9838 45.5646 73.2396
## s.e. 0.0477 0.0287 0.0623 0.0373 3.4270 50.1625 18.2813
##
## sigma^2 = 5589: log likelihood = -2462.36
## AIC=4940.72 AICc=4941.06 BIC=4973.23
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.009934146 74.06231 48.22322 -0.6350183 3.947053 0.9833254
## ACF1
## Training set -0.03814311
checkresiduals(dreg,color=mycolors[2])
##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(2,1,2) errors
## Q* = 31.08, df = 6, p-value = 2.447e-05
##
## Model df: 4. Total lags used: 10
#adf test for residuals
#stationary
adf.test(dreg$residuals)
##
## Augmented Dickey-Fuller Test
##
## data: dreg$residuals
## Dickey-Fuller = -6.3719, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
The Ljung-Box tells that there is a autocorrelation in the residuals, which means that the prediction interval may not provide accurate coverage. However, the residuals are stationary. It seems that we will have t o bear the bias.
Let’s see how the model was fitted well to the actual data.
ggplot(data=df[300:432,],aes(x=Date,y=sp500))+
scale_color_manual(values=c('Actual'=mycolors[2],'Fitted'=mycolors[1]))+
geom_line(aes(color='Actual'),size=1)+
geom_line(aes(y=dreg$fitted[300:432],color='Fitted'),size=1)+
labs(x='Month',y='Index',title='Actual vs Fitted Value',colour='Series')+
theme_bw()+
theme(legend.position = 'top')
knitr::kable(accuracy(dreg))
| ME | RMSE | MAE | MPE | MAPE | MASE | ACF1 | |
|---|---|---|---|---|---|---|---|
| Training set | -0.0099341 | 74.06231 | 48.22322 | -0.6350183 | 3.947053 | 0.9833254 | -0.0381431 |
Time-series analysis using OLS, especially for macroeconomic variable, is never an easy task due to Gauss Markov theorem. We know that OLS is BLUE when it follows all the assumptions but it very hard achieve it. To deal with the bias and inefficiency causing from the violation of the theorem, I have utilized PCA to deal with multicollinearity and dynamic regression model trying to avoid serial correlation as well as endogenity problem.
We found out that PC2 factor (combination of Spot Crude Oil and Dollar Index) combined with AR(2)+MA(1)+MA(2)+drift affecting S&P 500 index. However, this analysis focuses more on the ARIMA model itself rather the macroeconomic variable making it hard to conclude that the macroeconomic variable itself solely affects the S&P500 index.