PCA

library(DT)
library(tidyquant)

## 필요한 패키지를 로딩중입니다: lubridate

## 
## 다음의 패키지를 부착합니다: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

## 필요한 패키지를 로딩중입니다: PerformanceAnalytics

## 필요한 패키지를 로딩중입니다: xts

## 필요한 패키지를 로딩중입니다: zoo

## 
## 다음의 패키지를 부착합니다: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## 다음의 패키지를 부착합니다: 'PerformanceAnalytics'

## The following object is masked from 'package:graphics':
## 
##     legend

## 필요한 패키지를 로딩중입니다: quantmod

## 필요한 패키지를 로딩중입니다: TTR

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(magrittr)
library(readr)
library(tseries)

#Import data
data <- read_csv("~/Final Project/Final_Project_Data.csv")

## Rows: 431 Columns: 9
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl  (8): CPI, RDPI, Dollar Index, Treasury, Indsutrial Production, M2, Spot...
## date (1): Date
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

datatable(data)

# Set the start and end dates
start_date <- "1987-01-01"
end_date <- "2022-12-01"

# Get the S&P 500 monthly price data
sp500_data <- tq_get("^GSPC", 
                     from = start_date,
                     to = end_date,
                     get = "stock.prices",
                     periodicity = "monthly")

tail(sp500_data)

## # A tibble: 6 x 8
##   symbol date        open  high   low close       volume adjusted
##   <chr>  <date>     <dbl> <dbl> <dbl> <dbl>        <dbl>    <dbl>
## 1 ^GSPC  2022-06-01 4150. 4178. 3637. 3785. 106116710000    3785.
## 2 ^GSPC  2022-07-01 3781  4140. 3722. 4130.  81688320000    4130.
## 3 ^GSPC  2022-08-01 4112. 4325. 3955. 3955   92252350000    3955 
## 4 ^GSPC  2022-09-01 3937. 4119. 3584. 3586.  94241020000    3586.
## 5 ^GSPC  2022-10-01 3610. 3905. 3492. 3872.  95823760000    3872.
## 6 ^GSPC  2022-11-01 3902. 4080. 3698. 4080.  92671910000    4080.

#Extract adjusted price for S&P 500
sp500 <- sp500_data$adjusted

#Merge Dataframe
df <- cbind(data,sp500)

#Change the column names
colnames(df) <- c('Date','CPI','RDPI','DI','TNX','IP','M2','SCO','HPI','sp500')

head(df)

##         Date   CPI   RDPI    DI  TNX    IP     M2   SCO    HPI  sp500
## 1 1987-01-01 111.4 6159.5 99.86 7.18 56.13 2743.9 18.66 63.735 274.08
## 2 1987-02-01 111.8 6192.1 99.26 7.19 56.88 2747.5 17.73 64.134 284.20
## 3 1987-03-01 112.2 6200.0 97.11 7.51 56.95 2753.7 18.31 64.469 291.70
## 4 1987-04-01 112.7 5967.2 95.94 8.21 57.32 2767.7 18.64 64.973 288.36
## 5 1987-05-01 113.0 6209.1 97.77 8.49 57.68 2772.9 19.42 65.547 290.10
## 6 1987-06-01 113.5 6200.8 98.13 8.38 57.99 2774.6 20.03 66.218 304.00

#Revise data shape for visualization purpose
df_long <- tidyr::pivot_longer(df,
                      cols=-Date,
                      names_to = 'Variable',
                      values_to = 'Value')
head(df_long)

## # A tibble: 6 x 3
##   Date       Variable   Value
##   <date>     <chr>      <dbl>
## 1 1987-01-01 CPI       111.  
## 2 1987-01-01 RDPI     6160.  
## 3 1987-01-01 DI         99.9 
## 4 1987-01-01 TNX         7.18
## 5 1987-01-01 IP         56.1 
## 6 1987-01-01 M2       2744.

#Use facet wrap function to plot all the graphs in one grid
library(wesanderson)
library(ggplot2)
ggplot(df_long,aes(x = Date,y = Value, color = Variable)) +
  geom_line(color='#4373B6')+
  theme_bw()+
  facet_wrap(~ Variable,scales = "free_y")+
  theme(legend.position="none")

library(ggplot2)
library(gridExtra)

#Normalize the data using min-max normalization
norm_sp500 <- (df$sp500-min(df$sp500))/(max(df$sp500)-min(df$sp500))
norm_cpi <- (df$CPI-min(df$CPI))/(max(df$CPI)-min(df$CPI))
norm_rdpi <-  (df$RDPI-min(df$RDPI))/(max(df$RDPI)-min(df$RDPI))
norm_di <- (df$DI-min(df$DI))/(max(df$DI)-min(df$DI))
norm_tnx <- (df$TNX-min(df$TNX))/(max(df$TNX)-min(df$TNX))
norm_ip <- (df$IP-min(df$IP))/(max(df$IP)-min(df$IP))
norm_m2 <- (df$M2-min(df$M2))/(max(df$M2)-min(df$M2))
norm_sco <- (df$SCO-min(df$SCO))/(max(df$SCO)-min(df$SCO))
norm_hpi <- (df$HPI-min(df$HPI))/(max(df$HPI)-min(df$HPI))

#Create Visualization Plot for each Normalized Value
aa0 <- df %>%
ggplot(aes(x=Date,y=sp500))+
  geom_line(color='#4373B6')+
  theme_bw()+
  labs(x='Date',y='Value',title='S&P 500')

aa1 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
  geom_line(color='#4373B6')+
  geom_line(aes(y=norm_cpi),color='red')+
  theme_bw()+
  labs(x='Date',y='Value',title='CPI')


aa2 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
  geom_line(color='#4373B6')+
  geom_line(aes(y=norm_rdpi),color='red')+
  theme_bw()+
  labs(x='Date',y='Value',title='RDPI')

aa3 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
  geom_line(color='#4373B6')+
  geom_line(aes(y=norm_di),color='red')+
  theme_bw()+
  labs(x='Date',y='Value',title='DI')

aa4 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
  geom_line(color='#4373B6')+
  geom_line(aes(y=norm_tnx),color='red')+
  theme_bw()+
  labs(x='Date',y='Value',title='TNX')

aa5 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
  geom_line(color='#4373B6')+
  geom_line(aes(y=norm_ip),color='red')+
  theme_bw()+
  labs(x='Date',y='Value',title='IP')

aa6 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
  geom_line(color='#4373B6')+
  geom_line(aes(y=norm_m2),color='red')+
  theme_bw()+
  labs(x='Date',y='Value',title='M2')

aa7 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
  geom_line(color='#4373B6')+
  geom_line(aes(y=norm_sco),color='red')+
  theme_bw()+
  labs(x='Date',y='Value',title='SCO')

aa8 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
  geom_line(color='#4373B6')+
  geom_line(aes(y=norm_hpi),color='red')+
  theme_bw()+
  labs(x='Date',y='Value',title='HPI')

#Plot the graph
grid.arrange(aa0,aa1,aa2,aa3,aa4,aa5,aa6,aa7,aa8,ncol=3)

library(forecast)
library(TSstudio)
library(timetk)
library(tidyverse)

## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr   1.0.8     v stringr 1.5.0
## v forcats 1.0.0     v tibble  3.1.6
## v purrr   0.3.4     v tidyr   1.2.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::combine()   masks gridExtra::combine()
## x tidyr::extract()   masks magrittr::extract()
## x dplyr::filter()    masks stats::filter()
## x dplyr::first()     masks xts::first()
## x dplyr::lag()       masks stats::lag()
## x dplyr::last()      masks xts::last()
## x purrr::set_names() masks magrittr::set_names()
## i Use the [conflicted package](http://conflicted.r-lib.org/) to force all conflicts to become errors

library(stats)
library(dplyr)

# Convert data to time series object
ts_data <- ts(df, start = c(1987, 1), frequency = 12)
ts_sp500 <- ts(df$sp500, start=c(1987,1),frequency = 12)

#Dependent Variable Decomposition
fit_sp500 <- stl(ts_sp500,s.window = 'periodic')

#Plot the Decomposition
library(ggfortify)

## Registered S3 methods overwritten by 'ggfortify':
##   method                 from    
##   autoplot.Arima         forecast
##   autoplot.acf           forecast
##   autoplot.ar            forecast
##   autoplot.bats          forecast
##   autoplot.decomposed.ts forecast
##   autoplot.ets           forecast
##   autoplot.forecast      forecast
##   autoplot.glmnet        parsnip 
##   autoplot.stl           forecast
##   autoplot.ts            forecast
##   fitted.ar              forecast
##   fortify.ts             forecast
##   residuals.ar           forecast

autoplot(fit_sp500,ts.colour = '#4373B6',size=0.7) +
  theme_bw()

Stationarity

Before implementing time-series regression, we have to define the stationarity of the data. We have to be careful that regression based on OLS can be conducted between stationary & stationary, non-stationary & non-stationary variables. We are going to use a customized visualization tool called ggplot.corr to plot pacf and acf which was created by Kevin Liu https://rh8liuqy.github.io/ACF_PACF_by_ggplot2.html#correlogram-formula

#Creating customized acf & pacf plot 
ggplot.corr <- function(data, lag.max = 24, ci = 0.95, large.sample.size = TRUE, horizontal = TRUE,...) {
  options(dplyr.banner_expression = FALSE)
  
  require(ggplot2)
  require(dplyr)
  require(cowplot)
  
  if(horizontal == TRUE) {numofrow <- 1} else {numofrow <- 2}
  
  list.acf <- acf(data, lag.max = lag.max, type = "correlation", plot = FALSE)
  N <- as.numeric(list.acf$n.used)
  df1 <- data.frame(lag = list.acf$lag, acf = list.acf$acf)
  df1$lag.acf <- dplyr::lag(df1$acf, default = 0)
  df1$lag.acf[2] <- 0
  df1$lag.acf.cumsum <- cumsum((df1$lag.acf)^2)
  df1$acfstd <- sqrt(1/N * (1 + 2 * df1$lag.acf.cumsum))
  df1$acfstd[1] <- 0
  df1 <- dplyr::select(df1, lag, acf, acfstd)
  
  list.pacf <- acf(data, lag.max = lag.max, type = "partial", plot = FALSE)
  df2 <- data.frame(lag = list.pacf$lag,pacf = list.pacf$acf)
  df2$pacfstd <- sqrt(1/N)
  
  if(large.sample.size == TRUE) {
    plot.acf <- ggplot(data = df1, aes( x = lag, y = acf)) +
    geom_area(aes(x = lag, y = qnorm((1+ci)/2)*acfstd), fill = "#B9CFE7") +
    geom_area(aes(x = lag, y = -qnorm((1+ci)/2)*acfstd), fill = "#B9CFE7") +
    geom_col(fill = "#4373B6", width = 0.7) +
    scale_x_continuous(breaks = seq(0,max(df1$lag),6)) +
    scale_y_continuous(name = element_blank(), 
                       limits = c(min(df1$acf,df2$pacf),1)) +
    ggtitle("ACF") +
    theme_bw() +
      theme(plot.title = element_text(size=10)) 
    
    plot.pacf <- ggplot(data = df2, aes(x = lag, y = pacf)) +
    geom_area(aes(x = lag, y = qnorm((1+ci)/2)*pacfstd), fill = "#B9CFE7") +
    geom_area(aes(x = lag, y = -qnorm((1+ci)/2)*pacfstd), fill = "#B9CFE7") +
    geom_col(fill = "#4373B6", width = 0.7) +
    scale_x_continuous(breaks = seq(0,max(df2$lag, na.rm = TRUE),6)) +
    scale_y_continuous(name = element_blank(),
                       limits = c(min(df1$acf,df2$pacf),1)) +
    ggtitle("PACF") +
    theme_bw() +
      theme(plot.title = element_text(size=10)) 
  }
  else {
    plot.acf <- ggplot(data = df1, aes( x = lag, y = acf)) +
    geom_col(fill = "#4373B6", width = 0.7) +
    geom_hline(yintercept = qnorm((1+ci)/2)/sqrt(N), 
               colour = "sandybrown",
               linetype = "dashed",
               size=1) + 
    geom_hline(yintercept = - qnorm((1+ci)/2)/sqrt(N), 
               colour = "sandybrown",
               linetype = "dashed",
               size=1) + 
    scale_x_continuous(breaks = seq(0,max(df1$lag),6)) +
    scale_y_continuous(name = element_blank(), 
                       limits = c(min(df1$acf,df2$pacf),1)) +
    ggtitle("ACF") +
    theme_bw() +
      theme(plot.title = element_text(size=10)) 
    
    plot.pacf <- ggplot(data = df2, aes(x = lag, y = pacf)) +
    geom_col(fill = "#4373B6", width = 0.7) +
    geom_hline(yintercept = qnorm((1+ci)/2)/sqrt(N), 
               colour = "sandybrown",
               linetype = "dashed",
               size=1) + 
    geom_hline(yintercept = - qnorm((1+ci)/2)/sqrt(N), 
               colour = "sandybrown",
               linetype = "dashed",
               size=1) + 
    scale_x_continuous(breaks = seq(0,max(df2$lag, na.rm = TRUE),6)) +
    scale_y_continuous(name = element_blank(),
                       limits = c(min(df1$acf,df2$pacf),1)) +
    ggtitle("PACF") +
    theme_bw() +
      theme(plot.title = element_text(size=10)) 
  }
  cowplot::plot_grid(plot.acf, plot.pacf, nrow = numofrow)
}

ggplot.corr.acf <- function(data, lag.max = 24, ci = 0.95, large.sample.size = TRUE, horizontal = TRUE,...) {
  options(dplyr.banner_expression = FALSE)
  
  require(ggplot2)
  require(dplyr)
  require(cowplot)
  
  if(horizontal == TRUE) {numofrow <- 1} else {numofrow <- 2}
  
  list.acf <- acf(data, lag.max = lag.max, type = "correlation", plot = FALSE)
  N <- as.numeric(list.acf$n.used)
  df1 <- data.frame(lag = list.acf$lag, acf = list.acf$acf)
  df1$lag.acf <- dplyr::lag(df1$acf, default = 0)
  df1$lag.acf[2] <- 0
  df1$lag.acf.cumsum <- cumsum((df1$lag.acf)^2)
  df1$acfstd <- sqrt(1/N * (1 + 2 * df1$lag.acf.cumsum))
  df1$acfstd[1] <- 0
  df1 <- dplyr::select(df1, lag, acf, acfstd)
  
  list.pacf <- acf(data, lag.max = lag.max, type = "partial", plot = FALSE)
  df2 <- data.frame(lag = list.pacf$lag,pacf = list.pacf$acf)
  df2$pacfstd <- sqrt(1/N)
  
  if(large.sample.size == TRUE) {
    plot.acf <- ggplot(data = df1, aes( x = lag, y = acf)) +
    geom_area(aes(x = lag, y = qnorm((1+ci)/2)*acfstd), fill = "#B9CFE7") +
    geom_area(aes(x = lag, y = -qnorm((1+ci)/2)*acfstd), fill = "#B9CFE7") +
    geom_col(fill = "#4373B6", width = 0.7) +
    scale_x_continuous(breaks = seq(0,max(df1$lag),6)) +
    scale_y_continuous(name = element_blank(), 
                       limits = c(min(df1$acf,df2$pacf),1)) +
    ggtitle("ACF") +
    theme_bw() +
      theme(plot.title = element_text(size=10)) 
    
    plot.pacf <- ggplot(data = df2, aes(x = lag, y = pacf)) +
    geom_area(aes(x = lag, y = qnorm((1+ci)/2)*pacfstd), fill = "#B9CFE7") +
    geom_area(aes(x = lag, y = -qnorm((1+ci)/2)*pacfstd), fill = "#B9CFE7") +
    geom_col(fill = "#4373B6", width = 0.7) +
    scale_x_continuous(breaks = seq(0,max(df2$lag, na.rm = TRUE),6)) +
    scale_y_continuous(name = element_blank(),
                       limits = c(min(df1$acf,df2$pacf),1)) +
    ggtitle("PACF") +
    theme_bw() +
      theme(plot.title = element_text(size=10)) 
  }
  else {
    plot.acf <- ggplot(data = df1, aes( x = lag, y = acf)) +
    geom_col(fill = "#4373B6", width = 0.7) +
    geom_hline(yintercept = qnorm((1+ci)/2)/sqrt(N), 
               colour = "sandybrown",
               linetype = "dashed",
               size=1) + 
    geom_hline(yintercept = - qnorm((1+ci)/2)/sqrt(N), 
               colour = "sandybrown",
               linetype = "dashed",
               size=1) + 
    scale_x_continuous(breaks = seq(0,max(df1$lag),6)) +
    scale_y_continuous(name = element_blank(), 
                       limits = c(min(df1$acf,df2$pacf),1)) +
    ggtitle("ACF") +
    theme_bw() +
      theme(plot.title = element_text(size=10)) 
    
    plot.pacf <- ggplot(data = df2, aes(x = lag, y = pacf)) +
    geom_col(fill = "#4373B6", width = 0.7) +
    geom_hline(yintercept = qnorm((1+ci)/2)/sqrt(N), 
               colour = "sandybrown",
               linetype = "dashed",
               size=1) + 
    geom_hline(yintercept = - qnorm((1+ci)/2)/sqrt(N), 
               colour = "sandybrown",
               linetype = "dashed",
               size=1) + 
    scale_x_continuous(breaks = seq(0,max(df2$lag, na.rm = TRUE),6)) +
    scale_y_continuous(name = element_blank(),
                       limits = c(min(df1$acf,df2$pacf),1)) +
    ggtitle("PACF") +
    theme_bw() +
      theme(plot.title = element_text(size=10)) 
  }
  plot.acf
}

ggplot.corr.pacf <- function(data, lag.max = 24, ci = 0.95, large.sample.size = TRUE, horizontal = TRUE,...) {
  options(dplyr.banner_expression = FALSE)
  
  require(ggplot2)
  require(dplyr)
  require(cowplot)
  
  if(horizontal == TRUE) {numofrow <- 1} else {numofrow <- 2}
  
  list.acf <- acf(data, lag.max = lag.max, type = "correlation", plot = FALSE)
  N <- as.numeric(list.acf$n.used)
  df1 <- data.frame(lag = list.acf$lag, acf = list.acf$acf)
  df1$lag.acf <- dplyr::lag(df1$acf, default = 0)
  df1$lag.acf[2] <- 0
  df1$lag.acf.cumsum <- cumsum((df1$lag.acf)^2)
  df1$acfstd <- sqrt(1/N * (1 + 2 * df1$lag.acf.cumsum))
  df1$acfstd[1] <- 0
  df1 <- dplyr::select(df1, lag, acf, acfstd)
  
  list.pacf <- acf(data, lag.max = lag.max, type = "partial", plot = FALSE)
  df2 <- data.frame(lag = list.pacf$lag,pacf = list.pacf$acf)
  df2$pacfstd <- sqrt(1/N)
  
  if(large.sample.size == TRUE) {
    plot.acf <- ggplot(data = df1, aes( x = lag, y = acf)) +
    geom_area(aes(x = lag, y = qnorm((1+ci)/2)*acfstd), fill = "#B9CFE7") +
    geom_area(aes(x = lag, y = -qnorm((1+ci)/2)*acfstd), fill = "#B9CFE7") +
    geom_col(fill = "#4373B6", width = 0.7) +
    scale_x_continuous(breaks = seq(0,max(df1$lag),6)) +
    scale_y_continuous(name = element_blank(), 
                       limits = c(min(df1$acf,df2$pacf),1)) +
    ggtitle("ACF") +
    theme_bw() +
      theme(plot.title = element_text(size=10)) 
    
    plot.pacf <- ggplot(data = df2, aes(x = lag, y = pacf)) +
    geom_area(aes(x = lag, y = qnorm((1+ci)/2)*pacfstd), fill = "#B9CFE7") +
    geom_area(aes(x = lag, y = -qnorm((1+ci)/2)*pacfstd), fill = "#B9CFE7") +
    geom_col(fill = "#4373B6", width = 0.7) +
    scale_x_continuous(breaks = seq(0,max(df2$lag, na.rm = TRUE),6)) +
    scale_y_continuous(name = element_blank(),
                       limits = c(min(df1$acf,df2$pacf),1)) +
    ggtitle("PACF") +
    theme_bw() +
      theme(plot.title = element_text(size=10)) 
  }
  else {
    plot.acf <- ggplot(data = df1, aes( x = lag, y = acf)) +
    geom_col(fill = "#4373B6", width = 0.7) +
    geom_hline(yintercept = qnorm((1+ci)/2)/sqrt(N), 
               colour = "sandybrown",
               linetype = "dashed",
               size=1) + 
    geom_hline(yintercept = - qnorm((1+ci)/2)/sqrt(N), 
               colour = "sandybrown",
               linetype = "dashed",
               size=1) + 
    scale_x_continuous(breaks = seq(0,max(df1$lag),6)) +
    scale_y_continuous(name = element_blank(), 
                       limits = c(min(df1$acf,df2$pacf),1)) +
    ggtitle("ACF") +
    theme_bw() +
      theme(plot.title = element_text(size=10)) 
    
    plot.pacf <- ggplot(data = df2, aes(x = lag, y = pacf)) +
    geom_col(fill = "#4373B6", width = 0.7) +
    geom_hline(yintercept = qnorm((1+ci)/2)/sqrt(N), 
               colour = "sandybrown",
               linetype = "dashed",
               size=1) + 
    geom_hline(yintercept = - qnorm((1+ci)/2)/sqrt(N), 
               colour = "sandybrown",
               linetype = "dashed",
               size=1) + 
    scale_x_continuous(breaks = seq(0,max(df2$lag, na.rm = TRUE),6)) +
    scale_y_continuous(name = element_blank(),
                       limits = c(min(df1$acf,df2$pacf),1)) +
    ggtitle("PACF") +
    theme_bw() +
      theme(plot.title = element_text(size=10)) 
  }
  plot.pacf
}

#Create acf plot
a1 <- ggplot.corr.acf(data = df$sp500,large.sample.size = FALSE)+ggtitle('Sp500')

## 필요한 패키지를 로딩중입니다: cowplot

## 
## 다음의 패키지를 부착합니다: 'cowplot'

## The following object is masked from 'package:TSstudio':
## 
##     plot_grid

## The following object is masked from 'package:lubridate':
## 
##     stamp

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## i Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

b1 <- ggplot.corr.acf(data = df$CPI, lag.max = 24, ci= 0.95, large.sample.size = FALSE, 
                 horizontal = TRUE) + 
  ggtitle("CPI") 
  

c1 <- ggplot.corr.acf(data = df$RDPI, lag.max = 24, ci= 0.95, large.sample.size = FALSE,
                 horizontal = TRUE) + ggtitle('Real Disposable Income')
  

d1 <- ggplot.corr.acf(data = df$DI, lag.max = 24, ci= 0.95, large.sample.size = FALSE, 
                 horizontal = TRUE) + 
  ggtitle("Dollar Index") 
  

e1 <- ggplot.corr.acf(data = df$TNX, lag.max = 24, ci= 0.95, large.sample.size = FALSE, 
                 horizontal = TRUE) + 
  ggtitle("10Yr Treasury Yield") 
  

f1 <- ggplot.corr.acf(data = df$IP, lag.max = 24, ci= 0.95, large.sample.size = FALSE, 
                 horizontal = TRUE) + 
  ggtitle("Industrial Production") 
  

g1 <- ggplot.corr.acf(data = df$M2, lag.max = 24, ci= 0.95, large.sample.size = FALSE, 
                horizontal = TRUE) + 
  ggtitle("Money Supply (M2)") 
  

h1 <- ggplot.corr.acf(data = df$SCO, lag.max = 24, ci= 0.95, large.sample.size = FALSE, 
                 horizontal = TRUE) + 
  ggtitle("Spot Crude Oil") 
 

i1 <- ggplot.corr.acf(data = df$HPI, lag.max = 24, ci= 0.95, large.sample.size = FALSE, 
                 horizontal = TRUE) + 
  ggtitle("Home Price Index") 

#Create pacf plot  
a2 <- ggplot.corr.pacf(data = df$sp500,large.sample.size = TRUE)+ggtitle('Sp500')

b2 <- ggplot.corr.pacf(data = df$CPI, lag.max = 24, ci= 0.95, large.sample.size = FALSE, 
                 horizontal = TRUE) + 
  ggtitle("CPI") 
  

c2 <- ggplot.corr.pacf(data = df$RDPI, lag.max = 24, ci= 0.95, large.sample.size = FALSE,
                 horizontal = TRUE) + ggtitle('Real Disposable Income')
  

d2 <- ggplot.corr.pacf(data = df$DI, lag.max = 24, ci= 0.95, large.sample.size = FALSE, 
                 horizontal = TRUE) + 
  ggtitle("Dollar Index") 
  

e2 <- ggplot.corr.pacf(data = df$TNX, lag.max = 24, ci= 0.95, large.sample.size = FALSE, 
                 horizontal = TRUE) + 
  ggtitle("10Yr Treasury Yield") 
  

f2 <- ggplot.corr.pacf(data = df$IP, lag.max = 24, ci= 0.95, large.sample.size = FALSE, 
                 horizontal = TRUE) + 
  ggtitle("Industrial Production") 
  

g2 <- ggplot.corr.pacf(data = df$M2, lag.max = 24, ci= 0.95, large.sample.size = FALSE, 
                horizontal = TRUE) + 
  ggtitle("Money Supply (M2)") 
  

h2 <- ggplot.corr.pacf(data = df$SCO, lag.max = 24, ci= 0.95, large.sample.size = FALSE, 
                 horizontal = TRUE) + 
  ggtitle("Spot Crude Oil") 
 

i2 <- ggplot.corr.pacf(data = df$HPI, lag.max = 24, ci= 0.95, large.sample.size = FALSE, 
                 horizontal = TRUE) + 
  ggtitle("Home Price Index")

library(patchwork)

## 
## 다음의 패키지를 부착합니다: 'patchwork'

## The following object is masked from 'package:cowplot':
## 
##     align_plots

library(gridExtra)

#Plot ACF
grid.arrange(a1,b1,c1,d1,e1,f1,g1,h1,i1, ncol=3)

## Warning: Removed 1 rows containing missing values (`geom_hline()`).
## Removed 1 rows containing missing values (`geom_hline()`).
## Removed 1 rows containing missing values (`geom_hline()`).
## Removed 1 rows containing missing values (`geom_hline()`).

#Plot PACF
grid.arrange(a2,b2,c2,d2,e2,f2,g2,h2,i2, ncol=3)

## Warning: Removed 1 rows containing missing values (`geom_hline()`).
## Removed 1 rows containing missing values (`geom_hline()`).
## Removed 1 rows containing missing values (`geom_hline()`).
## Removed 1 rows containing missing values (`geom_hline()`).

From the ACF & PACF plot, we can see that none of the data is stationary meaning that there is a chance that dependent variables are cointegrated with independent variable (S&P 500).

Now we will also conduct adf test to double check the stationarity of the data

#CPI
adf.test(df$CPI)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  df$CPI
## Dickey-Fuller = -0.55153, Lag order = 7, p-value = 0.9793
## alternative hypothesis: stationary

#Real Disposable Income
adf.test(df$RDPI)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  df$RDPI
## Dickey-Fuller = -2.5523, Lag order = 7, p-value = 0.3442
## alternative hypothesis: stationary

#Dollar Index
adf.test(df$DI)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  df$DI
## Dickey-Fuller = -1.916, Lag order = 7, p-value = 0.6131
## alternative hypothesis: stationary

#10-yr treasury yield
adf.test(df$TNX)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  df$TNX
## Dickey-Fuller = -2.8651, Lag order = 7, p-value = 0.2119
## alternative hypothesis: stationary

#Industrial Production
adf.test(df$IP)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  df$IP
## Dickey-Fuller = -1.9042, Lag order = 7, p-value = 0.6181
## alternative hypothesis: stationary

#M2
adf.test(df$M2)

## Warning in adf.test(df$M2): p-value greater than printed p-value

## 
##  Augmented Dickey-Fuller Test
## 
## data:  df$M2
## Dickey-Fuller = -0.16108, Lag order = 7, p-value = 0.99
## alternative hypothesis: stationary

#Spot Crude Oil
adf.test(df$SCO)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  df$SCO
## Dickey-Fuller = -2.7648, Lag order = 7, p-value = 0.2544
## alternative hypothesis: stationary

#Home Price Index
adf.test(df$HPI)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  df$HPI
## Dickey-Fuller = -2.2559, Lag order = 7, p-value = 0.4694
## alternative hypothesis: stationary

#S&P500 
adf.test(df$sp500)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  df$sp500
## Dickey-Fuller = -1.2949, Lag order = 7, p-value = 0.8756
## alternative hypothesis: stationary

Since we couldn’t find any variable with p-value < 0.05, we could conclude that all the variables are non-stationary

However, since we are dealing with several macroeconomic variables, we can expect that there would be a severe multi-correlinearity issue.

Multicollinearity

library(ggcorrplot)

#Create correlation matrix
#Exclude date column
corr_matrix <- cor(df[,-1])
ggcorrplot(corr_matrix,hc.order =TRUE,
           outline.col='white',
           ggtheme = theme_bw(),
           colors=c("#4373B6", "white", "#FC4E07"),
           lab=TRUE,
           title='Correlation Heatmap')

We see that there are a lot of variables showing high correlation. Let’s see the regression results

reg <- lm(data=df,sp500~CPI+RDPI+DI+TNX+IP+M2+SCO+HPI)
summary(reg)

## 
## Call:
## lm(formula = sp500 ~ CPI + RDPI + DI + TNX + IP + M2 + SCO + 
##     HPI, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -647.73 -114.92   12.05  120.17  542.01 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.438e+03  2.375e+02  -6.057 3.07e-09 ***
## CPI         -1.394e+01  1.666e+00  -8.369 8.64e-16 ***
## RDPI        -6.970e-02  2.047e-02  -3.404 0.000727 ***
## DI          -8.359e-01  1.323e+00  -0.632 0.527995    
## TNX          7.454e+01  1.318e+01   5.657 2.85e-08 ***
## IP           4.505e+01  2.434e+00  18.509  < 2e-16 ***
## M2           3.163e-01  1.105e-02  28.624  < 2e-16 ***
## SCO         -4.366e+00  6.408e-01  -6.813 3.31e-11 ***
## HPI         -1.595e+00  5.967e-01  -2.673 0.007814 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 170.3 on 422 degrees of freedom
## Multiple R-squared:  0.9719, Adjusted R-squared:  0.9714 
## F-statistic:  1828 on 8 and 422 DF,  p-value: < 2.2e-16

From the regression, we see that RDPI has a negative coefficient, indiciating increase in real disposable income negatively affects S&P500. This does not make sense and this is due to multi-collinearity

#Plot residuals
autoplot(reg)+theme_bw()

#Plot acf & pacf of residuals from regression
ggplot.corr(reg$residuals,large.sample.size = FALSE)

library(lmtest)
dwtest(reg)

## 
##  Durbin-Watson test
## 
## data:  reg
## DW = 0.30647, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0

reset(reg)

## 
##  RESET test
## 
## data:  reg
## RESET = 168.67, df1 = 2, df2 = 420, p-value < 2.2e-16

Thus we will implement Principle Component Analysis to extract features that are not correlated to each other

Principal Component Analysis

Reference: http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/112-pca-principal-component-analysis-essentials/

#Exclude date and dependent variable for PCA and create new dataframe
prin_df <- df[,c(-1,-10)]
head(prin_df)

##     CPI   RDPI    DI  TNX    IP     M2   SCO    HPI
## 1 111.4 6159.5 99.86 7.18 56.13 2743.9 18.66 63.735
## 2 111.8 6192.1 99.26 7.19 56.88 2747.5 17.73 64.134
## 3 112.2 6200.0 97.11 7.51 56.95 2753.7 18.31 64.469
## 4 112.7 5967.2 95.94 8.21 57.32 2767.7 18.64 64.973
## 5 113.0 6209.1 97.77 8.49 57.68 2772.9 19.42 65.547
## 6 113.5 6200.8 98.13 8.38 57.99 2774.6 20.03 66.218

#Import tools for PCA visualization
library(FactoMineR)
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

my_pca <- PCA(prin_df,scale.unit = TRUE,graph=FALSE)
my_pca

## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 431 individuals, described by 8 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"

get_eigenvalue(my_pca)

##        eigenvalue variance.percent cumulative.variance.percent
## Dim.1 6.019979233       75.2497404                    75.24974
## Dim.2 1.239799883       15.4974985                    90.74724
## Dim.3 0.325564347        4.0695543                    94.81679
## Dim.4 0.245780791        3.0722599                    97.88905
## Dim.5 0.116713521        1.4589190                    99.34797
## Dim.6 0.030041473        0.3755184                    99.72349
## Dim.7 0.014094207        0.1761776                    99.89967
## Dim.8 0.008026544        0.1003318                   100.00000

#Using prcomp function to extract the variance of principal components
pca <- prcomp(prin_df,scale=TRUE)

#Calculate variance contribution
pca.var <- pca$sdev^2
pca.var.per <- round(pca.var/sum(pca.var),3)

#Calculate Cumulative Variance 
pca.cumvar.per <- cumsum(pca.var.per)

#Create PCA data frame for visualization
pca.var.per.df <- data.frame(x=c('PC1','PC2','PC3','PC4','PC5','PC6','PC7','PC8'),
                            y=pca.var.per,
                            z=pca.cumvar.per)

#Visualization
pca1 <- pca.var.per.df %>%
  ggplot(aes(x=x,y=y,group=1))+
  geom_bar(fill='#4373B6',stat='identity')+
  labs(x='PCA',y='Variance %',title='Proportion of Variance')+
   scale_y_continuous(labels = scales::percent,
                      limits=c(0,0.8))+
  theme_bw()+
  geom_text(aes(label=scales::percent(y)), vjust=-0.2, size=4) 

pca2 <- pca.var.per.df %>%
  ggplot(aes(x=x,y=z,group=1))+
  geom_point(color='#4373B6',size=3)+
  geom_line(color='#4373B6',size=1)+
  labs(x='PCA',y='Variance %',title='Cumulative Variance ')+
  scale_y_continuous(labels = scales::percent,
                     limits = c(0.73,1.005))+
  theme_bw()+
  geom_text(aes(label=scales::percent(z)),
            vjust=1.1,hjust=-0.1, size=4) 

grid.arrange(pca1,pca2,ncol=1)

For the analysis, we will use 2 principal components to capture 90% of the variance of the original dataset.

#Correlation Circle
set.seed(123)
var <- get_pca_var(my_pca)

#Use k-means clustering to group the components to similar feature for visualization
res.km <- kmeans(var$coord,centers = 3,nstart=25)
grp <- as.factor(res.km$cluster)

#Visualize Correlation Circle
fviz_pca_var(my_pca,
             axes = c(1,2),
             col.var=grp,
             ggtheme=theme_bw(),
             palette = c("#4373B6", "red2", "forestgreen"),
             legend.title = "Cluster",
             title='Correlation Circle by Cluster (PC1 & PC2)',
             col.circle = 'grey40')

# Contributions of variables to PC1
# Note that, the total contribution of a given variable, on explaining the variations retained by two principal components, say PC1 and PC2, is calculated as contrib = [(C1 * Eig1) + (C2 * Eig2)]/(Eig1 + Eig2)

#The red dashed line on the graph above indicates the expected average contribution
#If the contribution of the variables were uniform, the expected value would be 1/length(variables) = 1/8 = 12.5%. 

cont1 <- fviz_contrib(my_pca, choice = "var", axes = 1, top = 8,
             fill='#4373B6',
             ggtheme= theme_bw(),
               ylim=c(0,17))

cont2 <- fviz_contrib(my_pca, choice = "var", axes = 2, top = 8,
             fill='#4373B6',
             ggtheme = theme_bw())


grid.arrange(cont1,cont2,ncol=1)

In PCA, the contribution of each principal component (PC) refers to the amount of variation in the data explained by that component. The contribution of a PC is measured by its eigenvalue, which represents the variance of the data along that component. The higher the eigenvalue, the more variation the PC explains.

On the other hand, the quality of representation of a variable in a PC refers to how well that variable is represented by that PC. The quality of representation of a variable is measured by its correlation with the PC. The higher the absolute value of the correlation, the better the variable is represented by the PC.

Therefore, while a PC may have a high contribution (e.g., high eigenvalue) to the total variation of the data, it may not necessarily have a high quality of representation for a particular variable. Conversely, a PC may have a low contribution to the total variation of the data, but may have a high quality of representation for a particular variable.

#Square Cosine - indicator of quality of representation
#The cos2 values are used to estimate the quality of the representation
cos1 <- fviz_cos2(my_pca,choice='var',axes=1,
          fill='#4373B6',
          ggtheme = theme_bw()
          
          )+scale_y_continuous(labels = scales::percent)

cos2 <- fviz_cos2(my_pca,choice='var',axes=2,
          fill='#4373B6',
          ggtheme = theme_bw()
          )+scale_y_continuous(labels = scales::percent)



grid.arrange(cos1,cos2,ncol=2)

#A high cos2 indicates a good representation of the variable on the principal component. 
#A low cos2 indicates that the variable is not perfectly represented by the PCs.

#Biplot
fviz_pca_biplot(my_pca, repel = TRUE,
                col.var = "red", # Variables color
                col.ind = "#4373B6"  # Individuals color
                )+theme_bw()

## Warning: ggrepel: 229 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

#Extract the principal components
PC1 <- pca$x[,1]
PC2 <- pca$x[,2]

#Create a new dataframe consiting of the principal components 
pca_df <- tibble(Date=df$Date,
                 sp500 = df$sp500,
                 PC1 = PC1,
                 PC2 = PC2)
datatable(pca_df)

#Normalize value of each PC to plot the graph
norm_PC1 <- (PC1-min(PC1))/(max(PC1)-min(PC1))
norm_PC2 <- (PC2-min(PC2))/(max(PC2)-min(PC2))

#Plot normalized principal components

apca1 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
  geom_line(color='#4373B6')+
  geom_line(aes(y=norm_PC1),color='forestgreen')+
  theme_bw()+
  labs(x='Date',y='Value',title='PC1')

apca2 <- df %>%
ggplot(aes(x=Date,y=norm_sp500))+
  geom_line(color='#4373B6')+
  geom_line(aes(y=norm_PC2),color='forestgreen')+
  theme_bw()+
  labs(x='Date',y='Value',title='PC2')

#Visualization
grid.arrange(apca1,apca2,ncol=1)

#Check the acf & pacf of principal components
#PC1
aaaa1 <- ggplot.corr(PC1,large.sample.size = FALSE)+ggtitle('PC1')+theme(plot.title = element_text(hjust = 0.5,size=12,face = 'bold'))

## Warning: Removed 1 rows containing missing values (`geom_hline()`).
## Removed 1 rows containing missing values (`geom_hline()`).

#PC2
aaaa2 <- ggplot.corr(PC2,large.sample.size = FALSE)+ggtitle('PC2')+theme(plot.title = element_text(hjust = 0.5,size=12,face = 'bold'))

#Visualization
grid.arrange(aaaa1,aaaa2)

#adf test for PC1 & PC2
adf.test(PC1)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  PC1
## Dickey-Fuller = -2.668, Lag order = 7, p-value = 0.2953
## alternative hypothesis: stationary

adf.test(PC2)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  PC2
## Dickey-Fuller = -1.7716, Lag order = 7, p-value = 0.6741
## alternative hypothesis: stationary

Now, we can see that the multicollinearity issue has been resolved.

#Correlation between PC1 & PC2
ggcorrplot(cor(pca$x[,1:2]),hc.order =TRUE,
           outline.col='white',
           ggtheme = theme_bw(),
           colors=c("#4373B6", "white", "#FC4E07"),
           lab=TRUE,
           title='Principal Component Correlation Heatmap (PC1 & PC2)')

#Correlation between all the 8 PC
ggcorrplot(cor(pca$x),hc.order =TRUE,
           outline.col='white',
           ggtheme = theme_bw(),
           colors=c("#4373B6", "white", "#FC4E07"),
           lab=TRUE,
           title='Principal Component Correlation Heatmap')

#Implement Regression using PC1 & PC2 on S&P 500
regPCA <- lm(sp500~PC1+PC2)
summary(regPCA)

## 
## Call:
## lm(formula = sp500 ~ PC1 + PC2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -845.05 -245.73   31.71  204.18 1445.73 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1399.158     17.480   80.04   <2e-16 ***
## PC1          360.089      7.133   50.48   <2e-16 ***
## PC2         -288.689     15.717  -18.37   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 362.9 on 428 degrees of freedom
## Multiple R-squared:  0.8709, Adjusted R-squared:  0.8703 
## F-statistic:  1443 on 2 and 428 DF,  p-value: < 2.2e-16

#Check the residuals of regression
#The residuals are not stationary
#Regression is not valid
adf.test(regPCA$residuals)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  regPCA$residuals
## Dickey-Fuller = -2.7942, Lag order = 7, p-value = 0.2419
## alternative hypothesis: stationary

#Plot the Residuals
autoplot(regPCA) + theme_bw()

#Plot acf & pacf of residuals
#There is a serial correlation
ggplot.corr(regPCA$residuals,large.sample.size = FALSE) +ggtitle('PCA Residuals')+theme(plot.title = element_text(hjust = 0.5,size=14,face = 'bold'))

library(lmtest)

# Durbin-Watson Test of Serial Correlation of Errors. Ho: No Serial Correlation
dwtest(regPCA)

## 
##  Durbin-Watson test
## 
## data:  regPCA
## DW = 0.084321, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0

#Bruesch Godfrey Test of Serial Correlation of Errors. Ho: No Serial Correlation
bgtest(regPCA)

## 
##  Breusch-Godfrey test for serial correlation of order up to 1
## 
## data:  regPCA
## LM test = 395.41, df = 1, p-value < 2.2e-16

# Ramsey Reset Test of Model Specification:  Ho: Model Specification is Correct
reset(regPCA)

## 
##  RESET test
## 
## data:  regPCA
## RESET = 259.48, df1 = 2, df2 = 426, p-value < 2.2e-16

Since logging the variable is not possible due to negative values, we will use dynamic model.

#Regression using ARIMA model 
cb <- cbind(PC1,PC2)
dreg <- auto.arima(sp500,xreg = cb)
summary(dreg)

## Series: sp500 
## Regression with ARIMA(2,1,2) errors 
## 
## Coefficients:
##           ar1      ar2      ma1     ma2   drift      PC1      PC2
##       -0.0678  -0.9598  -0.0302  0.8903  7.9838  45.5646  73.2396
## s.e.   0.0477   0.0287   0.0623  0.0373  3.4270  50.1625  18.2813
## 
## sigma^2 = 5589:  log likelihood = -2462.36
## AIC=4940.72   AICc=4941.06   BIC=4973.23
## 
## Training set error measures:
##                        ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -0.009934148 74.06231 48.22322 -0.6350183 3.947053 0.9833254
##                     ACF1
## Training set -0.03814311

From the result, we see that the PC2 is significant. Since Dollar index account for most of the PC2, we can say that dollar index is the most important factor among the macro ecnomic variabls affecting the S&P500 Index.

#Residuals plot
res1 <- autoplot(dreg$residuals,colour='#4373B6')+theme_bw()+ggtitle('Residuals from ARIMA (2,1,2)')

#Histogram of residuals
res2 <- dreg$residuals %>% 
ggplot(aes(dreg$residuals))+
  geom_histogram(fill='#4373B6',position='identity',bins=30)+
  labs(x='Residuals',y='Count',title='Residuals from ARIMA (2,1,2)')+
  theme_bw()+
   geom_density(alpha=0.6)
  
  
#Visualization
grid.arrange(res1,res2,ncol=1)

##

## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.

#acf & pacf of residuals from dynamic model
ggplot.corr(dreg$residuals,large.sample.size = FALSE)+ggtitle('Residuals from ARIMA (2,1,2) ')+theme(plot.title = element_text(hjust = 0.5,size=12,face = 'bold'))

#adf test for residuals
#stationary
adf.test(dreg$residuals)

## Warning in adf.test(dreg$residuals): p-value smaller than printed p-value

## 
##  Augmented Dickey-Fuller Test
## 
## data:  dreg$residuals
## Dickey-Fuller = -6.3719, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary

unloadNamespace('ggfortify')
unloadNamespace('forecast')
library(forecast)

## Registered S3 methods overwritten by 'forecast':
##   method                 from     
##   autoplot.Arima         ggfortify
##   autoplot.acf           ggfortify
##   autoplot.ar            ggfortify
##   autoplot.bats          ggfortify
##   autoplot.decomposed.ts ggfortify
##   autoplot.ets           ggfortify
##   autoplot.forecast      ggfortify
##   autoplot.stl           ggfortify
##   autoplot.ts            ggfortify
##   fitted.ar              ggfortify
##   fortify.ts             ggfortify
##   residuals.ar           ggfortify

ggplot(data=df[421:432,],aes(x=Date,y=sp500))+
  scale_color_manual(values=c('Actual'='#4373B6','Fitted'='red'))+
  geom_line(aes(color='Actual'),size=1)+
  geom_line(aes(y=dreg$fitted[421:432],color='Fitted'),size=1)+
  labs(x='Month',y='Index',title='Actual vs Fitted for Y2022',colour='Series')+
  theme_bw()+
  theme(legend.position = 'top')

## Warning: Removed 1 row containing missing values (`geom_line()`).
## Removed 1 row containing missing values (`geom_line()`).

PCA

Jaehyun Ahn

2023-01-28

Stationarity

Multicollinearity

Principal Component Analysis