Project

Project Goal

Data Exploration

options(warn=-1)
library(ggplot2)
library(quantmod)
library(TSA)
library(readxl)
library(stringi)
library(zoo)
library(forecast)
library(tseries)
library(TTR)
library(ModelMetrics)

path = '/Users/mareksalamon/Desktop/School/Hunter/Spring Semester 2019/Time Series Analysis (715)/Final Project'
setwd(path)
data <- read_xlsx('Data/total_tech_invs_usa.xlsx', col_names = TRUE, range = 'A2:C98',
                      col_types = c('text','numeric','numeric'), na = '0', trim_ws = TRUE)
data$Quarter <- as.Date(as.yearqtr(data$Quarter, format = "Q%q %Y"))

For simplicity, we will look at the total amount of quarterly investment across all 20 sectors.

head(data)
## # A tibble: 6 x 3
##   Quarter    `Number of deals`    Amounts
##   <date>                 <dbl>      <dbl>
## 1 1995-01-01               473 1613610000
## 2 1995-04-01               434 1974500000
## 3 1995-07-01               397 1611750000
## 4 1995-10-01               449 2055159999
## 5 1996-01-01               544 2231810000
## 6 1996-04-01               605 2891970000
tail(data)
## # A tibble: 6 x 3
##   Quarter    `Number of deals`     Amounts
##   <date>                 <dbl>       <dbl>
## 1 2017-07-01              1500 20625070000
## 2 2017-10-01              1363 20065560000
## 3 2018-01-01              1425 22543850000
## 4 2018-04-01              1542 23871260000
## 5 2018-07-01              1308 27798870000
## 6 2018-10-01              1204 24546960000
# Let's first fix the 'Amounts' for inflation and turn all the values in terms of 2018 dollars

# Loding in CPIs
getSymbols("CPIAUCSL", src='FRED')
## [1] "CPIAUCSL"
# Grabbing the CPIs/conversion rates for the necessary years and months 
avg.cpi <- apply.monthly(CPIAUCSL, mean)
cf <- avg.cpi/as.numeric(avg.cpi['2018'])
desired.months <- cf[seq(1, length(cf), 3), ]
desired.months <- desired.months[194:length(desired.months)-1, ]

# Concatenating the CPIs to the original dataframe
data <- cbind(data, data.frame(desired.months))

# Reset the index
rownames(data) <- NULL

# Multiply the amounts and the conversion rates
data$Adj.Amounts <- data[, 3] * data[, 4]

Let’s take a look at the top 5 largest investments and smallest investments between 1995 and 2018.

# Top 5 largest investments
data[with(data,order(-Adj.Amounts)),][1:5,]
##       Quarter Number of deals     Amounts  CPIAUCSL Adj.Amounts
## 95 2018-07-01            1308 27798870000 1.0000000 27798870000
## 96 2018-10-01            1204 24546960000 1.0000000 24546960000
## 94 2018-04-01            1542 23871260000 1.0000000 23871260000
## 21 2000-01-01            2266 34030559999 0.6802366 23148831616
## 93 2018-01-01            1425 22543850000 1.0000000 22543850000
# Top 5 smallest investments
data[with(data,order(Adj.Amounts)),][1:5,]
##      Quarter Number of deals    Amounts  CPIAUCSL Adj.Amounts
## 1 1995-01-01             473 1613610000 0.6046994   975748963
## 3 1995-07-01             397 1611750000 0.6065255   977567499
## 2 1995-04-01             434 1974500000 0.6073069  1199127446
## 4 1995-10-01             449 2055159999 0.6072138  1247921469
## 5 1996-01-01             544 2231810000 0.6215747  1387236652
data.amount <- data[,c('Quarter', 'Adj.Amounts')]
colnames(data.amount)[2] <- "Amounts"
data.num.deals <- data[,c('Quarter', 'Number of deals')]

## Data Plot: Autocorrelation of Quarterly Investment Amounts

ts.data <- ts(data.amount$Amounts, start = c(1995,1), frequency = 4)

# Autocorrelation function of the data
acf(ts.data, main = "Correlogram")

# Partial Autocorrelation function of the data
pacf(ts.data, main = "Correlogram")

tsdisplay(ts.data)

# Test for stationarity
adf.test(ts.data, alternative = "stationary")
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts.data
## Dickey-Fuller = -2.6593, Lag order = 4, p-value = 0.304
## alternative hypothesis: stationary

It is clear that our data is not stationary. Let’s begin by removing the trend by taking the first difference.

ts.diff <- diff(ts.data)
# Plotting the transformed data
quick.plot <- function(var, title){

autoplot(var, color = "#00AFBB") +
  ggtitle(title) +
  xlab("Year") +
  ylab("Amount of Investment (USD)")
  
}
# Plotting the transformed data
quick.plot(ts.diff, "First Difference of Investment Amounts")

tsdisplay(ts.diff)

# Test for stationarity
adf.test(ts.diff)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts.diff
## Dickey-Fuller = -4.1977, Lag order = 4, p-value = 0.01
## alternative hypothesis: stationary

Based on the Dickey-Fuller Test, it seems like our data is now stationary and ready to be modeled.

Model Building

# Splitting the data into a train and test set 
train <- data.amount[1:85,]
test <- data.amount[86:96,]
# Baseline Drift Model

# Drift; equivalent to drawing a line between the first and last observations, and extrapolating it into the future
drift.fit <- rwf(train[2], level = 95, drift = TRUE)
# ARIMA Model
set.seed(2019)
arima.fit <- Arima(ts(train[2]), order = c(1,1,0))
set.seed(2019)
arima.auto.fit <- auto.arima(ts(train[2]), approximation = FALSE, stepwise = FALSE)

Model Diagnostics

# Model diagnostics

model.diag <- function(model){
  
  model[2]
  checkresiduals(model)
  
}
# Model summaries
arima.auto.fit # AIC = 3809.72
## Series: ts(train[2]) 
## ARIMA(1,1,0) 
## 
## Coefficients:
##          ar1
##       0.3432
## s.e.  0.1016
## 
## sigma^2 estimated as 2.807e+18:  log likelihood=-1902.86
## AIC=3809.72   AICc=3809.87   BIC=3814.58
arima.fit # AIC = 3809.72
## Series: ts(train[2]) 
## ARIMA(1,1,0) 
## 
## Coefficients:
##          ar1
##       0.3432
## s.e.  0.1016
## 
## sigma^2 estimated as 2.807e+18:  log likelihood=-1902.86
## AIC=3809.72   AICc=3809.87   BIC=3814.58
drift.fit$model
## Call: rwf(y = train[2], drift = TRUE, level = 95) 
## 
## Drift: 176080752.1103  (se 193969221.1159)
## Residual sd: 1777757276.5107
model.diag(drift.fit) 

## 
##  Ljung-Box test
## 
## data:  Residuals from Random walk with drift
## Q* = 27.445, df = 9, p-value = 0.00118
## 
## Model df: 1.   Total lags used: 10
model.diag(arima.fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,0)
## Q* = 6.5252, df = 9, p-value = 0.6864
## 
## Model df: 1.   Total lags used: 10
# Model fit plots
autoplot(ts(data.amount[1:85,2]), lwd = 1) +
  forecast::autolayer(arima.fit$fitted, series = "ARIMA", alpha = 0.6, lwd = 1) +
  forecast::autolayer(drift.fit$fitted, series = "Drift", alpha = 0.6, lwd = 1) +
  labs(title = "ARIMA Forecast vs. Drift Forecast") + xlab("Quarter") + ylab( "Investment (Billions of Dollars)") +
  guides(colour = guide_legend(title = 'Model Legend')) +
  theme(legend.position = c(0.12, 0.9),
        axis.text.x = element_text(size = 15),
        axis.text.y = element_text(size = 15),  
        axis.title.x = element_text(size = 20),
        axis.title.y = element_text(size = 20),
        title = element_text(size = 20))

# Model comparison plots
autoplot(ts(data.amount[2]), lwd = 0.8) +
  forecast::autolayer(forecast(arima.fit, h=10, level=95), series = "ARIMA", alpha = 0.5 , lwd = 1) +
  forecast::autolayer(forecast(drift.fit, h=10, level=95), series = "Drift", alpha = 0.6, lwd = 1) +
  labs(title = "ARIMA Forecast vs. Drift Forecast") + xlab("Quarter") + ylab( "Investment (Billions of Dollars)") +
  guides(colour = guide_legend(title = 'Model Legend')) +
  theme(legend.position = c(0.12, 0.9),
        axis.text.x = element_text(size = 15),
        axis.text.y = element_text(size = 15),  
        axis.title.x = element_text(size = 20),
        axis.title.y = element_text(size = 20),
        title = element_text(size = 20))

# Model predictions

accuracy(forecast(drift.fit, h=10), window(ts(data.amount), start=86)[,2])
##                         ME       RMSE        MAE       MPE    MAPE
## Training set -3.554409e-08 1767143705 1095537808 -2.962055 14.7569
## Test set      2.570835e+09 4676420209 3789259883  9.260021 18.4073
##                   MASE      ACF1 Theil's U
## Training set 0.9919185 0.3402014        NA
## Test set     3.4308600 0.6124931  1.506522
accuracy(forecast(arima.fit, h=10), window(ts(data.amount), start=86)[,2])
##                      ME       RMSE        MAE       MPE     MAPE      MASE
## Training set  112413709 1655676837 1102959826  0.910362 16.11667 0.9986385
## Test set     3751783705 5744372291 4526446990 15.173629 21.19132 4.0983217
##                     ACF1 Theil's U
## Training set -0.04364653        NA
## Test set      0.63904814  1.824753

Model Forecast

# Final fit
final.drift <- rwf(data.amount[2], level = 95, drift = TRUE)

# Final forecast predictions
forecast(final.drift, h=10, level=95)
##     Point Forecast       Lo 95       Hi 95
##  97    24795078011 21078596066 28511559956
##  98    25043196022 19732830120 30353561924
##  99    25291314033 18721440069 31861187996
## 100    25539432044 17877708236 33201155851
## 101    25787550055 17137909055 34437191054
## 102    26035668065 16469787183 35601548948
## 103    26283786076 15854434485 36713137668
## 104    26531904087 15279696211 37784111964
## 105    26780022098 14737244200 38822799996
## 106    27028140109 14221098030 39835182188