Time Series Analysis: Final Project

Project

Analyzing the amount of quarterly investment in technology companies, in the New York Metropolitan area, from 1995 to 2018.

Project Goal

Goal is to project future investment in US tech 5 years into the future.
Data Exploration
Model Building
Model Diagnostics
Forecasting

Data Exploration

20 different sectors in tech industry
1 observation per quarter, 96 observations per sector

options(warn=-1)
library(ggplot2)
library(quantmod)
library(TSA)
library(readxl)
library(stringi)
library(zoo)
library(forecast)
library(tseries)
library(TTR)
library(ModelMetrics)

path = '/Users/mareksalamon/Desktop/School/Hunter/Spring Semester 2019/Time Series Analysis (715)/Final Project'
setwd(path)
data <- read_xlsx('Data/total_tech_invs_usa.xlsx', col_names = TRUE, range = 'A2:C98',
                      col_types = c('text','numeric','numeric'), na = '0', trim_ws = TRUE)
data$Quarter <- as.Date(as.yearqtr(data$Quarter, format = "Q%q %Y"))

For simplicity, we will look at the total amount of quarterly investment across all 20 sectors.

head(data)

## # A tibble: 6 x 3
##   Quarter    `Number of deals`    Amounts
##   <date>                 <dbl>      <dbl>
## 1 1995-01-01               473 1613610000
## 2 1995-04-01               434 1974500000
## 3 1995-07-01               397 1611750000
## 4 1995-10-01               449 2055159999
## 5 1996-01-01               544 2231810000
## 6 1996-04-01               605 2891970000

tail(data)

## # A tibble: 6 x 3
##   Quarter    `Number of deals`     Amounts
##   <date>                 <dbl>       <dbl>
## 1 2017-07-01              1500 20625070000
## 2 2017-10-01              1363 20065560000
## 3 2018-01-01              1425 22543850000
## 4 2018-04-01              1542 23871260000
## 5 2018-07-01              1308 27798870000
## 6 2018-10-01              1204 24546960000

# Let's first fix the 'Amounts' for inflation and turn all the values in terms of 2018 dollars

# Loding in CPIs
getSymbols("CPIAUCSL", src='FRED')

## [1] "CPIAUCSL"

# Grabbing the CPIs/conversion rates for the necessary years and months 
avg.cpi <- apply.monthly(CPIAUCSL, mean)
cf <- avg.cpi/as.numeric(avg.cpi['2018'])
desired.months <- cf[seq(1, length(cf), 3), ]
desired.months <- desired.months[194:length(desired.months)-1, ]

# Concatenating the CPIs to the original dataframe
data <- cbind(data, data.frame(desired.months))

# Reset the index
rownames(data) <- NULL

# Multiply the amounts and the conversion rates
data$Adj.Amounts <- data[, 3] * data[, 4]

Let’s take a look at the top 5 largest investments and smallest investments between 1995 and 2018.

# Top 5 largest investments
data[with(data,order(-Adj.Amounts)),][1:5,]

##       Quarter Number of deals     Amounts  CPIAUCSL Adj.Amounts
## 95 2018-07-01            1308 27798870000 1.0000000 27798870000
## 96 2018-10-01            1204 24546960000 1.0000000 24546960000
## 94 2018-04-01            1542 23871260000 1.0000000 23871260000
## 21 2000-01-01            2266 34030559999 0.6802366 23148831616
## 93 2018-01-01            1425 22543850000 1.0000000 22543850000

# Top 5 smallest investments
data[with(data,order(Adj.Amounts)),][1:5,]

##      Quarter Number of deals    Amounts  CPIAUCSL Adj.Amounts
## 1 1995-01-01             473 1613610000 0.6046994   975748963
## 3 1995-07-01             397 1611750000 0.6065255   977567499
## 2 1995-04-01             434 1974500000 0.6073069  1199127446
## 4 1995-10-01             449 2055159999 0.6072138  1247921469
## 5 1996-01-01             544 2231810000 0.6215747  1387236652

data.amount <- data[,c('Quarter', 'Adj.Amounts')]
colnames(data.amount)[2] <- "Amounts"
data.num.deals <- data[,c('Quarter', 'Number of deals')]

No obvious seasonal trends
Upward trend after 2010
Some chnage in variance after 2015

Very positive, linear relationship
Autoregressive model rather than sinusoidal model for best fit
Not random

## Data Plot: Autocorrelation of Quarterly Investment Amounts

ts.data <- ts(data.amount$Amounts, start = c(1995,1), frequency = 4)

# Autocorrelation function of the data
acf(ts.data, main = "Correlogram")

# Partial Autocorrelation function of the data
pacf(ts.data, main = "Correlogram")

tsdisplay(ts.data)

Bands linearly decrease with the lag
Strong autocorrelation

# Test for stationarity
adf.test(ts.data, alternative = "stationary")

## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts.data
## Dickey-Fuller = -2.6593, Lag order = 4, p-value = 0.304
## alternative hypothesis: stationary

It is clear that our data is not stationary. Let’s begin by removing the trend by taking the first difference.

ts.diff <- diff(ts.data)

# Plotting the transformed data
quick.plot <- function(var, title){

autoplot(var, color = "#00AFBB") +
  ggtitle(title) +
  xlab("Year") +
  ylab("Amount of Investment (USD)")
  
}

# Plotting the transformed data
quick.plot(ts.diff, "First Difference of Investment Amounts")

tsdisplay(ts.diff)

# Test for stationarity
adf.test(ts.diff)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts.diff
## Dickey-Fuller = -4.1977, Lag order = 4, p-value = 0.01
## alternative hypothesis: stationary

Based on the Dickey-Fuller Test, it seems like our data is now stationary and ready to be modeled.

Model Building

# Splitting the data into a train and test set 
train <- data.amount[1:85,]
test <- data.amount[86:96,]

# Baseline Drift Model

# Drift; equivalent to drawing a line between the first and last observations, and extrapolating it into the future
drift.fit <- rwf(train[2], level = 95, drift = TRUE)

# ARIMA Model
set.seed(2019)
arima.fit <- Arima(ts(train[2]), order = c(1,1,0))
set.seed(2019)
arima.auto.fit <- auto.arima(ts(train[2]), approximation = FALSE, stepwise = FALSE)

Model Diagnostics

# Model diagnostics

model.diag <- function(model){
  
  model[2]
  checkresiduals(model)
  
}

# Model summaries
arima.auto.fit # AIC = 3809.72

## Series: ts(train[2]) 
## ARIMA(1,1,0) 
## 
## Coefficients:
##          ar1
##       0.3432
## s.e.  0.1016
## 
## sigma^2 estimated as 2.807e+18:  log likelihood=-1902.86
## AIC=3809.72   AICc=3809.87   BIC=3814.58

arima.fit # AIC = 3809.72

## Series: ts(train[2]) 
## ARIMA(1,1,0) 
## 
## Coefficients:
##          ar1
##       0.3432
## s.e.  0.1016
## 
## sigma^2 estimated as 2.807e+18:  log likelihood=-1902.86
## AIC=3809.72   AICc=3809.87   BIC=3814.58

drift.fit$model

## Call: rwf(y = train[2], drift = TRUE, level = 95) 
## 
## Drift: 176080752.1103  (se 193969221.1159)
## Residual sd: 1777757276.5107

model.diag(drift.fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from Random walk with drift
## Q* = 27.445, df = 9, p-value = 0.00118
## 
## Model df: 1.   Total lags used: 10

model.diag(arima.fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,0)
## Q* = 6.5252, df = 9, p-value = 0.6864
## 
## Model df: 1.   Total lags used: 10

# Model fit plots
autoplot(ts(data.amount[1:85,2]), lwd = 1) +
  forecast::autolayer(arima.fit$fitted, series = "ARIMA", alpha = 0.6, lwd = 1) +
  forecast::autolayer(drift.fit$fitted, series = "Drift", alpha = 0.6, lwd = 1) +
  labs(title = "ARIMA Forecast vs. Drift Forecast") + xlab("Quarter") + ylab( "Investment (Billions of Dollars)") +
  guides(colour = guide_legend(title = 'Model Legend')) +
  theme(legend.position = c(0.12, 0.9),
        axis.text.x = element_text(size = 15),
        axis.text.y = element_text(size = 15),  
        axis.title.x = element_text(size = 20),
        axis.title.y = element_text(size = 20),
        title = element_text(size = 20))

# Model comparison plots
autoplot(ts(data.amount[2]), lwd = 0.8) +
  forecast::autolayer(forecast(arima.fit, h=10, level=95), series = "ARIMA", alpha = 0.5 , lwd = 1) +
  forecast::autolayer(forecast(drift.fit, h=10, level=95), series = "Drift", alpha = 0.6, lwd = 1) +
  labs(title = "ARIMA Forecast vs. Drift Forecast") + xlab("Quarter") + ylab( "Investment (Billions of Dollars)") +
  guides(colour = guide_legend(title = 'Model Legend')) +
  theme(legend.position = c(0.12, 0.9),
        axis.text.x = element_text(size = 15),
        axis.text.y = element_text(size = 15),  
        axis.title.x = element_text(size = 20),
        axis.title.y = element_text(size = 20),
        title = element_text(size = 20))

# Model predictions

accuracy(forecast(drift.fit, h=10), window(ts(data.amount), start=86)[,2])

##                         ME       RMSE        MAE       MPE    MAPE
## Training set -3.554409e-08 1767143705 1095537808 -2.962055 14.7569
## Test set      2.570835e+09 4676420209 3789259883  9.260021 18.4073
##                   MASE      ACF1 Theil's U
## Training set 0.9919185 0.3402014        NA
## Test set     3.4308600 0.6124931  1.506522

accuracy(forecast(arima.fit, h=10), window(ts(data.amount), start=86)[,2])

##                      ME       RMSE        MAE       MPE     MAPE      MASE
## Training set  112413709 1655676837 1102959826  0.910362 16.11667 0.9986385
## Test set     3751783705 5744372291 4526446990 15.173629 21.19132 4.0983217
##                     ACF1 Theil's U
## Training set -0.04364653        NA
## Test set      0.63904814  1.824753

Model Forecast

# Final fit
final.drift <- rwf(data.amount[2], level = 95, drift = TRUE)

# Final forecast predictions
forecast(final.drift, h=10, level=95)

##     Point Forecast       Lo 95       Hi 95
##  97    24795078011 21078596066 28511559956
##  98    25043196022 19732830120 30353561924
##  99    25291314033 18721440069 31861187996
## 100    25539432044 17877708236 33201155851
## 101    25787550055 17137909055 34437191054
## 102    26035668065 16469787183 35601548948
## 103    26283786076 15854434485 36713137668
## 104    26531904087 15279696211 37784111964
## 105    26780022098 14737244200 38822799996
## 106    27028140109 14221098030 39835182188