Goal is to project future investment in US tech 5 years into the future.
options(warn=-1)
library(ggplot2)
library(quantmod)
library(TSA)
library(readxl)
library(stringi)
library(zoo)
library(forecast)
library(tseries)
library(TTR)
library(ModelMetrics)
path = '/Users/mareksalamon/Desktop/School/Hunter/Spring Semester 2019/Time Series Analysis (715)/Final Project'
setwd(path)
data <- read_xlsx('Data/total_tech_invs_usa.xlsx', col_names = TRUE, range = 'A2:C98',
col_types = c('text','numeric','numeric'), na = '0', trim_ws = TRUE)
data$Quarter <- as.Date(as.yearqtr(data$Quarter, format = "Q%q %Y"))
For simplicity, we will look at the total amount of quarterly investment across all 20 sectors.
head(data)
## # A tibble: 6 x 3
## Quarter `Number of deals` Amounts
## <date> <dbl> <dbl>
## 1 1995-01-01 473 1613610000
## 2 1995-04-01 434 1974500000
## 3 1995-07-01 397 1611750000
## 4 1995-10-01 449 2055159999
## 5 1996-01-01 544 2231810000
## 6 1996-04-01 605 2891970000
tail(data)
## # A tibble: 6 x 3
## Quarter `Number of deals` Amounts
## <date> <dbl> <dbl>
## 1 2017-07-01 1500 20625070000
## 2 2017-10-01 1363 20065560000
## 3 2018-01-01 1425 22543850000
## 4 2018-04-01 1542 23871260000
## 5 2018-07-01 1308 27798870000
## 6 2018-10-01 1204 24546960000
# Let's first fix the 'Amounts' for inflation and turn all the values in terms of 2018 dollars
# Loding in CPIs
getSymbols("CPIAUCSL", src='FRED')
## [1] "CPIAUCSL"
# Grabbing the CPIs/conversion rates for the necessary years and months
avg.cpi <- apply.monthly(CPIAUCSL, mean)
cf <- avg.cpi/as.numeric(avg.cpi['2018'])
desired.months <- cf[seq(1, length(cf), 3), ]
desired.months <- desired.months[194:length(desired.months)-1, ]
# Concatenating the CPIs to the original dataframe
data <- cbind(data, data.frame(desired.months))
# Reset the index
rownames(data) <- NULL
# Multiply the amounts and the conversion rates
data$Adj.Amounts <- data[, 3] * data[, 4]
Let’s take a look at the top 5 largest investments and smallest investments between 1995 and 2018.
# Top 5 largest investments
data[with(data,order(-Adj.Amounts)),][1:5,]
## Quarter Number of deals Amounts CPIAUCSL Adj.Amounts
## 95 2018-07-01 1308 27798870000 1.0000000 27798870000
## 96 2018-10-01 1204 24546960000 1.0000000 24546960000
## 94 2018-04-01 1542 23871260000 1.0000000 23871260000
## 21 2000-01-01 2266 34030559999 0.6802366 23148831616
## 93 2018-01-01 1425 22543850000 1.0000000 22543850000
# Top 5 smallest investments
data[with(data,order(Adj.Amounts)),][1:5,]
## Quarter Number of deals Amounts CPIAUCSL Adj.Amounts
## 1 1995-01-01 473 1613610000 0.6046994 975748963
## 3 1995-07-01 397 1611750000 0.6065255 977567499
## 2 1995-04-01 434 1974500000 0.6073069 1199127446
## 4 1995-10-01 449 2055159999 0.6072138 1247921469
## 5 1996-01-01 544 2231810000 0.6215747 1387236652
data.amount <- data[,c('Quarter', 'Adj.Amounts')]
colnames(data.amount)[2] <- "Amounts"
data.num.deals <- data[,c('Quarter', 'Number of deals')]
## Data Plot: Autocorrelation of Quarterly Investment Amounts
ts.data <- ts(data.amount$Amounts, start = c(1995,1), frequency = 4)
# Autocorrelation function of the data
acf(ts.data, main = "Correlogram")
# Partial Autocorrelation function of the data
pacf(ts.data, main = "Correlogram")
tsdisplay(ts.data)
# Test for stationarity
adf.test(ts.data, alternative = "stationary")
##
## Augmented Dickey-Fuller Test
##
## data: ts.data
## Dickey-Fuller = -2.6593, Lag order = 4, p-value = 0.304
## alternative hypothesis: stationary
It is clear that our data is not stationary. Let’s begin by removing the trend by taking the first difference.
ts.diff <- diff(ts.data)
# Plotting the transformed data
quick.plot <- function(var, title){
autoplot(var, color = "#00AFBB") +
ggtitle(title) +
xlab("Year") +
ylab("Amount of Investment (USD)")
}
# Plotting the transformed data
quick.plot(ts.diff, "First Difference of Investment Amounts")
tsdisplay(ts.diff)
# Test for stationarity
adf.test(ts.diff)
##
## Augmented Dickey-Fuller Test
##
## data: ts.diff
## Dickey-Fuller = -4.1977, Lag order = 4, p-value = 0.01
## alternative hypothesis: stationary
Based on the Dickey-Fuller Test, it seems like our data is now stationary and ready to be modeled.
# Splitting the data into a train and test set
train <- data.amount[1:85,]
test <- data.amount[86:96,]
# Baseline Drift Model
# Drift; equivalent to drawing a line between the first and last observations, and extrapolating it into the future
drift.fit <- rwf(train[2], level = 95, drift = TRUE)
# ARIMA Model
set.seed(2019)
arima.fit <- Arima(ts(train[2]), order = c(1,1,0))
set.seed(2019)
arima.auto.fit <- auto.arima(ts(train[2]), approximation = FALSE, stepwise = FALSE)
# Model diagnostics
model.diag <- function(model){
model[2]
checkresiduals(model)
}
# Model summaries
arima.auto.fit # AIC = 3809.72
## Series: ts(train[2])
## ARIMA(1,1,0)
##
## Coefficients:
## ar1
## 0.3432
## s.e. 0.1016
##
## sigma^2 estimated as 2.807e+18: log likelihood=-1902.86
## AIC=3809.72 AICc=3809.87 BIC=3814.58
arima.fit # AIC = 3809.72
## Series: ts(train[2])
## ARIMA(1,1,0)
##
## Coefficients:
## ar1
## 0.3432
## s.e. 0.1016
##
## sigma^2 estimated as 2.807e+18: log likelihood=-1902.86
## AIC=3809.72 AICc=3809.87 BIC=3814.58
drift.fit$model
## Call: rwf(y = train[2], drift = TRUE, level = 95)
##
## Drift: 176080752.1103 (se 193969221.1159)
## Residual sd: 1777757276.5107
model.diag(drift.fit)
##
## Ljung-Box test
##
## data: Residuals from Random walk with drift
## Q* = 27.445, df = 9, p-value = 0.00118
##
## Model df: 1. Total lags used: 10
model.diag(arima.fit)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,0)
## Q* = 6.5252, df = 9, p-value = 0.6864
##
## Model df: 1. Total lags used: 10
# Model fit plots
autoplot(ts(data.amount[1:85,2]), lwd = 1) +
forecast::autolayer(arima.fit$fitted, series = "ARIMA", alpha = 0.6, lwd = 1) +
forecast::autolayer(drift.fit$fitted, series = "Drift", alpha = 0.6, lwd = 1) +
labs(title = "ARIMA Forecast vs. Drift Forecast") + xlab("Quarter") + ylab( "Investment (Billions of Dollars)") +
guides(colour = guide_legend(title = 'Model Legend')) +
theme(legend.position = c(0.12, 0.9),
axis.text.x = element_text(size = 15),
axis.text.y = element_text(size = 15),
axis.title.x = element_text(size = 20),
axis.title.y = element_text(size = 20),
title = element_text(size = 20))
# Model comparison plots
autoplot(ts(data.amount[2]), lwd = 0.8) +
forecast::autolayer(forecast(arima.fit, h=10, level=95), series = "ARIMA", alpha = 0.5 , lwd = 1) +
forecast::autolayer(forecast(drift.fit, h=10, level=95), series = "Drift", alpha = 0.6, lwd = 1) +
labs(title = "ARIMA Forecast vs. Drift Forecast") + xlab("Quarter") + ylab( "Investment (Billions of Dollars)") +
guides(colour = guide_legend(title = 'Model Legend')) +
theme(legend.position = c(0.12, 0.9),
axis.text.x = element_text(size = 15),
axis.text.y = element_text(size = 15),
axis.title.x = element_text(size = 20),
axis.title.y = element_text(size = 20),
title = element_text(size = 20))
# Model predictions
accuracy(forecast(drift.fit, h=10), window(ts(data.amount), start=86)[,2])
## ME RMSE MAE MPE MAPE
## Training set -3.554409e-08 1767143705 1095537808 -2.962055 14.7569
## Test set 2.570835e+09 4676420209 3789259883 9.260021 18.4073
## MASE ACF1 Theil's U
## Training set 0.9919185 0.3402014 NA
## Test set 3.4308600 0.6124931 1.506522
accuracy(forecast(arima.fit, h=10), window(ts(data.amount), start=86)[,2])
## ME RMSE MAE MPE MAPE MASE
## Training set 112413709 1655676837 1102959826 0.910362 16.11667 0.9986385
## Test set 3751783705 5744372291 4526446990 15.173629 21.19132 4.0983217
## ACF1 Theil's U
## Training set -0.04364653 NA
## Test set 0.63904814 1.824753
# Final fit
final.drift <- rwf(data.amount[2], level = 95, drift = TRUE)
# Final forecast predictions
forecast(final.drift, h=10, level=95)
## Point Forecast Lo 95 Hi 95
## 97 24795078011 21078596066 28511559956
## 98 25043196022 19732830120 30353561924
## 99 25291314033 18721440069 31861187996
## 100 25539432044 17877708236 33201155851
## 101 25787550055 17137909055 34437191054
## 102 26035668065 16469787183 35601548948
## 103 26283786076 15854434485 36713137668
## 104 26531904087 15279696211 37784111964
## 105 26780022098 14737244200 38822799996
## 106 27028140109 14221098030 39835182188