library(Quandl) # to access data from Quandl, see Quandl.com or run ?Quandl() for details
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(tseries) # a variety of tools for TSE
library(stargazer) # regression outputs (or other tables) formatting, see ?stargazer for details
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.1. https://CRAN.R-project.org/package=stargazer
library(foreach) # conventilnal looping, see ?foreach() for details
library(forecast)
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ purrr::accumulate() masks foreach::accumulate()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks xts::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks xts::last()
## ✖ purrr::when() masks foreach::when()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
df <- Quandl("FRED/CP0213ATM086NEST", api_key="JJzFyVSwQ7zs9zdLeyLo", type='xts')
#nrow(df)
id <- c(1:(nrow(df)/2)*2)
df = df[id]
df = df[27:nrow(df)]
summary(df) # a quick summary of the variables in the set
## Index df
## Min. :2000 Min. : 71.87
## 1st Qu.:2004 1st Qu.: 77.00
## Median :2008 Median : 84.48
## Mean :2008 Mean : 85.15
## 3rd Qu.:2013 3rd Qu.: 92.98
## Max. :2017 Max. :102.02
head(df, 5) # first five rows of df
## [,1]
## Jun 2000 71.87
## Aug 2000 73.55
## Oct 2000 73.62
## Dec 2000 74.06
## Feb 2001 73.79
tail(df, 5) # last five rows of df
## [,1]
## Dec 2015 99.46
## Feb 2016 100.97
## Apr 2016 101.19
## Jun 2016 101.15
## Aug 2016 102.02
plot(df) # a fast plot of raw data
plot(df, main = 'CPI for Beer in Austria')
diff_1 <- diff(df) # taking the first diffs
diff_log_1 <- diff(log(df)) # taking the log-diffs
#plotting the raw and transformed data together
plot.new()
layout(matrix(c(1,2,3), 3, 1))
plot(df, main = '[A] CPI for Beer in Austria')
plot(diff_1, main = '[B] First diffs')
plot(diff_log_1, main = '[C] Log diffs')
adf.test(df) #ADF test, mind the warnings!
##
## Augmented Dickey-Fuller Test
##
## data: df
## Dickey-Fuller = -2.0354, Lag order = 4, p-value = 0.5619
## alternative hypothesis: stationary
kpss.test(df) #KPSS test, mind the warnings!
##
## KPSS Test for Level Stationarity
##
## data: df
## KPSS Level = 3.292, Truncation lag parameter = 2, p-value = 0.01
adf.test(diff_1[-1])
##
## Augmented Dickey-Fuller Test
##
## data: diff_1[-1]
## Dickey-Fuller = -4.5787, Lag order = 4, p-value = 0.01
## alternative hypothesis: stationary
kpss.test(diff_1[-1])
##
## KPSS Test for Level Stationarity
##
## data: diff_1[-1]
## KPSS Level = 0.12361, Truncation lag parameter = 2, p-value = 0.1
adf.test(diff_log_1[-1])
##
## Augmented Dickey-Fuller Test
##
## data: diff_log_1[-1]
## Dickey-Fuller = -4.4533, Lag order = 4, p-value = 0.01
## alternative hypothesis: stationary
kpss.test(diff_log_1[-1])
##
## KPSS Test for Level Stationarity
##
## data: diff_log_1[-1]
## KPSS Level = 0.071833, Truncation lag parameter = 2, p-value = 0.1
plot.new() #reset the plotting device (clear the layout)
acf(diff_log_1[-1])
pacf(diff_log_1[-1])
acf(diff_1[-1])
pacf(diff_1[-1])
plot.new()
layout(matrix(c(1,1,2,2,3,4), 3, 2, byrow = TRUE),
widths=c(1,1), heights=c(2,2,3))
plot(df, main='[A] CPI for Beer in Austria')
plot(diff_log_1, main='[B] the log diffs')
acf(diff_log_1[-1], main='[C] log diffs ACF')
pacf(diff_log_1[-1], main='[D] log diffs pACF')
plot.new()
layout(matrix(c(1,1,2,2,3,4), 3, 2, byrow = TRUE),
widths=c(1,1), heights=c(2,2,3))
plot(df, main='[A] CPI for Beer in Austria')
plot(diff_1, main='[B] the first diffs')
acf(diff_1[-1], main='[C] first diffs ACF')
pacf(diff_1[-1], main='[D] first diffs pACF')
df_fit = df[1:round(0.8*nrow(df))]
df_valid = df[79:round(0.9*nrow(df))]
df_test = df[89:98]
diff_fit = diff(df_fit)
df_col = df
col_vector = read_csv("~/Time Series/Книга1.csv")
## Parsed with column specification:
## cols(
## color = col_character()
## )
plot(df, col = c(rep("black", 78), rep("green", 10), rep("blue", 10)))
Тут я попытался сделать автопредсказание
plot.new()
autopilot <- forecast::auto.arima(diff_log_1, max.order = 10)
arima.prediction <- forecast(autopilot, h = 10)
plot(arima.prediction)
### ESTIMATING and ANALYZING SEVERAL SPECIFICATIONS
## Set the parameters for the procedure
# setting the caps for the specifications orders
phat <- 10
qhat <- 10
# setting critical level(s)
alpha <- 0.1
# other shortcuts
N <- length(df_fit)-1 # number of obs in the transformed series
pq <- expand.grid(0:phat,0:qhat) # all combinations of (p,q)
M <- nrow(pq) # number of specifications
Lhat <- 5 # max lags to use in residual ACF analysis
## Estiamte all the ARMA and collect the results
SKIPm <- c(0) # the specs to skip (if there are errors or something)
#loop over all the combinations of (p,q) and estimate the ARMAs
ARMAs <- foreach(m=seq(1:M)[!(1:M %in% SKIPm)], .combine=rbind) %do% {
# the orders
p_ <- pq[m,1]
q_ <- pq[m,2]
# the estimates
armapq <- arima(diff_1, c(p_,0,q_))
# the AR part coefficients (only valid for p>0)
if(p_>0) {arcoefpq <- armapq$coef[1:p_]}
# stationarity of AR part (only valid for p>0, TRUE for p=0)
if(p_>0) {
statAR <- sum(abs(polyroot(c(1,-arcoefpq)))>1)==p_
} else {
statAR <- TRUE
}
# the residuals
respq <- armapq$residuals
# the LB test results
LBpvalpq <- Box.test(respq[-1], lag = Lhat, type = c("Ljung-Box"), fitdf=p_+q_)$p.value
# info criteria
aicpq <- AIC(armapq)
bicpq <- AIC(armapq, k=log(N))
# combine the results together
data.frame(m = m, # spec number
p = p_, # AR order
q = q_, # MA order
stat = statAR, # AR part stationarity check
LB = LBpvalpq, # LB test p-value
AIC = aicpq, # AIC
SBIC = bicpq) # SBIC
}
Топ 5 моделек
# remove invalid LB test results
ARMAs <- ARMAs[!is.nan(ARMAs$LB),]
# keep only stationary & without residual correlation
ARMAs <- ARMAs[ARMAs$stat == 1 & ARMAs$LB >= alpha,] %>% arrange(desc(-SBIC))
# sort by SBIC (keep top-5)
#top5 = ARMAs[order(ARMAs$SBIC)[1:5],]
ARMAs
## m p q stat LB AIC SBIC
## 1 2 1 0 TRUE 0.1957278 252.7468 259.7782
## 2 12 0 1 TRUE 0.2248593 253.0472 260.0786
## 3 13 1 1 TRUE 0.1096387 254.6992 264.0744
## 4 3 2 0 TRUE 0.1079333 254.7250 264.1002
## 5 23 0 2 TRUE 0.1184396 254.9901 264.3653
## 6 34 0 3 TRUE 0.2223720 254.9816 266.7007
## 7 4 3 0 TRUE 0.1201760 255.8964 267.6154
## 8 5 4 0 TRUE 0.1106763 255.8346 269.8974
#estimate the models
diff_valid = diff(df_valid)
arma1 <- arima(diff_fit, order=c(1,0,0))
arma2 <- arima(diff_fit, order=c(0,0,1))
arma3 <- arima(diff_fit, order=c(1,0,1))
arma4 <- arima(diff_fit, order=c(2,0,0))
arma5 <- arima(diff_fit, order=c(0,0,2))
arma6 <- arima(diff_fit, order=c(0,0,3))
arma7 <- arima(diff_fit, order=c(3,0,0))
arma8 <- arima(diff_fit, order=c(4,0,0))
valid1=forecast(diff_fit, model = arma1, h = 10)
acc1=accuracy(valid1,diff_valid)
valid2=forecast(diff_fit, model = arma2, h = 10)
acc2=accuracy(valid2,diff_valid)
valid3=forecast(diff_fit, model = arma3, h = 10)
acc3=accuracy(valid3,diff_valid)
valid4=forecast(diff_fit, model = arma4, h = 10)
acc4=accuracy(valid4,diff_valid)
valid5=forecast(diff_fit, model = arma5, h = 10)
acc5=accuracy(valid5,diff_valid)
valid6=forecast(diff_fit, model = arma6, h = 10)
acc6=accuracy(valid6,diff_valid)
valid7=forecast(diff_fit, model = arma7, h = 10)
acc7=accuracy(valid7,diff_valid)
valid8=forecast(diff_fit, model = arma8, h = 10)
acc8=accuracy(valid8,diff_valid)
valid_models = data.frame(p = ARMAs$p,
q = ARMAs$q,
id = 1:nrow(ARMAs),
RMSE = c(acc1[2,2], acc2[2,2],
acc3[2,2], acc4[2,2],
acc5[2,2], acc6[2,2],
acc7[2,2], acc8[2,2]),
MAE = c(acc1[2,3], acc2[2,3],
acc3[2,3], acc4[2,3],
acc5[2,3], acc6[2,3],
acc7[2,3], acc8[2,3])) %>% arrange(desc(-MAE)) %>% top_n(5, MAE)
diff_testing = diff(df_test)
testing1=forecast(diff_testing, model = arma1, h = 10)
acc1=accuracy(testing1,diff_testing)
testing2=forecast(diff_testing, model = arma2, h = 10)
acc2=accuracy(testing2,diff_testing)
testing3=forecast(diff_testing, model = arma3, h = 10)
acc3=accuracy(testing3,diff_testing)
testing4=forecast(diff_testing, model = arma4, h = 10)
acc4=accuracy(testing4,diff_testing)
testing5=forecast(diff_testing, model = arma5, h = 10)
acc5=accuracy(testing5,diff_testing)
testing6=forecast(diff_testing, model = arma6, h = 10)
acc6=accuracy(testing6,diff_testing)
testing7=forecast(diff_testing, model = arma7, h = 10)
acc7=accuracy(testing7,diff_testing)
testing8=forecast(diff_testing, model = arma8, h = 10)
acc8=accuracy(testing8,diff_testing)
testing_models = data.frame(p = ARMAs$p,
q = ARMAs$q,
id = 1:nrow(ARMAs),
RMSE = c(acc1[2,2], acc2[2,2],
acc3[2,2], acc4[2,2],
acc5[2,2], acc6[2,2],
acc7[2,2], acc8[2,2]),
MAE = c(acc1[2,3], acc2[2,3],
acc3[2,3], acc4[2,3],
acc5[2,3], acc6[2,3],
acc7[2,3], acc8[2,3])) %>%
filter(id %in% valid_models$id) %>%
arrange(desc(-MAE)) %>% top_n(3, -MAE)
# # estimate the top-5 specs again and check the outputs
# ARMAs2 <- foreach(m=ARMAs[order(ARMAs$SBIC)[1:5],1]) %do% {
# arima(diff_1, c(pq[m,1],0,pq[m,2]))
# }
#
#
# ## PLOT the BEST MODEL RESULTS
#
# #create a new time series object
# best_arma <- data.frame(index = index(df),
# diff_1 = as.numeric(diff_1),
# d1fitted = as.numeric(diff_valid-ARMAs2[[1]]$residuals),
# d1residuals = as.numeric(ARMAs2[[1]]$residuals),
# df = as.numeric(df),
# dffitted = as.numeric(df-ARMAs2[[1]]$residuals))
# best_arma <- xts(best_arma[,2:6], order.by = best_arma$index)
# colnames(best_arma) <- c('Log diffs', 'Fitted log diffs', 'Residuals', 'Raw data in levels', 'Fitted levels')
#
#
# plot(best_arma,
# main='Fitted MA(1) for the log differences results')#, legend.loc='topright')
# addLegend("topright", on=1,
# lty = c(1, 1, 1, 1, 1),
# lwd = c(2, 2, 2, 2, 2))
plot.new()
layout(matrix(c(1,2,3), 3, 1, byrow = TRUE))
pred1 <- forecast(diff_1, h = 6,
model = arima(diff_fit,
order=c(testing_models$p[1],0,testing_models$q[1])))
plot(pred1)
pred2 <- forecast(diff_1, h = 6,
model = arima(diff_fit,
order=c(testing_models$p[2],0,testing_models$q[2])))
plot(pred2)
pred3 <- forecast(diff_1, h = 6,
model = arima(diff_fit,
order=c(testing_models$p[3],0,testing_models$q[3])))
plot(pred3)
df2 = data.frame(V1 = c(df[98] + pred3[[4]][1],
df[98] + pred3[[4]][1] + pred3[[4]][2],
df[98] + pred3[[4]][1] + pred3[[4]][2] + pred3[[4]][3],
df[98] + pred3[[4]][1] + pred3[[4]][2] + pred3[[4]][3] + pred3[[4]][4],
df[98] + pred3[[4]][1] + pred3[[4]][2] + pred3[[4]][3] + pred3[[4]][4] + pred3[[4]][5],
df[98] + pred3[[4]][1] + pred3[[4]][2] + pred3[[4]][3] + pred3[[4]][4] + pred3[[4]][5] + pred3[[4]][6]), time = c("2016/10/01", "2016/12/01", "2017/02/01", "2017/04/01", "2017/06/01", "2017/08/01"))
## Warning in data.row.names(row.names, rowsi, i): some row.names duplicated:
## 2,3,4,5,6 --> row.names NOT used
#row.names(df2) = c("October 2016", "December 2016", "February 2017", "April 2017", "June 2017", "August 2017")
df2$time = ymd(df2$time)
df2 = xts(dplyr::select(df2, -time), order.by = as.Date(df2$time))
df2 = xts(transform(df2, col = 1))
plot(df2$V1)
df_col = xts(transform(df, col = 2))
df3 = rbind(df_col, df2)
df4 = data.frame(df3)
df4$date = as.Date(as.yearmon(rownames(df4)))
ggplot(df4, aes(x = date, y = V1)) +
geom_line(aes(color = col), size = 1) +
#scale_color_hue(l = 40, c = 30) +
theme_minimal() +
xlab("Date") +
ylab("CPI") +
ggtitle("CPI for Beer in Austria. Prediction for 1 year") + theme(legend.position = "none")
plot(df3$V1, col = df4$col)