library(Quandl)     # to access data from Quandl, see Quandl.com or run ?Quandl() for details
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(tseries)    # a variety of tools for TSE
library(stargazer)  # regression outputs (or other tables) formatting, see ?stargazer for details
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.1. https://CRAN.R-project.org/package=stargazer
library(foreach)    # conventilnal looping, see ?foreach() for details
library(forecast)
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ purrr::accumulate() masks foreach::accumulate()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::first()      masks xts::first()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ dplyr::last()       masks xts::last()
## ✖ purrr::when()       masks foreach::when()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
df <- Quandl("FRED/CP0213ATM086NEST", api_key="JJzFyVSwQ7zs9zdLeyLo", type='xts')

#nrow(df)

id <- c(1:(nrow(df)/2)*2)

df = df[id]

df = df[27:nrow(df)]
summary(df)    # a quick summary of the variables in the set
##      Index            df        
##  Min.   :2000   Min.   : 71.87  
##  1st Qu.:2004   1st Qu.: 77.00  
##  Median :2008   Median : 84.48  
##  Mean   :2008   Mean   : 85.15  
##  3rd Qu.:2013   3rd Qu.: 92.98  
##  Max.   :2017   Max.   :102.02
head(df, 5)    # first five rows of df
##           [,1]
## Jun 2000 71.87
## Aug 2000 73.55
## Oct 2000 73.62
## Dec 2000 74.06
## Feb 2001 73.79
tail(df, 5)    # last five rows of df
##            [,1]
## Dec 2015  99.46
## Feb 2016 100.97
## Apr 2016 101.19
## Jun 2016 101.15
## Aug 2016 102.02
plot(df)       # a fast plot of raw data

plot(df, main = 'CPI for Beer in Austria')

diff_1   <- diff(df)       # taking the first diffs
diff_log_1 <- diff(log(df))  # taking the log-diffs

#plotting the raw and transformed data together
plot.new()
layout(matrix(c(1,2,3), 3, 1))
plot(df,     main = '[A] CPI for Beer in Austria')
plot(diff_1,    main = '[B] First diffs')
plot(diff_log_1,  main = '[C] Log diffs')

adf.test(df)   #ADF test, mind the warnings!
## 
##  Augmented Dickey-Fuller Test
## 
## data:  df
## Dickey-Fuller = -2.0354, Lag order = 4, p-value = 0.5619
## alternative hypothesis: stationary
kpss.test(df)  #KPSS test, mind the warnings!
## 
##  KPSS Test for Level Stationarity
## 
## data:  df
## KPSS Level = 3.292, Truncation lag parameter = 2, p-value = 0.01
adf.test(diff_1[-1])
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff_1[-1]
## Dickey-Fuller = -4.5787, Lag order = 4, p-value = 0.01
## alternative hypothesis: stationary
kpss.test(diff_1[-1])
## 
##  KPSS Test for Level Stationarity
## 
## data:  diff_1[-1]
## KPSS Level = 0.12361, Truncation lag parameter = 2, p-value = 0.1
adf.test(diff_log_1[-1])
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff_log_1[-1]
## Dickey-Fuller = -4.4533, Lag order = 4, p-value = 0.01
## alternative hypothesis: stationary
kpss.test(diff_log_1[-1])
## 
##  KPSS Test for Level Stationarity
## 
## data:  diff_log_1[-1]
## KPSS Level = 0.071833, Truncation lag parameter = 2, p-value = 0.1
plot.new() #reset the plotting device (clear the layout)
acf(diff_log_1[-1])

pacf(diff_log_1[-1])

acf(diff_1[-1])

pacf(diff_1[-1])

plot.new()
layout(matrix(c(1,1,2,2,3,4), 3, 2, byrow = TRUE),
       widths=c(1,1), heights=c(2,2,3))
plot(df, main='[A] CPI for Beer in Austria')
plot(diff_log_1, main='[B] the log diffs')
acf(diff_log_1[-1], main='[C] log diffs ACF')
pacf(diff_log_1[-1], main='[D] log diffs pACF')

plot.new()
layout(matrix(c(1,1,2,2,3,4), 3, 2, byrow = TRUE),
       widths=c(1,1), heights=c(2,2,3))

plot(df, main='[A] CPI for Beer in Austria')
plot(diff_1, main='[B] the first diffs')
acf(diff_1[-1], main='[C] first diffs ACF')
pacf(diff_1[-1], main='[D] first diffs pACF')

df_fit = df[1:round(0.8*nrow(df))]
df_valid = df[79:round(0.9*nrow(df))]
df_test = df[89:98]
diff_fit = diff(df_fit)
df_col = df
col_vector = read_csv("~/Time Series/Книга1.csv")
## Parsed with column specification:
## cols(
##   color = col_character()
## )
plot(df, col = c(rep("black", 78), rep("green", 10), rep("blue", 10)))

Тут я попытался сделать автопредсказание

plot.new()
autopilot <- forecast::auto.arima(diff_log_1, max.order = 10)
arima.prediction <- forecast(autopilot, h = 10)
plot(arima.prediction)

### ESTIMATING and ANALYZING SEVERAL SPECIFICATIONS

## Set the parameters for the procedure

# setting the caps for the specifications orders
phat <- 10
qhat <- 10

# setting critical level(s)
alpha <- 0.1

# other shortcuts
N    <- length(df_fit)-1 # number of obs in the transformed series
pq   <- expand.grid(0:phat,0:qhat) # all combinations of (p,q)
M    <- nrow(pq) # number of specifications
Lhat <- 5 # max lags to use in residual ACF analysis


## Estiamte all the ARMA and collect the results

SKIPm <- c(0) # the specs to skip (if there are errors or something)

#loop over all the combinations of (p,q) and estimate the ARMAs
ARMAs <- foreach(m=seq(1:M)[!(1:M %in% SKIPm)], .combine=rbind) %do% {
  
  # the orders
  p_ <- pq[m,1]
  q_ <- pq[m,2]
  
  # the estimates
  armapq <- arima(diff_1, c(p_,0,q_))
  
  # the AR part coefficients (only valid for p>0)
  if(p_>0) {arcoefpq <- armapq$coef[1:p_]}
  
  # stationarity of AR part (only valid for p>0, TRUE for p=0)
  if(p_>0) {
    statAR <- sum(abs(polyroot(c(1,-arcoefpq)))>1)==p_
  } else {
    statAR <- TRUE
  }
  
  # the residuals
  respq <- armapq$residuals
  
  # the LB test results
  LBpvalpq <- Box.test(respq[-1], lag = Lhat, type = c("Ljung-Box"), fitdf=p_+q_)$p.value
  
  # info criteria
  aicpq <- AIC(armapq)
  bicpq <- AIC(armapq, k=log(N))
  
  # combine the results together
  data.frame(m    = m,         # spec number
             p    = p_,        # AR order
             q    = q_,        # MA order
             stat = statAR,    # AR part stationarity check
             LB   = LBpvalpq,  # LB test p-value
             AIC  = aicpq,     # AIC
             SBIC = bicpq)     # SBIC
}

Топ 5 моделек

# remove invalid LB test results
ARMAs <- ARMAs[!is.nan(ARMAs$LB),]

# keep only stationary & without residual correlation 
ARMAs <- ARMAs[ARMAs$stat == 1 & ARMAs$LB >= alpha,] %>% arrange(desc(-SBIC))

# sort by SBIC (keep top-5)
#top5 = ARMAs[order(ARMAs$SBIC)[1:5],]
ARMAs
##    m p q stat        LB      AIC     SBIC
## 1  2 1 0 TRUE 0.1957278 252.7468 259.7782
## 2 12 0 1 TRUE 0.2248593 253.0472 260.0786
## 3 13 1 1 TRUE 0.1096387 254.6992 264.0744
## 4  3 2 0 TRUE 0.1079333 254.7250 264.1002
## 5 23 0 2 TRUE 0.1184396 254.9901 264.3653
## 6 34 0 3 TRUE 0.2223720 254.9816 266.7007
## 7  4 3 0 TRUE 0.1201760 255.8964 267.6154
## 8  5 4 0 TRUE 0.1106763 255.8346 269.8974
#estimate the models
diff_valid = diff(df_valid)

arma1 <- arima(diff_fit, order=c(1,0,0))
arma2 <- arima(diff_fit, order=c(0,0,1))
arma3 <- arima(diff_fit, order=c(1,0,1))
arma4 <- arima(diff_fit, order=c(2,0,0))
arma5 <- arima(diff_fit, order=c(0,0,2))
arma6 <- arima(diff_fit, order=c(0,0,3))
arma7 <- arima(diff_fit, order=c(3,0,0))
arma8 <- arima(diff_fit, order=c(4,0,0))

valid1=forecast(diff_fit, model = arma1, h = 10)
acc1=accuracy(valid1,diff_valid)

valid2=forecast(diff_fit, model = arma2, h = 10)
acc2=accuracy(valid2,diff_valid)

valid3=forecast(diff_fit, model = arma3, h = 10)
acc3=accuracy(valid3,diff_valid)
valid4=forecast(diff_fit, model = arma4, h = 10)
acc4=accuracy(valid4,diff_valid)
valid5=forecast(diff_fit, model = arma5, h = 10)
acc5=accuracy(valid5,diff_valid)

valid6=forecast(diff_fit, model = arma6, h = 10)
acc6=accuracy(valid6,diff_valid)
valid7=forecast(diff_fit, model = arma7, h = 10)
acc7=accuracy(valid7,diff_valid)
valid8=forecast(diff_fit, model = arma8, h = 10)
acc8=accuracy(valid8,diff_valid)

valid_models = data.frame(p = ARMAs$p, 
                          q = ARMAs$q, 
                          id = 1:nrow(ARMAs), 
                          RMSE = c(acc1[2,2], acc2[2,2], 
                                   acc3[2,2], acc4[2,2], 
                                   acc5[2,2], acc6[2,2],
                                   acc7[2,2], acc8[2,2]),
                          MAE = c(acc1[2,3], acc2[2,3],
                                  acc3[2,3], acc4[2,3],
                                  acc5[2,3], acc6[2,3],
                                  acc7[2,3], acc8[2,3])) %>% arrange(desc(-MAE)) %>% top_n(5, MAE)
diff_testing = diff(df_test)
testing1=forecast(diff_testing, model = arma1, h = 10)
acc1=accuracy(testing1,diff_testing)
testing2=forecast(diff_testing, model = arma2, h = 10)
acc2=accuracy(testing2,diff_testing)
testing3=forecast(diff_testing, model = arma3, h = 10)
acc3=accuracy(testing3,diff_testing)
testing4=forecast(diff_testing, model = arma4, h = 10)
acc4=accuracy(testing4,diff_testing)
testing5=forecast(diff_testing, model = arma5, h = 10)
acc5=accuracy(testing5,diff_testing)
testing6=forecast(diff_testing, model = arma6, h = 10)
acc6=accuracy(testing6,diff_testing)
testing7=forecast(diff_testing, model = arma7, h = 10)
acc7=accuracy(testing7,diff_testing)
testing8=forecast(diff_testing, model = arma8, h = 10)
acc8=accuracy(testing8,diff_testing)


testing_models = data.frame(p = ARMAs$p, 
                            q = ARMAs$q, 
                            id = 1:nrow(ARMAs),
                            RMSE = c(acc1[2,2], acc2[2,2], 
                                   acc3[2,2], acc4[2,2], 
                                   acc5[2,2], acc6[2,2],
                                   acc7[2,2], acc8[2,2]),
                          MAE = c(acc1[2,3], acc2[2,3],
                                  acc3[2,3], acc4[2,3],
                                  acc5[2,3], acc6[2,3],
                                  acc7[2,3], acc8[2,3])) %>% 
  filter(id %in% valid_models$id) %>% 
  arrange(desc(-MAE)) %>% top_n(3, -MAE)
# # estimate the top-5 specs again and check the outputs
# ARMAs2 <- foreach(m=ARMAs[order(ARMAs$SBIC)[1:5],1]) %do% {
#   arima(diff_1, c(pq[m,1],0,pq[m,2]))
# }
# 
# 
# ## PLOT the BEST MODEL RESULTS
# 
# #create a new time series object
# best_arma <- data.frame(index       = index(df),
#                         diff_1         = as.numeric(diff_1),
#                         d1fitted    = as.numeric(diff_valid-ARMAs2[[1]]$residuals),
#                         d1residuals = as.numeric(ARMAs2[[1]]$residuals),
#                         df          = as.numeric(df),
#                         dffitted    = as.numeric(df-ARMAs2[[1]]$residuals))
# best_arma <- xts(best_arma[,2:6], order.by = best_arma$index)
# colnames(best_arma) <- c('Log diffs', 'Fitted log diffs', 'Residuals', 'Raw data in levels', 'Fitted levels')
# 
# 
# plot(best_arma,
#      main='Fitted MA(1) for the log differences results')#,     legend.loc='topright')
# addLegend("topright", on=1, 
#           lty = c(1, 1, 1, 1, 1),
#           lwd = c(2, 2, 2, 2, 2))
plot.new()
layout(matrix(c(1,2,3), 3, 1, byrow = TRUE))
pred1 <- forecast(diff_1, h = 6, 
                  model = arima(diff_fit, 
                                order=c(testing_models$p[1],0,testing_models$q[1])))
plot(pred1)
pred2 <- forecast(diff_1, h = 6, 
                  model = arima(diff_fit, 
                                order=c(testing_models$p[2],0,testing_models$q[2])))
plot(pred2)
pred3 <- forecast(diff_1, h = 6, 
                  model = arima(diff_fit, 
                                order=c(testing_models$p[3],0,testing_models$q[3])))
plot(pred3)

df2 = data.frame(V1 = c(df[98] + pred3[[4]][1],
                        df[98] + pred3[[4]][1] + pred3[[4]][2],
                        df[98] + pred3[[4]][1] + pred3[[4]][2] + pred3[[4]][3],
                        df[98] + pred3[[4]][1] + pred3[[4]][2] + pred3[[4]][3] + pred3[[4]][4],
                        df[98] + pred3[[4]][1] + pred3[[4]][2] + pred3[[4]][3] + pred3[[4]][4] + pred3[[4]][5],
                        df[98] + pred3[[4]][1] + pred3[[4]][2] + pred3[[4]][3] + pred3[[4]][4] + pred3[[4]][5] + pred3[[4]][6]), time = c("2016/10/01", "2016/12/01", "2017/02/01", "2017/04/01", "2017/06/01", "2017/08/01"))
## Warning in data.row.names(row.names, rowsi, i): some row.names duplicated:
## 2,3,4,5,6 --> row.names NOT used
#row.names(df2) = c("October 2016", "December 2016", "February 2017", "April 2017", "June 2017", "August 2017")

df2$time = ymd(df2$time)

df2 = xts(dplyr::select(df2, -time), order.by = as.Date(df2$time))
df2 = xts(transform(df2, col = 1))
plot(df2$V1)

df_col = xts(transform(df, col = 2))
df3 = rbind(df_col, df2)
df4 = data.frame(df3)
df4$date = as.Date(as.yearmon(rownames(df4)))


ggplot(df4, aes(x = date, y = V1)) + 
  geom_line(aes(color = col), size = 1) +
  #scale_color_hue(l = 40, c = 30) +
  theme_minimal() + 
  xlab("Date") + 
  ylab("CPI") + 
  ggtitle("CPI for Beer in Austria. Prediction for 1 year") + theme(legend.position = "none")

plot(df3$V1, col = df4$col)