Forecast Competition: Google Trends

Popularity of David Cameron…

library(Quandl)

## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

Quandl.auth("y8y3ezt48WZeqUqq1yQd")
library(quantmod)

## Loading required package: TTR
## Version 0.4-0 included new data defaults. See ?getSymbols.

library(zoo)
library(forecast)

## Loading required package: timeDate
## This is forecast 5.8

library(knitr)
options(scipen=13)
setwd("/home/readejj/Dropbox/Teaching/Reading/ec313/2015/Lecture-10")
dave09 <- read.csv("Dave_090215.csv",stringsAsFactors=F)
dave09$Week2 <- as.Date(substr(dave09$Week,1,10))
plot(dave09$Week2,dave09$david.cameron,
     main="Google Search Popularity of David Cameron",
     ylab="Search popularity",xlab="Date",type="l",ylim=range(0,20))

dave09.ts <- ts(dave09$david.cameron,start=c(2004,1),freq=52)
tsdisplay(dave09.ts)

dave09.arima <- auto.arima(dave09.ts)
dave09.forc <- forecast(dave09.arima,h=8)
plot(dave09.forc,include=100)

dave09.forc$mean

## Time Series:
## Start = c(2015, 8) 
## End = c(2015, 15) 
## Frequency = 52 
## [1] 4.977339 4.967643 4.963494 4.961719 4.960959 4.960634 4.960495 4.960435

dave23 <- read.csv("dave_230215.csv",stringsAsFactors=F)
tail(dave23,2)

##                        Week david.cameron
## 580 2015-02-08 - 2015-02-14             4
## 581 2015-02-15 - 2015-02-21             4

Forecast Competition: Inflation…

cpi <- read.csv("cpi_090215.csv",stringsAsFactors=F)
cpi.t <- ts(cpi$Actual[order(cpi$DateTime)],start=c(2007,1),freq=12)
cpi.t.a <- auto.arima(cpi.t)
cpi.t.f <- forecast(cpi.t.a,h=10)
plot(cpi.t.f,main="Forecasts of CPI Inflation",ylab="Inflation (%, YoY)",xlab="Date")

cpi.t.f$mean

##            Jan       Feb       Mar       Apr       May       Jun       Jul
## 2015 0.5015949 0.6033607 0.6889686 0.7836768 0.9564424 0.8226495 0.9617996
##            Aug       Sep       Oct
## 2015 1.0601266 1.2608149 1.2895016

cpi23 <- read.csv("cpi_230215.csv",stringsAsFactors=F)
tail(cpi23,1)

##             DateTime Actual Consensus Previous
## 97 20070213 09:30:00    2.7         3       NA

Can we reflect general expectation of lower inflation?
- “UK heading for negative inflation, Bank of England expected to warn”, The Guardian
One way would be to use a longer time series…

getSymbols("GBRCPIALLMINMEI",src="FRED",return.class="zoo") # CPI since 1955 (from FRED)

##     As of 0.4-0, 'getSymbols' uses env=parent.frame() and
##  auto.assign=TRUE by default.
## 
##  This  behavior  will be  phased out in 0.5-0  when the call  will
##  default to use auto.assign=FALSE. getOption("getSymbols.env") and 
##  getOptions("getSymbols.auto.assign") are now checked for alternate defaults
## 
##  This message is shown once per session and may be disabled by setting 
##  options("getSymbols.warning4.0"=FALSE). See ?getSymbol for more details

## [1] "GBRCPIALLMINMEI"

ukcpi <- as.numeric(coredata(diff(log(GBRCPIALLMINMEI),differences=1,lag=12)))
ukcpi.t <- ts(ukcpi,start=c(1956,1),freq=12)
tsdisplay(ukcpi.t)

tsdisplay(diff(ukcpi.t,differences=1))

inf.ar1 <- Arima(ukcpi.t,order=c(2,1,2))
inf.ar2 <- Arima(ukcpi.t,order=c(3,1,2))
inf.ar3 <- Arima(ukcpi.t,order=c(1,1,2))
inf.ar4 <- Arima(ukcpi.t,order=c(2,1,3))
inf.ar5 <- Arima(ukcpi.t,order=c(2,1,1))
aiccs <- c(inf.ar1$aicc,inf.ar2$aicc,inf.ar3$aicc,inf.ar4$aicc,inf.ar5$aicc)
inf.auto <- auto.arima(ukcpi.t)
inf.forc <- forecast(inf.ar1,h=60)
inf.auto.forc <- forecast(inf.auto,h=60)
plot(ukcpi.t,xlim=range(2008,2020),ylim=range(0,0.05),main="UK CPI Inflation",ylab="%",xlab="Date")
lines(inf.forc$mean,col="blue")
lines(inf.auto.forc$mean,col="red")

Submit forecast: http://goo.gl/forms/odKgSjyiU0

R: What is it?

R is a powerful computer programming language.
R is free. open source, and widely used in academia and industry.
- Free: Put it on your computer!
- Open source: Anyone can write code (centrally vetted).
  - New procedures usually coded in R first.
- Widely used: Lots of online help.
  - Impressive skill to add to CV.

Lots of online help: Resources

Exam Feedback

Help me understand: No bunching or other deadlines, yet evidence for lack of preparation.
Maths! I explicitly said pre-exam that Section A required maths.
- Why not thus attempt to understand the maths covered in lectures?
  - You might as a result have learnt that omitted variable bias only occurs with correlation between omitted variable(s) and included variable(s).
Be careful with your words! They betray lack of understanding.
- Omitted variable bias is a modelling problem when using in-sample data, it is not a forecast problem.
- An estimator is biased, or unbiased, it is not distorted, unreliable, efficient, etc.
- A forecast error is not the same as a regression equation error: \[ \begin{align} \text{Regression Equation:} & y_t = \beta_0 + \beta_1 x_t + e_t, \qquad e_t \sim \text{iid} (0,\sigma^2),\\ \text{Residuals:} & \widehat{e}_t = y_t - \widehat{y}_t = y_t - \widehat{\beta}_0 - \widehat{\beta}_1 x_{t},\\ \text{Forecast Equation:} & y_{T+h\left|T\right.} = \widehat{\beta}_0 + \widehat{\beta}_1 x_{T+h} + e_{T+h},\\ \text{Forecast Error:} & \widehat{e}_{T+h\left|T\right.} = y_{T+h} - \widehat{y}_{T+h\left|T\right.} = y_{T+h} - \widehat{\beta}_0 - \widehat{\beta}_1 x_{T+h\left|T\right.}. \end{align} \]
Main point: Bluffing is useless. No substitute for preparation.
- Even more so in the real world…
- If you use a term (e.g. reliable, robust, etc), define what you mean by it.
Feedback: https://www.dropbox.com/s/t6dbcefi0cjjzm4/midterm_i_notes.pdf?dl=0
- Minimal scrawling on your scripts.
- Happy to answer individual questions.

Today: Further Adventures in R

More on loading data.
Doing things with data:
- Plotting.
- Manipulating.
- Running models.

Loading Data: Quandl

FTSE last week

ftse <- Quandl("YAHOO/INDEX_FTSE")
ftse <- ftse[order(ftse$Date),]
ftse.0 <- ftse[ftse$Date<as.Date("2015-02-13"),]
ftse.auto <- auto.arima(ftse$Close)
ftse.forc <- forecast(ftse.auto,h=50)
plot(ftse.forc,include=100)

ftse.forc$mean[seq(1,35,5)]

## [1] 6948.161 6949.152 6952.735 6956.367 6960.000 6963.632 6967.264

ftse[ftse$Date==as.Date("2015-02-13") | ftse$Date==as.Date("2015-02-20"),]

##         Date   Open   High    Low  Close    Volume Adjusted Close
## 8 2015-02-13 6828.1 6887.6 6828.1 6873.5         0         6873.5
## 3 2015-02-20 6888.9 6920.5 6884.8 6915.2 764028800         6915.2

Loading Data: quantmod

getSymbols("YHOO",src="google",return.class="zoo") # from google finance

## [1] "YHOO"

plot(YHOO)

getSymbols("GOOG",src="yahoo",return.class="zoo") # from yahoo finance

## [1] "GOOG"

plot(GOOG)

getSymbols("DEXJPUS",src="FRED",return.class="zoo") # FX rates from FRED

## [1] "DEXJPUS"

plot(DEXJPUS)

Loading Data: Via CSV or Excel

read.csv() is easiest way:
- Other than Import Dataset…
Also better way: Reproducable.
Always include option stringsAsFactors=F.
Example: Collect data on government spending from:
- http://www.ukpublicspending.co.uk/download_multi_year
- Select percent GDP and go from 1700 to 2016.

gs <- read.csv("ukgs_1700_2016.csv",stringsAsFactors=F)
head(gs)

##   Year GDP.UK...163..billion Population.UK.million
## 1 1700               0.06058                 9.100
## 2 1701                0.0608                 9.129
## 3 1702               0.06102                 9.158
## 4 1703               0.06124                 9.188
## 5 1704               0.06146                 9.217
## 6 1705               0.06168                 9.247
##   Total.Spending.total.percent.GDP X
## 1                             5.28 a
## 2                             5.66 a
## 3                             8.21 a
## 4                             8.68 a
## 5                             8.99 a
## 6                             9.52 a

str(gs)

## 'data.frame':    317 obs. of  5 variables:
##  $ Year                            : int  1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 ...
##  $ GDP.UK...163..billion           : chr  "0.06058" "0.0608" "0.06102" "0.06124" ...
##  $ Population.UK.million           : num  9.1 9.13 9.16 9.19 9.22 ...
##  $ Total.Spending.total.percent.GDP: num  5.28 5.66 8.21 8.68 8.99 ...
##  $ X                               : chr  "a" "a" "a" "a" ...

Plotting Data: Line Plots

Vital for understanding data and its characteristics.
- Often need to set type to “l” for line, “p” for points, “o” for both.
- Can set colour from range of hundreds.
- Set title: main = "Title of plot", labels (xlab, ylab).

plot(gs$Year,gs$Total.Spending.total.percent.GDP,type="l")

Plotting Data: Scatter Plots

Is one series related to another?

getSymbols('JTSJOR',src='FRED',return.class="zoo")

## [1] "JTSJOR"

getSymbols('UNRATE',src='FRED',return.class="zoo")

## [1] "UNRATE"

getSymbols('CPIAUCSL',src='FRED',return.class="zoo")

## [1] "CPIAUCSL"

plot(UNRATE,JTSJOR)

For Phillips curve, want inflation from CPI:

infl <- diff(log(CPIAUCSL),differences=1,lag=12)
plot(UNRATE,infl,main="US Phillips Curve?",ylab="Inflation",xlab="Unemployment Rate")

Heard the one about the Japanese Phillips Curve?
- Data via Fred as it has good international coverage.
- http://qed.econ.queensu.ca/working_papers/papers/qed_wp_1083.pdf

getSymbols('JPNURHARMMDSMEI',src='FRED',return.class="zoo")

## [1] "JPNURHARMMDSMEI"

getSymbols('JPNCPIALLMINMEI',src='FRED',return.class="zoo")

## [1] "JPNCPIALLMINMEI"

infl.j <- diff(log(JPNCPIALLMINMEI),differences=1,lag=12)
plot(JPNURHARMMDSMEI,infl.j,main="Japan's Phillips Curve",ylab="Inflation (CPI)",xlab="Unemployment",pch=16)

plot(-JPNURHARMMDSMEI,infl.j,main="Japan's Phillips Curve",ylab="Inflation (CPI)",xlab="-Unemployment",pch=16)

plot(-JPNURHARMMDSMEI[index(JPNURHARMMDSMEI)>=as.Date("1980-01-01") & index(JPNURHARMMDSMEI)<=as.Date("2005-08-01")],
     infl.j[index(infl.j)>=as.Date("1980-01-01") & index(infl.j)<=as.Date("2005-08-01")],main="Japan's Phillips Curve",ylab="Inflation (CPI)",xlab="-Unemployment",pch=16)

Plotting Data: Distributions

hist() is the main command for density plots.
- breaks option can be used to create more bars (can be more informative).

hist(infl.j)

hist(infl.j,breaks=50)

Modelling: Linear Regression

Standard command for linear regression is lm().
- Formula takes a litte bit of familiarity:
- lm(dep.var ~ ind.var.1 + ind.var.2 + ...,data=dataset)

Modelling: Model Selection

Recall concern regarding model selection.
- Stepwise: No additional package needed.
- General-to-specific: Need gets package.
  - install.packages("gets")
  - library(gets)

Modelling: arx

An alternative method for modelling:
- Compatible with gets.
- Nice graphical output.
- Commands more R-like.

Model Selection: From Lecture 4…

Need many replications in simulations.
Hoover and Perez (1999):
- various DGPs, 0–5 significant variables.
- GUM always has 40 variables.
- 75% of time true model in selected model.
Numerous subsequent improvements in algorithm:
- Hendry and Krolzig (1999, 2001, 2004)
Software variants:
- Autometrics, coded by Jurgen Doornik in OxMetrics.
- gets 0.2, coded by Genaro Sucarrat.

Example of Model Selection

Beveridge Curve example: Vacancy rate on unemployment rate.
But how many lags are necessary? Which variables?
Might other variables matter?
- Matching process: Number of hires? Long-term unemployed?
- Participation rate?
- Frictional unemployment?

getSymbols('JTSQUR',src='FRED',return.class="zoo") #quits

## [1] "JTSQUR"

getSymbols('JTSHIL',src='FRED',return.class="zoo") #total hires

## [1] "JTSHIL"

getSymbols('LNU01300000',src='FRED',return.class="zoo") #participation rate

## [1] "LNU01300000"

getSymbols('UEMPMEAN',src='FRED',return.class="zoo") #average duration of unemployment

## [1] "UEMPMEAN"

plot(JTSJOR,main="Vacancy Rate through time",ylab="%",xlab="Date")

Current and lag of every variable:

partrate <- window(LNU01300000, start="2000-12-01",end="2014-12-01")
uempmean <- window(UEMPMEAN, start="2000-12-01",end="2014-12-01")
unrate <- window(UNRATE, start="2000-12-01",end="2014-12-01")
vars <- data.frame(partrate,uempmean,unrate,JTSHIL,JTSJOR,JTSQUR,
                   "partrate.1"=lag(partrate,-1,na.pad=TRUE),
                   "uempmean.1"=lag(uempmean,-1,na.pad=TRUE),
                   "unrate.1"=lag(unrate,-1,na.pad=TRUE),
                   "JTSHIL.1"=lag(JTSHIL,-1,na.pad=TRUE),
                   "JTSQUR.1"=lag(JTSQUR,-1,na.pad=TRUE),
                   "JTSJOR.1"=lag(JTSJOR,-1,na.pad=TRUE))

vac.mod <- lm(JTSJOR ~ JTSJOR.1 + unrate + unrate.1 + partrate + partrate.1 + 
                JTSHIL + JTSHIL.1 + JTSQUR + JTSQUR.1 + uempmean + uempmean.1, data=vars)
summary(vac.mod)

## 
## Call:
## lm(formula = JTSJOR ~ JTSJOR.1 + unrate + unrate.1 + partrate + 
##     partrate.1 + JTSHIL + JTSHIL.1 + JTSQUR + JTSQUR.1 + uempmean + 
##     uempmean.1, data = vars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.37444 -0.06651  0.00058  0.07390  0.34219 
## 
## Coefficients:
##                Estimate  Std. Error t value            Pr(>|t|)    
## (Intercept)  2.09266489  1.34423754   1.557              0.1216    
## JTSJOR.1     0.58912942  0.05940339   9.917 <0.0000000000000002 ***
## unrate       0.03834411  0.06948536   0.552              0.5819    
## unrate.1    -0.06202105  0.06767094  -0.917              0.3608    
## partrate    -0.01055820  0.03290480  -0.321              0.7487    
## partrate.1  -0.02959515  0.03351824  -0.883              0.3786    
## JTSHIL       0.00023295  0.00007807   2.984              0.0033 ** 
## JTSHIL.1     0.00002741  0.00008522   0.322              0.7481    
## JTSQUR      -0.10079557  0.15295015  -0.659              0.5109    
## JTSQUR.1     0.29100532  0.14559861   1.999              0.0474 *  
## uempmean     0.00405668  0.01464278   0.277              0.7821    
## uempmean.1   0.00442336  0.01385946   0.319              0.7500    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1199 on 156 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.9249, Adjusted R-squared:  0.9196 
## F-statistic: 174.6 on 11 and 156 DF,  p-value: < 0.00000000000000022

Stepwise selection:

vac.step <- step(vac.mod)

## Start:  AIC=-701.18
## JTSJOR ~ JTSJOR.1 + unrate + unrate.1 + partrate + partrate.1 + 
##     JTSHIL + JTSHIL.1 + JTSQUR + JTSQUR.1 + uempmean + uempmean.1
## 
##              Df Sum of Sq    RSS     AIC
## - uempmean    1   0.00110 2.2433 -703.09
## - uempmean.1  1   0.00146 2.2436 -703.07
## - partrate    1   0.00148 2.2436 -703.06
## - JTSHIL.1    1   0.00149 2.2437 -703.06
## - unrate      1   0.00438 2.2465 -702.85
## - JTSQUR      1   0.00624 2.2484 -702.71
## - partrate.1  1   0.01121 2.2534 -702.34
## - unrate.1    1   0.01207 2.2542 -702.27
## <none>                    2.2422 -701.18
## - JTSQUR.1    1   0.05742 2.2996 -698.93
## - JTSHIL      1   0.12798 2.3701 -693.85
## - JTSJOR.1    1   1.41365 3.6558 -621.04
## 
## Step:  AIC=-703.09
## JTSJOR ~ JTSJOR.1 + unrate + unrate.1 + partrate + partrate.1 + 
##     JTSHIL + JTSHIL.1 + JTSQUR + JTSQUR.1 + uempmean.1
## 
##              Df Sum of Sq    RSS     AIC
## - JTSHIL.1    1   0.00114 2.2444 -705.01
## - partrate    1   0.00134 2.2446 -704.99
## - unrate      1   0.00397 2.2472 -704.80
## - JTSQUR      1   0.00590 2.2492 -704.65
## - unrate.1    1   0.01112 2.2544 -704.26
## - partrate.1  1   0.01253 2.2558 -704.16
## <none>                    2.2433 -703.09
## - JTSQUR.1    1   0.05686 2.3001 -700.89
## - uempmean.1  1   0.06173 2.3050 -700.53
## - JTSHIL      1   0.13201 2.3753 -695.49
## - JTSJOR.1    1   1.42575 3.6690 -622.44
## 
## Step:  AIC=-705.01
## JTSJOR ~ JTSJOR.1 + unrate + unrate.1 + partrate + partrate.1 + 
##     JTSHIL + JTSQUR + JTSQUR.1 + uempmean.1
## 
##              Df Sum of Sq    RSS     AIC
## - partrate    1   0.00124 2.2456 -706.92
## - unrate      1   0.00327 2.2477 -706.76
## - JTSQUR      1   0.00591 2.2503 -706.57
## - unrate.1    1   0.01037 2.2548 -706.23
## - partrate.1  1   0.01229 2.2567 -706.09
## <none>                    2.2444 -705.01
## - uempmean.1  1   0.06657 2.3110 -702.10
## - JTSQUR.1    1   0.08556 2.3300 -700.72
## - JTSHIL      1   0.14729 2.3917 -696.33
## - JTSJOR.1    1   1.44845 3.6929 -623.35
## 
## Step:  AIC=-706.92
## JTSJOR ~ JTSJOR.1 + unrate + unrate.1 + partrate.1 + JTSHIL + 
##     JTSQUR + JTSQUR.1 + uempmean.1
## 
##              Df Sum of Sq    RSS     AIC
## - unrate      1   0.00269 2.2483 -708.71
## - JTSQUR      1   0.00569 2.2513 -708.49
## - unrate.1    1   0.00957 2.2552 -708.20
## <none>                    2.2456 -706.92
## - partrate.1  1   0.05835 2.3040 -704.61
## - uempmean.1  1   0.07168 2.3173 -703.64
## - JTSQUR.1    1   0.08433 2.3300 -702.72
## - JTSHIL      1   0.14608 2.3917 -698.33
## - JTSJOR.1    1   1.46549 3.7111 -624.52
## 
## Step:  AIC=-708.71
## JTSJOR ~ JTSJOR.1 + unrate.1 + partrate.1 + JTSHIL + JTSQUR + 
##     JTSQUR.1 + uempmean.1
## 
##              Df Sum of Sq    RSS     AIC
## - JTSQUR      1   0.00620 2.2545 -710.25
## - unrate.1    1   0.01965 2.2680 -709.25
## <none>                    2.2483 -708.71
## - partrate.1  1   0.05791 2.3062 -706.44
## - uempmean.1  1   0.06900 2.3173 -705.64
## - JTSQUR.1    1   0.08172 2.3301 -704.72
## - JTSHIL      1   0.14465 2.3930 -700.24
## - JTSJOR.1    1   1.48892 3.7373 -625.34
## 
## Step:  AIC=-710.25
## JTSJOR ~ JTSJOR.1 + unrate.1 + partrate.1 + JTSHIL + JTSQUR.1 + 
##     uempmean.1
## 
##              Df Sum of Sq    RSS     AIC
## - unrate.1    1   0.02018 2.2747 -710.75
## <none>                    2.2545 -710.25
## - partrate.1  1   0.06166 2.3162 -707.72
## - JTSQUR.1    1   0.07682 2.3314 -706.62
## - uempmean.1  1   0.07834 2.3329 -706.51
## - JTSHIL      1   0.18045 2.4350 -699.32
## - JTSJOR.1    1   1.52533 3.7799 -625.44
## 
## Step:  AIC=-710.75
## JTSJOR ~ JTSJOR.1 + partrate.1 + JTSHIL + JTSQUR.1 + uempmean.1
## 
##              Df Sum of Sq    RSS     AIC
## <none>                    2.2747 -710.75
## - uempmean.1  1   0.05937 2.3341 -708.43
## - partrate.1  1   0.09332 2.3680 -706.00
## - JTSQUR.1    1   0.11567 2.3904 -704.42
## - JTSHIL      1   0.25766 2.5324 -694.73
## - JTSJOR.1    1   1.97031 4.2450 -607.94

Gets selection:

xregs <- cbind(vars$unrate[2:NROW(vars)],vars$unrate.1[2:NROW(vars)],
                             vars$partrate[2:NROW(vars)],vars$partrate.1[2:NROW(vars)],
                             vars$JTSHIL[2:NROW(vars)],vars$JTSHIL.1[2:NROW(vars)],
                             vars$JTSQUR[2:NROW(vars)],vars$JTSQUR.1[2:NROW(vars)],
                             vars$uempmean[2:NROW(vars)],vars$uempmean.1[2:NROW(vars)])
colnames(xregs) <- c("unrate","unrate.1","partrate","partrate.1","hires","hires.1","quits","quits.1","unempdur","unempdur.1")
library(gets)
vac.arx <- arx(vars$JTSJOR[2:NROW(vars)],mc=T,ar=1:5,mxreg = xregs)

vac.gets <- getsm(vac.arx)

## Searching path no. 1 out of 12
## Searching path no. 2 out of 12
## Searching path no. 3 out of 12
## Searching path no. 4 out of 12
## Searching path no. 5 out of 12
## Searching path no. 6 out of 12
## Searching path no. 7 out of 12
## Searching path no. 8 out of 12
## Searching path no. 9 out of 12
## Searching path no. 10 out of 12
## Searching path no. 11 out of 12
## Searching path no. 12 out of 12

Modelling: Outlier/Break Detection

Package: strucchange
Package: gets

Exponential Smoothing

Recall model (recap at https://www.otexts.org/fpp/7: \[ \begin{align} y_{t+h\|t} &= \ell_{t} + hb_{t} + s_{t-m+h_m^+} \\ \ell_{t} &= \alpha(y_{t} - s_{t-m}) + (1 - \alpha)(\ell_{t-1} + b_{t-1})\\ b_{t} &= \beta^*(\ell_{t} - \ell_{t-1}) + (1 - \beta^*)b_{t-1}\\ s_{t} &= \gamma (y_{t}-\ell_{t-1}-b_{t-1}) + (1-\gamma)s_{t-m}, \end{align} \]

ets() function in R does everything.
- Selects best model via AICc.
- Tool, not gospel.

ukcpi.ets <- ets(ukcpi.t)
plot(ukcpi.ets)

Time Series Decomposition

Recall model (recap at https://www.otexts.org/fpp/6: \[ y_{t} = S_{t} + T_{t} + E_{t}, \]
Function for most general form of decomposition is stl().
Of constituent parts, from classical method, ma() carries out moving average:

plot(ukcpi.t)
lines(ma(ukcpi.t,50),col=2)

STL: “Seasonal and Trend decomposition using Loess”.
- Allows flexibility in trend and seasonal component. “periodic” means fixed window.
- Only additive however, although can use log function.

ukcpi.stl <- stl(ukcpi.t,s.window="periodic",t.window=10)
plot(ukcpi.stl)

plot(stl(ukcpi.t,s.window=10,t.window=100))

Recall aim: Remainder “looks” like white noise.

Forecast Function

forecast() function works well with standard models:
- ARIMA via Arima() function.
  - Can add explanatory variables also: dynamic modelling.
  - See https://www.otexts.org/fpp/9/1 in textbook.
- Time series decomposition via stl() function.
- Exponential smoothing models via ets() function.
Plotting forecasts straightforward using plot() function.
forecast() returns a list with various useful items:
- $mean contains the actual forecast values.
- $upper and $lower contains the upper and lower confidence intervals (80 and 95%).

ukcpi.ets.f <- forecast(ukcpi.ets,h=60)
ukcpi.stl.f <- forecast(ukcpi.stl,h=60)
inf.forc <- forecast(inf.ar1,h=60)
inf.auto.forc <- forecast(inf.auto,h=60)
plot(ukcpi.t,xlim=range(2008,2016),ylim=range(-0.01,0.05))
lines(inf.forc$mean,col=2)
lines(inf.auto.forc$mean,col=3)
lines(ukcpi.ets.f$mean,col=4)
lines(ukcpi.stl.f$mean,col=5)
legend("bottomleft",lty=1,col=c(1,2,3,4,5),legend=c("Data","AR","auto.arima","ETS","STL"),bty="n",ncol=2)

How to train your model

Standard procedure: Hold back some observations to forecast over.
E.g. end sample at end of 2012 for inflation:

ukcpi.0 <- window(ukcpi.t,end=c(2012,12))
ukcpi.1 <- window(ukcpi.t,start=c(2013,1))

ukcpi.stl.0 <- stl(ukcpi.0,s.window="periodic",t.window=10)
ukcpi.stl.f.0 <- forecast(ukcpi.stl.0,h=24)
ukcpi.ets.0 <- ets(ukcpi.0)
ukcpi.ets.f.0 <- forecast(ukcpi.ets.0,h=24)
inf.ar1.0 <- Arima(ukcpi.0,order=c(2,1,2))
inf.forc.0 <- forecast(inf.ar1.0,h=24)
inf.auto.0 <- auto.arima(ukcpi.0)
inf.auto.forc.0 <- forecast(inf.auto.0,h=24)

plot(ukcpi.t,xlim=range(2008,2015),ylim=range(0,0.05))
lines(inf.forc.0$mean,col=2)
lines(inf.auto.forc.0$mean,col=3)
lines(ukcpi.ets.f.0$mean,col=4)
lines(ukcpi.stl.f.0$mean,col=5)
legend("bottomleft",lty=1,col=c(1,2,3,4,5),legend=c("Data","AR","auto.arima","ETS","STL"),bty="n",ncol=2)

#check accuracy via various forecast measures
accuracy(inf.forc.0,ukcpi.1)

##                          ME        RMSE         MAE       MPE     MAPE
## Training set -0.00001795836 0.005052316 0.003364368      -Inf      Inf
## Test set     -0.00810631704 0.010338084 0.008212941 -64.68629 65.05564
##                   MASE        ACF1 Theil's U
## Training set 0.1688247 0.001958033        NA
## Test set     0.4121273 0.804603661  5.543466

accuracy(inf.auto.forc.0,ukcpi.1)

##                          ME        RMSE         MAE       MPE     MAPE
## Training set -0.00002114912 0.004142458 0.002845168      -Inf      Inf
## Test set     -0.01207163377 0.013701909 0.012071634 -87.35295 87.35295
##                   MASE         ACF1 Theil's U
## Training set 0.1427712 -0.003581477        NA
## Test set     0.6057573  0.794546894  6.893837

accuracy(ukcpi.ets.f.0,ukcpi.1)

##                          ME       RMSE         MAE       MPE     MAPE
## Training set -0.00002846032 0.00521723 0.003427318      -Inf      Inf
## Test set     -0.00846351111 0.01081692 0.008565600 -67.47745 67.83284
##                   MASE       ACF1 Theil's U
## Training set 0.1719836 0.06423661        NA
## Test set     0.4298238 0.81043717  5.779589

accuracy(ukcpi.stl.f.0,ukcpi.1)

##                          ME       RMSE         MAE
## Training set -0.00001372806 0.00519169 0.003410987
## Test set     -0.00820352912 0.01049068 0.008312254
##                                   MPE                    MAPE      MASE
## Training set -11360238444364688.00000 11360238444364706.00000 0.1711641
## Test set                    -65.46754                65.84349 0.4171108
##                    ACF1 Theil's U
## Training set 0.06229391        NA
## Test set     0.80201728  5.611123

#make table to print out in your output
forc.table <- rbind(accuracy(inf.forc.0,ukcpi.1)[2,],
                    accuracy(inf.auto.forc.0,ukcpi.1)[2,],
                    accuracy(ukcpi.ets.f.0,ukcpi.1)[2,],
                    accuracy(ukcpi.stl.f.0,ukcpi.1)[2,])
rownames(forc.table) <- c("ARIMA","auto.arima","ETS","STL")
kable(forc.table)

	ME	RMSE	MAE	MPE	MAPE	MASE	ACF1	Theil’s U
ARIMA	-0.0081063	0.0103381	0.0082129	-64.68629	65.05564	0.4121273	0.8046037	5.543466
auto.arima	-0.0120716	0.0137019	0.0120716	-87.35295	87.35295	0.6057573	0.7945469	6.893837
ETS	-0.0084635	0.0108169	0.0085656	-67.47745	67.83284	0.4298238	0.8104372	5.779590
STL	-0.0082035	0.0104907	0.0083123	-65.46754	65.84349	0.4171108	0.8020173	5.611123

Modelling: Very Long Term Data

As earlier: data downloaded as CSV, then manually checked and imported from:
- http://www.ukpublicspending.co.uk/download_multi_year_1700_2016UKp_14c1li111mcn_F0t
- http://www.ukpublicrevenue.co.uk/download_multi_year_1700_2016UKp_14c1li111mcn_F0t

gs <- read.csv("ukgs_1700_2016.csv",stringsAsFactors=F)
gr <- read.csv("ukgr_1700_2016.csv",stringsAsFactors=F)
plot(gs$Year,gs$Total.Spending.total.percent.GDP,type="l")
lines(gr$Year,gr$Total.Direct.Revenue.total.percent.GDP,type="l",col=2)

Forecast Competition

Submit via usual form: http://goo.gl/forms/gCr5fXTecg

Forecast Competition League Table

#load data
wk1 <- read.csv("Wk1.csv",stringsAsFactors=F)
wk1$Wk <- 1
wk1$X <- NULL
wk1 <- wk1[!(wk1$Timestamp=="15/01/2015 16:51:56" & wk1$Name=="James Reade"),]
wk2 <- read.csv("Wk2.csv",stringsAsFactors=F)
wk2$Wk <- 2
wk3 <- read.csv("Wk3.csv",stringsAsFactors=F)
wk3$Wk <- 3
wk4 <- read.csv("Wk4.csv",stringsAsFactors=F)
wk4$Wk <- 4
wk5 <- read.csv("Wk5.csv",stringsAsFactors=F)
wk5$Wk <- 5

#want to combine data so first collect all variable names
total.cols <- union(union(colnames(wk1),colnames(wk2)),union(colnames(wk3),colnames(wk4)))
total.cols <- union(total.cols,colnames(wk5))
#next add empty columns to datasets with variable names
for(i in setdiff(total.cols,colnames(wk1))) {wk1[[i]] <- NA}
for(i in setdiff(total.cols,colnames(wk2))) {wk2[[i]] <- NA}
for(i in setdiff(total.cols,colnames(wk3))) {wk3[[i]] <- NA}
for(i in setdiff(total.cols,colnames(wk4))) {wk4[[i]] <- NA}
for(i in setdiff(total.cols,colnames(wk5))) {wk5[[i]] <- NA}
forcs <- rbind(wk1,wk2,wk3,wk4,wk5)
forcs <- forcs[forcs$Name!="Outcome" & forcs$Name!="",]
forcs$Name <- gsub("Desiati Cosimo","Cosimo Desiati",forcs$Name)

#outcomes
outcomes <- data.frame("temp"=0,stringsAsFactors=F)
for(i in total.cols) {outcomes[[i]] <- NA}
outcomes$temp <- NULL
outcomes$What.value.will.FTSE.close.at.on.Friday.January.16. <- ftse$Close[ftse$Date==as.Date("2015-01-16")]
outcomes$What.value.will.the.FTSE.close.at.on.Friday.January.23. <- ftse$Close[ftse$Date==as.Date("2015-01-23")]
outcomes$What.value.will.the.FTSE.close.at.on.Friday.January.30. <- ftse$Close[ftse$Date==as.Date("2015-01-30")]
outcomes$What.value.will.the.FTSE.close.at.on.Friday.February.6. <- ftse$Close[ftse$Date==as.Date("2015-02-06")]
outcomes$What.value.will.the.FTSE.close.at.on.Friday.February.13. <- ftse$Close[ftse$Date==as.Date("2015-02-13")]
outcomes$What.value.will.the.FTSE.close.at.on.Friday.February.20. <- ftse$Close[ftse$Date==as.Date("2015-02-20")]
outcomes$What.will.be.the.relative.search.volume.for.David.Cameron.in.the.week.commencing.January.11. <- 
  dave09$david.cameron[dave09$Week=="2015-01-11 - 2015-01-17"]
outcomes$What.will.be.the.relative.search.volume.for.David.Cameron.in.the.week.commencing.January.18. <- 
  dave09$david.cameron[dave09$Week=="2015-01-18 - 2015-01-24"]
outcomes$What.will.be.the.relative.search.volume.for.David.Cameron.in.the.week.commencing.January.25. <- 
  dave09$david.cameron[dave09$Week=="2015-01-25 - 2015-01-31"]
outcomes$What.will.be.the.relative.search.volume.for.David.Cameron.in.the.week.commencing.February.1. <- 
  dave09$david.cameron[dave09$Week=="2015-02-01 - 2015-02-07"]
outcomes$What.will.be.the.relative.search.volume.for.David.Cameron.in.the.week.commencing.February.8. <- 
  dave23$david.cameron[dave23$Week=="2015-02-08 - 2015-02-14"]
outcomes$What.will.be.the.relative.search.volume.for.David.Cameron.in.the.week.commencing.February.15. <- 
  dave23$david.cameron[dave23$Week=="2015-02-15 - 2015-02-21"]
gdp09 <- read.csv("gdp09.csv",stringsAsFactors=F)
outcomes$What.will.GDP.growth..QoQ..be.for.2014Q4. <- gdp09$Actual[gdp09$DateTime=="20150127 09:30:00"]
trade09 <- read.csv("trade_090215.csv",stringsAsFactors=F)
outcomes$What.will.the.total.trade.balance.be.for.December.2014. <- 
  trade09$Actual[trade09$DateTime=="20150206 09:30:00"]
mgage09 <- read.csv("mortgage_090215.csv",stringsAsFactors=F)
outcomes$What.will.mortgage.approvals.be.for.December.2014. <- mgage09$Actual[mgage09$DateTime=="20150130 09:30:00"]
outcomes$What.will.the.increase.in.CPI..YoY..be.for.January.2015. <- tail(cpi23,1)

#first create forecast absolute % errors
abspc <- data.frame("Wk"=forcs$Wk,"Name"=forcs$Name)
for(i in colnames(outcomes)[grep("What.",colnames(outcomes))]) {
  if(is.na(outcomes[,i])==F) {
    abspc[[i]] <- abs((as.numeric(forcs[,i]) - as.numeric(outcomes[,i]))/as.numeric(outcomes[,i]))
  } else {
    abspc[[i]] <- NA
  }
}

## Warning in if (is.na(outcomes[, i]) == F) {: the condition has length > 1
## and only the first element will be used

## Warning: NAs introduced by coercion

## Warning in as.numeric(forcs[, i]) - as.numeric(outcomes[, i]): longer
## object length is not a multiple of shorter object length

## Warning: NAs introduced by coercion

## Warning in (as.numeric(forcs[, i]) - as.numeric(outcomes[,
## i]))/as.numeric(outcomes[, : longer object length is not a multiple of
## shorter object length

#first metric: APE to 1DP
#first create matrix with column per name and forecasts down columns
ppl <- data.frame("temp"=rep(NA,4*NROW(grep("What.",colnames(abspc)))))
names <- abspc$Name[duplicated(abspc$Name)==F]
for(i in names) {
  person.forcs <- abspc[abspc$Name==i,c(1,grep("What.",colnames(abspc)))]
  full.person.forcs <- c()
  for(j in 1:4) {
    if(j %in% person.forcs$Wk) {
      full.person.forcs <- cbind(full.person.forcs,
                                 t(as.numeric(person.forcs[person.forcs$Wk==j,
                                                           grep("What.",colnames(person.forcs))])))
    } else {
      full.person.forcs <- cbind(full.person.forcs,t(rep(NA,NROW(grep("What.",colnames(person.forcs))))))
    }    
  }
  ppl[[i]] <- t(full.person.forcs)
}
ppl$temp <- NULL
mape <- round(colMeans(ppl,na.rm=T),1)
#tie-breaker: number of forecasts
no.forcs <- colSums(is.na(ppl)==F)
#table
lge.tab <- data.frame("Name"=colnames(ppl),"MAPE"=as.numeric(mape),"No. Forecasts"=as.numeric(no.forcs),
                      "Adjusted"=as.numeric(mape)-as.numeric(no.forcs)/2000)
lge.tab <- lge.tab[order(lge.tab$Adjusted),]
kable(lge.tab[1:10,])

	Name	MAPE	No..Forecasts	Adjusted
6	James Reade	0.1	39	0.0805
12	Adam Shermon	0.3	5	0.2975
8	Lloyd Morrish-Thomas	0.4	32	0.3840
16	Chris Tucker	0.4	8	0.3960
17	Luke	0.4	2	0.3990
4	Will Colwell	0.5	15	0.4925
19	Matthew	0.5	3	0.4985
3	Neil Chandratreya	0.5	2	0.4990
13	Marios H	0.5	2	0.4990
18	JMW	0.5	2	0.4990

EC313 Lecture and Class, Week 7

Introduction

Forecast Competition: Google Trends

Forecast Competition: Inflation…

R: What is it?

Lots of online help: Resources

Exam Feedback

Today: Further Adventures in R

Loading Data: Quandl

Loading Data: quantmod

Loading Data: Via CSV or Excel

Plotting Data: Line Plots

Plotting Data: Scatter Plots

Plotting Data: Distributions

Modelling: Linear Regression

Modelling: Model Selection

Modelling: arx

Model Selection: From Lecture 4…

Example of Model Selection

Modelling: Outlier/Break Detection

Exponential Smoothing

Time Series Decomposition

Forecast Function

How to train your model

Modelling: Very Long Term Data

Forecast Competition

Forecast Competition League Table