EC313 Lecture 13

J James Reade

10/03/2015

Introduction

Project Basics

  1. Decide on your topics today!
    • Make best use of term time to get as much done as possible.
    • I will be uncontactable from 11am Friday 27th until April 7.
  2. Find data — plenty of historical data to use to create forecast model.
    • Be ready to show me your data files tomorrow!
  3. Construct models:
    • ARIMA, ETS, time series decomposition.
    • Check for structural breaks, check for outliers.
    • Make use of isat and Arima (plus explanatory variables).
  4. Construct forecasts on training datasets.

Presentation Basics

Data Analysis in R

mov <- read.csv("/home/readejj/Dropbox/Teaching/Reading/ec313/2015/movies/imdb.csv",
                stringsAsFactors=F)
mov$genre <- gsub("-",".",mov$genre) #the dash in sci-fi is really unhelpful!
table(mov$genre[regexpr(";",mov$genre)==-1])
## 
##                Action  Adventure  Animation  Biography     Comedy 
##       5861       1538        231        260        165       4046 
##      Crime      Drama     Family    Fantasy  Game.Show    History 
##        355       8286        421        204          1        107 
##     Horror      Music    Musical    Mystery       News Reality.TV 
##       1883        141        110        161          1          6 
##    Romance     Sci.Fi      Sport  Talk.Show   Thriller        War 
##        583        413         65          3       1842         66 
##    Western 
##         70
genres <- mov$genre[regexpr(";",mov$genre)==-1]
genres <- genres[duplicated(genres)==F]
for(g in genres) {
  print(g)
  mov[[g]] <- regexpr(g,mov$genre)>-1
}
## [1] "Mystery"
## [1] "Comedy"
## [1] "Thriller"
## [1] ""
## [1] "Drama"
## [1] "Action"
## [1] "Sci.Fi"
## [1] "Horror"
## [1] "Crime"
## [1] "Animation"
## [1] "Family"
## [1] "Music"
## [1] "Romance"
## [1] "Adventure"
## [1] "Western"
## [1] "Biography"
## [1] "Musical"
## [1] "War"
## [1] "Fantasy"
## [1] "Sport"
## [1] "History"
## [1] "Reality.TV"
## [1] "News"
## [1] "Talk.Show"
## [1] "Game.Show"
summary(mov$imdbrating)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   5.100   6.100   5.951   6.900  10.000   21976
summary(mov$imdbrating[mov$Romance==1])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1.30    5.50    6.20    6.12    6.90    9.30    1331
summary(mov$imdbrating[mov$Thriller==1])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.600   4.700   5.700   5.666   6.600   9.300    2906
summary(mov$imdbrating[mov$Action==1])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.300   4.600   5.600   5.505   6.500   9.300    2316
summary(mov$imdbrating[mov$Comedy==1])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.100   5.200   6.100   5.944   6.800  10.000    3820
summary(mov$imdbrating[mov$Crime==1])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1.20    5.20    6.10    5.96    6.90    9.00    1101
mov$imdbraters <- gsub(",","",mov$imdbraters)
mov$imdbraters <- as.numeric(mov$imdbraters)
mov.reg <- lm(imdbrating ~ imdbraters + Romance + Thriller + Action + Comedy + Crime + Drama + Sci.Fi + Animation + Family + as.character(year),data=mov)
options(scipen=13)
summary(mov.reg)
## 
## Call:
## lm(formula = imdbrating ~ imdbraters + Romance + Thriller + Action + 
##     Comedy + Crime + Drama + Sci.Fi + Animation + Family + as.character(year), 
##     data = mov)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.2620 -0.7438  0.1368  0.8647  3.4624 
## 
## Coefficients:
##                             Estimate    Std. Error t value
## (Intercept)             5.6636572466  0.0306151834 184.995
## imdbraters              0.0000063670  0.0000002665  23.888
## RomanceTRUE             0.0753478021  0.0256598008   2.936
## ThrillerTRUE           -0.2405065200  0.0264686494  -9.086
## ActionTRUE             -0.4234444344  0.0258949315 -16.352
## ComedyTRUE              0.0332618296  0.0196410551   1.693
## CrimeTRUE               0.0718481996  0.0301618917   2.382
## DramaTRUE               0.5940843872  0.0177721557  33.428
## Sci.FiTRUE             -0.4993431481  0.0452836284 -11.027
## AnimationTRUE           0.4541840041  0.0575386738   7.894
## FamilyTRUE             -0.0178219969  0.0408339130  -0.436
## as.character(year)1991  0.1041600881  0.0395548279   2.633
## as.character(year)1992  0.0098287194  0.0675161033   0.146
## as.character(year)2000 -0.1531266718  0.0365556448  -4.189
## as.character(year)2001 -0.1056397901  0.0361207101  -2.925
## as.character(year)2002 -0.1276340040  0.0359538728  -3.550
## as.character(year)2003 -0.0054292619  0.0830281761  -0.065
## as.character(year)2013  0.0173171896  0.0360975632   0.480
## as.character(year)2014  0.2798421231  0.0331623887   8.439
## as.character(year)2015  0.7026183030  0.0572242327  12.278
##                                    Pr(>|t|)    
## (Intercept)            < 0.0000000000000002 ***
## imdbraters             < 0.0000000000000002 ***
## RomanceTRUE                        0.003324 ** 
## ThrillerTRUE           < 0.0000000000000002 ***
## ActionTRUE             < 0.0000000000000002 ***
## ComedyTRUE                         0.090378 .  
## CrimeTRUE                          0.017224 *  
## DramaTRUE              < 0.0000000000000002 ***
## Sci.FiTRUE             < 0.0000000000000002 ***
## AnimationTRUE           0.00000000000000308 ***
## FamilyTRUE                         0.662514    
## as.character(year)1991             0.008462 ** 
## as.character(year)1992             0.884258    
## as.character(year)2000  0.00002814676780118 ***
## as.character(year)2001             0.003452 ** 
## as.character(year)2002             0.000386 ***
## as.character(year)2003             0.947864    
## as.character(year)2013             0.631422    
## as.character(year)2014 < 0.0000000000000002 ***
## as.character(year)2015 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.228 on 21549 degrees of freedom
##   (21976 observations deleted due to missingness)
## Multiple R-squared:  0.124,  Adjusted R-squared:  0.1233 
## F-statistic: 160.6 on 19 and 21549 DF,  p-value: < 0.00000000000000022
library(knitr)
kable(summary(mov.reg)$coef, digits=2)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.66 0.03 185.00 0.00
imdbraters 0.00 0.00 23.89 0.00
RomanceTRUE 0.08 0.03 2.94 0.00
ThrillerTRUE -0.24 0.03 -9.09 0.00
ActionTRUE -0.42 0.03 -16.35 0.00
ComedyTRUE 0.03 0.02 1.69 0.09
CrimeTRUE 0.07 0.03 2.38 0.02
DramaTRUE 0.59 0.02 33.43 0.00
Sci.FiTRUE -0.50 0.05 -11.03 0.00
AnimationTRUE 0.45 0.06 7.89 0.00
FamilyTRUE -0.02 0.04 -0.44 0.66
as.character(year)1991 0.10 0.04 2.63 0.01
as.character(year)1992 0.01 0.07 0.15 0.88
as.character(year)2000 -0.15 0.04 -4.19 0.00
as.character(year)2001 -0.11 0.04 -2.92 0.00
as.character(year)2002 -0.13 0.04 -3.55 0.00
as.character(year)2003 -0.01 0.08 -0.07 0.95
as.character(year)2013 0.02 0.04 0.48 0.63
as.character(year)2014 0.28 0.03 8.44 0.00
as.character(year)2015 0.70 0.06 12.28 0.00
hist(mov$imdbrating,breaks=100,main="Distribution of all ratings")

plot(mov$imdbrating,mov$imdbraters,main="Number of Raters against Rating")

Forecasting by Simulation

Forecast by Simulation

  1. Model each outcome (football/cricket match, F1 race) up to date.
  2. Input outcomes according to forecasts for upcoming events.
    • E.g. last night Swansea vs Liverpool, Liverpool 52% likely to win.
    • Generate random numbers s.t. Liverpool win 52% of time.
  3. Update league tables and continue forecasting to end of season.
  4. Repeat \(N\) times, where \(N\) as large as possible.
    • Ideally at least 1000 times.
    • Collect results, can present likely outcomes by team and most likely final standings.

Example: Premier League

dates <- c("2015-01-30","2015-02-06","2015-02-13","2015-02-20","2015-02-27","2015-03-06","2015-03-13")
date.1 <- tail(dates,1)
forecast.matches <- read.csv(paste("/home/readejj/Dropbox/Teaching/Reading/ec313/2015/Football-forecasts/forecasts_",date.1,".csv",sep=""),stringsAsFactors=F)
forecast.matches <- forecast.matches[is.na(forecast.matches$outcome)==F,]
prem.matches <- forecast.matches[forecast.matches$division=="English Premier",]
prem.matches <- prem.matches[order(prem.matches$date),]
prem.matches$id <- 1:NROW(prem.matches)
par(mar=c(9,4,4,5)+.1)
plot(prem.matches$id,prem.matches$outcome,xaxt="n",xlab="",ylim=range(0,1),
     main="Forecasts of Weekend Premier League Matches",
     ylab="Probability of Outcome")
lines(prem.matches$id,prem.matches$Ph,col=2,pch=15,type="p")
lines(prem.matches$id,prem.matches$Pd,col=3,pch=16,type="p")
lines(prem.matches$id,prem.matches$Pa,col=4,pch=17,type="p")
legend("topleft",ncol=4,pch=c(1,15,16,17),col=c(1:4),
       legend=c("OLS","OL (home)","OL (draw)","OL (away)"),bty="n")
abline(h=0.5,lty=2)
abline(h=0.6,lty=3)
abline(h=0.7,lty=2)
abline(h=0.4,lty=3)
axis(1,at=prem.matches$id,labels=paste(prem.matches$team1,prem.matches$team2,sep=" v "),las=2,cex.axis=0.65)
if(NROW(prem.matches)>1) {
  for(i in 2:NROW(prem.matches)){
    if(prem.matches$date[i]!=prem.matches$date[i-1]) {
      lines(rep(c(i-0.5),2),c(0,1),lty=2)
    }
  }
}

Modelling Individual Outcomes

Simulating to the end of the season