J James Reade
10/03/2015
isat and Arima (plus explanatory variables).mov <- read.csv("/home/readejj/Dropbox/Teaching/Reading/ec313/2015/movies/imdb.csv",
stringsAsFactors=F)
table. E.g. how many movies from each year: table(mov$year).summary. E.g. average ratings: summary(mov$imdbrating).is.na. E.g. how many ratings? summary(is.na(mov$imdbrating))rowMeans, rowSums, colMeans, colSums.duplicated: Identifies duplicate entries, e.g. table(duplicated(mov$code)).
mov <- mov[duplicated(mov$code)==F,]data$newvar <- mean(data$oldvar)NULL:
data$newvar <- NULLcolnames(data) function, and gsub.
gsub works by replacing first string with second in object listed third.gsub("newvar","bettername",colnames(data))gsub is useful independently:
gsub(",","",mov$imdbraters)regexpr function:
regexpr("Horror",mov$genre) returns where “horror” found in genre.regexpr("Horror",mov$genre)>-1 indicates whether horror or not.mov$genre <- gsub("-",".",mov$genre) #the dash in sci-fi is really unhelpful!
table(mov$genre[regexpr(";",mov$genre)==-1])
##
## Action Adventure Animation Biography Comedy
## 5861 1538 231 260 165 4046
## Crime Drama Family Fantasy Game.Show History
## 355 8286 421 204 1 107
## Horror Music Musical Mystery News Reality.TV
## 1883 141 110 161 1 6
## Romance Sci.Fi Sport Talk.Show Thriller War
## 583 413 65 3 1842 66
## Western
## 70
genres <- mov$genre[regexpr(";",mov$genre)==-1]
genres <- genres[duplicated(genres)==F]
for(g in genres) {
print(g)
mov[[g]] <- regexpr(g,mov$genre)>-1
}
## [1] "Mystery"
## [1] "Comedy"
## [1] "Thriller"
## [1] ""
## [1] "Drama"
## [1] "Action"
## [1] "Sci.Fi"
## [1] "Horror"
## [1] "Crime"
## [1] "Animation"
## [1] "Family"
## [1] "Music"
## [1] "Romance"
## [1] "Adventure"
## [1] "Western"
## [1] "Biography"
## [1] "Musical"
## [1] "War"
## [1] "Fantasy"
## [1] "Sport"
## [1] "History"
## [1] "Reality.TV"
## [1] "News"
## [1] "Talk.Show"
## [1] "Game.Show"
summary(mov$imdbrating)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 5.100 6.100 5.951 6.900 10.000 21976
summary(mov$imdbrating[mov$Romance==1])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.30 5.50 6.20 6.12 6.90 9.30 1331
summary(mov$imdbrating[mov$Thriller==1])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.600 4.700 5.700 5.666 6.600 9.300 2906
summary(mov$imdbrating[mov$Action==1])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.300 4.600 5.600 5.505 6.500 9.300 2316
summary(mov$imdbrating[mov$Comedy==1])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.100 5.200 6.100 5.944 6.800 10.000 3820
summary(mov$imdbrating[mov$Crime==1])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.20 5.20 6.10 5.96 6.90 9.00 1101
mov$imdbraters <- gsub(",","",mov$imdbraters)
mov$imdbraters <- as.numeric(mov$imdbraters)
mov.reg <- lm(imdbrating ~ imdbraters + Romance + Thriller + Action + Comedy + Crime + Drama + Sci.Fi + Animation + Family + as.character(year),data=mov)
options(scipen=13)
summary(mov.reg)
##
## Call:
## lm(formula = imdbrating ~ imdbraters + Romance + Thriller + Action +
## Comedy + Crime + Drama + Sci.Fi + Animation + Family + as.character(year),
## data = mov)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.2620 -0.7438 0.1368 0.8647 3.4624
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 5.6636572466 0.0306151834 184.995
## imdbraters 0.0000063670 0.0000002665 23.888
## RomanceTRUE 0.0753478021 0.0256598008 2.936
## ThrillerTRUE -0.2405065200 0.0264686494 -9.086
## ActionTRUE -0.4234444344 0.0258949315 -16.352
## ComedyTRUE 0.0332618296 0.0196410551 1.693
## CrimeTRUE 0.0718481996 0.0301618917 2.382
## DramaTRUE 0.5940843872 0.0177721557 33.428
## Sci.FiTRUE -0.4993431481 0.0452836284 -11.027
## AnimationTRUE 0.4541840041 0.0575386738 7.894
## FamilyTRUE -0.0178219969 0.0408339130 -0.436
## as.character(year)1991 0.1041600881 0.0395548279 2.633
## as.character(year)1992 0.0098287194 0.0675161033 0.146
## as.character(year)2000 -0.1531266718 0.0365556448 -4.189
## as.character(year)2001 -0.1056397901 0.0361207101 -2.925
## as.character(year)2002 -0.1276340040 0.0359538728 -3.550
## as.character(year)2003 -0.0054292619 0.0830281761 -0.065
## as.character(year)2013 0.0173171896 0.0360975632 0.480
## as.character(year)2014 0.2798421231 0.0331623887 8.439
## as.character(year)2015 0.7026183030 0.0572242327 12.278
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## imdbraters < 0.0000000000000002 ***
## RomanceTRUE 0.003324 **
## ThrillerTRUE < 0.0000000000000002 ***
## ActionTRUE < 0.0000000000000002 ***
## ComedyTRUE 0.090378 .
## CrimeTRUE 0.017224 *
## DramaTRUE < 0.0000000000000002 ***
## Sci.FiTRUE < 0.0000000000000002 ***
## AnimationTRUE 0.00000000000000308 ***
## FamilyTRUE 0.662514
## as.character(year)1991 0.008462 **
## as.character(year)1992 0.884258
## as.character(year)2000 0.00002814676780118 ***
## as.character(year)2001 0.003452 **
## as.character(year)2002 0.000386 ***
## as.character(year)2003 0.947864
## as.character(year)2013 0.631422
## as.character(year)2014 < 0.0000000000000002 ***
## as.character(year)2015 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.228 on 21549 degrees of freedom
## (21976 observations deleted due to missingness)
## Multiple R-squared: 0.124, Adjusted R-squared: 0.1233
## F-statistic: 160.6 on 19 and 21549 DF, p-value: < 0.00000000000000022
library(knitr)
kable(summary(mov.reg)$coef, digits=2)
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 5.66 | 0.03 | 185.00 | 0.00 |
| imdbraters | 0.00 | 0.00 | 23.89 | 0.00 |
| RomanceTRUE | 0.08 | 0.03 | 2.94 | 0.00 |
| ThrillerTRUE | -0.24 | 0.03 | -9.09 | 0.00 |
| ActionTRUE | -0.42 | 0.03 | -16.35 | 0.00 |
| ComedyTRUE | 0.03 | 0.02 | 1.69 | 0.09 |
| CrimeTRUE | 0.07 | 0.03 | 2.38 | 0.02 |
| DramaTRUE | 0.59 | 0.02 | 33.43 | 0.00 |
| Sci.FiTRUE | -0.50 | 0.05 | -11.03 | 0.00 |
| AnimationTRUE | 0.45 | 0.06 | 7.89 | 0.00 |
| FamilyTRUE | -0.02 | 0.04 | -0.44 | 0.66 |
| as.character(year)1991 | 0.10 | 0.04 | 2.63 | 0.01 |
| as.character(year)1992 | 0.01 | 0.07 | 0.15 | 0.88 |
| as.character(year)2000 | -0.15 | 0.04 | -4.19 | 0.00 |
| as.character(year)2001 | -0.11 | 0.04 | -2.92 | 0.00 |
| as.character(year)2002 | -0.13 | 0.04 | -3.55 | 0.00 |
| as.character(year)2003 | -0.01 | 0.08 | -0.07 | 0.95 |
| as.character(year)2013 | 0.02 | 0.04 | 0.48 | 0.63 |
| as.character(year)2014 | 0.28 | 0.03 | 8.44 | 0.00 |
| as.character(year)2015 | 0.70 | 0.06 | 12.28 | 0.00 |
hist(mov$imdbrating,breaks=100,main="Distribution of all ratings")
plot(mov$imdbrating,mov$imdbraters,main="Number of Raters against Rating")
dates <- c("2015-01-30","2015-02-06","2015-02-13","2015-02-20","2015-02-27","2015-03-06","2015-03-13")
date.1 <- tail(dates,1)
forecast.matches <- read.csv(paste("/home/readejj/Dropbox/Teaching/Reading/ec313/2015/Football-forecasts/forecasts_",date.1,".csv",sep=""),stringsAsFactors=F)
forecast.matches <- forecast.matches[is.na(forecast.matches$outcome)==F,]
prem.matches <- forecast.matches[forecast.matches$division=="English Premier",]
prem.matches <- prem.matches[order(prem.matches$date),]
prem.matches$id <- 1:NROW(prem.matches)
par(mar=c(9,4,4,5)+.1)
plot(prem.matches$id,prem.matches$outcome,xaxt="n",xlab="",ylim=range(0,1),
main="Forecasts of Weekend Premier League Matches",
ylab="Probability of Outcome")
lines(prem.matches$id,prem.matches$Ph,col=2,pch=15,type="p")
lines(prem.matches$id,prem.matches$Pd,col=3,pch=16,type="p")
lines(prem.matches$id,prem.matches$Pa,col=4,pch=17,type="p")
legend("topleft",ncol=4,pch=c(1,15,16,17),col=c(1:4),
legend=c("OLS","OL (home)","OL (draw)","OL (away)"),bty="n")
abline(h=0.5,lty=2)
abline(h=0.6,lty=3)
abline(h=0.7,lty=2)
abline(h=0.4,lty=3)
axis(1,at=prem.matches$id,labels=paste(prem.matches$team1,prem.matches$team2,sep=" v "),las=2,cex.axis=0.65)
if(NROW(prem.matches)>1) {
for(i in 2:NROW(prem.matches)){
if(prem.matches$date[i]!=prem.matches$date[i-1]) {
lines(rep(c(i-0.5),2),c(0,1),lty=2)
}
}
}
rmultinom function for ordinal events.