library(ggplot2)## Warning: package 'ggplot2' was built under R version 3.2.4
library(dplyr)## Warning: package 'dplyr' was built under R version 3.2.5
library(statsr)
library(BAS)## Warning: package 'BAS' was built under R version 3.2.5
load("movies.Rdata")As shown in the codebook, the data set is comprised of 651 randomly sampled movies produced and released before 2016. Hence, the inference method only implicates generalization rather than causality.
# create feature_film
movies <- mutate(movies, feature_film = factor(ifelse(title_type == 'Feature Film', 'yes', 'no')))
# create drama
movies <- mutate(movies, drama = factor(ifelse(genre =='Drama', 'yes', 'no')))
# create mpaa_rating_R
movies <- mutate(movies, mpaa_rating_R = factor(ifelse(mpaa_rating == 'R', 'yes', 'no')))
# create oscar_season
movies <- mutate(movies, oscar_season = factor(ifelse(thtr_rel_month >= 10, 'yes', 'no')))
# create summer_season
movies <- mutate(movies, summer_season = factor(ifelse(thtr_rel_month %in% c(5,6,7,8), 'yes', 'no')))
head(movies[c("title_type", "feature_film", "genre", "drama", "mpaa_rating", "mpaa_rating_R")])## # A tibble: 6 x 6
## title_type feature_film genre drama mpaa_rating mpaa_rating_R
## <fctr> <fctr> <fctr> <fctr> <fctr> <fctr>
## 1 Feature Film yes Drama yes R yes
## 2 Feature Film yes Drama yes PG-13 no
## 3 Feature Film yes Comedy no R yes
## 4 Feature Film yes Drama yes PG no
## 5 Feature Film yes Horror no R yes
## 6 Documentary no Documentary no Unrated no
head(movies[c("thtr_rel_month", "oscar_season", "summer_season")])## # A tibble: 6 x 3
## thtr_rel_month oscar_season summer_season
## <dbl> <fctr> <fctr>
## 1 4 no no
## 2 3 no no
## 3 8 no yes
## 4 10 yes no
## 5 9 no no
## 6 1 no no
new.movies <- movies[c("feature_film", "drama", "mpaa_rating_R", "oscar_season", "summer_season", "audience_score")]
summary(new.movies)## feature_film drama mpaa_rating_R oscar_season summer_season
## no : 60 no :346 no :322 no :460 no :443
## yes:591 yes:305 yes:329 yes:191 yes:208
##
##
##
##
## audience_score
## Min. :11.00
## 1st Qu.:46.00
## Median :65.00
## Mean :62.36
## 3rd Qu.:80.00
## Max. :97.00
All new features are binary type. Feature films takes the majority of title types. Drama and mpaa_rating_R have roughly half movies in their respective categories. Osaca_season and summer_season have more fillms released than other months.
“audience_score” has the median slightly higher than mean, which results in a slightly left skewed distribution.
ggplot(new.movies, aes(factor(feature_film), audience_score, fill=feature_film)) +
geom_boxplot() +
ggtitle('Audience Score Distribution by Feature Film') +
xlab('feature_film') +
ylab('audience_score')bayes_inference(y = audience_score, x = feature_film, data = new.movies, statistic = "mean", type = "ht", null = 0, alternative = "twosided")## Response variable: numerical, Explanatory variable: categorical (2 levels)
## n_no = 60, y_bar_no = 81.05, s_no = 13.5764
## n_yes = 591, y_bar_yes = 60.4653, s_yes = 19.824
## (Assuming intrinsic prior on parameters)
## Hypotheses:
## H1: mu_no = mu_yes
## H2: mu_no != mu_yes
##
## Priors:
## P(H1) = 0.5
## P(H2) = 0.5
##
## Results:
## BF[H2:H1] = 4.221452e+13
## P(H1|data) = 0
## P(H2|data) = 1
ggplot(new.movies, aes(factor(drama), audience_score, fill=drama)) +
geom_boxplot() +
ggtitle('Audience Score Distribution by Drama') +
xlab('drama') +
ylab('audience_score')bayes_inference(y = audience_score, x = drama, data = new.movies, statistic = "mean", type = "ht", null = 0, alternative = "twosided")## Response variable: numerical, Explanatory variable: categorical (2 levels)
## n_no = 346, y_bar_no = 59.7312, s_no = 21.2775
## n_yes = 305, y_bar_yes = 65.3475, s_yes = 18.5418
## (Assuming intrinsic prior on parameters)
## Hypotheses:
## H1: mu_no = mu_yes
## H2: mu_no != mu_yes
##
## Priors:
## P(H1) = 0.5
## P(H2) = 0.5
##
## Results:
## BF[H2:H1] = 22.6567
## P(H1|data) = 0.0423
## P(H2|data) = 0.9577
ggplot(new.movies, aes(factor(mpaa_rating_R), audience_score, fill=mpaa_rating_R)) +
geom_boxplot() +
ggtitle('Audience Score Distribution by MPAA Rating R') +
xlab('mpaa_rating_R') +
ylab('audience_score')bayes_inference(y = audience_score, x = mpaa_rating_R, data = new.movies, statistic = "mean", type = "ht", null = 0, alternative = "twosided")## Response variable: numerical, Explanatory variable: categorical (2 levels)
## n_no = 322, y_bar_no = 62.6894, s_no = 20.3167
## n_yes = 329, y_bar_yes = 62.0426, s_yes = 20.1559
## (Assuming intrinsic prior on parameters)
## Hypotheses:
## H1: mu_no = mu_yes
## H2: mu_no != mu_yes
##
## Priors:
## P(H1) = 0.5
## P(H2) = 0.5
##
## Results:
## BF[H1:H2] = 23.9679
## P(H1|data) = 0.9599
## P(H2|data) = 0.0401
ggplot(new.movies, aes(factor(oscar_season), audience_score, fill=oscar_season)) +
geom_boxplot() +
ggtitle('Audience Score Distribution by Oscar Season') +
xlab('oscar_season') +
ylab('audience_score')bayes_inference(y = audience_score, x = oscar_season, data = new.movies, statistic = "mean", type = "ht", null = 0, alternative = "twosided")## Response variable: numerical, Explanatory variable: categorical (2 levels)
## n_no = 460, y_bar_no = 61.813, s_no = 20.1196
## n_yes = 191, y_bar_yes = 63.6859, s_yes = 20.4612
## (Assuming intrinsic prior on parameters)
## Hypotheses:
## H1: mu_no = mu_yes
## H2: mu_no != mu_yes
##
## Priors:
## P(H1) = 0.5
## P(H2) = 0.5
##
## Results:
## BF[H1:H2] = 13.3993
## P(H1|data) = 0.9306
## P(H2|data) = 0.0694
ggplot(new.movies, aes(factor(summer_season), audience_score, fill=summer_season)) +
geom_boxplot() +
ggtitle('Audience Score Distribution by Summer Season') +
xlab('summer_season') +
ylab('audience_score')bayes_inference(y = audience_score, x = summer_season, data = new.movies, statistic = "mean", type = "ht", null = 0, alternative = "twosided")## Response variable: numerical, Explanatory variable: categorical (2 levels)
## n_no = 443, y_bar_no = 62.623, s_no = 20.3857
## n_yes = 208, y_bar_yes = 61.8077, s_yes = 19.9083
## (Assuming intrinsic prior on parameters)
## Hypotheses:
## H1: mu_no = mu_yes
## H2: mu_no != mu_yes
##
## Priors:
## P(H1) = 0.5
## P(H2) = 0.5
##
## Results:
## BF[H1:H2] = 21.7118
## P(H1|data) = 0.956
## P(H2|data) = 0.044
model.movies <- movies[c("feature_film", "drama", "runtime", "mpaa_rating_R",
"thtr_rel_year", "oscar_season", "summer_season", "imdb_rating", "imdb_num_votes",
"critics_score", "best_pic_nom", "best_pic_win", "best_actor_win", "best_actress_win",
"best_dir_win", "top200_box", "audience_score")]
# remove nans
colSums(is.na(model.movies))## feature_film drama runtime mpaa_rating_R
## 0 0 1 0
## thtr_rel_year oscar_season summer_season imdb_rating
## 0 0 0 0
## imdb_num_votes critics_score best_pic_nom best_pic_win
## 0 0 0 0
## best_actor_win best_actress_win best_dir_win top200_box
## 0 0 0 0
## audience_score
## 0
model.movies <- na.omit(model.movies)
# log transform to normalize the features
model.movies[c("runtime", "thtr_rel_year", "imdb_rating", "imdb_num_votes", "critics_score")] <- lapply(model.movies[c("runtime", "thtr_rel_year", "imdb_rating", "imdb_num_votes", "critics_score")], function(x) {log(1 + x)})
# model
set.seed(116)
n = nrow(model.movies)
n_cv = 10
ape = matrix(NA, ncol=4, nrow=n_cv)
colnames(ape) = c("BMA", "BPM", "HPM", "MPM")
for (i in 1:n_cv) {
train = sample(1:n, size=round(.90*n), replace=FALSE)
movies_train = model.movies[train,]
movies_test = model.movies[-train,]
bma_train_movie = bas.lm(audience_score ~ .-audience_score, data=movies_train, prior="ZS-null", modelprior=uniform(), initprobs="eplogp")
yhat_bma = predict(bma_train_movie, movies_test, estimator="BMA")$fit
yhat_hpm = predict(bma_train_movie, movies_test, estimator="HPM")$fit
yhat_mpm = predict(bma_train_movie, movies_test, estimator="MPM")$fit
yhat_bpm = predict(bma_train_movie, movies_test, estimator="BPM")$fit
ape[i, "BMA"] = cv.summary.bas(yhat_bma, movies_test$audience_score)
ape[i, "BPM"] = cv.summary.bas(yhat_bpm, movies_test$audience_score)
ape[i, "HPM"] = cv.summary.bas(yhat_hpm, movies_test$audience_score)
ape[i, "MPM"] = cv.summary.bas(yhat_mpm, movies_test$audience_score)
}boxplot(ape)apply(ape, 2, mean)## BMA BPM HPM MPM
## 10.22301 10.31527 10.32793 10.31611
bma_movies = bas.lm(audience_score ~ ., data = model.movies,
prior = "ZS-null",
method = "MCMC",
modelprior = uniform())# par(mfrow=c(2,2))
plot(bma_movies, which=1, ask=FALSE)plot(bma_movies, which=2, ask=FALSE)plot(bma_movies, which=3, ask=FALSE, cex.lab=0.5)plot(bma_movies, which=4, ask=FALSE, cex.lab=0.5)The “Residuals vs Fitted” doen’t show a constant spead over the prediction. And there are three outliers. Probably we need further transformation of numeric features.
The “Model Probabilities” shows that the model posterior probability begins to level off after 500 model trials. The model search stops at 3000 instead of enumerations of 2^16 combindations.
The “Model Complexity” shows that the highest log marginal can be reached from 2 to 14 dimensions. The log marginal becomes stable after 8 dimentions.
The “Inclusion Probabilities” shows the inclusion probabilities for features. The red lines are the features with top probablities to be included in the optimal model. As explored in the previous EDA part, feature film does contribute quite a bit to the final scores.
image(bma_movies, rotate=F)par(mfrow=c(2,3))
plot(coefficients(bma_movies), subset=c(1,2,6,9,10), ask=FALSE)Data source: http://www.imdb.com/title/tt1211837/?ref_=fn_al_tt_1 https://www.rottentomatoes.com/m/doctor_strange_2016
doctor.strange <- data.frame(feature_film="yes",drama="no",runtime=115,mpaa_rating_R="no",
thtr_rel_year=2016,oscar_season="yes",summer_season="no",
imdb_rating=8,
imdb_num_votes=67255,critics_score=90,best_pic_nom="no",
best_pic_win="no",best_actor_win="no",best_actress_win="no",
best_dir_win="no",top200_box="yes",audience_score=92)
# log transformation
doctor.strange[c("runtime", "thtr_rel_year", "imdb_rating", "imdb_num_votes", "critics_score")] <- lapply(doctor.strange[c("runtime", "thtr_rel_year", "imdb_rating", "imdb_num_votes", "critics_score")], function(x) {log(1 + x)})
model.movies2 <- rbind(model.movies, doctor.strange)
doctor.strange <-tail(model.movies2,1)
str(doctor.strange)## Classes 'tbl_df', 'tbl' and 'data.frame': 1 obs. of 17 variables:
## $ feature_film : Factor w/ 2 levels "no","yes": 2
## $ drama : Factor w/ 2 levels "no","yes": 1
## $ runtime : num 4.75
## $ mpaa_rating_R : Factor w/ 2 levels "no","yes": 1
## $ thtr_rel_year : num 7.61
## $ oscar_season : Factor w/ 2 levels "no","yes": 2
## $ summer_season : Factor w/ 2 levels "no","yes": 1
## $ imdb_rating : num 2.2
## $ imdb_num_votes : num 11.1
## $ critics_score : num 4.51
## $ best_pic_nom : Factor w/ 2 levels "no","yes": 1
## $ best_pic_win : Factor w/ 2 levels "no","yes": 1
## $ best_actor_win : Factor w/ 2 levels "no","yes": 1
## $ best_actress_win: Factor w/ 2 levels "no","yes": 1
## $ best_dir_win : Factor w/ 2 levels "no","yes": 1
## $ top200_box : Factor w/ 2 levels "no","yes": 2
## $ audience_score : num 92
# Make a prediction of audience_score using bayesian model averaging.
doctor.strange.pred <- predict(bma_movies, newdata=doctor.strange, estimator="BMA", se.fit=TRUE, interval="predict")
doctor.strange.pred$Ybma## [,1]
## [1,] 78.75834
In this project, the final prediction is not quite good, hence some places need improving: