Modeling and prediction for movies

Setup

Load packages

library(ggplot2)
library(gridExtra)
library(GGally)
library(dplyr)
library(statsr)
library(xtable)

Load data

Make sure your data and R Markdown files are in the same directory. When loaded your data file will be called movies. Delete this note when before you submit your work.

load("movies.Rdata")
nrow(movies)

## [1] 651

Part 1: Data

According to the codebook, the data is a random sample of movies produced before 2016. Given there are 600 movies made every year in US, this can be considered a good sample if all the movies have been considered when sampling was performed. Since that information is missing I would not generalize the results determined after the analysis and modeling.

Since this data is just random observations from a population and is not gathered under experimental setup, one cannot attribute the results to causation. But there may exist correlations among the predictors.

  movies <- movies %>% filter(!is.na(audience_score))
  nrow(movies)

## [1] 651

Part 2: Research question

We are going to create a multiple linear regression model that can estimate audience score given all the relevant predictors. What predictors will be choosen will depend on the exploratory data analysis performed on the variables.

Part 3: Exploratory data analysis

The predictors to choose for modeling

  names(movies)

##  [1] "title"            "title_type"       "genre"           
##  [4] "runtime"          "mpaa_rating"      "studio"          
##  [7] "thtr_rel_year"    "thtr_rel_month"   "thtr_rel_day"    
## [10] "dvd_rel_year"     "dvd_rel_month"    "dvd_rel_day"     
## [13] "imdb_rating"      "imdb_num_votes"   "critics_rating"  
## [16] "critics_score"    "audience_rating"  "audience_score"  
## [19] "best_pic_nom"     "best_pic_win"     "best_actor_win"  
## [22] "best_actress_win" "best_dir_win"     "top200_box"      
## [25] "director"         "actor1"           "actor2"          
## [28] "actor3"           "actor4"           "actor5"          
## [31] "imdb_url"         "rt_url"

The features imdb_url and rotten tomatoes url rt_url are just hyper links and can be omitted from modelling. Lets check whether there are collinear predictors and try to eliminate them or combine them as seem fit.

Lets see how the year, month and days affects the audience_score

The theatrical/dvd release month doesnt seem to be variable across months, but there are more variable in a month.

par(mfrow=c(2,2))
bp1<- ggplot(data=movies, aes(x=as.factor(thtr_rel_month) ,y=audience_score))+geom_boxplot()
bp2<- ggplot(data=movies, aes(x=as.factor(dvd_rel_month) ,y=audience_score))+geom_boxplot()

grid.arrange(bp1,bp2)

The theatrical/dvd release day does seem to have more variability.

par(mfrow=c(2,2))
bp1<- ggplot(data=movies, aes(x=as.factor(thtr_rel_day) ,y=audience_score))+geom_boxplot()
bp2<- ggplot(data=movies, aes(x=as.factor(dvd_rel_day) ,y=audience_score))+geom_boxplot()

grid.arrange(bp1,bp2)

The theatrical/dvd release year also has more variability

par(mfrow=c(2,2))
bp1<- ggplot(data=movies, aes(y=audience_score, x=as.factor(thtr_rel_year))) +
      geom_boxplot() +
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
bp2<- ggplot(data=movies, aes(y=audience_score, x=as.factor(dvd_rel_year))) +
      geom_boxplot() +
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
grid.arrange(bp1,bp2)

Feature best_pic_win does seem to be the best predictor for audience score give the amount of variablility seen.

par(mfrow=c(2,2))
bp1<- ggplot(data=movies, aes(x=best_actor_win ,y=audience_score))+geom_boxplot()
bp2<- ggplot(data=movies, aes(x=best_actress_win ,y=audience_score))+geom_boxplot()
bp3<- ggplot(data=movies, aes(x=best_dir_win ,y=audience_score))+geom_boxplot()
bp4<- ggplot(data=movies, aes(x=best_pic_win ,y=audience_score))+geom_boxplot()

grid.arrange(bp1,bp2,bp3,bp4)

Feature best_pic_nom and top200_box are good predictors too.

par(mfrow=c(2,1))
bp1<- ggplot(data=movies, aes(x=best_pic_nom ,y=audience_score))+geom_boxplot()
bp2<- ggplot(data=movies, aes(x=top200_box ,y=audience_score))+geom_boxplot()

grid.arrange(bp1,bp2)

Lets explore how runtime affects the audience score alont with ratings. We can clearly see how critics_rating, audience_rating and imdb_rating are able to capture difference range in the audience score.

par(mfrow=c(2,2))

bp1<- ggplot(data=movies, aes(x=runtime ,y=audience_score, color = mpaa_rating)) +
      geom_point()
bp2<- ggplot(data=movies, aes(x=runtime ,y=audience_score, color = critics_rating)) +
      geom_point()
bp3<- ggplot(data=movies, aes(x=runtime ,y=audience_score, color = audience_rating)) +
      geom_point()
bp4<- ggplot(data=movies, aes(x=runtime ,y=audience_score, color = imdb_rating)) +
      geom_point()

grid.arrange(bp1,bp2,bp3,bp4)

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

  ggplot(data=movies, aes(x=imdb_rating, y=imdb_num_votes)) +
  geom_point()

Two predictors imdb_rating and imdb_num_votes are correlated and hence lets form a variables imdb_score that can be more representative of these 2 features combined. we take log of the number of votes and add them to the score.

  movies$imdb_score <- movies$imdb_rating + log(movies$imdb_num_votes)

imdb_score can be a good predictor too sicne we can definitely see correlation.

  ggplot(data=movies, aes(x=imdb_score, y=audience_score)) +
  geom_point()

genre makes a good predictor too. Animation, Documentary and Musical & Performing Arts genres get high score from audience.

  ggplot(data=movies, aes(x=genre, y=audience_score)) +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Since audience_rating and audience_score are very similar or even inter changeable, lets omit audience_rating from modeling.

Part 4: Modeling

Lets build the model using the following predictors that has been explored in the previous section. We are also considering the new imdb_score predictor created.

genre
runtime
thtr_rel_month
dvd_rel_month
critics_rating
best_pic_nom
best_pic_win
top200_box
imdb_score

The variables selected seems to exhibit collinearity that is tolerable.

selected_cols <- c("genre","runtime","thtr_rel_month","critics_rating","best_pic_nom","best_pic_win","top200_box","imdb_score")

  c_movies <- na.omit(movies)
  ggpairs(c_movies, 
          columns = selected_cols,
          mapping = aes(color = audience_rating),
          lower = list(combo = wrap("facethist", binwidth = 40)))

forward elimination

Lets perform the forward elimination model selection with selecting high adjusted R-squared at each step.

reduced_cols <- c("genre","runtime","thtr_rel_month","critics_rating","best_pic_nom","best_pic_win","top200_box","imdb_score")

    selected_cols     <- c()
    selected_cols_str <- paste(selected_cols, collapse=", ")
    print(selected_cols_str)

## [1] ""

    for (i in 1:length(reduced_cols)) {
        f <- paste("audience_score","~", selected_cols_str, reduced_cols[i])
        model = lm(formula=f, data = movies)
        print(paste(f," => ", summary(model)$adj.r.squared))
    }

## [1] "audience_score ~  genre  =>  0.181556287946631"
## [1] "audience_score ~  runtime  =>  0.0312548989483175"
## [1] "audience_score ~  thtr_rel_month  =>  -0.000470529142357456"
## [1] "audience_score ~  critics_rating  =>  0.375578296208879"
## [1] "audience_score ~  best_pic_nom  =>  0.0436669258757576"
## [1] "audience_score ~  best_pic_win  =>  0.0117789507767483"
## [1] "audience_score ~  top200_box  =>  0.00702825728686585"
## [1] "audience_score ~  imdb_score  =>  0.328460350192648"

We select critics_rating with adjusted R-squared as 0.375578296208879

    selected_cols     <- c("critics_rating")
    reduced_cols      <- reduced_cols[reduced_cols != selected_cols]
    selected_cols_str <- paste(selected_cols, collapse=" + ")
    print(selected_cols_str)

## [1] "critics_rating"

    for (i in 1:length(reduced_cols)) {
        f <- paste("audience_score","~", selected_cols_str, " + " , reduced_cols[i])
        model = lm(formula=f, data = movies)
        print(paste(f," => ", summary(model)$adj.r.squared))
    }

## [1] "audience_score ~ critics_rating  +  genre  =>  0.432650982015913"
## [1] "audience_score ~ critics_rating  +  runtime  =>  0.38430769495162"
## [1] "audience_score ~ critics_rating  +  thtr_rel_month  =>  0.374755350539467"
## [1] "audience_score ~ critics_rating  +  best_pic_nom  =>  0.382112844281114"
## [1] "audience_score ~ critics_rating  +  best_pic_win  =>  0.375410015765499"
## [1] "audience_score ~ critics_rating  +  top200_box  =>  0.374943522235776"
## [1] "audience_score ~ critics_rating  +  imdb_score  =>  0.498889588051879"

We select imdb_score with adjusted R-squared as 0.498889588051879

    selected_cols     <- c("critics_rating", "imdb_score")
    reduced_cols      <- reduced_cols[reduced_cols != "imdb_score"]
    selected_cols_str <- paste(selected_cols, collapse=" + ")
    print(selected_cols_str)

## [1] "critics_rating + imdb_score"

    for (i in 1:length(reduced_cols)) {
        f <- paste("audience_score","~", selected_cols_str, " + " , reduced_cols[i])
        model = lm(formula=f, data = movies)
        print(paste(f," => ", summary(model)$adj.r.squared))
    }

## [1] "audience_score ~ critics_rating + imdb_score  +  genre  =>  0.609139098741606"
## [1] "audience_score ~ critics_rating + imdb_score  +  runtime  =>  0.498647668992818"
## [1] "audience_score ~ critics_rating + imdb_score  +  thtr_rel_month  =>  0.499246126758942"
## [1] "audience_score ~ critics_rating + imdb_score  +  best_pic_nom  =>  0.498645058637949"
## [1] "audience_score ~ critics_rating + imdb_score  +  best_pic_win  =>  0.498387610780303"
## [1] "audience_score ~ critics_rating + imdb_score  +  top200_box  =>  0.499380328995121"

We select thtr_rel_month with adjusted R-squared as 0.499246126758942

    selected_cols     <- c("critics_rating", "imdb_score", "thtr_rel_month")
    reduced_cols      <- reduced_cols[reduced_cols != "thtr_rel_month"]
    selected_cols_str <- paste(selected_cols, collapse=" + ")
    print(selected_cols_str)

## [1] "critics_rating + imdb_score + thtr_rel_month"

    for (i in 1:length(reduced_cols)) {
        f <- paste("audience_score","~", selected_cols_str, " + " , reduced_cols[i])
        model = lm(formula=f, data = movies)
        print(paste(f," => ", summary(model)$adj.r.squared))
    }

## [1] "audience_score ~ critics_rating + imdb_score + thtr_rel_month  +  genre  =>  0.610526828499511"
## [1] "audience_score ~ critics_rating + imdb_score + thtr_rel_month  +  runtime  =>  0.498862938964456"
## [1] "audience_score ~ critics_rating + imdb_score + thtr_rel_month  +  best_pic_nom  =>  0.499253277300928"
## [1] "audience_score ~ critics_rating + imdb_score + thtr_rel_month  +  best_pic_win  =>  0.498707939564811"
## [1] "audience_score ~ critics_rating + imdb_score + thtr_rel_month  +  top200_box  =>  0.499614583432354"

We select genre with adjusted R-squared as 0.610526828499511

    selected_cols     <- c("critics_rating", "imdb_score", "thtr_rel_month", "genre")
    reduced_cols      <- reduced_cols[reduced_cols != "genre"]
    selected_cols_str <- paste(selected_cols, collapse=" + ")
    print(selected_cols_str)

## [1] "critics_rating + imdb_score + thtr_rel_month + genre"

    for (i in 1:length(reduced_cols)) {
        f <- paste("audience_score","~", selected_cols_str, " + " , reduced_cols[i])
        model = lm(formula=f, data = movies)
        print(paste(f," => ", summary(model)$adj.r.squared))
    }

## [1] "audience_score ~ critics_rating + imdb_score + thtr_rel_month + genre  +  runtime  =>  0.610239642067133"
## [1] "audience_score ~ critics_rating + imdb_score + thtr_rel_month + genre  +  best_pic_nom  =>  0.611194599270699"
## [1] "audience_score ~ critics_rating + imdb_score + thtr_rel_month + genre  +  best_pic_win  =>  0.609984096763282"
## [1] "audience_score ~ critics_rating + imdb_score + thtr_rel_month + genre  +  top200_box  =>  0.610022406200753"

We select best_pic_nom with adjusted R-squared as 0.611194599270699

    selected_cols     <- c("critics_rating", "imdb_score", "thtr_rel_month", "genre", "best_pic_nom")
    reduced_cols      <- reduced_cols[reduced_cols != "best_pic_nom"]
    selected_cols_str <- paste(selected_cols, collapse=" + ")
    print(selected_cols_str)

## [1] "critics_rating + imdb_score + thtr_rel_month + genre + best_pic_nom"

    for (i in 1:length(reduced_cols)) {
        f <- paste("audience_score","~", selected_cols_str, " + " , reduced_cols[i])
        model = lm(formula=f, data = movies)
        print(paste(f," => ", summary(model)$adj.r.squared))
    }

## [1] "audience_score ~ critics_rating + imdb_score + thtr_rel_month + genre + best_pic_nom  +  runtime  =>  0.611171107708812"
## [1] "audience_score ~ critics_rating + imdb_score + thtr_rel_month + genre + best_pic_nom  +  best_pic_win  =>  0.611292953208765"
## [1] "audience_score ~ critics_rating + imdb_score + thtr_rel_month + genre + best_pic_nom  +  top200_box  =>  0.610698366559542"

We select best_pic_win with adjusted R-squared as 0.611292953208765

    selected_cols     <- c("critics_rating", "imdb_score", "thtr_rel_month", "genre", "best_pic_nom", "best_pic_win")
    reduced_cols      <- reduced_cols[reduced_cols != "best_pic_win"]
    selected_cols_str <- paste(selected_cols, collapse=" + ")
    print(selected_cols_str)

## [1] "critics_rating + imdb_score + thtr_rel_month + genre + best_pic_nom + best_pic_win"

    for (i in 1:length(reduced_cols)) {
        f <- paste("audience_score","~", selected_cols_str, " + " , reduced_cols[i])
        model = lm(formula=f, data = movies)
        print(paste(f," => ", summary(model)$adj.r.squared))
    }

## [1] "audience_score ~ critics_rating + imdb_score + thtr_rel_month + genre + best_pic_nom + best_pic_win  +  runtime  =>  0.611158749075273"
## [1] "audience_score ~ critics_rating + imdb_score + thtr_rel_month + genre + best_pic_nom + best_pic_win  +  top200_box  =>  0.610777760906041"

We can see there is no increment in the adjusted R-squared value and hence we have selected the best parsimonious predictors for the model after 7 steps of forward elimination. The predictors are

critics_rating
imdb_score
thtr_rel_month
genre
best_pic_nom
best_pic_win

  fw_model <- lm(audience_score ~ critics_rating + imdb_score + thtr_rel_month + genre + best_pic_nom + best_pic_win, data = movies)
  summary(fw_model)

## 
## Call:
## lm(formula = audience_score ~ critics_rating + imdb_score + thtr_rel_month + 
##     genre + best_pic_nom + best_pic_win, data = movies)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.006  -8.057   0.247   8.483  44.360 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -14.8207     5.6932  -2.603 0.009451 ** 
## critics_ratingFresh              1.1911     1.5420   0.772 0.440114    
## critics_ratingRotten           -11.0971     1.6285  -6.814 2.21e-11 ***
## imdb_score                       4.7626     0.2838  16.782  < 2e-16 ***
## thtr_rel_month                  -0.2898     0.1427  -2.031 0.042694 *  
## genreAnimation                   7.9413     4.5004   1.765 0.078119 .  
## genreArt House & International  13.0397     3.7509   3.476 0.000543 ***
## genreComedy                      3.2802     2.0884   1.571 0.116756    
## genreDocumentary                26.8919     2.5451  10.566  < 2e-16 ***
## genreDrama                       7.5041     1.7715   4.236 2.61e-05 ***
## genreHorror                     -2.9652     3.0774  -0.964 0.335640    
## genreMusical & Performing Arts  22.1549     4.0125   5.521 4.91e-08 ***
## genreMystery & Suspense         -2.2905     2.2902  -1.000 0.317627    
## genreOther                       6.7248     3.5756   1.881 0.060464 .  
## genreScience Fiction & Fantasy  -3.1510     4.5074  -0.699 0.484760    
## best_pic_nomyes                  5.7818     3.2638   1.771 0.076960 .  
## best_pic_winyes                 -5.9208     5.4958  -1.077 0.281736    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.61 on 634 degrees of freedom
## Multiple R-squared:  0.6209, Adjusted R-squared:  0.6113 
## F-statistic: 64.89 on 16 and 634 DF,  p-value: < 2.2e-16

backward elimination

Lets perform the backward elimination model selection with selecting high adjusted R-squared at each step. To start with lets make the model with all the variable.

    all_cols     <- c("genre","runtime","thtr_rel_month","critics_rating","best_pic_nom","best_pic_win","top200_box","imdb_score")

    f <- paste("audience_score","~", paste(all_cols, collapse=" + "))
    model = lm(formula=f, data = movies)
    print(summary(model)$adj.r.squared)

## [1] 0.6106133

    all_cols     <- c("genre","runtime","thtr_rel_month","critics_rating","best_pic_nom","best_pic_win","top200_box","imdb_score")

    reduced_cols <- all_cols
    reduced_cols_str <- paste("audience_score","~",paste(reduced_cols, collapse=" + "))
    print(reduced_cols_str)

## [1] "audience_score ~ genre + runtime + thtr_rel_month + critics_rating + best_pic_nom + best_pic_win + top200_box + imdb_score"

    for (i in 1:length(all_cols)) {
        reduced_cols      <- all_cols
        reduced_cols      <- reduced_cols[reduced_cols != all_cols[i]]
        reduced_cols_str  <- paste(reduced_cols, collapse=" + ")
        
        f <- paste("audience_score","~", reduced_cols_str)
        model = lm(formula=f, data = movies)
        print(paste(all_cols[i]," => ", summary(model)$adj.r.squared))
    }

## [1] "genre  =>  0.499390651120926"
## [1] "runtime  =>  0.610777760906041"
## [1] "thtr_rel_month  =>  0.609247712586131"
## [1] "critics_rating  =>  0.547227021042473"
## [1] "best_pic_nom  =>  0.609118441306124"
## [1] "best_pic_win  =>  0.610638216061963"
## [1] "top200_box  =>  0.611158749075273"
## [1] "imdb_score  =>  0.445561248389702"

We eliminate top200_box and obtain high adjusted R-squared 0.611158749075273

    all_cols     <- c("genre","runtime","thtr_rel_month","critics_rating","best_pic_nom","best_pic_win","imdb_score")

    reduced_cols <- all_cols
    reduced_cols_str <- paste("audience_score","~",paste(reduced_cols, collapse=" + "))
    print(reduced_cols_str)

## [1] "audience_score ~ genre + runtime + thtr_rel_month + critics_rating + best_pic_nom + best_pic_win + imdb_score"

    for (i in 1:length(all_cols)) {
        reduced_cols      <- all_cols
        reduced_cols      <- reduced_cols[reduced_cols != all_cols[i]]
        reduced_cols_str  <- paste(reduced_cols, collapse=" + ")
        
        f <- paste("audience_score","~", reduced_cols_str)
        model = lm(formula=f, data = movies)
        print(paste(all_cols[i]," => ", summary(model)$adj.r.squared))
    }

## [1] "genre  =>  0.49912260723431"
## [1] "runtime  =>  0.611292953208765"
## [1] "thtr_rel_month  =>  0.609755653600492"
## [1] "critics_rating  =>  0.547941096632871"
## [1] "best_pic_nom  =>  0.609655640761888"
## [1] "best_pic_win  =>  0.611171107708812"
## [1] "imdb_score  =>  0.44550250141041"

We eliminate runtime and obtain high adjusted R-squared 0.611292953208765

    all_cols     <- c("genre","thtr_rel_month","critics_rating","best_pic_nom","best_pic_win","imdb_score")

    reduced_cols <- all_cols
    reduced_cols_str <- paste("audience_score","~",paste(reduced_cols, collapse=" + "))
    print(reduced_cols_str)

## [1] "audience_score ~ genre + thtr_rel_month + critics_rating + best_pic_nom + best_pic_win + imdb_score"

    for (i in 1:length(all_cols)) {
        reduced_cols      <- all_cols
        reduced_cols      <- reduced_cols[reduced_cols != all_cols[i]]
        reduced_cols_str  <- paste(reduced_cols, collapse=" + ")
        
        f <- paste("audience_score","~", reduced_cols_str)
        model = lm(formula=f, data = movies)
        print(paste(all_cols[i]," => ", summary(model)$adj.r.squared))
    }

## [1] "genre  =>  0.499414944081151"
## [1] "thtr_rel_month  =>  0.609380587587638"
## [1] "critics_rating  =>  0.547667743940504"
## [1] "best_pic_nom  =>  0.609984096763282"
## [1] "best_pic_win  =>  0.611194599270698"
## [1] "imdb_score  =>  0.439506852607386"

We can see that we cannot improve the model anymore with eliminating any variable. we have achieved the parsimonious model with the following predictors after 3 steps of backward elimination 1. genre 2. thtr_rel_month 3. critics_rating 4. best_pic_nom 5. best_pic_win 6. imdb_score

  bw_model <- lm(audience_score ~ genre + thtr_rel_month + critics_rating + best_pic_nom + best_pic_win + imdb_score, data = movies)
  summary(bw_model)

## 
## Call:
## lm(formula = audience_score ~ genre + thtr_rel_month + critics_rating + 
##     best_pic_nom + best_pic_win + imdb_score, data = movies)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.006  -8.057   0.247   8.483  44.360 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -14.8207     5.6932  -2.603 0.009451 ** 
## genreAnimation                   7.9413     4.5004   1.765 0.078119 .  
## genreArt House & International  13.0397     3.7509   3.476 0.000543 ***
## genreComedy                      3.2802     2.0884   1.571 0.116756    
## genreDocumentary                26.8919     2.5451  10.566  < 2e-16 ***
## genreDrama                       7.5041     1.7715   4.236 2.61e-05 ***
## genreHorror                     -2.9652     3.0774  -0.964 0.335640    
## genreMusical & Performing Arts  22.1549     4.0125   5.521 4.91e-08 ***
## genreMystery & Suspense         -2.2905     2.2902  -1.000 0.317627    
## genreOther                       6.7248     3.5756   1.881 0.060464 .  
## genreScience Fiction & Fantasy  -3.1510     4.5074  -0.699 0.484760    
## thtr_rel_month                  -0.2898     0.1427  -2.031 0.042694 *  
## critics_ratingFresh              1.1911     1.5420   0.772 0.440114    
## critics_ratingRotten           -11.0971     1.6285  -6.814 2.21e-11 ***
## best_pic_nomyes                  5.7818     3.2638   1.771 0.076960 .  
## best_pic_winyes                 -5.9208     5.4958  -1.077 0.281736    
## imdb_score                       4.7626     0.2838  16.782  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.61 on 634 degrees of freedom
## Multiple R-squared:  0.6209, Adjusted R-squared:  0.6113 
## F-statistic: 64.89 on 16 and 634 DF,  p-value: < 2.2e-16

Checking Model Assumptions:

The follwoing plots of the forward eliminated model residuals shows that they are nearly normal.

  par(mfrow=c(1,2))
  hist(fw_model$residuals)
  qqnorm(fw_model$residuals)
  qqline(fw_model$residuals)

The follwoing plots of the backward eliminated model residuals shows that they are nearly normal.

  par(mfrow=c(1,2))
  hist(bw_model$residuals)
  qqnorm(bw_model$residuals)
  qqline(bw_model$residuals)

Both the models doest show any heteroscedasticity in the plots with model$fitted.values vs model$residuals

  par(mfrow=c(1,2))
  plot(fw_model$fitted.values, fw_model$residuals)
  plot(bw_model$fitted.values, bw_model$residuals)

Part 5: Prediction

Both the model has high adjusted R-square value, but we will choose backward eliminated model for making predictions on the audience score base on selected predictors

  predictors <- c("genre","thtr_rel_month","critics_rating","best_pic_nom","best_pic_win","imdb_score","audience_score")
predict.movie.audience_score <- function(title) {
  movie <- movies[movies$title %in% c(title),]
  predict(bw_model, newdata=movie, interval="prediction")
}

Lets predict the audience score for the following movies

  print(xtable(movies[movies$title %in% c("Titanic"),][predictors]), type = "html")

	genre	thtr_rel_month	critics_rating	best_pic_nom	best_pic_win	imdb_score	audience_score
1	Drama	12.00	Certified Fresh	yes	yes	21.24	69.00

  predict.movie.audience_score("Titanic")

##       fit      lwr      upr
## 1 90.2077 63.70446 116.7109

  print(xtable(movies[movies$title %in% c("Pina"),][predictors]), type = "html")

	genre	thtr_rel_month	critics_rating	best_pic_nom	best_pic_win	imdb_score	audience_score
1	Musical & Performing Arts	12.00	Certified Fresh	no	no	17.02	81.00

  predict.movie.audience_score("Pina")

##      fit      lwr      upr
## 1 84.932 59.07315 110.7909

  print(xtable(movies[movies$title %in% c("Taps"),][predictors]), type = "html")

	genre	thtr_rel_month	critics_rating	best_pic_nom	best_pic_win	imdb_score	audience_score
1	Drama	12.00	Fresh	no	no	16.13	67.00

  predict.movie.audience_score("Taps")

##        fit     lwr      upr
## 1 67.23332 42.3462 92.12045

  print(xtable(movies[movies$title %in% c("A Good Woman"),][predictors]), type = "html")

	genre	thtr_rel_month	critics_rating	best_pic_nom	best_pic_win	imdb_score	audience_score
1	Drama	2.00	Rotten	no	no	15.65	49.00

  predict.movie.audience_score("A Good Woman")

##        fit      lwr      upr
## 1 55.53302 30.66597 80.40007

  print(xtable(movies[movies$title %in% c("Serpico"),][predictors]), type = "html")

	genre	thtr_rel_month	critics_rating	best_pic_nom	best_pic_win	imdb_score	audience_score
1	Drama	12.00	Fresh	no	no	19.03	88.00

  predict.movie.audience_score("Serpico")

##        fit      lwr      upr
## 1 81.03588 56.11271 105.9591

We can see the predicted range for the movie’s audience score interval always contains the real audience score.

Part 6: Conclusion

The audience score for a movie will be based on its oscar wins and nominations, critics rating from sites like rotten tomatoes. Imdb ratings and scores also plays an important role since it can have ripple effect on people selecting the movie to watch it and rate it again higher. These arguments seems to be matching coherently with the predictors chosen by the model. We can also see the significant level on most of the features in the parsimoniuous model is very small. Also we can see the documentary genre has high audience ratings in general.