Setup

Load packages

library(ggplot2)
library(dplyr)
library(tidyverse)
library(statsr)
library(skimr)
library(GGally)
library(kableExtra)
library(grid)
library(gridExtra)
library(scales)
library(corrplot)

Load data

#Clear the evironment
rm(list=ls(all=TRUE))
#Set working directory
setwd("C:/Users/satos/Documents/project/Quarto/07.linear-regression")

load("movies.Rdata.gz")

Part 1: Data

The data set comprises 651 randomly sampled movies produced and released before 2016 on the scope of generalized inference.


Part 2: Research question

What genres contribute to high social media engagement for movies? Can we make a prediction from the tendency of scores in each genre? In the data set, there are some information from IMDB and Rotten Tomatoes.


Part 3: Exploratory data analysis

#Look at the summary of the dataset
skim(movies)
Data summary
Name movies
Number of rows 651
Number of columns 32
_______________________
Column type frequency:
character 9
factor 12
numeric 11
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
title 0 1.00 3 70 0 647 0
director 2 1.00 6 27 0 532 0
actor1 2 1.00 4 26 0 485 0
actor2 7 0.99 5 27 0 572 0
actor3 9 0.99 3 27 0 601 0
actor4 13 0.98 4 25 0 607 0
actor5 15 0.98 4 28 0 615 0
imdb_url 0 1.00 36 36 0 650 0
rt_url 0 1.00 31 85 0 650 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
title_type 0 1.00 FALSE 3 Fea: 591, Doc: 55, TV : 5
genre 0 1.00 FALSE 11 Dra: 305, Com: 87, Act: 65, Mys: 59
mpaa_rating 0 1.00 FALSE 6 R: 329, PG-: 133, PG: 118, Unr: 50
studio 8 0.99 FALSE 211 Par: 37, War: 30, Son: 27, Uni: 23
critics_rating 0 1.00 FALSE 3 Rot: 307, Fre: 209, Cer: 135
audience_rating 0 1.00 FALSE 2 Upr: 376, Spi: 275
best_pic_nom 0 1.00 FALSE 2 no: 629, yes: 22
best_pic_win 0 1.00 FALSE 2 no: 644, yes: 7
best_actor_win 0 1.00 FALSE 2 no: 558, yes: 93
best_actress_win 0 1.00 FALSE 2 no: 579, yes: 72
best_dir_win 0 1.00 FALSE 2 no: 608, yes: 43
top200_box 0 1.00 FALSE 2 no: 636, yes: 15

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
runtime 1 1.00 105.82 19.45 39.0 92.0 103.0 115.75 267 ▁▇▁▁▁
thtr_rel_year 0 1.00 1997.94 10.97 1970.0 1990.0 2000.0 2007.00 2014 ▂▃▆▆▇
thtr_rel_month 0 1.00 6.74 3.55 1.0 4.0 7.0 10.00 12 ▆▃▅▅▇
thtr_rel_day 0 1.00 14.42 8.86 1.0 7.0 15.0 21.00 31 ▇▆▆▆▃
dvd_rel_year 8 0.99 2004.43 4.64 1991.0 2001.0 2004.0 2008.00 2015 ▁▃▇▃▃
dvd_rel_month 8 0.99 6.33 3.38 1.0 3.0 6.0 9.00 12 ▇▆▅▅▇
dvd_rel_day 8 0.99 15.01 8.87 1.0 7.0 15.0 23.00 31 ▇▆▆▅▅
imdb_rating 0 1.00 6.49 1.08 1.9 5.9 6.6 7.30 9 ▁▁▃▇▂
imdb_num_votes 0 1.00 57532.98 112124.39 180.0 4545.5 15116.0 58300.50 893008 ▇▁▁▁▁
critics_score 0 1.00 57.69 28.40 1.0 33.0 61.0 83.00 100 ▅▅▅▆▇
audience_score 0 1.00 62.36 20.22 11.0 46.0 65.0 80.00 97 ▂▅▆▇▇
#For this analysis, the character class is not necessary, so we drop off them except for the title column. 
df <- movies %>%
  select(-c(director,actor1, actor2, actor3, actor4, actor5, imdb_url, rt_url))

#check title type
table(df$title_type)
## 
##  Documentary Feature Film     TV Movie 
##           55          591            5
#Feature Film is the majority of the type. 

#Check and remove NA. 
sum(is.na(df))
## [1] 33
#33
df <- df %>%
  na.omit() %>%
  filter(title_type == "Feature Film") 

dim(df) 
## [1] 579  24
# 579  24

sum(is.na(df))
## [1] 0
# 0

Scores of IMDB and Rotten Tomatoes

According to the column, imdb_rating from IMDB, critics_scores and audience_scores from Rotten tomatoes are useful variables. Let’s see the distribution of the scores!

#distribution
score <- df %>%
  select(c(imdb_rating, critics_score, audience_score))
skim(score)
Data summary
Name score
Number of rows 579
Number of columns 3
_______________________
Column type frequency:
numeric 3
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
imdb_rating 0 1 6.40 1.05 1.9 5.9 6.5 7.1 9 ▁▁▅▇▂
critics_score 0 1 55.09 27.70 1.0 31.0 59.0 79.0 100 ▅▆▆▇▇
audience_score 0 1 60.73 19.80 11.0 45.0 63.0 78.0 97 ▂▆▆▇▆
#visualisation by histgram
g1 <- ggplot(data = score, mapping = aes(x= imdb_rating))+geom_histogram()
#A left skewer, values are from 1 to 10.
g2 <- ggplot(data = score, mapping = aes(x= critics_score))+geom_histogram() 
#A fairly left skewer,values are from 1 to 100.
g3 <- ggplot(data = score, mapping = aes(x= audience_score))+geom_histogram()
#A fairly left skewer, values are from 1 to 100.
grid.arrange(g1,g2,g3, nrow = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#visualisation by plot
plot(score$imdb_rating)

plot(score$critics_score)

plot(score$audience_score)

#visualisation by jitter
g4<- ggplot(data = score, mapping = aes(x = imdb_rating, y= audience_score))+
  geom_jitter()+
  geom_smooth(method = "lm")

#critics score vs rating imdb
g5<- ggplot(data = score, mapping = aes(x = imdb_rating, y= critics_score))+
  geom_jitter()+
  geom_smooth(method = "lm")

#critics score vs audience score
g6 <-ggplot(data = score, mapping = aes(x = audience_score, y= critics_score))+
  geom_jitter()+
  geom_smooth(method = "lm")
grid.arrange(g4,g5,g6, nrow = 1)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

According to the chart, we confirm the these three values are correlated, we make a new variable as average score values from these three variables.

###Average scores
Calculate the average scores from the rating of IMDB, audience and critics scores of Rotten tomatoes.

#Adjust digit of rating of IMDB to scores of rotten tomatoes. 
#Make a new variable of average score. 
df2 <- df %>%
  mutate( score_imdb = imdb_rating * 10)%>%  
  mutate( score_avg = (critics_score + audience_score+ score_imdb)/3)

Explore average score

#summary
skim(df2$score_avg)
Data summary
Name df2$score_avg
Number of rows 579
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 0 1 59.95 17.61 12.67 47.33 60 74.17 94.67 ▁▅▇▇▅
#579
#mean: 59.95
#Standard deviation:17.61
#Min: 12.6
#Median: 60
#Max: 94.6

#Visualization
plot(df2$score_avg) # Randomly scattered

hist(df2$score_avg) #Left skewer

#top200 according to boxoffice mojo
df2%>%
  ggplot(aes(x = top200_box, y = score_avg)) +
  geom_boxplot()

#scores vs best picture nomination
df2%>%
  ggplot(aes(x = best_pic_nom, y = score_avg)) +
  geom_boxplot()

#scores vs best picture win
df2%>%
  ggplot(aes(x = best_pic_win, y = score_avg)) +
  geom_boxplot()

#How many titles are scored higher than 50 points?
df2 %>%
  filter(score_avg > 50)%>%
  count()
## # A tibble: 1 × 1
##       n
##   <int>
## 1   408
#408 titles 70.4%.
#The most of social media users scores higher than 50 points. 

Response variable and explanatory variables

As we set up the response variable as average score, we will find out the suitable explanatory variables through visualised relations and correlation by ggpair function

#Drop off the unnecessary variable to compare with between "score_avg" and other numerical variables. 
df3 <- df2 %>%
  select(runtime, thtr_rel_year, thtr_rel_month, imdb_num_votes, score_avg)


lowerFn <- function(data, mapping, method = "lm", ...) {
  p <- ggplot(data = data, mapping = mapping) +
    geom_point(colour = "black") +
    geom_smooth(method = method, color = "red", ...)
  p
} #A function to help add a regression line to the scatter plots to the ggpairs function.


#Next, I will add a sapply function to the ggpair function to ensures all categorical variables 
#are excluded from the plot..

ggpairs(df3[ ,sapply(df3,is.numeric)], 
        lower = list(continuous = wrap(lowerFn, method = "lm")), 
        diag = list(continuous = wrap("barDiag", colour = "white")), 
        upper = list(continuous = wrap("cor", size = 8)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `geom_smooth()` using formula = 'y ~ x'
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

According to that, score average has relatively lower correlation with number of votes by users on IMDB, but 36%. More over, we would say year, month, and runtime are lower correlation with average score.


Part 4: Modeling

What factors are more significant for the average scores?

#As response variable is "score_avg", other score data, such as "imdb_rating", "audience_score", and "critics_score", is removed from the data frame of df2. 
#In addition, "audience_rating", "critics_rating", "best_pic_no", and "best_pic_win" are not found the current website of IBDB and Rotten tomatoes, so that we also remove those variables. 
df4<- df2%>%
  select(genre, runtime, mpaa_rating, thtr_rel_year, imdb_num_votes, score_avg)
str(df4)
## tibble [579 × 6] (S3: tbl_df/tbl/data.frame)
##  $ genre         : Factor w/ 11 levels "Action & Adventure",..: 6 6 4 6 7 6 6 6 1 6 ...
##  $ runtime       : num [1:579] 80 101 84 139 90 142 93 119 127 108 ...
##  $ mpaa_rating   : Factor w/ 6 levels "G","NC-17","PG",..: 5 4 5 3 5 4 5 6 3 4 ...
##  $ thtr_rel_year : num [1:579] 2013 2001 1996 1993 2004 ...
##  $ imdb_num_votes: int [1:579] 899 12285 22381 35096 2386 5016 2272 12496 71979 9669 ...
##  $ score_avg     : num [1:579] 57.7 83.3 86 76 37 ...
##  - attr(*, "na.action")= 'omit' Named int [1:17] 94 100 184 207 223 261 282 308 334 345 ...
##   ..- attr(*, "names")= chr [1:17] "94" "100" "184" "207" ...

We have 5 explanatory variables and a response variable. Now, we will create a linear regression model with all the variables and assess the results.

#Linear regression model
m_avg <- lm(formula= score_avg ~ genre + runtime + mpaa_rating + thtr_rel_year + imdb_num_votes, data = df4)
summary(m_avg)
## 
## Call:
## lm(formula = score_avg ~ genre + runtime + mpaa_rating + thtr_rel_year + 
##     imdb_num_votes, data = df4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -47.453  -9.327   0.838  10.078  35.474 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     6.900e+02  1.285e+02   5.369 1.16e-07 ***
## genreAnimation                  3.550e+00  5.993e+00   0.592  0.55386    
## genreArt House & International  1.226e+01  4.773e+00   2.568  0.01048 *  
## genreComedy                     1.833e+00  2.452e+00   0.748  0.45503    
## genreDocumentary                1.006e+01  8.649e+00   1.163  0.24517    
## genreDrama                      1.348e+01  2.091e+00   6.448 2.46e-10 ***
## genreHorror                    -2.268e+00  3.702e+00  -0.613  0.54036    
## genreMusical & Performing Arts  2.219e+01  5.464e+00   4.062 5.55e-05 ***
## genreMystery & Suspense         6.630e+00  2.694e+00   2.461  0.01415 *  
## genreOther                      1.003e+01  4.214e+00   2.379  0.01767 *  
## genreScience Fiction & Fantasy  1.732e+00  5.426e+00   0.319  0.74977    
## runtime                         5.245e-02  4.075e-02   1.287  0.19859    
## mpaa_ratingNC-17                1.537e+00  1.506e+01   0.102  0.91872    
## mpaa_ratingPG                  -9.124e+00  4.422e+00  -2.063  0.03954 *  
## mpaa_ratingPG-13               -1.432e+01  4.600e+00  -3.113  0.00194 ** 
## mpaa_ratingR                   -8.793e+00  4.449e+00  -1.976  0.04859 *  
## mpaa_ratingUnrated              7.030e+00  6.037e+00   1.165  0.24471    
## thtr_rel_year                  -3.198e-01  6.420e-02  -4.981 8.43e-07 ***
## imdb_num_votes                  5.971e-05  5.843e-06  10.221  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.39 on 560 degrees of freedom
## Multiple R-squared:  0.3529, Adjusted R-squared:  0.3321 
## F-statistic: 16.96 on 18 and 560 DF,  p-value: < 2.2e-16

At present, we have a R-squared value of 35.29%, meaning around 35% of the variability of the response variables is explained by our model. The adjusted R-squared is a modified version of R-squared that adjusts for predictors that are not significant in a regression model. In this case, we achieved an adjusted R-squared of 33.21%, which is not too less from the R-squared values.

Check significant variables by ANOVA function

anova(m_avg)
## Analysis of Variance Table
## 
## Response: score_avg
##                 Df Sum Sq Mean Sq  F value    Pr(>F)    
## genre           10  25618  2561.8  12.3629 < 2.2e-16 ***
## runtime          1   6570  6569.7  31.7052 2.842e-08 ***
## mpaa_rating      5   8223  1644.7   7.9371 3.072e-07 ***
## thtr_rel_year    1   1217  1217.1   5.8737   0.01568 *  
## imdb_num_votes   1  21646 21645.6 104.4605 < 2.2e-16 ***
## Residuals      560 116039   207.2                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We confirmed all variables are significant, which means the p-values are below 0.05.

Check and eliminate lower predictive features by step function.

m_avg2 <- step(m_avg, direction = "both", scope = m_avg)
## Start:  AIC=3106.92
## score_avg ~ genre + runtime + mpaa_rating + thtr_rel_year + imdb_num_votes
## 
##                  Df Sum of Sq    RSS    AIC
## - runtime         1     343.3 116383 3106.6
## <none>                        116039 3106.9
## - thtr_rel_year   1    5141.5 121181 3130.0
## - mpaa_rating     5    7863.7 123903 3134.9
## - genre          10   17900.2 133939 3170.0
## - imdb_num_votes  1   21645.6 137685 3204.0
## 
## Step:  AIC=3106.63
## score_avg ~ genre + mpaa_rating + thtr_rel_year + imdb_num_votes
## 
##                  Df Sum of Sq    RSS    AIC
## <none>                        116383 3106.6
## - mpaa_rating     5    7648.1 124031 3133.5
## - thtr_rel_year   1    6098.5 122481 3134.2
## - genre          10   21511.6 137894 3184.8
## - imdb_num_votes  1   28904.2 145287 3233.1

We have achieved a parsimonious model with just seven predictors.

summary(m_avg2)
## 
## Call:
## lm(formula = score_avg ~ genre + mpaa_rating + thtr_rel_year + 
##     imdb_num_votes, data = df4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -47.758  -9.305   0.683  10.237  34.910 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     7.328e+02  1.242e+02   5.900 6.30e-09 ***
## genreAnimation                  3.328e+00  5.994e+00   0.555  0.57894    
## genreArt House & International  1.233e+01  4.775e+00   2.581  0.01010 *  
## genreComedy                     1.494e+00  2.439e+00   0.613  0.54040    
## genreDocumentary                9.980e+00  8.654e+00   1.153  0.24931    
## genreDrama                      1.384e+01  2.074e+00   6.672 6.05e-11 ***
## genreHorror                    -2.775e+00  3.683e+00  -0.754  0.45145    
## genreMusical & Performing Arts  2.289e+01  5.440e+00   4.208 2.99e-05 ***
## genreMystery & Suspense         6.931e+00  2.685e+00   2.581  0.01010 *  
## genreOther                      1.016e+01  4.216e+00   2.410  0.01628 *  
## genreScience Fiction & Fantasy  1.520e+00  5.427e+00   0.280  0.77954    
## mpaa_ratingNC-17                5.639e-01  1.505e+01   0.037  0.97012    
## mpaa_ratingPG                  -8.689e+00  4.412e+00  -1.970  0.04937 *  
## mpaa_ratingPG-13               -1.349e+01  4.557e+00  -2.960  0.00321 ** 
## mpaa_ratingR                   -8.264e+00  4.433e+00  -1.864  0.06278 .  
## mpaa_ratingUnrated              8.021e+00  5.991e+00   1.339  0.18115    
## thtr_rel_year                  -3.389e-01  6.250e-02  -5.422 8.78e-08 ***
## imdb_num_votes                  6.282e-05  5.322e-06  11.804  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.4 on 561 degrees of freedom
## Multiple R-squared:  0.351,  Adjusted R-squared:  0.3313 
## F-statistic: 17.84 on 17 and 561 DF,  p-value: < 2.2e-16

After the elimination step of runtime, we haven’t lost in term both R-squared and adjust R-squared value.

Finally, the p-value based on the F-statistics test is below 0.05, meaning all of the current predictors are statistically significant.

Interpretation of model coefficients

  1. An intercept of 73.2 is the estimated audience score if all other variables is zero. This does not make any sense is only used to adjust the intercept height.
  2. Out of the 11 genre categories, the audience score can be higher or lower than Action & Adventure films based on what genre is selected and its corresponding coefficient. For example, movie with Documentary genre is expected to have an audience score 9.980 higher than Action & Adventure films. And, movies with Animation genre is expected to have an audience score 3.328 higher than that of Action & Adventure films.
  3. All else hold constant, for every one unit increase in theater release year, the model predicts a 0.389 decrease in score_avg on average.
  4. All else hold constant, for every one unit increase in the IMDB number of votes, the model predicts a 0.00006282 increase in score_avg on average.

Model Diagnostics

Now, we will check whether our model follows the four assumptions of linear regression.

  1. Linear relationship: There exists a linear relationship between the independent variable, x, and the dependent variable, y.

  2. Independence: The residuals are independent. In particular, there is no correlation between consecutive residuals in time series data.

  3. Homoscedasticity: The residuals have constant variance at every level of x.

  4. Normality: The residuals of the model are normally distributed.

We have already checked for linearity in the previous section. Only variables such as thtr_rel_year, imdb_num_votes, and runtime have low linearity.

We have also taken care for multicollinearity in the previous section by creating average score and dropping the critics score, audience score, and imdb rating from our model.

Let us now visualize our model for checking the remaining conditions.

g7 <- ggplot(data = m_avg2, aes(x = .fitted, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  xlab("Fitted values") +
  ylab("Residuals") + theme(plot.title = element_text(hjust = 0.5)) + 
  labs(title = "a) Variability Condition")

g8 <- ggplot(data = m_avg2, aes(x = .resid)) +
  geom_histogram(binwidth = 1, fill='white', color='black', aes(y=..density..)) +
  xlab("Residuals") + geom_density(lwd = 0.8) + theme(plot.title = element_text(hjust = 0.5)) +
  labs(title = "b) Normality Condition 1")

g9 <- ggplot(data = m_avg2, aes(sample = .resid)) +
  stat_qq() + stat_qq_line(col = "red") + theme(plot.title = element_text(hjust = 0.5)) + 
  labs(title = "c) Normality Condition 2")

grid.arrange(g7,g8,g9, nrow = 1)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.


Part 5: Prediction

Suppose we want to use the model we created earlier, check the m_avg model to predict the audience score of a new dataframe of film.

Test data frame of movie

Use the 5 score data from Rotten Tomatoes, which are randomly selected. The Great Wall (2017, 38% on Rotten Tomatoes) The Mummy (2017, 16% on Rotten Tomatoes) Ghost in the Shell (2017, 43% on Rotten Tomatoes) King Arthur: Legend of the Sword (2017, 30% on Rotten Tomatoes) Valerian and the City of a Thousand Planets (2017, 53% on Rotten Tomatoes)

df_test <- data.frame(genre = c("Action & Adventure", "Horror", "Science Fiction & Fantasy", "Science Fiction & Fantasy", "Science Fiction & Fantasy"), 
                      runtime = c(135,107,107,126,137), 
                      thtr_rel_year = c(2017,2017,2017, 2017, 2017), 
                      imdb_rating = c(5.9, 5.4 ,6.3 ,6.7, 6.4), 
                      imdb_num_votes = c(144644, 204162, 225966, 229947, 194791), 
                      mpaa_rating = c("PG-13", "PG-13", "PG-13", 
                                          "PG-13", "PG-13"))

df_test
##                       genre runtime thtr_rel_year imdb_rating imdb_num_votes
## 1        Action & Adventure     135          2017         5.9         144644
## 2                    Horror     107          2017         5.4         204162
## 3 Science Fiction & Fantasy     107          2017         6.3         225966
## 4 Science Fiction & Fantasy     126          2017         6.7         229947
## 5 Science Fiction & Fantasy     137          2017         6.4         194791
##   mpaa_rating
## 1       PG-13
## 2       PG-13
## 3       PG-13
## 4       PG-13
## 5       PG-13

Predict scores

df_predict <- predict(m_avg2, df_test, interval = "predict")
df_predict2 <- cbind(df_test, round(df_predict))
df_predict2$title <- c("The Great Wall", "The Mummy", 
                      "Ghost in the Shell   ", "King Arthur: Legend of the Sword", 
                      "Valerian and the City of a Thousand Planets")
df_predict2 <- df_predict2 %>%
  select(title, everything()) %>%
  mutate(Actual_score = c(42,35,51,69,53), 
         estimate = ifelse(abs(Actual_score - fit) <= 5, "Good Prediction", 
                           ifelse(Actual_score >= lwr & Actual_score <= upr, 
                                  "Not great but within the range", "Bad prediction")))
df_predict2 %>%
  kbl() %>%
  column_spec(c(1,8,11), bold = T) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                full_width = F, position = "center", font_size = 11)         
title genre runtime thtr_rel_year imdb_rating imdb_num_votes mpaa_rating fit lwr upr Actual_score estimate
The Great Wall Action & Adventure 135 2017 5.9 144644 PG-13 45 16 74 42 Good Prediction
The Mummy Horror 107 2017 5.4 204162 PG-13 46 17 75 35 Not great but within the range
Ghost in the Shell Science Fiction & Fantasy 107 2017 6.3 225966 PG-13 52 21 82 51 Good Prediction
King Arthur: Legend of the Sword Science Fiction & Fantasy 126 2017 6.7 229947 PG-13 52 22 82 69 Not great but within the range
Valerian and the City of a Thousand Planets Science Fiction & Fantasy 137 2017 6.4 194791 PG-13 50 19 80 53 Good Prediction

According to the table, the estimate in the three out of five movies are fitted to the model. Even the rest of two movies are out of “Good prediction”, but within the set range.


Part 6: Conclusion

In conclusion, we have created a linear regression model to predict the audience score on the online movie data base on the certain conditions. so, we have found out more or less the required factors of a movies to predict a popular movie among internet users.

On the other hands, this model is not perfect. To make more accurate model, there might additional attributes for more accurate prediction. In addition, the preference is depending on the personal background. Even though the limited model, but I hope you would find out an ideal movie for this weekend.