Team 3 - Aderinsola, Robert, Fausat, Vanessa

Set Up Directory and needed packages

getwd()
## [1] "/Users/user/Documents/Year 4/MKTG 3P98"
setwd("/Users/user/Documents/Year 4/MKTG 3P98")
library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode

Cleaning Data

MovieFinancials <- read_excel("/Users/user/Downloads/Movie Dataset_Financials.xlsx")

MovieAudience <- read_excel("/Users/user/Downloads/Movie Dataset_General Audience.xlsx")

Merge both Datasets

Movie_Total<-merge(MovieAudience, MovieFinancials, by="original_title")

Remove NA Rows for “Runtime”

summary(Movie_Total)
##  original_title         type              genre              runtime      
##  Length:665         Length:665         Length:665         Min.   : 39.00  
##  Class :character   Class :character   Class :character   1st Qu.: 92.75  
##  Mode  :character   Mode  :character   Mode  :character   Median :102.00  
##                                                           Mean   :105.71  
##                                                           3rd Qu.:115.00  
##                                                           Max.   :267.00  
##                                                           NA's   :1       
##  mpaa_rating         imdb_rating     imdb_num_votes   critics_rating    
##  Length:665         Min.   : 1.200   Min.   :   180   Length:665        
##  Class :character   1st Qu.: 4.200   1st Qu.:  3998   Class :character  
##  Mode  :character   Median : 6.100   Median : 10535   Mode  :character  
##                     Mean   : 5.789   Mean   : 49127                     
##                     3rd Qu.: 7.100   3rd Qu.: 56128                     
##                     Max.   :55.000   Max.   :893008                     
##                                                                         
##  critics_score    audience_rating    audience_score best_pic_nom      
##  Min.   :  5.00   Length:665         Min.   :13.0   Length:665        
##  1st Qu.: 36.00   Class :character   1st Qu.:38.0   Class :character  
##  Median : 54.00   Mode  :character   Median :55.0   Mode  :character  
##  Mean   : 54.84                      Mean   :56.4                     
##  3rd Qu.: 73.00                      3rd Qu.:76.0                     
##  Max.   :100.00                      Max.   :96.0                     
##                                                                       
##  Facebook_Likes   budget (Millions) revenue (Millions)    language        
##  Min.   :     2   Min.   :  0.318   Min.   :   0.0093   Length:665        
##  1st Qu.:  6479   1st Qu.: 10.000   1st Qu.:  19.6829   Class :character  
##  Median : 26057   Median : 25.000   Median :  61.5487   Mode  :character  
##  Mean   : 63971   Mean   : 40.319   Mean   : 137.4735                     
##  3rd Qu.: 82622   3rd Qu.: 51.000   3rd Qu.: 153.9630                     
##  Max.   :555609   Max.   :280.000   Max.   :2068.1782                     
##                                                                           
##    country         
##  Length:665        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
mean(Movie_Total$runtime)
## [1] NA
na_rows<-Movie_Total[is.na(Movie_Total$runtime),]
print(na_rows)
##          original_title        type       genre runtime mpaa_rating imdb_rating
## 392 Sex and the City 2  Documentary Documentary      NA     Unrated         4.3
##     imdb_num_votes critics_rating critics_score audience_rating audience_score
## 392            739          Fresh            53         Upright             41
##     best_pic_nom Facebook_Likes budget (Millions) revenue (Millions) language
## 392           no          44555              19.8           41.29632  English
##     country
## 392     USA
meanruntime<-mean(Movie_Total$runtime,na.rm=TRUE)
Movie_Total[is.na(Movie_Total$runtime), "runtime"]<-meanruntime

Deal with duplicates

n_distinct(MovieAudience$original_title)
## [1] 644
n_distinct(MovieFinancials$original_title)
## [1] 644
#Find duplicate rows 
duplicate_rows<-Movie_Total[duplicated(Movie_Total$original_title),]

#print duplicate rows
print(duplicate_rows)
##                                 original_title         type              genre
## 147                            Fantastic Four  Feature Film Mystery & Suspense
## 148                            Fantastic Four  Feature Film              Drama
## 149                            Fantastic Four  Feature Film              Drama
## 174                       Godzilla Resurgence  Feature Film             Comedy
## 175                       Godzilla Resurgence  Feature Film              Drama
## 176                       Godzilla Resurgence  Feature Film              Drama
## 203                                  Hercules  Feature Film              Drama
## 204                                  Hercules   Documentary        Documentary
## 205                                  Hercules   Documentary        Documentary
## 339                                       Pan  Feature Film              Drama
## 340                                       Pan  Feature Film              Other
## 341                                       Pan  Feature Film              Other
## 492                  The Fast and the Furious  Feature Film              Drama
## 493                  The Fast and the Furious  Feature Film              Drama
## 494                  The Fast and the Furious  Feature Film              Drama
## 532                      The Legend of Tarzan   Documentary        Documentary
## 533                      The Legend of Tarzan  Feature Film              Drama
## 534                      The Legend of Tarzan  Feature Film              Drama
## 588 The Twilight Saga: Breaking Dawn - Part 2   Documentary        Documentary
## 589 The Twilight Saga: Breaking Dawn - Part 2  Feature Film             Comedy
## 590 The Twilight Saga: Breaking Dawn - Part 2  Feature Film             Comedy
##     runtime mpaa_rating imdb_rating imdb_num_votes  critics_rating
## 147     106           R         6.3          25054          Rotten
## 148     101           R         2.1           9904           Fresh
## 149     101           R         2.1           9904           Fresh
## 174      93          PG         4.2         182983 Certified Fresh
## 175      98       PG-13         6.3          50340           Fresh
## 176      98       PG-13         6.3          50340           Fresh
## 203     105           R         3.0           9216          Rotten
## 204     107     Unrated         3.8          10522           Fresh
## 205     107     Unrated         3.8          10522           Fresh
## 339     130           R         2.0           9216          Rotten
## 340      90          PG         3.6           1010           Fresh
## 341      90          PG         3.6           1010           Fresh
## 492     104           R         7.1         128361           Fresh
## 493      95           R         7.2          35635 Certified Fresh
## 494      95           R         7.2          35635 Certified Fresh
## 532      92     Unrated         6.8           1942 Certified Fresh
## 533      94           R         3.4          57933          Rotten
## 534      94           R         3.4          57933          Rotten
## 588      98     Unrated         3.1           1010           Fresh
## 589      95          PG         7.5            880           Fresh
## 590      95          PG         7.5            880           Fresh
##     critics_score audience_rating audience_score best_pic_nom Facebook_Likes
## 147            43         Spilled             49           no           1261
## 148            35         Upright             26           no          51261
## 149            35         Upright             26           no          51261
## 174            34         Upright             33          yes           5699
## 175            60         Spilled             67           no            699
## 176            60         Spilled             67           no            699
## 203            50         Upright             40           no          16235
## 204            21         Upright             20           no            235
## 205            21         Upright             20           no            235
## 339            34         Upright             24           no            393
## 340            33         Upright             24           no            393
## 341            33         Upright             24           no            393
## 492            68         Upright             78           no         125327
## 493            93         Upright             80           no          45327
## 494            93         Upright             80           no          45327
## 532            66         Upright             68           no         121175
## 533            25         Spilled             39           no          11175
## 534            25         Spilled             39           no          11175
## 588            45         Upright             46           no           1177
## 589            90         Upright             89           no         359177
## 590            90         Upright             89           no         359177
##     budget (Millions) revenue (Millions) language country
## 147              13.0          45.300000  English     USA
## 148              17.0         136.621271  English     USA
## 149              13.0          45.300000  English     USA
## 174               1.9         157.107755 Japanese   Japan
## 175               8.9           4.160000 Japanese   Japan
## 176               1.9         157.107755 Japanese   Japan
## 203               6.0           1.270522  English     USA
## 204              20.0          34.077920  English     USA
## 205               6.0           1.270522  English     USA
## 339               3.0           0.695269  English     USA
## 340               4.0           1.600896  English     USA
## 341               3.0           0.695269  English     USA
## 492              20.0          18.220000  English     USA
## 493              35.0          77.477008  English     USA
## 494              20.0          18.220000  English     USA
## 532               6.0          11.122090  English     USA
## 533              35.0          82.347656  English     USA
## 534               6.0          11.122090  English     USA
## 588               0.5        1025.467110  English     USA
## 589               1.0           3.387000  English     USA
## 590               0.5        1025.467110  English     USA
#Remove duplicates 
CleanedMovie_Total<-Movie_Total[!duplicated(Movie_Total$original_title),]

Cut IMDb Rating into 3 groups

CleanedMovie_Total$IMDbGrp<-cut(CleanedMovie_Total$imdb_rating,
                      breaks = c(0, 4, 7.5, Inf),
                      Labels = c("Low Rated", "Medium Rated", "Highly Rated"),
                      right=FALSE)

Remove IMDB Scores greater than 10

CleanedMovie_Total <- subset(CleanedMovie_Total, imdb_rating <= 10)

Remove the movie with no language or country

CleanedMovie_Total <- CleanedMovie_Total[-c(1), ]

Create Profit Column

CleanedMovie_Total <- CleanedMovie_Total %>%
  mutate(profit = `revenue (Millions)` - `budget (Millions)`)

Descriptive Analysis

Comparing Critic Rating to IMDb Rating

ggplot(CleanedMovie_Total,aes(x=critics_rating, fill = IMDbGrp))+
  theme_bw()+
  geom_bar()+
  labs(y="Number of Movies",
       x = "Critic Rating",
       title = "Critic Rating by IMDb Score Rating ")

### Average IMDb ratings by genre

avgimdb_rating <- CleanedMovie_Total %>%
  group_by(genre) %>%
  summarise(average_imdb_rating = mean(imdb_rating, na.rm = TRUE))

ggplot(avgimdb_rating, aes(x = genre, y = average_imdb_rating)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  theme_minimal() +
  labs(title = "Average IMDb Rating by Genre",
       x = "Genre",
       y = "Average IMDb Rating") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

### Plot b/w Budget and Profit

ggplot(CleanedMovie_Total, aes(x = `budget (Millions)`, y = profit )) +
  geom_point() +
  ggtitle("Scatter Plot between Budget and Profit") +
  xlab("Budget (Millions)") +
  ylab("Profit") +
  theme_minimal()

## Predictive Analysis ### #T-Test for IMDB Rating and Budget

boxplot(CleanedMovie_Total$imdb_rating ~ CleanedMovie_Total$`budget (Millions)`, col=c(5,7))

shapiro.test(CleanedMovie_Total$imdb_rating)
## 
##  Shapiro-Wilk normality test
## 
## data:  CleanedMovie_Total$imdb_rating
## W = 0.93249, p-value < 2.2e-16

Perform one-way ANOVA test for genre and budget

anova_budget_genre <- aov(`budget (Millions)` ~ genre, data = CleanedMovie_Total)
summary(anova_budget_genre)
##              Df  Sum Sq Mean Sq F value Pr(>F)
## genre        10   13933    1393    0.65  0.771
## Residuals   630 1350531    2144
# Create the boxplot of budgets by genre
ggplot(CleanedMovie_Total, aes(x = genre, y = `budget (Millions)`)) +
  geom_boxplot(fill = "skyblue") +
  theme_minimal() +
  labs(title = "Boxplot of Budgets by Genre",
       x = "Genre",
       y = "Budget (Millions)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

## Multiple Linear Regression

model <- lm(imdb_rating ~ `budget (Millions)` + `revenue (Millions)` + runtime + 
              critics_score + audience_score, data = CleanedMovie_Total)
# Summary of the regression model
summary(model)
## 
## Call:
## lm(formula = imdb_rating ~ `budget (Millions)` + `revenue (Millions)` + 
##     runtime + critics_score + audience_score, data = CleanedMovie_Total)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.90446 -0.41104 -0.00605  0.54473  2.78058 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.660e+00  2.161e-01   7.679 6.07e-14 ***
## `budget (Millions)`  2.046e-03  1.076e-03   1.901   0.0578 .  
## `revenue (Millions)` 8.124e-05  2.461e-04   0.330   0.7415    
## runtime              1.445e-04  1.865e-03   0.077   0.9383    
## critics_score        4.103e-03  2.478e-03   1.656   0.0982 .  
## audience_score       6.512e-02  2.821e-03  23.082  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9145 on 635 degrees of freedom
## Multiple R-squared:  0.7271, Adjusted R-squared:  0.7249 
## F-statistic: 338.4 on 5 and 635 DF,  p-value: < 2.2e-16
#check if variance inflation factor (vif) <10
vif(model)
##  `budget (Millions)` `revenue (Millions)`              runtime 
##             1.889045             2.018807             1.011418 
##        critics_score       audience_score 
##             2.517059             2.675179
#Check linearity of residuels 
plot(model, which = 1)

#Check Normality of residuals 
plot(model, which = 2)