getwd()
## [1] "/Users/user/Documents/Year 4/MKTG 3P98"
setwd("/Users/user/Documents/Year 4/MKTG 3P98")
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
MovieFinancials <- read_excel("/Users/user/Downloads/Movie Dataset_Financials.xlsx")
MovieAudience <- read_excel("/Users/user/Downloads/Movie Dataset_General Audience.xlsx")
Movie_Total<-merge(MovieAudience, MovieFinancials, by="original_title")
summary(Movie_Total)
## original_title type genre runtime
## Length:665 Length:665 Length:665 Min. : 39.00
## Class :character Class :character Class :character 1st Qu.: 92.75
## Mode :character Mode :character Mode :character Median :102.00
## Mean :105.71
## 3rd Qu.:115.00
## Max. :267.00
## NA's :1
## mpaa_rating imdb_rating imdb_num_votes critics_rating
## Length:665 Min. : 1.200 Min. : 180 Length:665
## Class :character 1st Qu.: 4.200 1st Qu.: 3998 Class :character
## Mode :character Median : 6.100 Median : 10535 Mode :character
## Mean : 5.789 Mean : 49127
## 3rd Qu.: 7.100 3rd Qu.: 56128
## Max. :55.000 Max. :893008
##
## critics_score audience_rating audience_score best_pic_nom
## Min. : 5.00 Length:665 Min. :13.0 Length:665
## 1st Qu.: 36.00 Class :character 1st Qu.:38.0 Class :character
## Median : 54.00 Mode :character Median :55.0 Mode :character
## Mean : 54.84 Mean :56.4
## 3rd Qu.: 73.00 3rd Qu.:76.0
## Max. :100.00 Max. :96.0
##
## Facebook_Likes budget (Millions) revenue (Millions) language
## Min. : 2 Min. : 0.318 Min. : 0.0093 Length:665
## 1st Qu.: 6479 1st Qu.: 10.000 1st Qu.: 19.6829 Class :character
## Median : 26057 Median : 25.000 Median : 61.5487 Mode :character
## Mean : 63971 Mean : 40.319 Mean : 137.4735
## 3rd Qu.: 82622 3rd Qu.: 51.000 3rd Qu.: 153.9630
## Max. :555609 Max. :280.000 Max. :2068.1782
##
## country
## Length:665
## Class :character
## Mode :character
##
##
##
##
mean(Movie_Total$runtime)
## [1] NA
na_rows<-Movie_Total[is.na(Movie_Total$runtime),]
print(na_rows)
## original_title type genre runtime mpaa_rating imdb_rating
## 392 Sex and the City 2 Documentary Documentary NA Unrated 4.3
## imdb_num_votes critics_rating critics_score audience_rating audience_score
## 392 739 Fresh 53 Upright 41
## best_pic_nom Facebook_Likes budget (Millions) revenue (Millions) language
## 392 no 44555 19.8 41.29632 English
## country
## 392 USA
meanruntime<-mean(Movie_Total$runtime,na.rm=TRUE)
Movie_Total[is.na(Movie_Total$runtime), "runtime"]<-meanruntime
n_distinct(MovieAudience$original_title)
## [1] 644
n_distinct(MovieFinancials$original_title)
## [1] 644
#Find duplicate rows
duplicate_rows<-Movie_Total[duplicated(Movie_Total$original_title),]
#print duplicate rows
print(duplicate_rows)
## original_title type genre
## 147 Fantastic Four Feature Film Mystery & Suspense
## 148 Fantastic Four Feature Film Drama
## 149 Fantastic Four Feature Film Drama
## 174 Godzilla Resurgence Feature Film Comedy
## 175 Godzilla Resurgence Feature Film Drama
## 176 Godzilla Resurgence Feature Film Drama
## 203 Hercules Feature Film Drama
## 204 Hercules Documentary Documentary
## 205 Hercules Documentary Documentary
## 339 Pan Feature Film Drama
## 340 Pan Feature Film Other
## 341 Pan Feature Film Other
## 492 The Fast and the Furious Feature Film Drama
## 493 The Fast and the Furious Feature Film Drama
## 494 The Fast and the Furious Feature Film Drama
## 532 The Legend of Tarzan Documentary Documentary
## 533 The Legend of Tarzan Feature Film Drama
## 534 The Legend of Tarzan Feature Film Drama
## 588 The Twilight Saga: Breaking Dawn - Part 2 Documentary Documentary
## 589 The Twilight Saga: Breaking Dawn - Part 2 Feature Film Comedy
## 590 The Twilight Saga: Breaking Dawn - Part 2 Feature Film Comedy
## runtime mpaa_rating imdb_rating imdb_num_votes critics_rating
## 147 106 R 6.3 25054 Rotten
## 148 101 R 2.1 9904 Fresh
## 149 101 R 2.1 9904 Fresh
## 174 93 PG 4.2 182983 Certified Fresh
## 175 98 PG-13 6.3 50340 Fresh
## 176 98 PG-13 6.3 50340 Fresh
## 203 105 R 3.0 9216 Rotten
## 204 107 Unrated 3.8 10522 Fresh
## 205 107 Unrated 3.8 10522 Fresh
## 339 130 R 2.0 9216 Rotten
## 340 90 PG 3.6 1010 Fresh
## 341 90 PG 3.6 1010 Fresh
## 492 104 R 7.1 128361 Fresh
## 493 95 R 7.2 35635 Certified Fresh
## 494 95 R 7.2 35635 Certified Fresh
## 532 92 Unrated 6.8 1942 Certified Fresh
## 533 94 R 3.4 57933 Rotten
## 534 94 R 3.4 57933 Rotten
## 588 98 Unrated 3.1 1010 Fresh
## 589 95 PG 7.5 880 Fresh
## 590 95 PG 7.5 880 Fresh
## critics_score audience_rating audience_score best_pic_nom Facebook_Likes
## 147 43 Spilled 49 no 1261
## 148 35 Upright 26 no 51261
## 149 35 Upright 26 no 51261
## 174 34 Upright 33 yes 5699
## 175 60 Spilled 67 no 699
## 176 60 Spilled 67 no 699
## 203 50 Upright 40 no 16235
## 204 21 Upright 20 no 235
## 205 21 Upright 20 no 235
## 339 34 Upright 24 no 393
## 340 33 Upright 24 no 393
## 341 33 Upright 24 no 393
## 492 68 Upright 78 no 125327
## 493 93 Upright 80 no 45327
## 494 93 Upright 80 no 45327
## 532 66 Upright 68 no 121175
## 533 25 Spilled 39 no 11175
## 534 25 Spilled 39 no 11175
## 588 45 Upright 46 no 1177
## 589 90 Upright 89 no 359177
## 590 90 Upright 89 no 359177
## budget (Millions) revenue (Millions) language country
## 147 13.0 45.300000 English USA
## 148 17.0 136.621271 English USA
## 149 13.0 45.300000 English USA
## 174 1.9 157.107755 Japanese Japan
## 175 8.9 4.160000 Japanese Japan
## 176 1.9 157.107755 Japanese Japan
## 203 6.0 1.270522 English USA
## 204 20.0 34.077920 English USA
## 205 6.0 1.270522 English USA
## 339 3.0 0.695269 English USA
## 340 4.0 1.600896 English USA
## 341 3.0 0.695269 English USA
## 492 20.0 18.220000 English USA
## 493 35.0 77.477008 English USA
## 494 20.0 18.220000 English USA
## 532 6.0 11.122090 English USA
## 533 35.0 82.347656 English USA
## 534 6.0 11.122090 English USA
## 588 0.5 1025.467110 English USA
## 589 1.0 3.387000 English USA
## 590 0.5 1025.467110 English USA
#Remove duplicates
CleanedMovie_Total<-Movie_Total[!duplicated(Movie_Total$original_title),]
CleanedMovie_Total$IMDbGrp<-cut(CleanedMovie_Total$imdb_rating,
breaks = c(0, 4, 7.5, Inf),
Labels = c("Low Rated", "Medium Rated", "Highly Rated"),
right=FALSE)
CleanedMovie_Total <- subset(CleanedMovie_Total, imdb_rating <= 10)
CleanedMovie_Total <- CleanedMovie_Total[-c(1), ]
CleanedMovie_Total <- CleanedMovie_Total %>%
mutate(profit = `revenue (Millions)` - `budget (Millions)`)
ggplot(CleanedMovie_Total,aes(x=critics_rating, fill = IMDbGrp))+
theme_bw()+
geom_bar()+
labs(y="Number of Movies",
x = "Critic Rating",
title = "Critic Rating by IMDb Score Rating ")
### Average IMDb ratings by genre
avgimdb_rating <- CleanedMovie_Total %>%
group_by(genre) %>%
summarise(average_imdb_rating = mean(imdb_rating, na.rm = TRUE))
ggplot(avgimdb_rating, aes(x = genre, y = average_imdb_rating)) +
geom_bar(stat = "identity", fill = "skyblue") +
theme_minimal() +
labs(title = "Average IMDb Rating by Genre",
x = "Genre",
y = "Average IMDb Rating") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
### Plot b/w Budget and Profit
ggplot(CleanedMovie_Total, aes(x = `budget (Millions)`, y = profit )) +
geom_point() +
ggtitle("Scatter Plot between Budget and Profit") +
xlab("Budget (Millions)") +
ylab("Profit") +
theme_minimal()
## Predictive Analysis ### #T-Test for IMDB Rating and Budget
boxplot(CleanedMovie_Total$imdb_rating ~ CleanedMovie_Total$`budget (Millions)`, col=c(5,7))
shapiro.test(CleanedMovie_Total$imdb_rating)
##
## Shapiro-Wilk normality test
##
## data: CleanedMovie_Total$imdb_rating
## W = 0.93249, p-value < 2.2e-16
anova_budget_genre <- aov(`budget (Millions)` ~ genre, data = CleanedMovie_Total)
summary(anova_budget_genre)
## Df Sum Sq Mean Sq F value Pr(>F)
## genre 10 13933 1393 0.65 0.771
## Residuals 630 1350531 2144
# Create the boxplot of budgets by genre
ggplot(CleanedMovie_Total, aes(x = genre, y = `budget (Millions)`)) +
geom_boxplot(fill = "skyblue") +
theme_minimal() +
labs(title = "Boxplot of Budgets by Genre",
x = "Genre",
y = "Budget (Millions)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Multiple Linear Regression
model <- lm(imdb_rating ~ `budget (Millions)` + `revenue (Millions)` + runtime +
critics_score + audience_score, data = CleanedMovie_Total)
# Summary of the regression model
summary(model)
##
## Call:
## lm(formula = imdb_rating ~ `budget (Millions)` + `revenue (Millions)` +
## runtime + critics_score + audience_score, data = CleanedMovie_Total)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.90446 -0.41104 -0.00605 0.54473 2.78058
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.660e+00 2.161e-01 7.679 6.07e-14 ***
## `budget (Millions)` 2.046e-03 1.076e-03 1.901 0.0578 .
## `revenue (Millions)` 8.124e-05 2.461e-04 0.330 0.7415
## runtime 1.445e-04 1.865e-03 0.077 0.9383
## critics_score 4.103e-03 2.478e-03 1.656 0.0982 .
## audience_score 6.512e-02 2.821e-03 23.082 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9145 on 635 degrees of freedom
## Multiple R-squared: 0.7271, Adjusted R-squared: 0.7249
## F-statistic: 338.4 on 5 and 635 DF, p-value: < 2.2e-16
#check if variance inflation factor (vif) <10
vif(model)
## `budget (Millions)` `revenue (Millions)` runtime
## 1.889045 2.018807 1.011418
## critics_score audience_score
## 2.517059 2.675179
#Check linearity of residuels
plot(model, which = 1)
#Check Normality of residuals
plot(model, which = 2)