Load Libraries:
library(knitr)
library(stringr)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
movies_2008 <- read.csv("https://raw.githubusercontent.com/Riteshlohiya/Data606_Project_Proposal/master/movies_2008_V1.csv", sep=",")
head(movies_2008)
## Rank Movie Release_Date
## 1 1 The Dark Knight 7/18/2008
## 2 2 Iron Man 5/2/2008
## 3 3 Indiana Jones and the Kingdom of the Crystal Skull 5/22/2008
## 4 4 Hancock 7/2/2008
## 5 5 WALL-E 6/27/2008
## 6 6 Kung Fu Panda 6/6/2008
## Distributor Genre MPAA Gross_Collection Tickets_Sold
## 1 Warner Bros. Action/Adventure PG-13 531001578 73955652
## 2 Paramount Pictures Action/Adventure PG-13 318412101 44347089
## 3 Paramount Pictures Action/Adventure PG-13 317101119 44164501
## 4 Sony Pictures Action/Adventure PG-13 227946274 31747392
## 5 Buena Vista Comedy G 223806889 31170876
## 6 Paramount Pictures Action/Adventure PG 215434591 30004818
## Inflation_Adjusted_Gross Tickets_TT Gross_ML RotTom
## 1 531001581 7395.565 531.0016 8.2
## 2 318412099 4434.709 318.4121 7.4
## 3 317101117 4416.450 317.1011 5.7
## 4 227946275 3174.739 227.9463 6.2
## 5 223806890 3117.088 223.8069 7.8
## 6 215434593 3000.482 215.4346 6.9
The data is collected from StatCrunch and has the details of movie released in 2008. In this study we will see if the genre and RotTom(Rotten Tomatoes ratings) of a movie has any affect on the gross collection.
dim(movies_2008)
## [1] 227 12
Each case represents the movie details. The dataset has 227 observations.
The data is collected from StatCrunch website link for which is given below. The RomTom ratings are taken from Rotten Tomatoes. The collected data is then stored in GitHub repository: https://raw.githubusercontent.com/Riteshlohiya/Data606_Project_Proposal/master/movies_2008_V1.csv
Link for StatCrunch: https://www.statcrunch.com/app/index.php?dataid=398031
Link for Rotten Tomatoes ratings: https://www.kaggle.com/tmdb/tmdb-movie-metadata/data
This is an observational study.
The data is from StatCrunch website link for which is given below. The RomTom ratings are taken from Rotten Tomatoes.
Link for StatCrunch: https://www.statcrunch.com/app/index.php?dataid=398031
Link for Rotten Tomatoes ratings: https://www.kaggle.com/tmdb/tmdb-movie-metadata/data
Gross Collection(Gross_Collection) is the response variable which is numerical.
Genre is the explanatory variable which is categorical. RotTom is another explanatory variable which is numerical ordinal.
movies_2008 <- as.data.frame(movies_2008)
summary(movies_2008)
## Rank Movie Release_Date
## Min. : 1.0 21 : 1 10/24/2008: 7
## 1st Qu.: 69.0 27 Dresses : 1 10/3/2008 : 7
## Median :139.0 88 Minutes : 1 10/31/2008: 7
## Mean :190.2 All Hat : 1 1/18/2008 : 6
## 3rd Qu.:240.5 Alvin and the Chipmunks: 1 10/10/2008: 5
## Max. :719.0 American Gangster : 1 10/17/2008: 5
## (Other) :221 (Other) :190
## Distributor Genre MPAA
## Warner Bros. : 21 Action/Adventure :39 : 1
## Universal : 19 Comedy :74 G : 6
## 20th Century Fox : 18 Documentary : 8 Not Rated:15
## Lionsgate : 14 Drama/Musical :71 PG :21
## Paramount Pictures: 14 Horror :13 PG-13 :87
## Sony Pictures : 14 Thriller/Suspense:22 R :97
## (Other) :127
## Gross_Collection Tickets_Sold Inflation_Adjusted_Gross
## Min. : 744 Min. : 104 Min. : 747
## 1st Qu.: 1309738 1st Qu.: 182415 1st Qu.: 1309740
## Median : 13367624 Median : 1861786 Median : 13367623
## Mean : 36038215 Mean : 5019250 Mean : 36038215
## 3rd Qu.: 43049146 3rd Qu.: 5995702 3rd Qu.: 43049144
## Max. :531001578 Max. :73955652 Max. :531001581
##
## Tickets_TT Gross_ML RotTom
## Min. : 0.01 Min. : 0.0007 Min. : 0.00
## 1st Qu.: 18.24 1st Qu.: 1.3097 1st Qu.: 5.70
## Median : 186.18 Median : 13.3676 Median : 6.30
## Mean : 501.93 Mean : 36.0382 Mean : 6.14
## 3rd Qu.: 599.57 3rd Qu.: 43.0491 3rd Qu.: 6.70
## Max. :7395.56 Max. :531.0016 Max. :10.00
##
movies_Genre <- table(movies_2008$Genre)
par(oma = c(1,1,1,1))
par(mar = c(4,5,2,1))
barplot(movies_Genre[order(movies_Genre)],
horiz = TRUE,
las = 1,
col = c(rainbow(6)),
border = NA,
main = "Frequency of Movie Genre",
xlab = "Count",
font.lab = 2,
cex.names = 0.75)
ggplot(movies_2008,aes(x=Genre,y=Gross_Collection)) + geom_boxplot()
ggplot(movies_2008, aes(x=RotTom)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
movies_2008_Gross_Collection <- movies_2008 %>%
group_by(Genre)
movies_2008_Gross_Collection <- summarise(
movies_2008_Gross_Collection,
count = n(),
mean_collection = mean(Gross_Collection),
sd_collection = sd(Gross_Collection),
max_collection = max(Gross_Collection)
)
movies_2008_Gross_Collection
## # A tibble: 6 x 5
## Genre count mean_collection sd_collection max_collection
## <fct> <int> <dbl> <dbl> <dbl>
## 1 Action/Adventure 39 83659231 110078672 531001578
## 2 Comedy 74 37259824 47182426 223806889
## 3 Documentary 8 3958010 5023598 13011160
## 4 Drama/Musical 71 16275324 30706119 176922850
## 5 Horror 13 21937404 20537708 56746769
## 6 Thriller/Suspense 22 31288157 29490391 101401695
movies_2008_RotTom <- movies_2008 %>%
group_by(Genre)
movies_2008_RotTom <- summarise(
movies_2008_RotTom,
count = n(),
mean_RotTom = mean(RotTom),
sd_RotTom = sd(RotTom),
max_RotTom= max(RotTom)
)
movies_2008_RotTom
## # A tibble: 6 x 5
## Genre count mean_RotTom sd_RotTom max_RotTom
## <fct> <int> <dbl> <dbl> <dbl>
## 1 Action/Adventure 39 6.02 0.804 8.20
## 2 Comedy 74 5.80 1.17 10.0
## 3 Documentary 8 6.88 0.423 7.50
## 4 Drama/Musical 71 6.55 1.08 7.90
## 5 Horror 13 5.78 0.685 6.90
## 6 Thriller/Suspense 22 6.14 0.657 7.30