Load Libraries:

library(knitr)
library(stringr)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

Data Preparation

movies_2008 <- read.csv("https://raw.githubusercontent.com/Riteshlohiya/Data606_Project_Proposal/master/movies_2008_V1.csv", sep=",")
head(movies_2008)
##   Rank                                              Movie Release_Date
## 1    1                                    The Dark Knight    7/18/2008
## 2    2                                           Iron Man     5/2/2008
## 3    3 Indiana Jones and the Kingdom of the Crystal Skull    5/22/2008
## 4    4                                            Hancock     7/2/2008
## 5    5                                             WALL-E    6/27/2008
## 6    6                                      Kung Fu Panda     6/6/2008
##          Distributor            Genre  MPAA Gross_Collection Tickets_Sold
## 1       Warner Bros. Action/Adventure PG-13        531001578     73955652
## 2 Paramount Pictures Action/Adventure PG-13        318412101     44347089
## 3 Paramount Pictures Action/Adventure PG-13        317101119     44164501
## 4      Sony Pictures Action/Adventure PG-13        227946274     31747392
## 5        Buena Vista           Comedy     G        223806889     31170876
## 6 Paramount Pictures Action/Adventure    PG        215434591     30004818
##   Inflation_Adjusted_Gross Tickets_TT Gross_ML RotTom
## 1                531001581   7395.565 531.0016    8.2
## 2                318412099   4434.709 318.4121    7.4
## 3                317101117   4416.450 317.1011    5.7
## 4                227946275   3174.739 227.9463    6.2
## 5                223806890   3117.088 223.8069    7.8
## 6                215434593   3000.482 215.4346    6.9

Research question

The data is collected from StatCrunch and has the details of movie released in 2008. In this study we will see if the genre and RotTom(Rotten Tomatoes ratings) of a movie has any affect on the gross collection.

Cases

dim(movies_2008)
## [1] 227  12

Each case represents the movie details. The dataset has 227 observations.

Data collection

The data is collected from StatCrunch website link for which is given below. The RomTom ratings are taken from Rotten Tomatoes. The collected data is then stored in GitHub repository: https://raw.githubusercontent.com/Riteshlohiya/Data606_Project_Proposal/master/movies_2008_V1.csv

Link for StatCrunch: https://www.statcrunch.com/app/index.php?dataid=398031

Link for Rotten Tomatoes ratings: https://www.kaggle.com/tmdb/tmdb-movie-metadata/data

Type of study

This is an observational study.

Data Source

The data is from StatCrunch website link for which is given below. The RomTom ratings are taken from Rotten Tomatoes.

Link for StatCrunch: https://www.statcrunch.com/app/index.php?dataid=398031

Link for Rotten Tomatoes ratings: https://www.kaggle.com/tmdb/tmdb-movie-metadata/data

Response

Gross Collection(Gross_Collection) is the response variable which is numerical.

Explanatory

Genre is the explanatory variable which is categorical. RotTom is another explanatory variable which is numerical ordinal.

Relevant summary statistics

movies_2008 <- as.data.frame(movies_2008) 
summary(movies_2008)
##       Rank                           Movie         Release_Date
##  Min.   :  1.0   21                     :  1   10/24/2008:  7  
##  1st Qu.: 69.0   27 Dresses             :  1   10/3/2008 :  7  
##  Median :139.0   88 Minutes             :  1   10/31/2008:  7  
##  Mean   :190.2   All Hat                :  1   1/18/2008 :  6  
##  3rd Qu.:240.5   Alvin and the Chipmunks:  1   10/10/2008:  5  
##  Max.   :719.0   American Gangster      :  1   10/17/2008:  5  
##                  (Other)                :221   (Other)   :190  
##              Distributor                Genre           MPAA   
##  Warner Bros.      : 21   Action/Adventure :39            : 1  
##  Universal         : 19   Comedy           :74   G        : 6  
##  20th Century Fox  : 18   Documentary      : 8   Not Rated:15  
##  Lionsgate         : 14   Drama/Musical    :71   PG       :21  
##  Paramount Pictures: 14   Horror           :13   PG-13    :87  
##  Sony Pictures     : 14   Thriller/Suspense:22   R        :97  
##  (Other)           :127                                        
##  Gross_Collection     Tickets_Sold      Inflation_Adjusted_Gross
##  Min.   :      744   Min.   :     104   Min.   :      747       
##  1st Qu.:  1309738   1st Qu.:  182415   1st Qu.:  1309740       
##  Median : 13367624   Median : 1861786   Median : 13367623       
##  Mean   : 36038215   Mean   : 5019250   Mean   : 36038215       
##  3rd Qu.: 43049146   3rd Qu.: 5995702   3rd Qu.: 43049144       
##  Max.   :531001578   Max.   :73955652   Max.   :531001581       
##                                                                 
##    Tickets_TT         Gross_ML            RotTom     
##  Min.   :   0.01   Min.   :  0.0007   Min.   : 0.00  
##  1st Qu.:  18.24   1st Qu.:  1.3097   1st Qu.: 5.70  
##  Median : 186.18   Median : 13.3676   Median : 6.30  
##  Mean   : 501.93   Mean   : 36.0382   Mean   : 6.14  
##  3rd Qu.: 599.57   3rd Qu.: 43.0491   3rd Qu.: 6.70  
##  Max.   :7395.56   Max.   :531.0016   Max.   :10.00  
## 
movies_Genre <- table(movies_2008$Genre) 
par(oma = c(1,1,1,1))
par(mar = c(4,5,2,1))
barplot(movies_Genre[order(movies_Genre)],
        horiz = TRUE,
        las = 1,
        col = c(rainbow(6)),
        border = NA,
        main = "Frequency of Movie Genre",
        xlab = "Count",
        font.lab = 2,
        cex.names = 0.75)

Genre Boxplot:

ggplot(movies_2008,aes(x=Genre,y=Gross_Collection)) + geom_boxplot()

RotTom Ratings Histogram:

ggplot(movies_2008, aes(x=RotTom)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Summary of the data for Gross_Collection by Genre:

movies_2008_Gross_Collection  <- movies_2008 %>% 
  group_by(Genre) 

movies_2008_Gross_Collection <- summarise(
  movies_2008_Gross_Collection,
  count = n(),
  mean_collection = mean(Gross_Collection),
  sd_collection = sd(Gross_Collection),
  max_collection = max(Gross_Collection)
  
)

movies_2008_Gross_Collection
## # A tibble: 6 x 5
##   Genre             count mean_collection sd_collection max_collection
##   <fct>             <int>           <dbl>         <dbl>          <dbl>
## 1 Action/Adventure     39        83659231     110078672      531001578
## 2 Comedy               74        37259824      47182426      223806889
## 3 Documentary           8         3958010       5023598       13011160
## 4 Drama/Musical        71        16275324      30706119      176922850
## 5 Horror               13        21937404      20537708       56746769
## 6 Thriller/Suspense    22        31288157      29490391      101401695

Summary of the data for RotTom Ratings by Genre:

movies_2008_RotTom  <- movies_2008 %>% 
  group_by(Genre) 

movies_2008_RotTom <- summarise(
  movies_2008_RotTom,
  count = n(),
  mean_RotTom = mean(RotTom),
  sd_RotTom = sd(RotTom),
  max_RotTom= max(RotTom)
  
)

movies_2008_RotTom
## # A tibble: 6 x 5
##   Genre             count mean_RotTom sd_RotTom max_RotTom
##   <fct>             <int>       <dbl>     <dbl>      <dbl>
## 1 Action/Adventure     39        6.02     0.804       8.20
## 2 Comedy               74        5.80     1.17       10.0 
## 3 Documentary           8        6.88     0.423       7.50
## 4 Drama/Musical        71        6.55     1.08        7.90
## 5 Horror               13        5.78     0.685       6.90
## 6 Thriller/Suspense    22        6.14     0.657       7.30