Data Preparation

# Load libraries
library(knitr)
library(stringr)
library(ggplot2)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Read in csv file
movies = read.csv("https://raw.githubusercontent.com/Galanopoulog/DATA606-Project-CSV/master/Blockbuster.csv", sep = ",")

# Remove columns we won't use and rows with NA values
movies = na.omit(movies[, -c(1:4,6,8,9:11,13,15:16,19)])
names(movies)
## [1] "rt_score"     "adjusted"     "imdb_rating"  "rank_in_year"
## [5] "studio"       "title"        "year"
head(movies)
##   rt_score         adjusted imdb_rating rank_in_year
## 1      7.5 $712,903,691.09          7.8            7
## 2      7.9 $706,988,165.89          7.7            9
## 3      7.7 $772,158,880.00          8.1            3
## 4      7.0 $671,220,455.10          8.7           10
## 5      5.7 $756,677,675.77          7.1            4
## 6      5.9 $707,732,953.69          6.9            8
##                              studio                               title
## 1                    Marvel Studios Captain America: The Winter Soldier
## 2                  20th Century Fox      Dawn of the Planet of the Apes
## 3                    Marvel Studios             Guardians of the Galaxy
## 4 Paramount Pictures / Warner Bros.                        Interstellar
## 5              Walt Disney Pictures                          Maleficent
## 6                 Columbia Pictures            The Amazing Spider-Man 2
##   year
## 1 2014
## 2 2014
## 3 2014
## 4 2014
## 5 2014
## 6 2014
# Rearrange and rename columns
movies = movies[, c(6,5,7,4,2,1,3)]
colnames(movies) = c("Title", "Studio", "Year", "Year_Rank", "Gross_Profit", "RotTom", "IMDB")
kable(head(movies))
Title Studio Year Year_Rank Gross_Profit RotTom IMDB
Captain America: The Winter Soldier Marvel Studios 2014 7 $712,903,691.09 7.5 7.8
Dawn of the Planet of the Apes 20th Century Fox 2014 9 $706,988,165.89 7.9 7.7
Guardians of the Galaxy Marvel Studios 2014 3 $772,158,880.00 7.7 8.1
Interstellar Paramount Pictures / Warner Bros. 2014 10 $671,220,455.10 7.0 8.7
Maleficent Walt Disney Pictures 2014 4 $756,677,675.77 5.7 7.1
The Amazing Spider-Man 2 Columbia Pictures 2014 8 $707,732,953.69 5.9 6.9
# Remove "$" from Gross Profit and change it from a character to numeric
movies$Gross_Profit = as.numeric(gsub("," , "",substr(movies$Gross_Profit, 2, 30)))

kable(head(movies))
Title Studio Year Year_Rank Gross_Profit RotTom IMDB
Captain America: The Winter Soldier Marvel Studios 2014 7 712903691 7.5 7.8
Dawn of the Planet of the Apes 20th Century Fox 2014 9 706988166 7.9 7.7
Guardians of the Galaxy Marvel Studios 2014 3 772158880 7.7 8.1
Interstellar Paramount Pictures / Warner Bros. 2014 10 671220455 7.0 8.7
Maleficent Walt Disney Pictures 2014 4 756677676 5.7 7.1
The Amazing Spider-Man 2 Columbia Pictures 2014 8 707732954 5.9 6.9

Research Question

You should phrase your research question in a way that matches up with the scope of inference your dataset allows for.

Is the gross profit predictive of the rank of a movie? Does the profit have an effect the rating?

Cases

What are the cases, and how many are there?

dim(movies)
## [1] 398   7

There are 398 rows, in that each row is a movie and each movie is a case.

Data Collection

Describe the method of data collection.

Data is collected by CrowdFlower, a crowdsourcing company that has an open data library (Data fore Everyone library found here: “https://www.crowdflower.com/data-for-everyone/”).

Type of study

What type of study is this (observational/experiment)?

It is an observational study.

Data Source

A citation/link and minor data description.

The Blockbuster database can be found and dowloaded in csv format from “https://www.crowdflower.com/data-for-everyone/”. It includes data from the ten most popular movies of each each from 1975 to 2015. Categories within include: Movie titles, Poster URLs, Genre, Run time, Box Office Receits and ratings (from MPAA, IMDB and Rotten Tomatoes). The raw data from the csv can be found here: “https://raw.githubusercontent.com/Galanopoulog/DATA606-Project-CSV/master/Blockbuster.csv

Response

What is the response variable, and what type is it (numerical/categorical)?

The response variable is rank (categorical ordinal) and rating (numerical ordinal).

Explanatory

What is the explanatory variable, and what type is it (numerical/categorival)?

The explanatory variable is gross profit and is numerical.

Relevant Summary Statistics

Summary statistics relevant to the research question.

summary(movies)
##                    Title                      Studio         Year     
##  King Kong            :  2   Warner Bros.        : 45   Min.   :1975  
##  Signs                :  2   Paramount Pictures  : 30   1st Qu.:1985  
##  "Crocodile" Dundee   :  1   Universal Pictures  : 22   Median :1995  
##  "Crocodile" Dundee II:  1   20th Century Fox    : 20   Mean   :1995  
##  10                   :  1   Columbia Pictures   : 19   3rd Qu.:2005  
##  101 Dalmatians       :  1   Walt Disney Pictures: 14   Max.   :2014  
##  (Other)              :390   (Other)             :248                 
##    Year_Rank      Gross_Profit           RotTom           IMDB      
##  Min.   : 1.00   Min.   :1.100e+08   Min.   :0.000   Min.   :4.400  
##  1st Qu.: 3.00   1st Qu.:2.936e+08   1st Qu.:5.700   1st Qu.:6.500  
##  Median : 6.00   Median :4.880e+08   Median :6.600   Median :7.000  
##  Mean   : 5.53   Mean   :5.428e+08   Mean   :6.503   Mean   :7.053  
##  3rd Qu.: 8.00   3rd Qu.:6.986e+08   3rd Qu.:7.500   3rd Qu.:7.675  
##  Max.   :10.00   Max.   :3.026e+09   Max.   :9.100   Max.   :9.000  
## 

Mean Ratings and Profits by Year

movstats =movies %>% group_by(Year) %>% 
  summarise(MeanGross = round(mean(Gross_Profit),2),
            MeanRott = round(mean(RotTom),2),
            MeanIMBD = round(mean(IMDB),2),
            Counts = n())
kable(head(movstats))
Year MeanGross MeanRott MeanIMBD Counts
1975 179796017 4.60 6.64 7
1976 275627492 5.32 6.80 10
1977 382176754 7.07 7.26 10
1978 323918184 6.83 7.00 10
1979 262983710 6.86 7.16 10
1980 254279292 6.37 6.83 10

Side by side plots of Mean Gross Profits by Year for different Rating sites

gather(movstats, "Ratings", "n", 3:4 ) %>% 
ggplot(aes(x= Year, y= MeanGross), colour = Ratings) + 
  geom_line(aes(fill= Ratings), stat="identity", position=position_dodge())+
  facet_grid(~ Ratings)
## Warning: Ignoring unknown aesthetics: fill
## Warning: Width not defined. Set with `position_dodge(width = ?)`

Gross Profit histogram

ggplot(movies, aes(x=Gross_Profit)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Ratings histograms

ggplot(movies, aes(x=RotTom)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(movies, aes(x=IMDB)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.