# Load libraries
library(knitr)
library(stringr)
library(ggplot2)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Read in csv file
movies = read.csv("https://raw.githubusercontent.com/Galanopoulog/DATA606-Project-CSV/master/Blockbuster.csv", sep = ",")
# Remove columns we won't use and rows with NA values
movies = na.omit(movies[, -c(1:4,6,8,9:11,13,15:16,19)])
names(movies)
## [1] "rt_score" "adjusted" "imdb_rating" "rank_in_year"
## [5] "studio" "title" "year"
head(movies)
## rt_score adjusted imdb_rating rank_in_year
## 1 7.5 $712,903,691.09 7.8 7
## 2 7.9 $706,988,165.89 7.7 9
## 3 7.7 $772,158,880.00 8.1 3
## 4 7.0 $671,220,455.10 8.7 10
## 5 5.7 $756,677,675.77 7.1 4
## 6 5.9 $707,732,953.69 6.9 8
## studio title
## 1 Marvel Studios Captain America: The Winter Soldier
## 2 20th Century Fox Dawn of the Planet of the Apes
## 3 Marvel Studios Guardians of the Galaxy
## 4 Paramount Pictures / Warner Bros. Interstellar
## 5 Walt Disney Pictures Maleficent
## 6 Columbia Pictures The Amazing Spider-Man 2
## year
## 1 2014
## 2 2014
## 3 2014
## 4 2014
## 5 2014
## 6 2014
# Rearrange and rename columns
movies = movies[, c(6,5,7,4,2,1,3)]
colnames(movies) = c("Title", "Studio", "Year", "Year_Rank", "Gross_Profit", "RotTom", "IMDB")
kable(head(movies))
| Title | Studio | Year | Year_Rank | Gross_Profit | RotTom | IMDB |
|---|---|---|---|---|---|---|
| Captain America: The Winter Soldier | Marvel Studios | 2014 | 7 | $712,903,691.09 | 7.5 | 7.8 |
| Dawn of the Planet of the Apes | 20th Century Fox | 2014 | 9 | $706,988,165.89 | 7.9 | 7.7 |
| Guardians of the Galaxy | Marvel Studios | 2014 | 3 | $772,158,880.00 | 7.7 | 8.1 |
| Interstellar | Paramount Pictures / Warner Bros. | 2014 | 10 | $671,220,455.10 | 7.0 | 8.7 |
| Maleficent | Walt Disney Pictures | 2014 | 4 | $756,677,675.77 | 5.7 | 7.1 |
| The Amazing Spider-Man 2 | Columbia Pictures | 2014 | 8 | $707,732,953.69 | 5.9 | 6.9 |
# Remove "$" from Gross Profit and change it from a character to numeric
movies$Gross_Profit = as.numeric(gsub("," , "",substr(movies$Gross_Profit, 2, 30)))
kable(head(movies))
| Title | Studio | Year | Year_Rank | Gross_Profit | RotTom | IMDB |
|---|---|---|---|---|---|---|
| Captain America: The Winter Soldier | Marvel Studios | 2014 | 7 | 712903691 | 7.5 | 7.8 |
| Dawn of the Planet of the Apes | 20th Century Fox | 2014 | 9 | 706988166 | 7.9 | 7.7 |
| Guardians of the Galaxy | Marvel Studios | 2014 | 3 | 772158880 | 7.7 | 8.1 |
| Interstellar | Paramount Pictures / Warner Bros. | 2014 | 10 | 671220455 | 7.0 | 8.7 |
| Maleficent | Walt Disney Pictures | 2014 | 4 | 756677676 | 5.7 | 7.1 |
| The Amazing Spider-Man 2 | Columbia Pictures | 2014 | 8 | 707732954 | 5.9 | 6.9 |
You should phrase your research question in a way that matches up with the scope of inference your dataset allows for.
Is the gross profit predictive of the rank of a movie? Does the profit have an effect the rating?
What are the cases, and how many are there?
dim(movies)
## [1] 398 7
There are 398 rows, in that each row is a movie and each movie is a case.
Describe the method of data collection.
Data is collected by CrowdFlower, a crowdsourcing company that has an open data library (Data fore Everyone library found here: “https://www.crowdflower.com/data-for-everyone/”).
What type of study is this (observational/experiment)?
It is an observational study.
A citation/link and minor data description.
The Blockbuster database can be found and dowloaded in csv format from “https://www.crowdflower.com/data-for-everyone/”. It includes data from the ten most popular movies of each each from 1975 to 2015. Categories within include: Movie titles, Poster URLs, Genre, Run time, Box Office Receits and ratings (from MPAA, IMDB and Rotten Tomatoes). The raw data from the csv can be found here: “https://raw.githubusercontent.com/Galanopoulog/DATA606-Project-CSV/master/Blockbuster.csv”
What is the response variable, and what type is it (numerical/categorical)?
The response variable is rank (categorical ordinal) and rating (numerical ordinal).
What is the explanatory variable, and what type is it (numerical/categorival)?
The explanatory variable is gross profit and is numerical.
Summary statistics relevant to the research question.
summary(movies)
## Title Studio Year
## King Kong : 2 Warner Bros. : 45 Min. :1975
## Signs : 2 Paramount Pictures : 30 1st Qu.:1985
## "Crocodile" Dundee : 1 Universal Pictures : 22 Median :1995
## "Crocodile" Dundee II: 1 20th Century Fox : 20 Mean :1995
## 10 : 1 Columbia Pictures : 19 3rd Qu.:2005
## 101 Dalmatians : 1 Walt Disney Pictures: 14 Max. :2014
## (Other) :390 (Other) :248
## Year_Rank Gross_Profit RotTom IMDB
## Min. : 1.00 Min. :1.100e+08 Min. :0.000 Min. :4.400
## 1st Qu.: 3.00 1st Qu.:2.936e+08 1st Qu.:5.700 1st Qu.:6.500
## Median : 6.00 Median :4.880e+08 Median :6.600 Median :7.000
## Mean : 5.53 Mean :5.428e+08 Mean :6.503 Mean :7.053
## 3rd Qu.: 8.00 3rd Qu.:6.986e+08 3rd Qu.:7.500 3rd Qu.:7.675
## Max. :10.00 Max. :3.026e+09 Max. :9.100 Max. :9.000
##
Mean Ratings and Profits by Year
movstats =movies %>% group_by(Year) %>%
summarise(MeanGross = round(mean(Gross_Profit),2),
MeanRott = round(mean(RotTom),2),
MeanIMBD = round(mean(IMDB),2),
Counts = n())
kable(head(movstats))
| Year | MeanGross | MeanRott | MeanIMBD | Counts |
|---|---|---|---|---|
| 1975 | 179796017 | 4.60 | 6.64 | 7 |
| 1976 | 275627492 | 5.32 | 6.80 | 10 |
| 1977 | 382176754 | 7.07 | 7.26 | 10 |
| 1978 | 323918184 | 6.83 | 7.00 | 10 |
| 1979 | 262983710 | 6.86 | 7.16 | 10 |
| 1980 | 254279292 | 6.37 | 6.83 | 10 |
Side by side plots of Mean Gross Profits by Year for different Rating sites
gather(movstats, "Ratings", "n", 3:4 ) %>%
ggplot(aes(x= Year, y= MeanGross), colour = Ratings) +
geom_line(aes(fill= Ratings), stat="identity", position=position_dodge())+
facet_grid(~ Ratings)
## Warning: Ignoring unknown aesthetics: fill
## Warning: Width not defined. Set with `position_dodge(width = ?)`
Gross Profit histogram
ggplot(movies, aes(x=Gross_Profit)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Ratings histograms
ggplot(movies, aes(x=RotTom)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(movies, aes(x=IMDB)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.