Install required packages
# Here we are checking if the package is installed
if(!require("tidyverse")){
# If the package is not in the system then it will be install
install.packages("tidyverse", dependencies = TRUE)
# Here we are loading the package
library("tidyverse")
}
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ dplyr 0.7.4
## ✔ tidyr 0.7.2 ✔ stringr 1.2.0
## ✔ readr 1.1.1 ✔ forcats 0.2.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
# Here we are checking if the package is installed
if(!require("corrplot")){
# If the package is not in the system then it will be install
install.packages("corrplot", dependencies = TRUE)
# Here we are loading the package
library("corrplot")
}
## Loading required package: corrplot
## corrplot 0.84 loaded
if(!require("tidyverse")){
# If the package is not in the system then it will be install
install.packages("tidyverse", dependencies = TRUE)
# Here we are loading the package
library("tidyverse")
}
# Here we are checking if the package is installed
if(!require("plotly")){
# If the package is not in the system then it will be install
install.packages("plotly", dependencies = TRUE)
# Here we are loading the package
library("plotly")
}
## Loading required package: plotly
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
mydata <- read.csv("data/action.csv")
summary(mydata)
## title budget genre.1
## King Kong : 3 Min. :7.00e+03 Action:905
## The Fast and the Furious : 3 1st Qu.:2.80e+07
## Around the World in 80 Days: 2 Median :5.40e+07
## Clash of the Titans : 2 Mean :7.11e+07
## Dawn of the Dead : 2 3rd Qu.:9.80e+07
## Dredd : 2 Max. :6.00e+08
## (Other) :891
## genre.2 genre.3 actor1_fb_likes actor2_fb_likes
## Adventure:370 Drama :156 Min. : 0 Min. : 0
## Crime :179 Thriller:145 1st Qu.: 833 1st Qu.: 464
## Comedy :126 Sci-Fi :128 Median : 4000 Median : 794
## Drama : 82 Crime : 87 Mean : 9316 Mean : 2527
## Horror : 29 Fantasy : 80 3rd Qu.: 14000 3rd Qu.: 1000
## Sci-Fi : 29 : 76 Max. :260000 Max. :27000
## (Other) : 90 (Other) :233
## actor3_fb_likes sum_actor_likes total_cast_likes fb_likes
## Min. : 0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 244 1st Qu.: 1759 1st Qu.: 2403 1st Qu.: 0
## Median : 490 Median : 5899 Median : 6727 Median : 244
## Mean : 928 Mean : 12771 Mean : 13969 Mean : 12535
## 3rd Qu.: 798 3rd Qu.: 18504 3rd Qu.: 19764 3rd Qu.: 15000
## Max. :23000 Max. :261875 Max. :263584 Max. :197000
##
## critic_reviews users_reviews total_reviews users_votes
## Min. : 1.0 Min. : 1.0 Min. : 3.0 Min. : 40
## 1st Qu.: 94.0 1st Qu.: 141.0 1st Qu.: 264.0 1st Qu.: 34948
## Median :171.0 Median : 286.0 Median : 471.0 Median : 83097
## Mean :202.2 Mean : 463.5 Mean : 665.7 Mean : 141752
## 3rd Qu.:275.0 3rd Qu.: 552.0 3rd Qu.: 846.0 3rd Qu.: 182899
## Max. :813.0 Max. :5060.0 Max. :5357.0 Max. :1676169
##
## score aspect_ratio gross year
## Min. :2.100 Min. :1.330 Min. : 162 Min. :1990
## 1st Qu.:5.700 1st Qu.:2.350 1st Qu.: 18208078 1st Qu.:2000
## Median :6.300 Median :2.350 Median : 45434443 Median :2006
## Mean :6.249 Mean :2.229 Mean : 76867713 Mean :2005
## 3rd Qu.:6.900 3rd Qu.:2.350 3rd Qu.:102981571 3rd Qu.:2011
## Max. :9.000 Max. :2.390 Max. :760505847 Max. :2016
## NA's :11
Max Actor 1 Facebook likes is 260,000. Max actor 2 is 27,000. Max actor 3 facebook likes is 23,000. Max score is 9/10. Minimum is 2.1/10.Total reviews has a max of 5357 with a minimum of 3. Total cast likes min is 0 with a max of 263,584. Mean is 13,969.
head(mydata)
## title budget genre.1 genre.2 genre.3 actor1_fb_likes
## 1 Hardflip 1.0e+06 Action Drama 260000
## 2 Feast 3.2e+06 Action Comedy Horror 164000
## 3 Armored 2.7e+07 Action Crime Thriller 137000
## 4 Hostage 5.2e+07 Action Crime Drama 87000
## 5 Poseidon 1.6e+08 Action Adventure Drama 87000
## 6 Night at the Museum 1.1e+08 Action Adventure Comedy 49000
## actor2_fb_likes actor3_fb_likes sum_actor_likes total_cast_likes
## 1 984 891 261875 263584
## 2 2000 898 166898 170118
## 3 459 163 137622 137712
## 4 13000 759 100759 103354
## 5 2000 702 89702 92456
## 6 3000 1000 53000 55486
## fb_likes critic_reviews users_reviews total_reviews users_votes score
## 1 706 2 5 7 606 5.6
## 2 0 130 252 382 25542 6.4
## 3 0 107 110 217 26236 5.7
## 4 0 152 288 440 93790 6.6
## 5 0 231 629 860 82380 5.6
## 6 3000 179 444 623 234480 6.4
## aspect_ratio gross year
## 1 1.85 96734 2012
## 2 2.35 690872 2005
## 3 2.35 15988876 2009
## 4 2.35 34636443 2005
## 5 2.35 60655503 2006
## 6 1.85 250863268 2006
## Initial Data Cleaning in R |
mydata$budget <- gsub( ',', '', mydata$budget)
mydata$budget <- as.numeric(mydata$budget)
mydata$gross <- gsub( ',', '', mydata$gross)
mydata$gross <- as.numeric(mydata$gross)
write_csv(mydata, "data/action.csv")
mydata <- read_csv("data/action.csv")
## Parsed with column specification:
## cols(
## title = col_character(),
## budget = col_double(),
## genre.1 = col_character(),
## genre.2 = col_character(),
## genre.3 = col_character(),
## actor1_fb_likes = col_integer(),
## actor2_fb_likes = col_integer(),
## actor3_fb_likes = col_integer(),
## sum_actor_likes = col_integer(),
## total_cast_likes = col_integer(),
## fb_likes = col_integer(),
## critic_reviews = col_integer(),
## users_reviews = col_integer(),
## total_reviews = col_integer(),
## users_votes = col_integer(),
## score = col_double(),
## aspect_ratio = col_double(),
## gross = col_double(),
## year = col_integer()
## )
title <- mydata$title
budget <- mydata$budget
sum <- mydata$sum_actor_likes
total_reviews <- mydata$total_reviews
score <- mydata$score
gross <- mydata$gross
year <- mydata$year
actorlikes <- mydata$sum_actor_likes
My project is about using the Rotten Tomatoes data set to find out information about action movies.
Excel - data cleaning. Got rid of bad data. Filled in gaps where I could. Eliminated unnecessary columns. Deleted rows with mutltiple missing data points. Watson Analytics - initial analysis to help me see what kind of questions I could answer. RStudio - statistical analysis Tableau - data visualization Google Slides - final presentation put together
The Data has movies that originated from multiple countries but had a presence in the US. The Data is from Rotten Tomatoes. Years 1990 and greater. Only Action movies are included as this is the focus of my project. There are 909 rows and 19 columns of data.
DOES ROTTEN TOMATOE’S BUDGET, LIKES, AND SCORE PREDICT AN ACTION MOVIE’S GROSS REVENUE? Does budget affect gross revenue? Does the score and amount of likes affect gross revenue?
There are 905 rows, 19 columns. Originally there were 20 genres and I consolidated them to one. The Watson quality is 55 percent. The data is from Rotten Tomatoes. It has data from 2016 to 1990 (after cleaning).
Cleaning consisted of removing the special characters from the title data column. I did this using Ablebits excel extension software. Separation of the genre column was also an essential part of the cleaning. I simplified the genres from 6 (for some) to three in the final data set. I deleted any data from years earlier than 1990. Some data was missing from the set so I had to fill it in by finding the data from online. I had to add in things like budget for some lines of data. I also had to delete rows that had more than one blank in their data.
The numeric data includes budget, actor likes (1,2, and 3), sum of total actor likes, total cast likes, FB likes, critic reviews, user reviews, total reviews, users votes, score, sspect ratio, and gross revenue.
Non-numeric data includes title, genres 1-3, and year.
budgetmax <- max(budget)
budgetmax
## [1] 6e+08
budgetmin <- min(budget)
budgetmin
## [1] 7000
Max is 600 million min is 7000. The movie with the 600 million dollar budget only grossed 1,602,466. The movie with a 7,000 production budget grossed 2,040,920.
grossmax <- max(gross)
grossmax
## [1] 760505847
grossmin <- min(gross)
grossmin
## [1] 162
Gross max is about 760 million whereas the minimum is 162 dollars!
scoremax <- max(score)
scoremax
## [1] 9
scoremin <- min(score)
scoremin
## [1] 2.1
The highest scoring movie is The Dark Knight! The Dark Knight is a Christopher Nolan film that came out in 2008.
scoregrossplot <- qplot( x = score, y = gross, data = mydata)
scoregrossplot + geom_smooth(method = "lm", formula = y ~ x)
summary(scoregrossplot)
## data: title, budget, genre.1, genre.2, genre.3, actor1_fb_likes,
## actor2_fb_likes, actor3_fb_likes, sum_actor_likes,
## total_cast_likes, fb_likes, critic_reviews, users_reviews,
## total_reviews, users_votes, score, aspect_ratio, gross, year
## [905x19]
## mapping: x = score, y = gross
## faceting: <ggproto object: Class FacetNull, Facet>
## compute_layout: function
## draw_back: function
## draw_front: function
## draw_labels: function
## draw_panels: function
## finish_data: function
## init_scales: function
## map: function
## map_data: function
## params: list
## render_back: function
## render_front: function
## render_panels: function
## setup_data: function
## setup_params: function
## shrink: TRUE
## train: function
## train_positions: function
## train_scales: function
## vars: function
## super: <ggproto object: Class FacetNull, Facet>
## -----------------------------------
## geom_point: na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_identity
The Gross and Budget relationship is mapped on Watson. Could not visualize well on R because outliers need to be removed.
#Multiple Linear Regression Model
#mlr1 <- lm( DEPENDENT_VARIABLE ~ INDEPENDENT_VARIABLE1 + INDEPENDENT_VARIABLE2 )
#Summary of Multiple Linear Regression Model
mlrm_gross <- lm( score ~ gross + budget )
mlrm_gross
##
## Call:
## lm(formula = score ~ gross + budget)
##
## Coefficients:
## (Intercept) gross budget
## 5.933e+00 4.904e-09 -8.592e-10
summary(mlrm_gross)
##
## Call:
## lm(formula = score ~ gross + budget)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8625 -0.5065 0.0696 0.6046 2.4507
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.933e+00 4.753e-02 124.819 <2e-16 ***
## gross 4.904e-09 4.109e-10 11.937 <2e-16 ***
## budget -8.592e-10 5.948e-10 -1.445 0.149
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9346 on 902 degrees of freedom
## Multiple R-squared: 0.168, Adjusted R-squared: 0.1661
## F-statistic: 91.05 on 2 and 902 DF, p-value: < 2.2e-16
R squared is very low at .168
yearscoreplot<- qplot( x = year, y = score, data = mydata)
yearscoreplot
I chose to visualize this because I wanted to prove that the year the movie came out did not affect the score. The score given on Rotten Tomatoes is completely reliant on user preference and website use. This relationship shows there wasnt some larger trend like “movies were more successful in 1995”. From this visualization, you can see that the scores are scattered across the board.
But, according to the watson produced graph below, budget has increased by over 500% in the 26 years of data.
knitr::include_graphics('images/Budget by Year!.png')
grosstotalreviewsplot <- qplot( x = gross, y = total_reviews, data = mydata)
grosstotalreviewsplot + geom_smooth(method = "lm", formula = y ~ x)
There seems to be a relationship here.
grossbudgetplot <- qplot( x = gross, y = budget, data = mydata)
grossbudgetplot + geom_smooth(method = "lm", formula = y ~ x)
actor_data <- mydata[(mydata$sum_actor_likes>1759 & mydata$sum_actor_likes<18504),]
grossactorlikesplot <- qplot( x = gross, y = sum_actor_likes , data = actor_data)
grossactorlikesplot + geom_smooth(method = "lm", formula = y ~ x)
#Multiple Linear Regression Model
#mlr1 <- lm( DEPENDENT_VARIABLE ~ INDEPENDENT_VARIABLE1 + INDEPENDENT_VARIABLE2 )
#Summary of Multiple Linear Regression Model
mlrm_score <- lm( gross ~ score + budget, data = mydata)
summary(mlrm_score)
##
## Call:
## lm(formula = gross ~ score + budget, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -456620608 -35939239 -9600399 22688444 517936044
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.483e+08 1.448e+07 -10.24 <2e-16 ***
## score 2.782e+07 2.330e+06 11.94 <2e-16 ***
## budget 7.219e-01 3.786e-02 19.07 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 70380000 on 902 degrees of freedom
## Multiple R-squared: 0.4056, Adjusted R-squared: 0.4043
## F-statistic: 307.8 on 2 and 902 DF, p-value: < 2.2e-16
R squared of .4!!! For budget score to predict gross.
#p <- plot_ly(mydata, x = ~VARIBLE_1, y = ~VARIBLE_2, z = ~VARIBLE_3, marker = list(size = 5)) %>%
# add_markers() %>%
#p
p <- plot_ly(mydata, x = ~gross, y = ~budget, z = ~score, marker = list(size = 5))
p
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
This is a plot of the most predicitive variables of gross- score and budget.