Install required packages

# Here we are checking if the package is installed
if(!require("tidyverse")){
  
  # If the package is not in the system then it will be install
  install.packages("tidyverse", dependencies = TRUE)
  
  # Here we are loading the package
  library("tidyverse")
}
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.4.2     ✔ dplyr   0.7.4
## ✔ tidyr   0.7.2     ✔ stringr 1.2.0
## ✔ readr   1.1.1     ✔ forcats 0.2.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
# Here we are checking if the package is installed
if(!require("corrplot")){
  
  # If the package is not in the system then it will be install
  install.packages("corrplot", dependencies = TRUE)
  
  # Here we are loading the package
  library("corrplot")
}
## Loading required package: corrplot
## corrplot 0.84 loaded
if(!require("tidyverse")){
  
  # If the package is not in the system then it will be install
  install.packages("tidyverse", dependencies = TRUE)
  
  # Here we are loading the package
  library("tidyverse")
}

# Here we are checking if the package is installed
if(!require("plotly")){
  
  # If the package is not in the system then it will be install
  install.packages("plotly", dependencies = TRUE)
  
  # Here we are loading the package
  library("plotly")
}
## Loading required package: plotly
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Read the Data and summarize

mydata <- read.csv("data/action.csv")
summary(mydata)
##                          title         budget           genre.1   
##  King Kong                  :  3   Min.   :7.00e+03   Action:905  
##  The Fast and the Furious   :  3   1st Qu.:2.80e+07               
##  Around the World in 80 Days:  2   Median :5.40e+07               
##  Clash of the Titans        :  2   Mean   :7.11e+07               
##  Dawn of the Dead           :  2   3rd Qu.:9.80e+07               
##  Dredd                      :  2   Max.   :6.00e+08               
##  (Other)                    :891                                  
##       genre.2        genre.3    actor1_fb_likes  actor2_fb_likes
##  Adventure:370   Drama   :156   Min.   :     0   Min.   :    0  
##  Crime    :179   Thriller:145   1st Qu.:   833   1st Qu.:  464  
##  Comedy   :126   Sci-Fi  :128   Median :  4000   Median :  794  
##  Drama    : 82   Crime   : 87   Mean   :  9316   Mean   : 2527  
##  Horror   : 29   Fantasy : 80   3rd Qu.: 14000   3rd Qu.: 1000  
##  Sci-Fi   : 29           : 76   Max.   :260000   Max.   :27000  
##  (Other)  : 90   (Other) :233                                   
##  actor3_fb_likes sum_actor_likes  total_cast_likes    fb_likes     
##  Min.   :    0   Min.   :     0   Min.   :     0   Min.   :     0  
##  1st Qu.:  244   1st Qu.:  1759   1st Qu.:  2403   1st Qu.:     0  
##  Median :  490   Median :  5899   Median :  6727   Median :   244  
##  Mean   :  928   Mean   : 12771   Mean   : 13969   Mean   : 12535  
##  3rd Qu.:  798   3rd Qu.: 18504   3rd Qu.: 19764   3rd Qu.: 15000  
##  Max.   :23000   Max.   :261875   Max.   :263584   Max.   :197000  
##                                                                    
##  critic_reviews  users_reviews    total_reviews     users_votes     
##  Min.   :  1.0   Min.   :   1.0   Min.   :   3.0   Min.   :     40  
##  1st Qu.: 94.0   1st Qu.: 141.0   1st Qu.: 264.0   1st Qu.:  34948  
##  Median :171.0   Median : 286.0   Median : 471.0   Median :  83097  
##  Mean   :202.2   Mean   : 463.5   Mean   : 665.7   Mean   : 141752  
##  3rd Qu.:275.0   3rd Qu.: 552.0   3rd Qu.: 846.0   3rd Qu.: 182899  
##  Max.   :813.0   Max.   :5060.0   Max.   :5357.0   Max.   :1676169  
##                                                                     
##      score        aspect_ratio       gross                year     
##  Min.   :2.100   Min.   :1.330   Min.   :      162   Min.   :1990  
##  1st Qu.:5.700   1st Qu.:2.350   1st Qu.: 18208078   1st Qu.:2000  
##  Median :6.300   Median :2.350   Median : 45434443   Median :2006  
##  Mean   :6.249   Mean   :2.229   Mean   : 76867713   Mean   :2005  
##  3rd Qu.:6.900   3rd Qu.:2.350   3rd Qu.:102981571   3rd Qu.:2011  
##  Max.   :9.000   Max.   :2.390   Max.   :760505847   Max.   :2016  
##                  NA's   :11

Max Actor 1 Facebook likes is 260,000. Max actor 2 is 27,000. Max actor 3 facebook likes is 23,000. Max score is 9/10. Minimum is 2.1/10.Total reviews has a max of 5357 with a minimum of 3. Total cast likes min is 0 with a max of 263,584. Mean is 13,969.

head(mydata)
##                 title  budget genre.1   genre.2  genre.3 actor1_fb_likes
## 1            Hardflip 1.0e+06  Action     Drama                   260000
## 2               Feast 3.2e+06  Action    Comedy   Horror          164000
## 3             Armored 2.7e+07  Action     Crime Thriller          137000
## 4             Hostage 5.2e+07  Action     Crime    Drama           87000
## 5            Poseidon 1.6e+08  Action Adventure    Drama           87000
## 6 Night at the Museum 1.1e+08  Action Adventure   Comedy           49000
##   actor2_fb_likes actor3_fb_likes sum_actor_likes total_cast_likes
## 1             984             891          261875           263584
## 2            2000             898          166898           170118
## 3             459             163          137622           137712
## 4           13000             759          100759           103354
## 5            2000             702           89702            92456
## 6            3000            1000           53000            55486
##   fb_likes critic_reviews users_reviews total_reviews users_votes score
## 1      706              2             5             7         606   5.6
## 2        0            130           252           382       25542   6.4
## 3        0            107           110           217       26236   5.7
## 4        0            152           288           440       93790   6.6
## 5        0            231           629           860       82380   5.6
## 6     3000            179           444           623      234480   6.4
##   aspect_ratio     gross year
## 1         1.85     96734 2012
## 2         2.35    690872 2005
## 3         2.35  15988876 2009
## 4         2.35  34636443 2005
## 5         2.35  60655503 2006
## 6         1.85 250863268 2006
## Initial Data Cleaning in R
mydata$budget <- gsub( ',', '', mydata$budget)
mydata$budget <- as.numeric(mydata$budget)
mydata$gross <- gsub( ',', '', mydata$gross)
mydata$gross <- as.numeric(mydata$gross)
write_csv(mydata, "data/action.csv")
mydata <- read_csv("data/action.csv")
## Parsed with column specification:
## cols(
##   title = col_character(),
##   budget = col_double(),
##   genre.1 = col_character(),
##   genre.2 = col_character(),
##   genre.3 = col_character(),
##   actor1_fb_likes = col_integer(),
##   actor2_fb_likes = col_integer(),
##   actor3_fb_likes = col_integer(),
##   sum_actor_likes = col_integer(),
##   total_cast_likes = col_integer(),
##   fb_likes = col_integer(),
##   critic_reviews = col_integer(),
##   users_reviews = col_integer(),
##   total_reviews = col_integer(),
##   users_votes = col_integer(),
##   score = col_double(),
##   aspect_ratio = col_double(),
##   gross = col_double(),
##   year = col_integer()
## )
title <- mydata$title
budget <- mydata$budget
sum <- mydata$sum_actor_likes
total_reviews <- mydata$total_reviews
score <- mydata$score
gross <- mydata$gross
year <- mydata$year
actorlikes <- mydata$sum_actor_likes

Task 1: About this Project


1A) Brief description of your project.

My project is about using the Rotten Tomatoes data set to find out information about action movies.

1B) Resources - Packages/software use on the project (R, Tableau, Watson)

Excel - data cleaning. Got rid of bad data. Filled in gaps where I could. Eliminated unnecessary columns. Deleted rows with mutltiple missing data points. Watson Analytics - initial analysis to help me see what kind of questions I could answer. RStudio - statistical analysis Tableau - data visualization Google Slides - final presentation put together

1C) Data Description source, year, country.

The Data has movies that originated from multiple countries but had a presence in the US. The Data is from Rotten Tomatoes. Years 1990 and greater. Only Action movies are included as this is the focus of my project. There are 909 rows and 19 columns of data.

1D) Potential business cases (hypothesis)

DOES ROTTEN TOMATOE’S BUDGET, LIKES, AND SCORE PREDICT AN ACTION MOVIE’S GROSS REVENUE? Does budget affect gross revenue? Does the score and amount of likes affect gross revenue?


Task #2: Data Collection - Rotten Tomatoes - Movie Data (Action)


Paragraph using descriptive statistics

There are 905 rows, 19 columns. Originally there were 20 genres and I consolidated them to one. The Watson quality is 55 percent. The data is from Rotten Tomatoes. It has data from 2016 to 1990 (after cleaning).


Task #3: Data Preparation - Cleaning and preparing the data for analysis


3A) Describe the steps to cleaning and prepare the data for analysis. Make an argument of why those steps are necessary.

Cleaning consisted of removing the special characters from the title data column. I did this using Ablebits excel extension software. Separation of the genre column was also an essential part of the cleaning. I simplified the genres from 6 (for some) to three in the final data set. I deleted any data from years earlier than 1990. Some data was missing from the set so I had to fill it in by finding the data from online. I had to add in things like budget for some lines of data. I also had to delete rows that had more than one blank in their data.


Tast #4: Data Analysis - Descriptive Statistics, Correlations


4A) Basic descriptive statistics of the new clean dataset (write down any observations)

The numeric data includes budget, actor likes (1,2, and 3), sum of total actor likes, total cast likes, FB likes, critic reviews, user reviews, total reviews, users votes, score, sspect ratio, and gross revenue.

Non-numeric data includes title, genres 1-3, and year.

4B) Using descriptive statistics explore dataset investigate min and max values

budgetmax <- max(budget)
budgetmax
## [1] 6e+08
budgetmin <- min(budget)
budgetmin
## [1] 7000

Max is 600 million min is 7000. The movie with the 600 million dollar budget only grossed 1,602,466. The movie with a 7,000 production budget grossed 2,040,920.

grossmax <- max(gross)
grossmax
## [1] 760505847
grossmin <- min(gross)
grossmin
## [1] 162

Gross max is about 760 million whereas the minimum is 162 dollars!

scoremax <- max(score)
scoremax
## [1] 9
scoremin <- min(score)
scoremin
## [1] 2.1

The highest scoring movie is The Dark Knight! The Dark Knight is a Christopher Nolan film that came out in 2008.

4C) Create a correlation table ( only numeric data ). Note any significant values. Give a brief explanation/guess of why some variables are correlated

data_corr <- cor( mydata[-c(1,3,4,5,17,19)] )
data_corr
##                     budget actor1_fb_likes actor2_fb_likes actor3_fb_likes
## budget           1.0000000       0.1189832       0.2779575       0.2635146
## actor1_fb_likes  0.1189832       1.0000000       0.3518770       0.2260792
## actor2_fb_likes  0.2779575       0.3518770       1.0000000       0.6558972
## actor3_fb_likes  0.2635146       0.2260792       0.6558972       1.0000000
## sum_actor_likes  0.2018233       0.9446933       0.6260907       0.4788459
## total_cast_likes 0.2212162       0.9209036       0.6514523       0.5401854
## fb_likes         0.3503768       0.1473195       0.3414708       0.4161650
## critic_reviews   0.5044912       0.1759502       0.3574420       0.3522662
## users_reviews    0.4204286       0.1153323       0.2729345       0.2914162
## total_reviews    0.4699324       0.1370190       0.3115330       0.3262868
## users_votes      0.4142651       0.1876055       0.3857886       0.4385345
## score            0.1911196       0.1408218       0.2373021       0.2048637
## gross            0.5583407       0.1444920       0.3542932       0.3718417
##                  sum_actor_likes total_cast_likes  fb_likes critic_reviews
## budget                 0.2018233        0.2212162 0.3503768      0.5044912
## actor1_fb_likes        0.9446933        0.9209036 0.1473195      0.1759502
## actor2_fb_likes        0.6260907        0.6514523 0.3414708      0.3574420
## actor3_fb_likes        0.4788459        0.5401854 0.4161650      0.3522662
## sum_actor_likes        1.0000000        0.9944666 0.2605864      0.2802152
## total_cast_likes       0.9944666        1.0000000 0.2897387      0.3036360
## fb_likes               0.2605864        0.2897387 1.0000000      0.7407261
## critic_reviews         0.2802152        0.3036360 0.7407261      1.0000000
## users_reviews          0.2010673        0.2210067 0.3168884      0.5825896
## total_reviews          0.2331485        0.2553266 0.4320092      0.7161413
## users_votes            0.3079165        0.3335460 0.4580604      0.6720448
## score                  0.2022292        0.2089786 0.2944280      0.4342300
## gross                  0.2558692        0.2901325 0.4208311      0.5809444
##                  users_reviews total_reviews users_votes     score
## budget               0.4204286     0.4699324   0.4142651 0.1911196
## actor1_fb_likes      0.1153323     0.1370190   0.1876055 0.1408218
## actor2_fb_likes      0.2729345     0.3115330   0.3857886 0.2373021
## actor3_fb_likes      0.2914162     0.3262868   0.4385345 0.2048637
## sum_actor_likes      0.2010673     0.2331485   0.3079165 0.2022292
## total_cast_likes     0.2210067     0.2553266   0.3335460 0.2089786
## fb_likes             0.3168884     0.4320092   0.4580604 0.2944280
## critic_reviews       0.5825896     0.7161413   0.6720448 0.4342300
## users_reviews        1.0000000     0.9844912   0.8144106 0.3765109
## total_reviews        0.9844912     1.0000000   0.8444268 0.4170528
## users_votes          0.8144106     0.8444268   1.0000000 0.5624382
## score                0.3765109     0.4170528   0.5624382 1.0000000
## gross                0.6586304     0.6909882   0.7250525 0.4074871
##                      gross
## budget           0.5583407
## actor1_fb_likes  0.1444920
## actor2_fb_likes  0.3542932
## actor3_fb_likes  0.3718417
## sum_actor_likes  0.2558692
## total_cast_likes 0.2901325
## fb_likes         0.4208311
## critic_reviews   0.5809444
## users_reviews    0.6586304
## total_reviews    0.6909882
## users_votes      0.7250525
## score            0.4074871
## gross            1.0000000

I took out a lot of variables for the correlation analysis. I took out non-numeric variables. And consolodiated different columns into totals like total cast likes and sum of actor likes. The highest correlation is between user_votes and gross. The lowest is between sum_actor_likes and budget. Gross and budget are 62% correlated. This means that there is some information that can be gathered from my anaylsis. Because there is some relationship, albiet not too strong, there is something to be discovered.

I did not discuss the relationship between total cast likes and sum ator likes because they are similar numbers. One is a number I calculated and one is a number I was provided. They have a high correlation because they are essentially the same thing. I created the sum.

Visual Analytics: Use Tableau or R to create plots


5A) Based on your hypothesis and observations create a predictive model

scoregrossplot <- qplot( x = score, y = gross, data = mydata)
scoregrossplot + geom_smooth(method = "lm", formula = y ~ x)

summary(scoregrossplot)
## data: title, budget, genre.1, genre.2, genre.3, actor1_fb_likes,
##   actor2_fb_likes, actor3_fb_likes, sum_actor_likes,
##   total_cast_likes, fb_likes, critic_reviews, users_reviews,
##   total_reviews, users_votes, score, aspect_ratio, gross, year
##   [905x19]
## mapping:  x = score, y = gross
## faceting: <ggproto object: Class FacetNull, Facet>
##     compute_layout: function
##     draw_back: function
##     draw_front: function
##     draw_labels: function
##     draw_panels: function
##     finish_data: function
##     init_scales: function
##     map: function
##     map_data: function
##     params: list
##     render_back: function
##     render_front: function
##     render_panels: function
##     setup_data: function
##     setup_params: function
##     shrink: TRUE
##     train: function
##     train_positions: function
##     train_scales: function
##     vars: function
##     super:  <ggproto object: Class FacetNull, Facet>
## -----------------------------------
## geom_point: na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_identity

The Gross and Budget relationship is mapped on Watson. Could not visualize well on R because outliers need to be removed.

#Multiple Linear Regression Model
#mlr1 <- lm( DEPENDENT_VARIABLE ~ INDEPENDENT_VARIABLE1 + INDEPENDENT_VARIABLE2 )
#Summary of Multiple Linear Regression Model

mlrm_gross <- lm( score ~ gross + budget )
mlrm_gross
## 
## Call:
## lm(formula = score ~ gross + budget)
## 
## Coefficients:
## (Intercept)        gross       budget  
##   5.933e+00    4.904e-09   -8.592e-10
summary(mlrm_gross)
## 
## Call:
## lm(formula = score ~ gross + budget)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8625 -0.5065  0.0696  0.6046  2.4507 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.933e+00  4.753e-02 124.819   <2e-16 ***
## gross        4.904e-09  4.109e-10  11.937   <2e-16 ***
## budget      -8.592e-10  5.948e-10  -1.445    0.149    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9346 on 902 degrees of freedom
## Multiple R-squared:  0.168,  Adjusted R-squared:  0.1661 
## F-statistic: 91.05 on 2 and 902 DF,  p-value: < 2.2e-16

R squared is very low at .168

yearscoreplot<- qplot( x = year, y = score, data = mydata)
yearscoreplot

I chose to visualize this because I wanted to prove that the year the movie came out did not affect the score. The score given on Rotten Tomatoes is completely reliant on user preference and website use. This relationship shows there wasnt some larger trend like “movies were more successful in 1995”. From this visualization, you can see that the scores are scattered across the board.

But, according to the watson produced graph below, budget has increased by over 500% in the 26 years of data.

knitr::include_graphics('images/Budget by Year!.png')

grosstotalreviewsplot <- qplot( x = gross, y = total_reviews, data = mydata)

grosstotalreviewsplot + geom_smooth(method = "lm", formula = y ~ x)

There seems to be a relationship here.

grossbudgetplot <- qplot( x = gross, y = budget, data = mydata)

grossbudgetplot + geom_smooth(method = "lm", formula = y ~ x)

actor_data <- mydata[(mydata$sum_actor_likes>1759 & mydata$sum_actor_likes<18504),]

grossactorlikesplot <- qplot( x = gross, y = sum_actor_likes , data = actor_data)

grossactorlikesplot + geom_smooth(method = "lm", formula = y ~ x)

5B) Evaluate the efficiency of model. Note R Square and Adjusted R Square, be suspicious of very high R Squares.

#Multiple Linear Regression Model
#mlr1 <- lm( DEPENDENT_VARIABLE ~ INDEPENDENT_VARIABLE1 + INDEPENDENT_VARIABLE2 )
#Summary of Multiple Linear Regression Model

mlrm_score <- lm( gross ~ score + budget, data = mydata)
summary(mlrm_score)
## 
## Call:
## lm(formula = gross ~ score + budget, data = mydata)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -456620608  -35939239   -9600399   22688444  517936044 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.483e+08  1.448e+07  -10.24   <2e-16 ***
## score        2.782e+07  2.330e+06   11.94   <2e-16 ***
## budget       7.219e-01  3.786e-02   19.07   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 70380000 on 902 degrees of freedom
## Multiple R-squared:  0.4056, Adjusted R-squared:  0.4043 
## F-statistic: 307.8 on 2 and 902 DF,  p-value: < 2.2e-16

R squared of .4!!! For budget score to predict gross.

#p <- plot_ly(mydata, x = ~VARIBLE_1, y = ~VARIBLE_2, z = ~VARIBLE_3, marker = list(size = 5)) %>%
#  add_markers() %>% 
#p
p <- plot_ly(mydata, x = ~gross, y = ~budget, z = ~score, marker = list(size = 5)) 
p
## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode

This is a plot of the most predicitive variables of gross- score and budget.