IMDb Data

title = mydata$title
genre = mydata$genre
director = mydata$director
actor.1 = mydata$actor1
actor.2 = mydata$actor2
length = mydata$length
budget = mydata$budget
score = mydata$score
gross = mydata$gross
year = mydata$year
summary(mydata)
##     title              genre             director        
##  Length:191         Length:191         Length:191        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##     actor1             actor2              length          budget         
##  Length:191         Length:191         Min.   : 72.0   Min.   :   100000  
##  Class :character   Class :character   1st Qu.: 99.0   1st Qu.: 12310000  
##  Mode  :character   Mode  :character   Median :110.0   Median : 30000000  
##                                        Mean   :112.2   Mean   : 54944718  
##                                        3rd Qu.:122.0   3rd Qu.: 75000000  
##                                        Max.   :187.0   Max.   :250000000  
##      score           gross                year     
##  Min.   :3.400   Min.   :     3330   Min.   :2015  
##  1st Qu.:5.950   1st Qu.: 12232726   1st Qu.:2015  
##  Median :6.600   Median : 38317535   Median :2015  
##  Mean   :6.476   Mean   : 70622179   Mean   :2015  
##  3rd Qu.:7.200   3rd Qu.: 81448087   3rd Qu.:2016  
##  Max.   :8.400   Max.   :652177271   Max.   :2016

Gross

mean.gross = mean(gross)
mean.gross
## [1] 70622179
sd.gross = sd(gross)
sd.gross
## [1] 95640638
max.gross = max(gross)
max.gross
## [1] 652177271
min.gross = min(gross)
min.gross
## [1] 3330
snr.gross = mean.gross/sd.gross
snr.gross
## [1] 0.7384118
var.gross = var(gross)
var.gross
## [1] 9.147132e+15
lower.gross = mean.gross - (3) * sd.gross
upper.gross = mean.gross + (3) * sd.gross
lower.gross
## [1] -216299735
upper.gross 
## [1] 357544094
quantile(gross) 
##        0%       25%       50%       75%      100% 
##      3330  12232726  38317535  81448087 652177271
lowerq.gross = quantile(gross) [2]
upperq.gross = quantile(gross) [4]
IQR.gross = upperq.gross - lowerq.gross
IQR.gross
##      75% 
## 69215361
upperthreshold.gross = (IQR.gross * 1.5) + upper.gross 
upperthreshold.gross 
##       75% 
## 461367135
lowerthreshold.gross = lower.gross - (IQR.gross * 1.5) 
lowerthreshold.gross
##        75% 
## -320122777
gross[gross>upperthreshold.gross] 
## [1] 652177271

Budget

mean.budget = mean(budget)
mean.budget
## [1] 54944718
sd.budget = sd(budget)
sd.budget
## [1] 58986872
max.budget = max(budget)
max.budget
## [1] 2.5e+08
min.budget = min(budget)
min.budget
## [1] 1e+05
snr.budget = mean.budget/sd.budget
snr.budget
## [1] 0.9314737
var.budget = var(budget)
var.budget
## [1] 3.479451e+15
lower.budget = mean.budget - (3) * sd.budget
upper.budget = mean.budget + (3) * sd.budget
lower.budget
## [1] -122015897
upper.budget 
## [1] 231905333
quantile(budget) 
##        0%       25%       50%       75%      100% 
##    100000  12310000  30000000  75000000 250000000
lowerq.budget = quantile(budget) [2]
upperq.budget = quantile(budget) [4]
IQR.budget = upperq.budget - lowerq.budget
IQR.budget
##      75% 
## 62690000
upperthreshold.budget = (IQR.budget * 1.5) + upper.budget 
upperthreshold.budget 
##       75% 
## 325940333
lowerthreshold.budget = lower.budget - (IQR.budget * 1.5) 
lowerthreshold.budget
##        75% 
## -216050897
budget[budget>upperthreshold.budget] 
## numeric(0)

Score

mean.score = mean(score)
mean.score
## [1] 6.475916
sd.score = sd(score)
sd.score
## [1] 0.9818597
max.score = max(score)
max.score
## [1] 8.4
min.score = min(score)
min.score
## [1] 3.4
snr.score = mean.score/sd.score
snr.score
## [1] 6.595562
var.score = var(score) 
var.score
## [1] 0.9640485
lower.score = mean.score - (3) * sd.score
upper.score = mean.score + (3) * sd.score
lower.score
## [1] 3.530337
upper.score 
## [1] 9.421495
quantile(score) 
##   0%  25%  50%  75% 100% 
## 3.40 5.95 6.60 7.20 8.40
lowerq.score = quantile(score) [2]
upperq.score = quantile(score) [4]
IQR.score = upperq.score - lowerq.score
IQR.score
##  75% 
## 1.25
upperthreshold.score = (IQR.score * 1.5) + upper.score 
upperthreshold.score 
##     75% 
## 11.2965
lowerthreshold.score = lower.score - (IQR.score * 1.5) 
lowerthreshold.score
##      75% 
## 1.655337
score[score>upperthreshold.score] 
## numeric(0)

Tables

head(mydata)
## # A tibble: 6 × 10
##                        title                      genre        director
##                        <chr>                      <chr>           <chr>
## 1             Jurassic World    Action|Adventure|Sci-Fi Colin Trevorrow
## 2    Avengers: Age of Ultron    Action|Adventure|Sci-Fi     Joss Whedon
## 3 Captain America: Civil War    Action|Adventure|Sci-Fi   Anthony Russo
## 4                   Deadpool    Action|Adventure|Comedy      Tim Miller
## 5            The Jungle Book     Adventure|Drama|Family     Jon Favreau
## 6                 Inside Out Adventure|Animation|Comedy     Pete Docter
## # ... with 7 more variables: actor1 <chr>, actor2 <chr>, length <dbl>,
## #   budget <dbl>, score <dbl>, gross <dbl>, year <dbl>
gross.top20 = subset(mydata, gross >=169692572 )
gross.top20
## # A tibble: 20 × 10
##                                    title                      genre
##                                    <chr>                      <chr>
## 1                         Jurassic World    Action|Adventure|Sci-Fi
## 2                Avengers: Age of Ultron    Action|Adventure|Sci-Fi
## 3             Captain America: Civil War    Action|Adventure|Sci-Fi
## 4                               Deadpool    Action|Adventure|Comedy
## 5                        The Jungle Book     Adventure|Drama|Family
## 6                             Inside Out Adventure|Animation|Comedy
## 7                              Furious 7      Action|Crime|Thriller
## 8                                Minions    Action|Animation|Comedy
## 9     Batman v Superman: Dawn of Justice    Action|Adventure|Sci-Fi
## 10               The Secret Life of Pets    Animation|Comedy|Family
## 11 The Hunger Games: Mockingjay - Part 2           Adventure|Sci-Fi
## 12                           The Martian     Adventure|Drama|Sci-Fi
## 13                            Cinderella       Drama|Family|Fantasy
## 14                               Spectre  Action|Adventure|Thriller
## 15    Mission: Impossible - Rogue Nation  Action|Adventure|Thriller
## 16                          The Revenant   Adventure|Drama|Thriller
## 17                       Pitch Perfect 2               Comedy|Music
## 18                               Ant-Man    Action|Adventure|Comedy
## 19                                  Home Adventure|Animation|Comedy
## 20                  Hotel Transylvania 2    Animation|Comedy|Family
## # ... with 8 more variables: director <chr>, actor1 <chr>, actor2 <chr>,
## #   length <dbl>, budget <dbl>, score <dbl>, gross <dbl>, year <dbl>
budget.top20 = subset(mydata, budget >=155000000 )
budget.top20
## # A tibble: 18 × 10
##                                    title                      genre
##                                    <chr>                      <chr>
## 1                Avengers: Age of Ultron    Action|Adventure|Sci-Fi
## 2             Captain America: Civil War    Action|Adventure|Sci-Fi
## 3                        The Jungle Book     Adventure|Drama|Family
## 4                             Inside Out Adventure|Animation|Comedy
## 5                              Furious 7      Action|Crime|Thriller
## 6     Batman v Superman: Dawn of Justice    Action|Adventure|Sci-Fi
## 7  The Hunger Games: Mockingjay - Part 2           Adventure|Sci-Fi
## 8                                Spectre  Action|Adventure|Thriller
## 9                          Suicide Squad    Action|Adventure|Comedy
## 10                     X-Men: Apocalypse    Action|Adventure|Sci-Fi
## 11                      Star Trek Beyond    Action|Adventure|Sci-Fi
## 12                  The Legend of Tarzan     Action|Adventure|Drama
## 13          Independence Day: Resurgence    Action|Adventure|Sci-Fi
## 14                          Tomorrowland    Action|Adventure|Family
## 15                    Terminator Genisys    Action|Adventure|Sci-Fi
## 16       Alice Through the Looking Glass   Adventure|Family|Fantasy
## 17                     Jupiter Ascending    Action|Adventure|Sci-Fi
## 18                              Warcraft   Action|Adventure|Fantasy
## # ... with 8 more variables: director <chr>, actor1 <chr>, actor2 <chr>,
## #   length <dbl>, budget <dbl>, score <dbl>, gross <dbl>, year <dbl>
score.top20 = subset(mydata, score >=7.7 )
score.top20
## # A tibble: 18 × 10
##                         title                      genre
##                         <chr>                      <chr>
## 1  Captain America: Civil War    Action|Adventure|Sci-Fi
## 2                    Deadpool    Action|Adventure|Comedy
## 3             The Jungle Book     Adventure|Drama|Family
## 4                  Inside Out Adventure|Animation|Comedy
## 5                 The Martian     Adventure|Drama|Sci-Fi
## 6                The Revenant   Adventure|Drama|Thriller
## 7      Straight Outta Compton      Biography|Crime|Drama
## 8          Mad Max: Fury Road    Action|Adventure|Sci-Fi
## 9                       Creed                Drama|Sport
## 10            The Conjuring 2    Horror|Mystery|Thriller
## 11              The Big Short     Biography|Comedy|Drama
## 12          The Hateful Eight        Crime|Drama|Mystery
## 13                  Spotlight      Biography|Crime|Drama
## 14                 Ex Machina       Drama|Mystery|Sci-Fi
## 15                       Room                      Drama
## 16   Baahubali: The Beginning     Action|Adventure|Drama
## 17          The Little Prince  Adventure|Animation|Drama
## 18          The Second Mother               Comedy|Drama
## # ... with 8 more variables: director <chr>, actor1 <chr>, actor2 <chr>,
## #   length <dbl>, budget <dbl>, score <dbl>, gross <dbl>, year <dbl>
corr01 = mydata[ c(7,9) ] 
cor(corr01) 
##           budget     gross
## budget 1.0000000 0.6658252
## gross  0.6658252 1.0000000
corr02 = mydata[ c(7,8) ] 
cor(corr02) 
##           budget     score
## budget 1.0000000 0.1973812
## score  0.1973812 1.0000000
corr03 = mydata[ c(8,9) ] 
cor(corr03)
##          score    gross
## score 1.000000 0.284871
## gross 0.284871 1.000000
corr04 = mydata[ c(7,8,9) ] 
cor(corr04)
##           budget     score     gross
## budget 1.0000000 0.1973812 0.6658252
## score  0.1973812 1.0000000 0.2848710
## gross  0.6658252 0.2848710 1.0000000
plot(mydata$gross~mydata$budget)
gross.budget.linear=lm(mydata$gross~mydata$budget)
abline(gross.budget.linear,col="blue",lwd=2)

summary(gross.budget.linear)
## 
## Call:
## lm(formula = mydata$gross ~ mydata$budget)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -153933453  -27310329  -10907441   17464413  478937086 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.131e+07  7.084e+06   1.596    0.112    
## mydata$budget 1.080e+00  8.799e-02  12.268   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 71550000 on 189 degrees of freedom
## Multiple R-squared:  0.4433, Adjusted R-squared:  0.4404 
## F-statistic: 150.5 on 1 and 189 DF,  p-value: < 2.2e-16

Top 20

imdb2015 <- subset(mydata, mydata$year == 2015)
imdb2016 <- subset(mydata, mydata$year == 2016)
imdb2015 
## # A tibble: 127 × 10
##                                    title                      genre
##                                    <chr>                      <chr>
## 1                         Jurassic World    Action|Adventure|Sci-Fi
## 2                Avengers: Age of Ultron    Action|Adventure|Sci-Fi
## 3                             Inside Out Adventure|Animation|Comedy
## 4                              Furious 7      Action|Crime|Thriller
## 5                                Minions    Action|Animation|Comedy
## 6  The Hunger Games: Mockingjay - Part 2           Adventure|Sci-Fi
## 7                            The Martian     Adventure|Drama|Sci-Fi
## 8                             Cinderella       Drama|Family|Fantasy
## 9                                Spectre  Action|Adventure|Thriller
## 10    Mission: Impossible - Rogue Nation  Action|Adventure|Thriller
## # ... with 117 more rows, and 8 more variables: director <chr>,
## #   actor1 <chr>, actor2 <chr>, length <dbl>, budget <dbl>, score <dbl>,
## #   gross <dbl>, year <dbl>
imdb2016
## # A tibble: 64 × 10
##                                 title                      genre
##                                 <chr>                      <chr>
## 1          Captain America: Civil War    Action|Adventure|Sci-Fi
## 2                            Deadpool    Action|Adventure|Comedy
## 3                     The Jungle Book     Adventure|Drama|Family
## 4  Batman v Superman: Dawn of Justice    Action|Adventure|Sci-Fi
## 5             The Secret Life of Pets    Animation|Comedy|Family
## 6                       Suicide Squad    Action|Adventure|Comedy
## 7                   X-Men: Apocalypse    Action|Adventure|Sci-Fi
## 8                     Kung Fu Panda 3 Action|Adventure|Animation
## 9                    Star Trek Beyond    Action|Adventure|Sci-Fi
## 10               Central Intelligence        Action|Comedy|Crime
## # ... with 54 more rows, and 8 more variables: director <chr>,
## #   actor1 <chr>, actor2 <chr>, length <dbl>, budget <dbl>, score <dbl>,
## #   gross <dbl>, year <dbl>
top20_2015 <- subset(imdb2015, imdb2015$gross >=153629485) 
top20_2016 <- subset(imdb2016, imdb2016$gross >=76846624) 
top20_imdb <- subset(mydata, mydata$gross >=169692572)
top20_2015
## # A tibble: 20 × 10
##                                       title                      genre
##                                       <chr>                      <chr>
## 1                            Jurassic World    Action|Adventure|Sci-Fi
## 2                   Avengers: Age of Ultron    Action|Adventure|Sci-Fi
## 3                                Inside Out Adventure|Animation|Comedy
## 4                                 Furious 7      Action|Crime|Thriller
## 5                                   Minions    Action|Animation|Comedy
## 6     The Hunger Games: Mockingjay - Part 2           Adventure|Sci-Fi
## 7                               The Martian     Adventure|Drama|Sci-Fi
## 8                                Cinderella       Drama|Family|Fantasy
## 9                                   Spectre  Action|Adventure|Thriller
## 10       Mission: Impossible - Rogue Nation  Action|Adventure|Thriller
## 11                             The Revenant   Adventure|Drama|Thriller
## 12                          Pitch Perfect 2               Comedy|Music
## 13                                  Ant-Man    Action|Adventure|Comedy
## 14                                     Home Adventure|Animation|Comedy
## 15                     Hotel Transylvania 2    Animation|Comedy|Family
## 16                     Fifty Shades of Grey              Drama|Romance
## 17 The SpongeBob Movie: Sponge Out of Water Adventure|Animation|Comedy
## 18                   Straight Outta Compton      Biography|Crime|Drama
## 19                              San Andreas     Action|Adventure|Drama
## 20                       Mad Max: Fury Road    Action|Adventure|Sci-Fi
## # ... with 8 more variables: director <chr>, actor1 <chr>, actor2 <chr>,
## #   length <dbl>, budget <dbl>, score <dbl>, gross <dbl>, year <dbl>
top20_2016 
## # A tibble: 20 × 10
##                                               title
##                                               <chr>
## 1                        Captain America: Civil War
## 2                                          Deadpool
## 3                                   The Jungle Book
## 4                Batman v Superman: Dawn of Justice
## 5                           The Secret Life of Pets
## 6                                     Suicide Squad
## 7                                 X-Men: Apocalypse
## 8                                   Kung Fu Panda 3
## 9                                  Star Trek Beyond
## 10                             Central Intelligence
## 11                             The Legend of Tarzan
## 12                                     Ghostbusters
## 13                                     Jason Bourne
## 14                            The Angry Birds Movie
## 15                     Independence Day: Resurgence
## 16                                  The Conjuring 2
## 17                                     Ride Along 2
## 18 Teenage Mutant Ninja Turtles: Out of the Shadows
## 19                         The Purge: Election Year
## 20                  Alice Through the Looking Glass
## # ... with 9 more variables: genre <chr>, director <chr>, actor1 <chr>,
## #   actor2 <chr>, length <dbl>, budget <dbl>, score <dbl>, gross <dbl>,
## #   year <dbl>
top20_imdb
## # A tibble: 20 × 10
##                                    title                      genre
##                                    <chr>                      <chr>
## 1                         Jurassic World    Action|Adventure|Sci-Fi
## 2                Avengers: Age of Ultron    Action|Adventure|Sci-Fi
## 3             Captain America: Civil War    Action|Adventure|Sci-Fi
## 4                               Deadpool    Action|Adventure|Comedy
## 5                        The Jungle Book     Adventure|Drama|Family
## 6                             Inside Out Adventure|Animation|Comedy
## 7                              Furious 7      Action|Crime|Thriller
## 8                                Minions    Action|Animation|Comedy
## 9     Batman v Superman: Dawn of Justice    Action|Adventure|Sci-Fi
## 10               The Secret Life of Pets    Animation|Comedy|Family
## 11 The Hunger Games: Mockingjay - Part 2           Adventure|Sci-Fi
## 12                           The Martian     Adventure|Drama|Sci-Fi
## 13                            Cinderella       Drama|Family|Fantasy
## 14                               Spectre  Action|Adventure|Thriller
## 15    Mission: Impossible - Rogue Nation  Action|Adventure|Thriller
## 16                          The Revenant   Adventure|Drama|Thriller
## 17                       Pitch Perfect 2               Comedy|Music
## 18                               Ant-Man    Action|Adventure|Comedy
## 19                                  Home Adventure|Animation|Comedy
## 20                  Hotel Transylvania 2    Animation|Comedy|Family
## # ... with 8 more variables: director <chr>, actor1 <chr>, actor2 <chr>,
## #   length <dbl>, budget <dbl>, score <dbl>, gross <dbl>, year <dbl>
plot(top20_imdb$gross~top20_imdb$budget)
linear.model=lm(top20_imdb$gross~top20_imdb$budget)
abline(linear.model,col="blue",lwd=2)

summary(linear.model)
## 
## Call:
## lm(formula = top20_imdb$gross ~ top20_imdb$budget)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -156612734  -92077429  -26704988   53231190  352548220 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)   
## (Intercept)       2.095e+08  6.470e+07   3.238  0.00456 **
## top20_imdb$budget 6.006e-01  4.047e-01   1.484  0.15510   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 119100000 on 18 degrees of freedom
## Multiple R-squared:  0.109,  Adjusted R-squared:  0.05952 
## F-statistic: 2.202 on 1 and 18 DF,  p-value: 0.1551
top20_2015$logGross <- log(top20_2015$gross)
hist(top20_2015$logGross)

plot(top20_2015$gross,top20_2015$budget) 

top20_2016$logGross <- log(top20_2016$gross)
hist(top20_2016$logGross)

plot(top20_2016$gross,top20_2016$budget) 

boxplot(mydata$gross~mydata$year,names=c("2015","2016"), ylab="Gross", main="Boxplot of Gross Income by Year")

corr_2015 <-lm(top20_2015$gross~top20_2015$budget)
summary(corr_2015)
## 
## Call:
## lm(formula = top20_2015$gross ~ top20_2015$budget)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -154564832  -73105917  -12191666   27680498  382529449 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)  
## (Intercept)       1.355e+08  6.020e+07   2.250   0.0372 *
## top20_2015$budget 8.946e-01  4.316e-01   2.073   0.0528 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 117900000 on 18 degrees of freedom
## Multiple R-squared:  0.1927, Adjusted R-squared:  0.1478 
## F-statistic: 4.296 on 1 and 18 DF,  p-value: 0.05284
corr_2016 <-lm(top20_2016$gross~top20_2016$budget)
summary(corr_2016)
## 
## Call:
## lm(formula = top20_2016$gross ~ top20_2016$budget)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -120039547  -66746593  -36597097   20010209  229766023 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)  
## (Intercept)       1.003e+08  5.199e+07   1.929   0.0696 .
## top20_2016$budget 5.681e-01  3.525e-01   1.612   0.1244  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 107200000 on 18 degrees of freedom
## Multiple R-squared:  0.1261, Adjusted R-squared:  0.07757 
## F-statistic: 2.598 on 1 and 18 DF,  p-value: 0.1244