IMDb Data
title = mydata$title
genre = mydata$genre
director = mydata$director
actor.1 = mydata$actor1
actor.2 = mydata$actor2
length = mydata$length
budget = mydata$budget
score = mydata$score
gross = mydata$gross
year = mydata$year
summary(mydata)
## title genre director
## Length:191 Length:191 Length:191
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## actor1 actor2 length budget
## Length:191 Length:191 Min. : 72.0 Min. : 100000
## Class :character Class :character 1st Qu.: 99.0 1st Qu.: 12310000
## Mode :character Mode :character Median :110.0 Median : 30000000
## Mean :112.2 Mean : 54944718
## 3rd Qu.:122.0 3rd Qu.: 75000000
## Max. :187.0 Max. :250000000
## score gross year
## Min. :3.400 Min. : 3330 Min. :2015
## 1st Qu.:5.950 1st Qu.: 12232726 1st Qu.:2015
## Median :6.600 Median : 38317535 Median :2015
## Mean :6.476 Mean : 70622179 Mean :2015
## 3rd Qu.:7.200 3rd Qu.: 81448087 3rd Qu.:2016
## Max. :8.400 Max. :652177271 Max. :2016
Gross
mean.gross = mean(gross)
mean.gross
## [1] 70622179
sd.gross = sd(gross)
sd.gross
## [1] 95640638
max.gross = max(gross)
max.gross
## [1] 652177271
min.gross = min(gross)
min.gross
## [1] 3330
snr.gross = mean.gross/sd.gross
snr.gross
## [1] 0.7384118
var.gross = var(gross)
var.gross
## [1] 9.147132e+15
lower.gross = mean.gross - (3) * sd.gross
upper.gross = mean.gross + (3) * sd.gross
lower.gross
## [1] -216299735
upper.gross
## [1] 357544094
quantile(gross)
## 0% 25% 50% 75% 100%
## 3330 12232726 38317535 81448087 652177271
lowerq.gross = quantile(gross) [2]
upperq.gross = quantile(gross) [4]
IQR.gross = upperq.gross - lowerq.gross
IQR.gross
## 75%
## 69215361
upperthreshold.gross = (IQR.gross * 1.5) + upper.gross
upperthreshold.gross
## 75%
## 461367135
lowerthreshold.gross = lower.gross - (IQR.gross * 1.5)
lowerthreshold.gross
## 75%
## -320122777
gross[gross>upperthreshold.gross]
## [1] 652177271
Budget
mean.budget = mean(budget)
mean.budget
## [1] 54944718
sd.budget = sd(budget)
sd.budget
## [1] 58986872
max.budget = max(budget)
max.budget
## [1] 2.5e+08
min.budget = min(budget)
min.budget
## [1] 1e+05
snr.budget = mean.budget/sd.budget
snr.budget
## [1] 0.9314737
var.budget = var(budget)
var.budget
## [1] 3.479451e+15
lower.budget = mean.budget - (3) * sd.budget
upper.budget = mean.budget + (3) * sd.budget
lower.budget
## [1] -122015897
upper.budget
## [1] 231905333
quantile(budget)
## 0% 25% 50% 75% 100%
## 100000 12310000 30000000 75000000 250000000
lowerq.budget = quantile(budget) [2]
upperq.budget = quantile(budget) [4]
IQR.budget = upperq.budget - lowerq.budget
IQR.budget
## 75%
## 62690000
upperthreshold.budget = (IQR.budget * 1.5) + upper.budget
upperthreshold.budget
## 75%
## 325940333
lowerthreshold.budget = lower.budget - (IQR.budget * 1.5)
lowerthreshold.budget
## 75%
## -216050897
budget[budget>upperthreshold.budget]
## numeric(0)
Score
mean.score = mean(score)
mean.score
## [1] 6.475916
sd.score = sd(score)
sd.score
## [1] 0.9818597
max.score = max(score)
max.score
## [1] 8.4
min.score = min(score)
min.score
## [1] 3.4
snr.score = mean.score/sd.score
snr.score
## [1] 6.595562
var.score = var(score)
var.score
## [1] 0.9640485
lower.score = mean.score - (3) * sd.score
upper.score = mean.score + (3) * sd.score
lower.score
## [1] 3.530337
upper.score
## [1] 9.421495
quantile(score)
## 0% 25% 50% 75% 100%
## 3.40 5.95 6.60 7.20 8.40
lowerq.score = quantile(score) [2]
upperq.score = quantile(score) [4]
IQR.score = upperq.score - lowerq.score
IQR.score
## 75%
## 1.25
upperthreshold.score = (IQR.score * 1.5) + upper.score
upperthreshold.score
## 75%
## 11.2965
lowerthreshold.score = lower.score - (IQR.score * 1.5)
lowerthreshold.score
## 75%
## 1.655337
score[score>upperthreshold.score]
## numeric(0)
Tables
head(mydata)
## # A tibble: 6 × 10
## title genre director
## <chr> <chr> <chr>
## 1 Jurassic World Action|Adventure|Sci-Fi Colin Trevorrow
## 2 Avengers: Age of Ultron Action|Adventure|Sci-Fi Joss Whedon
## 3 Captain America: Civil War Action|Adventure|Sci-Fi Anthony Russo
## 4 Deadpool Action|Adventure|Comedy Tim Miller
## 5 The Jungle Book Adventure|Drama|Family Jon Favreau
## 6 Inside Out Adventure|Animation|Comedy Pete Docter
## # ... with 7 more variables: actor1 <chr>, actor2 <chr>, length <dbl>,
## # budget <dbl>, score <dbl>, gross <dbl>, year <dbl>
gross.top20 = subset(mydata, gross >=169692572 )
gross.top20
## # A tibble: 20 × 10
## title genre
## <chr> <chr>
## 1 Jurassic World Action|Adventure|Sci-Fi
## 2 Avengers: Age of Ultron Action|Adventure|Sci-Fi
## 3 Captain America: Civil War Action|Adventure|Sci-Fi
## 4 Deadpool Action|Adventure|Comedy
## 5 The Jungle Book Adventure|Drama|Family
## 6 Inside Out Adventure|Animation|Comedy
## 7 Furious 7 Action|Crime|Thriller
## 8 Minions Action|Animation|Comedy
## 9 Batman v Superman: Dawn of Justice Action|Adventure|Sci-Fi
## 10 The Secret Life of Pets Animation|Comedy|Family
## 11 The Hunger Games: Mockingjay - Part 2 Adventure|Sci-Fi
## 12 The Martian Adventure|Drama|Sci-Fi
## 13 Cinderella Drama|Family|Fantasy
## 14 Spectre Action|Adventure|Thriller
## 15 Mission: Impossible - Rogue Nation Action|Adventure|Thriller
## 16 The Revenant Adventure|Drama|Thriller
## 17 Pitch Perfect 2 Comedy|Music
## 18 Ant-Man Action|Adventure|Comedy
## 19 Home Adventure|Animation|Comedy
## 20 Hotel Transylvania 2 Animation|Comedy|Family
## # ... with 8 more variables: director <chr>, actor1 <chr>, actor2 <chr>,
## # length <dbl>, budget <dbl>, score <dbl>, gross <dbl>, year <dbl>
budget.top20 = subset(mydata, budget >=155000000 )
budget.top20
## # A tibble: 18 × 10
## title genre
## <chr> <chr>
## 1 Avengers: Age of Ultron Action|Adventure|Sci-Fi
## 2 Captain America: Civil War Action|Adventure|Sci-Fi
## 3 The Jungle Book Adventure|Drama|Family
## 4 Inside Out Adventure|Animation|Comedy
## 5 Furious 7 Action|Crime|Thriller
## 6 Batman v Superman: Dawn of Justice Action|Adventure|Sci-Fi
## 7 The Hunger Games: Mockingjay - Part 2 Adventure|Sci-Fi
## 8 Spectre Action|Adventure|Thriller
## 9 Suicide Squad Action|Adventure|Comedy
## 10 X-Men: Apocalypse Action|Adventure|Sci-Fi
## 11 Star Trek Beyond Action|Adventure|Sci-Fi
## 12 The Legend of Tarzan Action|Adventure|Drama
## 13 Independence Day: Resurgence Action|Adventure|Sci-Fi
## 14 Tomorrowland Action|Adventure|Family
## 15 Terminator Genisys Action|Adventure|Sci-Fi
## 16 Alice Through the Looking Glass Adventure|Family|Fantasy
## 17 Jupiter Ascending Action|Adventure|Sci-Fi
## 18 Warcraft Action|Adventure|Fantasy
## # ... with 8 more variables: director <chr>, actor1 <chr>, actor2 <chr>,
## # length <dbl>, budget <dbl>, score <dbl>, gross <dbl>, year <dbl>
score.top20 = subset(mydata, score >=7.7 )
score.top20
## # A tibble: 18 × 10
## title genre
## <chr> <chr>
## 1 Captain America: Civil War Action|Adventure|Sci-Fi
## 2 Deadpool Action|Adventure|Comedy
## 3 The Jungle Book Adventure|Drama|Family
## 4 Inside Out Adventure|Animation|Comedy
## 5 The Martian Adventure|Drama|Sci-Fi
## 6 The Revenant Adventure|Drama|Thriller
## 7 Straight Outta Compton Biography|Crime|Drama
## 8 Mad Max: Fury Road Action|Adventure|Sci-Fi
## 9 Creed Drama|Sport
## 10 The Conjuring 2 Horror|Mystery|Thriller
## 11 The Big Short Biography|Comedy|Drama
## 12 The Hateful Eight Crime|Drama|Mystery
## 13 Spotlight Biography|Crime|Drama
## 14 Ex Machina Drama|Mystery|Sci-Fi
## 15 Room Drama
## 16 Baahubali: The Beginning Action|Adventure|Drama
## 17 The Little Prince Adventure|Animation|Drama
## 18 The Second Mother Comedy|Drama
## # ... with 8 more variables: director <chr>, actor1 <chr>, actor2 <chr>,
## # length <dbl>, budget <dbl>, score <dbl>, gross <dbl>, year <dbl>
corr01 = mydata[ c(7,9) ]
cor(corr01)
## budget gross
## budget 1.0000000 0.6658252
## gross 0.6658252 1.0000000
corr02 = mydata[ c(7,8) ]
cor(corr02)
## budget score
## budget 1.0000000 0.1973812
## score 0.1973812 1.0000000
corr03 = mydata[ c(8,9) ]
cor(corr03)
## score gross
## score 1.000000 0.284871
## gross 0.284871 1.000000
corr04 = mydata[ c(7,8,9) ]
cor(corr04)
## budget score gross
## budget 1.0000000 0.1973812 0.6658252
## score 0.1973812 1.0000000 0.2848710
## gross 0.6658252 0.2848710 1.0000000
plot(mydata$gross~mydata$budget)
gross.budget.linear=lm(mydata$gross~mydata$budget)
abline(gross.budget.linear,col="blue",lwd=2)

summary(gross.budget.linear)
##
## Call:
## lm(formula = mydata$gross ~ mydata$budget)
##
## Residuals:
## Min 1Q Median 3Q Max
## -153933453 -27310329 -10907441 17464413 478937086
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.131e+07 7.084e+06 1.596 0.112
## mydata$budget 1.080e+00 8.799e-02 12.268 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 71550000 on 189 degrees of freedom
## Multiple R-squared: 0.4433, Adjusted R-squared: 0.4404
## F-statistic: 150.5 on 1 and 189 DF, p-value: < 2.2e-16
Top 20
imdb2015 <- subset(mydata, mydata$year == 2015)
imdb2016 <- subset(mydata, mydata$year == 2016)
imdb2015
## # A tibble: 127 × 10
## title genre
## <chr> <chr>
## 1 Jurassic World Action|Adventure|Sci-Fi
## 2 Avengers: Age of Ultron Action|Adventure|Sci-Fi
## 3 Inside Out Adventure|Animation|Comedy
## 4 Furious 7 Action|Crime|Thriller
## 5 Minions Action|Animation|Comedy
## 6 The Hunger Games: Mockingjay - Part 2 Adventure|Sci-Fi
## 7 The Martian Adventure|Drama|Sci-Fi
## 8 Cinderella Drama|Family|Fantasy
## 9 Spectre Action|Adventure|Thriller
## 10 Mission: Impossible - Rogue Nation Action|Adventure|Thriller
## # ... with 117 more rows, and 8 more variables: director <chr>,
## # actor1 <chr>, actor2 <chr>, length <dbl>, budget <dbl>, score <dbl>,
## # gross <dbl>, year <dbl>
imdb2016
## # A tibble: 64 × 10
## title genre
## <chr> <chr>
## 1 Captain America: Civil War Action|Adventure|Sci-Fi
## 2 Deadpool Action|Adventure|Comedy
## 3 The Jungle Book Adventure|Drama|Family
## 4 Batman v Superman: Dawn of Justice Action|Adventure|Sci-Fi
## 5 The Secret Life of Pets Animation|Comedy|Family
## 6 Suicide Squad Action|Adventure|Comedy
## 7 X-Men: Apocalypse Action|Adventure|Sci-Fi
## 8 Kung Fu Panda 3 Action|Adventure|Animation
## 9 Star Trek Beyond Action|Adventure|Sci-Fi
## 10 Central Intelligence Action|Comedy|Crime
## # ... with 54 more rows, and 8 more variables: director <chr>,
## # actor1 <chr>, actor2 <chr>, length <dbl>, budget <dbl>, score <dbl>,
## # gross <dbl>, year <dbl>
top20_2015 <- subset(imdb2015, imdb2015$gross >=153629485)
top20_2016 <- subset(imdb2016, imdb2016$gross >=76846624)
top20_imdb <- subset(mydata, mydata$gross >=169692572)
top20_2015
## # A tibble: 20 × 10
## title genre
## <chr> <chr>
## 1 Jurassic World Action|Adventure|Sci-Fi
## 2 Avengers: Age of Ultron Action|Adventure|Sci-Fi
## 3 Inside Out Adventure|Animation|Comedy
## 4 Furious 7 Action|Crime|Thriller
## 5 Minions Action|Animation|Comedy
## 6 The Hunger Games: Mockingjay - Part 2 Adventure|Sci-Fi
## 7 The Martian Adventure|Drama|Sci-Fi
## 8 Cinderella Drama|Family|Fantasy
## 9 Spectre Action|Adventure|Thriller
## 10 Mission: Impossible - Rogue Nation Action|Adventure|Thriller
## 11 The Revenant Adventure|Drama|Thriller
## 12 Pitch Perfect 2 Comedy|Music
## 13 Ant-Man Action|Adventure|Comedy
## 14 Home Adventure|Animation|Comedy
## 15 Hotel Transylvania 2 Animation|Comedy|Family
## 16 Fifty Shades of Grey Drama|Romance
## 17 The SpongeBob Movie: Sponge Out of Water Adventure|Animation|Comedy
## 18 Straight Outta Compton Biography|Crime|Drama
## 19 San Andreas Action|Adventure|Drama
## 20 Mad Max: Fury Road Action|Adventure|Sci-Fi
## # ... with 8 more variables: director <chr>, actor1 <chr>, actor2 <chr>,
## # length <dbl>, budget <dbl>, score <dbl>, gross <dbl>, year <dbl>
top20_2016
## # A tibble: 20 × 10
## title
## <chr>
## 1 Captain America: Civil War
## 2 Deadpool
## 3 The Jungle Book
## 4 Batman v Superman: Dawn of Justice
## 5 The Secret Life of Pets
## 6 Suicide Squad
## 7 X-Men: Apocalypse
## 8 Kung Fu Panda 3
## 9 Star Trek Beyond
## 10 Central Intelligence
## 11 The Legend of Tarzan
## 12 Ghostbusters
## 13 Jason Bourne
## 14 The Angry Birds Movie
## 15 Independence Day: Resurgence
## 16 The Conjuring 2
## 17 Ride Along 2
## 18 Teenage Mutant Ninja Turtles: Out of the Shadows
## 19 The Purge: Election Year
## 20 Alice Through the Looking Glass
## # ... with 9 more variables: genre <chr>, director <chr>, actor1 <chr>,
## # actor2 <chr>, length <dbl>, budget <dbl>, score <dbl>, gross <dbl>,
## # year <dbl>
top20_imdb
## # A tibble: 20 × 10
## title genre
## <chr> <chr>
## 1 Jurassic World Action|Adventure|Sci-Fi
## 2 Avengers: Age of Ultron Action|Adventure|Sci-Fi
## 3 Captain America: Civil War Action|Adventure|Sci-Fi
## 4 Deadpool Action|Adventure|Comedy
## 5 The Jungle Book Adventure|Drama|Family
## 6 Inside Out Adventure|Animation|Comedy
## 7 Furious 7 Action|Crime|Thriller
## 8 Minions Action|Animation|Comedy
## 9 Batman v Superman: Dawn of Justice Action|Adventure|Sci-Fi
## 10 The Secret Life of Pets Animation|Comedy|Family
## 11 The Hunger Games: Mockingjay - Part 2 Adventure|Sci-Fi
## 12 The Martian Adventure|Drama|Sci-Fi
## 13 Cinderella Drama|Family|Fantasy
## 14 Spectre Action|Adventure|Thriller
## 15 Mission: Impossible - Rogue Nation Action|Adventure|Thriller
## 16 The Revenant Adventure|Drama|Thriller
## 17 Pitch Perfect 2 Comedy|Music
## 18 Ant-Man Action|Adventure|Comedy
## 19 Home Adventure|Animation|Comedy
## 20 Hotel Transylvania 2 Animation|Comedy|Family
## # ... with 8 more variables: director <chr>, actor1 <chr>, actor2 <chr>,
## # length <dbl>, budget <dbl>, score <dbl>, gross <dbl>, year <dbl>
plot(top20_imdb$gross~top20_imdb$budget)
linear.model=lm(top20_imdb$gross~top20_imdb$budget)
abline(linear.model,col="blue",lwd=2)

summary(linear.model)
##
## Call:
## lm(formula = top20_imdb$gross ~ top20_imdb$budget)
##
## Residuals:
## Min 1Q Median 3Q Max
## -156612734 -92077429 -26704988 53231190 352548220
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.095e+08 6.470e+07 3.238 0.00456 **
## top20_imdb$budget 6.006e-01 4.047e-01 1.484 0.15510
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 119100000 on 18 degrees of freedom
## Multiple R-squared: 0.109, Adjusted R-squared: 0.05952
## F-statistic: 2.202 on 1 and 18 DF, p-value: 0.1551
top20_2015$logGross <- log(top20_2015$gross)
hist(top20_2015$logGross)

plot(top20_2015$gross,top20_2015$budget)

top20_2016$logGross <- log(top20_2016$gross)
hist(top20_2016$logGross)

plot(top20_2016$gross,top20_2016$budget)

boxplot(mydata$gross~mydata$year,names=c("2015","2016"), ylab="Gross", main="Boxplot of Gross Income by Year")

corr_2015 <-lm(top20_2015$gross~top20_2015$budget)
summary(corr_2015)
##
## Call:
## lm(formula = top20_2015$gross ~ top20_2015$budget)
##
## Residuals:
## Min 1Q Median 3Q Max
## -154564832 -73105917 -12191666 27680498 382529449
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.355e+08 6.020e+07 2.250 0.0372 *
## top20_2015$budget 8.946e-01 4.316e-01 2.073 0.0528 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 117900000 on 18 degrees of freedom
## Multiple R-squared: 0.1927, Adjusted R-squared: 0.1478
## F-statistic: 4.296 on 1 and 18 DF, p-value: 0.05284
corr_2016 <-lm(top20_2016$gross~top20_2016$budget)
summary(corr_2016)
##
## Call:
## lm(formula = top20_2016$gross ~ top20_2016$budget)
##
## Residuals:
## Min 1Q Median 3Q Max
## -120039547 -66746593 -36597097 20010209 229766023
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.003e+08 5.199e+07 1.929 0.0696 .
## top20_2016$budget 5.681e-01 3.525e-01 1.612 0.1244
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 107200000 on 18 degrees of freedom
## Multiple R-squared: 0.1261, Adjusted R-squared: 0.07757
## F-statistic: 2.598 on 1 and 18 DF, p-value: 0.1244