Let’s load libraries.

library(ggplot2)
library(dplyr)
library(scales)
library(ggthemes)
library(VIM)
library(stringr)
library(ggrepel)
library(plotly)
library(corrplot)
library(GGally)

I want to disable scientific notation - I am not comfortable with it.

options(scipen = 999)

Structure of given data. We have 5042 observations and 29 variables.

str(data1)
## 'data.frame':    5043 obs. of  28 variables:
##  $ color                  : chr  "Color" "Color" "Color" "Color" ...
##  $ director.name          : chr  "James Cameron" "Gore Verbinski" "Sam Mendes" "Christopher Nolan" ...
##  $ num.critic.review      : int  723 302 602 813 NA 462 392 324 635 375 ...
##  $ duration               : int  178 169 148 164 NA 132 156 100 141 153 ...
##  $ director.fbook.likes   : int  0 563 0 22000 131 475 0 15 0 282 ...
##  $ actor.fbook.likes      : int  855 1000 161 23000 NA 530 4000 284 19000 10000 ...
##  $ actor2.name            : chr  "Joel David Moore" "Orlando Bloom" "Rory Kinnear" "Christian Bale" ...
##  $ actor1.fbook.likes     : int  1000 40000 11000 27000 131 640 24000 799 26000 25000 ...
##  $ gross                  : int  760505847 309404152 200074175 448130642 NA 73058679 336530303 200807262 458991599 301956980 ...
##  $ genres                 : chr  "Action|Adventure|Fantasy|Sci-Fi" "Action|Adventure|Fantasy" "Action|Adventure|Thriller" "Action|Thriller" ...
##  $ actor1.name            : chr  "CCH Pounder" "Johnny Depp" "Christoph Waltz" "Tom Hardy" ...
##  $ movie.title            : chr  "Avatar " "Pirates of the Caribbean: At World's End " "Spectre " "The Dark Knight Rises " ...
##  $ users.vote             : int  886204 471220 275868 1144337 8 212204 383056 294810 462669 321795 ...
##  $ cast.tot.fbook.likes   : int  4834 48350 11700 106759 143 1873 46055 2036 92000 58753 ...
##  $ actor3.names           : chr  "Wes Studi" "Jack Davenport" "Stephanie Sigman" "Joseph Gordon-Levitt" ...
##  $ faces.in.poster        : int  0 0 1 0 0 1 0 1 4 3 ...
##  $ plot.keywords          : chr  "avatar|future|marine|native|paraplegic" "goddess|marriage ceremony|marriage proposal|pirate|singapore" "bomb|espionage|sequel|spy|terrorist" "deception|imprisonment|lawlessness|police officer|terrorist plot" ...
##  $ imdb.link              : chr  "http://www.imdb.com/title/tt0499549/?ref_=fn_tt_tt_1" "http://www.imdb.com/title/tt0449088/?ref_=fn_tt_tt_1" "http://www.imdb.com/title/tt2379713/?ref_=fn_tt_tt_1" "http://www.imdb.com/title/tt1345836/?ref_=fn_tt_tt_1" ...
##  $ num.of.users.for.review: int  3054 1238 994 2701 NA 738 1902 387 1117 973 ...
##  $ language               : chr  "English" "English" "English" "English" ...
##  $ country                : chr  "USA" "USA" "UK" "USA" ...
##  $ content.rating         : chr  "PG-13" "PG-13" "PG-13" "PG-13" ...
##  $ budget                 : num  237000000 300000000 245000000 250000000 NA ...
##  $ title.year             : int  2009 2007 2015 2012 NA 2012 2007 2010 2015 2009 ...
##  $ actor.fbookl.likes     : int  936 5000 393 23000 12 632 11000 553 21000 11000 ...
##  $ imdb.score             : num  7.9 7.1 6.8 8.5 7.1 6.6 6.2 7.8 7.5 7.5 ...
##  $ aspect.ratio           : num  1.78 2.35 2.35 2.35 NA 2.35 2.35 1.85 2.35 2.35 ...
##  $ movie.fbook.likes      : int  33000 0 85000 164000 0 24000 0 29000 118000 10000 ...
summary(data1)
##     color           director.name      num.critic.review    duration    
##  Length:5043        Length:5043        Min.   :  1.0     Min.   :  7.0  
##  Class :character   Class :character   1st Qu.: 50.0     1st Qu.: 93.0  
##  Mode  :character   Mode  :character   Median :110.0     Median :103.0  
##                                        Mean   :140.2     Mean   :107.2  
##                                        3rd Qu.:195.0     3rd Qu.:118.0  
##                                        Max.   :813.0     Max.   :511.0  
##                                        NA's   :50        NA's   :15     
##  director.fbook.likes actor.fbook.likes actor2.name       
##  Min.   :    0.0      Min.   :    0.0   Length:5043       
##  1st Qu.:    7.0      1st Qu.:  133.0   Class :character  
##  Median :   49.0      Median :  371.5   Mode  :character  
##  Mean   :  686.5      Mean   :  645.0                     
##  3rd Qu.:  194.5      3rd Qu.:  636.0                     
##  Max.   :23000.0      Max.   :23000.0                     
##  NA's   :104          NA's   :23                          
##  actor1.fbook.likes     gross              genres         
##  Min.   :     0     Min.   :      162   Length:5043       
##  1st Qu.:   614     1st Qu.:  5340988   Class :character  
##  Median :   988     Median : 25517500   Mode  :character  
##  Mean   :  6560     Mean   : 48468408                     
##  3rd Qu.: 11000     3rd Qu.: 62309438                     
##  Max.   :640000     Max.   :760505847                     
##  NA's   :7          NA's   :884                           
##  actor1.name        movie.title          users.vote     
##  Length:5043        Length:5043        Min.   :      5  
##  Class :character   Class :character   1st Qu.:   8594  
##  Mode  :character   Mode  :character   Median :  34359  
##                                        Mean   :  83668  
##                                        3rd Qu.:  96309  
##                                        Max.   :1689764  
##                                                         
##  cast.tot.fbook.likes actor3.names       faces.in.poster 
##  Min.   :     0       Length:5043        Min.   : 0.000  
##  1st Qu.:  1411       Class :character   1st Qu.: 0.000  
##  Median :  3090       Mode  :character   Median : 1.000  
##  Mean   :  9699                          Mean   : 1.371  
##  3rd Qu.: 13756                          3rd Qu.: 2.000  
##  Max.   :656730                          Max.   :43.000  
##                                          NA's   :13      
##  plot.keywords       imdb.link         num.of.users.for.review
##  Length:5043        Length:5043        Min.   :   1.0         
##  Class :character   Class :character   1st Qu.:  65.0         
##  Mode  :character   Mode  :character   Median : 156.0         
##                                        Mean   : 272.8         
##                                        3rd Qu.: 326.0         
##                                        Max.   :5060.0         
##                                        NA's   :21             
##    language           country          content.rating    
##  Length:5043        Length:5043        Length:5043       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##      budget              title.year   actor.fbookl.likes   imdb.score   
##  Min.   :        218   Min.   :1916   Min.   :     0     Min.   :1.600  
##  1st Qu.:    6000000   1st Qu.:1999   1st Qu.:   281     1st Qu.:5.800  
##  Median :   20000000   Median :2005   Median :   595     Median :6.600  
##  Mean   :   39752620   Mean   :2002   Mean   :  1652     Mean   :6.442  
##  3rd Qu.:   45000000   3rd Qu.:2011   3rd Qu.:   918     3rd Qu.:7.200  
##  Max.   :12215500000   Max.   :2016   Max.   :137000     Max.   :9.500  
##  NA's   :492           NA's   :108    NA's   :13                        
##   aspect.ratio   movie.fbook.likes
##  Min.   : 1.18   Min.   :     0   
##  1st Qu.: 1.85   1st Qu.:     0   
##  Median : 2.35   Median :   166   
##  Mean   : 2.22   Mean   :  7526   
##  3rd Qu.: 2.35   3rd Qu.:  3000   
##  Max.   :16.00   Max.   :349000   
##  NA's   :329

The dataset contains 2698 NAs.

colSums(sapply(data1, is.na))
##                   color           director.name       num.critic.review 
##                      19                     104                      50 
##                duration    director.fbook.likes       actor.fbook.likes 
##                      15                     104                      23 
##             actor2.name      actor1.fbook.likes                   gross 
##                      13                       7                     884 
##                  genres             actor1.name             movie.title 
##                       0                       7                       0 
##              users.vote    cast.tot.fbook.likes            actor3.names 
##                       0                       0                      23 
##         faces.in.poster           plot.keywords               imdb.link 
##                      13                     153                       0 
## num.of.users.for.review                language                 country 
##                      21                      12                       5 
##          content.rating                  budget              title.year 
##                     303                     492                     108 
##      actor.fbookl.likes              imdb.score            aspect.ratio 
##                      13                       0                     329 
##       movie.fbook.likes 
##                       0
sum(is.na(data1))
## [1] 2698

NAs distribution

I really want to see how NAs are distributed in the dataset.

missing.values<-aggr(data1, sortVars=T, prop=T,sortCombs=T, cex.lab=1.5, cex.axis=.6, cex.numbers=5, combined=F, gap=-.2)

## 
##  Variables sorted by number of missings: 
##                 Variable        Count
##                    gross 0.1752924846
##                   budget 0.0975609756
##             aspect.ratio 0.0652389451
##           content.rating 0.0600832838
##            plot.keywords 0.0303390839
##               title.year 0.0214158239
##            director.name 0.0206226453
##     director.fbook.likes 0.0206226453
##        num.critic.review 0.0099147333
##        actor.fbook.likes 0.0045607773
##             actor3.names 0.0045607773
##  num.of.users.for.review 0.0041641880
##                    color 0.0037675987
##                 duration 0.0029744200
##              actor2.name 0.0025778307
##          faces.in.poster 0.0025778307
##       actor.fbookl.likes 0.0025778307
##                 language 0.0023795360
##       actor1.fbook.likes 0.0013880627
##              actor1.name 0.0013880627
##                  country 0.0009914733
##                   genres 0.0000000000
##              movie.title 0.0000000000
##               users.vote 0.0000000000
##     cast.tot.fbook.likes 0.0000000000
##                imdb.link 0.0000000000
##               imdb.score 0.0000000000
##        movie.fbook.likes 0.0000000000

We can see that most of the NAs are from gross, budget & aspect ratio 18%, 10% & 6 % respectively. We can also see there is hardly any rows which have full observations. However, imdb.score has full observations. I am going to leave Nas as it is and work with it as I progress on.

Histogram of Movie released

Movie production just exploded after year 1990. It could be due to advancement in technology and commercialisation of internet.

ggplot(data1, aes(title.year))+geom_bar()+
        labs(x="Year movie was released", y="Movie Count", title="Histogram of Movie released")

Profitable movies

We are given Revenue & Cost of movie production therefore, we can easily find Profit.

data1 <- data1 %>% mutate(Profit=gross-budget)
data1 %>% filter(Profit>0) %>% ggplot(aes(x=budget/1000000, y=Profit/1000000))+
        geom_point()+
        geom_smooth(method='lm')+
        labs(x="Budget $M", y="Profit $ M", title="Profit vs Budget")

Let’s transform the scales into Log10

data1 %>% filter(Profit>0) %>% ggplot(aes(x=budget/1000000, y=Profit/1000000))+
        geom_point()+
        scale_x_log10() +
        scale_y_log10()+
        geom_smooth(method = 'lm')

I just found out from one of the Kagglers that international movie Revenue & Cost are in local currency. Which mean our above Profit vs Budget is not relevant.

Regardless of that let’s find top 10 profitable movies from the dataset.

data1 %>% filter(Profit>0) %>% top_n(10, Profit) %>% arrange(Profit) %>%
        ggplot(aes(x=budget/1000000, y=Profit/1000000, col=factor(director.name)))+
        geom_point()+
        geom_text_repel(aes(label=movie.title), nudge_x = 0, nudge_y = 5)+
        labs(x="Budget $M", y="Profit $ M", title="Top 10 Profitable Movies")

There’s duplication of rank 6 in the dataset. Let’s find and remove it.

which(data1$Profit==403279547)
## [1]  18 795
data1<-data1[-18,]

We also can see that  attached at the end of the movie title. Not sure what it is but I don’t think it should be there. Let’s remove that and any blank spaces.

head(data1$movie.title,10)
##  [1] "Avatar "                                                
##  [2] "Pirates of the Caribbean: At World's End "              
##  [3] "Spectre "                                               
##  [4] "The Dark Knight Rises "                                 
##  [5] "Star Wars: Episode VII - The Force Awakens             "
##  [6] "John Carter "                                           
##  [7] "Spider-Man 3 "                                          
##  [8] "Tangled "                                               
##  [9] "Avengers: Age of Ultron "                               
## [10] "Harry Potter and the Half-Blood Prince "
data1$movie.title<-sapply(data1$movie.title, function(x) gsub("Â","", x))

data1$movie.title<-str_trim(data1$movie.title, side = c("both"))

Now, let’s plot it again

data1 %>% filter(Profit>0) %>% top_n(10, Profit) %>% arrange(Profit) %>%
        ggplot(aes(x=budget/1000000, y=Profit/1000000, col=factor(director.name)))+
        geom_point()+
        geom_text_repel(aes(label=movie.title), nudge_x = 0, nudge_y = 5)+
        labs(x="Budget $M", y="Profit $ M", title="Top 10 Profitable Movies")

From the plot it seems that all top 10 movies were produced in USA. Lets verify it.

top10movies<-data1 %>% filter(Profit>0) %>% top_n(10, Profit) %>% arrange(Profit)
head(data.frame(top10movies$director.name, top10movies$country),10)
##    top10movies.director.name top10movies.country
## 1                  Gary Ross                 USA
## 2          Christopher Nolan                 USA
## 3               George Lucas                 USA
## 4               Roger Allers                 USA
## 5                Joss Whedon                 USA
## 6           Steven Spielberg                 USA
## 7               George Lucas                 USA
## 8              James Cameron                 USA
## 9            Colin Trevorrow                 USA
## 10             James Cameron                 USA

Since, all Top 10 profitable movies were made in USA we don’t have problem of exchange rate.

Movie directors vs average IMDB ratings

Let’s plot directors vs average IMDB ratings. The IMDB ratings are grouped into directors and its mean is obtained from it. Also, the plot will only display directors who have produced more then 5 movies.

directors <-data1 %>% group_by(director.name) %>%
        select(director.name, imdb.score) %>%
        summarise(Tot.films = n(), avg.imdb.score=mean(imdb.score, na.rm=T)) %>%
        filter(Tot.films>5) %>%
        na.omit() %>%
        arrange(desc(avg.imdb.score))

p<-ggplot(directors, aes(avg.imdb.score, Tot.films))+geom_point(aes(text=director.name))+
        labs(x="Averaged IMDB Score", y="Total flims made by Directors", title="Directors vs average IMDB ratings")
        
ggplotly(p)

Actor vs average IMDB rating

Let’s plot Actor vs average IMDB rating with standard error as well.

rating.actors <- data1 %>%
        group_by(actor1.name) %>%
        select(actor1.name, imdb.score) %>%
        na.omit()%>%
        summarise(M=mean(imdb.score, na.rm=T), SE=sd(imdb.score, na.rm=T)/sqrt(length(na.omit(imdb.score))), Length=length(na.omit(imdb.score)))%>%
        filter(Length>20)%>%
        arrange(desc(M))

rating.actors$actor1.name <-factor(rating.actors$actor1.name)
rating.actors$actor1.name <-reorder(rating.actors$actor1.name, rating.actors$M)

The plot will display actor’s averaged IMDB rating who has played more than 20 movies.

ggplot(rating.actors, aes(actor1.name,M))+
        geom_errorbar(aes(ymin=M-SE, ymax=M+SE))+
        geom_line()+
        geom_point(size=3, shape=21, fill="white")+
        coord_flip()+
        labs(x="Actors", y="Averaged Mean", title="Actors vs average IMDB rating")

Lastly, we’ll plot correlation matrix.

Let’s select all numeric variables.

numeric<-sapply(data1, is.numeric)
data1.numeric<-data1[,numeric]
data1.numeric<-cor(data1.numeric)
corrplot(data1.numeric, method= "circle", main = "Correlation Matrix", type = 'lower',tl.cex = .8)