Let’s load libraries.
library(ggplot2)
library(dplyr)
library(scales)
library(ggthemes)
library(VIM)
library(stringr)
library(ggrepel)
library(plotly)
library(corrplot)
library(GGally)
I want to disable scientific notation - I am not comfortable with it.
options(scipen = 999)
Structure of given data. We have 5042 observations and 29 variables.
str(data1)
## 'data.frame': 5043 obs. of 28 variables:
## $ color : chr "Color" "Color" "Color" "Color" ...
## $ director.name : chr "James Cameron" "Gore Verbinski" "Sam Mendes" "Christopher Nolan" ...
## $ num.critic.review : int 723 302 602 813 NA 462 392 324 635 375 ...
## $ duration : int 178 169 148 164 NA 132 156 100 141 153 ...
## $ director.fbook.likes : int 0 563 0 22000 131 475 0 15 0 282 ...
## $ actor.fbook.likes : int 855 1000 161 23000 NA 530 4000 284 19000 10000 ...
## $ actor2.name : chr "Joel David Moore" "Orlando Bloom" "Rory Kinnear" "Christian Bale" ...
## $ actor1.fbook.likes : int 1000 40000 11000 27000 131 640 24000 799 26000 25000 ...
## $ gross : int 760505847 309404152 200074175 448130642 NA 73058679 336530303 200807262 458991599 301956980 ...
## $ genres : chr "Action|Adventure|Fantasy|Sci-Fi" "Action|Adventure|Fantasy" "Action|Adventure|Thriller" "Action|Thriller" ...
## $ actor1.name : chr "CCH Pounder" "Johnny Depp" "Christoph Waltz" "Tom Hardy" ...
## $ movie.title : chr "Avatar " "Pirates of the Caribbean: At World's End " "Spectre " "The Dark Knight Rises " ...
## $ users.vote : int 886204 471220 275868 1144337 8 212204 383056 294810 462669 321795 ...
## $ cast.tot.fbook.likes : int 4834 48350 11700 106759 143 1873 46055 2036 92000 58753 ...
## $ actor3.names : chr "Wes Studi" "Jack Davenport" "Stephanie Sigman" "Joseph Gordon-Levitt" ...
## $ faces.in.poster : int 0 0 1 0 0 1 0 1 4 3 ...
## $ plot.keywords : chr "avatar|future|marine|native|paraplegic" "goddess|marriage ceremony|marriage proposal|pirate|singapore" "bomb|espionage|sequel|spy|terrorist" "deception|imprisonment|lawlessness|police officer|terrorist plot" ...
## $ imdb.link : chr "http://www.imdb.com/title/tt0499549/?ref_=fn_tt_tt_1" "http://www.imdb.com/title/tt0449088/?ref_=fn_tt_tt_1" "http://www.imdb.com/title/tt2379713/?ref_=fn_tt_tt_1" "http://www.imdb.com/title/tt1345836/?ref_=fn_tt_tt_1" ...
## $ num.of.users.for.review: int 3054 1238 994 2701 NA 738 1902 387 1117 973 ...
## $ language : chr "English" "English" "English" "English" ...
## $ country : chr "USA" "USA" "UK" "USA" ...
## $ content.rating : chr "PG-13" "PG-13" "PG-13" "PG-13" ...
## $ budget : num 237000000 300000000 245000000 250000000 NA ...
## $ title.year : int 2009 2007 2015 2012 NA 2012 2007 2010 2015 2009 ...
## $ actor.fbookl.likes : int 936 5000 393 23000 12 632 11000 553 21000 11000 ...
## $ imdb.score : num 7.9 7.1 6.8 8.5 7.1 6.6 6.2 7.8 7.5 7.5 ...
## $ aspect.ratio : num 1.78 2.35 2.35 2.35 NA 2.35 2.35 1.85 2.35 2.35 ...
## $ movie.fbook.likes : int 33000 0 85000 164000 0 24000 0 29000 118000 10000 ...
summary(data1)
## color director.name num.critic.review duration
## Length:5043 Length:5043 Min. : 1.0 Min. : 7.0
## Class :character Class :character 1st Qu.: 50.0 1st Qu.: 93.0
## Mode :character Mode :character Median :110.0 Median :103.0
## Mean :140.2 Mean :107.2
## 3rd Qu.:195.0 3rd Qu.:118.0
## Max. :813.0 Max. :511.0
## NA's :50 NA's :15
## director.fbook.likes actor.fbook.likes actor2.name
## Min. : 0.0 Min. : 0.0 Length:5043
## 1st Qu.: 7.0 1st Qu.: 133.0 Class :character
## Median : 49.0 Median : 371.5 Mode :character
## Mean : 686.5 Mean : 645.0
## 3rd Qu.: 194.5 3rd Qu.: 636.0
## Max. :23000.0 Max. :23000.0
## NA's :104 NA's :23
## actor1.fbook.likes gross genres
## Min. : 0 Min. : 162 Length:5043
## 1st Qu.: 614 1st Qu.: 5340988 Class :character
## Median : 988 Median : 25517500 Mode :character
## Mean : 6560 Mean : 48468408
## 3rd Qu.: 11000 3rd Qu.: 62309438
## Max. :640000 Max. :760505847
## NA's :7 NA's :884
## actor1.name movie.title users.vote
## Length:5043 Length:5043 Min. : 5
## Class :character Class :character 1st Qu.: 8594
## Mode :character Mode :character Median : 34359
## Mean : 83668
## 3rd Qu.: 96309
## Max. :1689764
##
## cast.tot.fbook.likes actor3.names faces.in.poster
## Min. : 0 Length:5043 Min. : 0.000
## 1st Qu.: 1411 Class :character 1st Qu.: 0.000
## Median : 3090 Mode :character Median : 1.000
## Mean : 9699 Mean : 1.371
## 3rd Qu.: 13756 3rd Qu.: 2.000
## Max. :656730 Max. :43.000
## NA's :13
## plot.keywords imdb.link num.of.users.for.review
## Length:5043 Length:5043 Min. : 1.0
## Class :character Class :character 1st Qu.: 65.0
## Mode :character Mode :character Median : 156.0
## Mean : 272.8
## 3rd Qu.: 326.0
## Max. :5060.0
## NA's :21
## language country content.rating
## Length:5043 Length:5043 Length:5043
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## budget title.year actor.fbookl.likes imdb.score
## Min. : 218 Min. :1916 Min. : 0 Min. :1.600
## 1st Qu.: 6000000 1st Qu.:1999 1st Qu.: 281 1st Qu.:5.800
## Median : 20000000 Median :2005 Median : 595 Median :6.600
## Mean : 39752620 Mean :2002 Mean : 1652 Mean :6.442
## 3rd Qu.: 45000000 3rd Qu.:2011 3rd Qu.: 918 3rd Qu.:7.200
## Max. :12215500000 Max. :2016 Max. :137000 Max. :9.500
## NA's :492 NA's :108 NA's :13
## aspect.ratio movie.fbook.likes
## Min. : 1.18 Min. : 0
## 1st Qu.: 1.85 1st Qu.: 0
## Median : 2.35 Median : 166
## Mean : 2.22 Mean : 7526
## 3rd Qu.: 2.35 3rd Qu.: 3000
## Max. :16.00 Max. :349000
## NA's :329
The dataset contains 2698 NAs.
colSums(sapply(data1, is.na))
## color director.name num.critic.review
## 19 104 50
## duration director.fbook.likes actor.fbook.likes
## 15 104 23
## actor2.name actor1.fbook.likes gross
## 13 7 884
## genres actor1.name movie.title
## 0 7 0
## users.vote cast.tot.fbook.likes actor3.names
## 0 0 23
## faces.in.poster plot.keywords imdb.link
## 13 153 0
## num.of.users.for.review language country
## 21 12 5
## content.rating budget title.year
## 303 492 108
## actor.fbookl.likes imdb.score aspect.ratio
## 13 0 329
## movie.fbook.likes
## 0
sum(is.na(data1))
## [1] 2698
I really want to see how NAs are distributed in the dataset.
missing.values<-aggr(data1, sortVars=T, prop=T,sortCombs=T, cex.lab=1.5, cex.axis=.6, cex.numbers=5, combined=F, gap=-.2)
##
## Variables sorted by number of missings:
## Variable Count
## gross 0.1752924846
## budget 0.0975609756
## aspect.ratio 0.0652389451
## content.rating 0.0600832838
## plot.keywords 0.0303390839
## title.year 0.0214158239
## director.name 0.0206226453
## director.fbook.likes 0.0206226453
## num.critic.review 0.0099147333
## actor.fbook.likes 0.0045607773
## actor3.names 0.0045607773
## num.of.users.for.review 0.0041641880
## color 0.0037675987
## duration 0.0029744200
## actor2.name 0.0025778307
## faces.in.poster 0.0025778307
## actor.fbookl.likes 0.0025778307
## language 0.0023795360
## actor1.fbook.likes 0.0013880627
## actor1.name 0.0013880627
## country 0.0009914733
## genres 0.0000000000
## movie.title 0.0000000000
## users.vote 0.0000000000
## cast.tot.fbook.likes 0.0000000000
## imdb.link 0.0000000000
## imdb.score 0.0000000000
## movie.fbook.likes 0.0000000000
We can see that most of the NAs are from gross, budget & aspect ratio 18%, 10% & 6 % respectively. We can also see there is hardly any rows which have full observations. However, imdb.score has full observations. I am going to leave Nas as it is and work with it as I progress on.
Movie production just exploded after year 1990. It could be due to advancement in technology and commercialisation of internet.
ggplot(data1, aes(title.year))+geom_bar()+
labs(x="Year movie was released", y="Movie Count", title="Histogram of Movie released")
We are given Revenue & Cost of movie production therefore, we can easily find Profit.
data1 <- data1 %>% mutate(Profit=gross-budget)
data1 %>% filter(Profit>0) %>% ggplot(aes(x=budget/1000000, y=Profit/1000000))+
geom_point()+
geom_smooth(method='lm')+
labs(x="Budget $M", y="Profit $ M", title="Profit vs Budget")
Let’s transform the scales into Log10
data1 %>% filter(Profit>0) %>% ggplot(aes(x=budget/1000000, y=Profit/1000000))+
geom_point()+
scale_x_log10() +
scale_y_log10()+
geom_smooth(method = 'lm')
I just found out from one of the Kagglers that international movie Revenue & Cost are in local currency. Which mean our above Profit vs Budget is not relevant.
Regardless of that let’s find top 10 profitable movies from the dataset.
data1 %>% filter(Profit>0) %>% top_n(10, Profit) %>% arrange(Profit) %>%
ggplot(aes(x=budget/1000000, y=Profit/1000000, col=factor(director.name)))+
geom_point()+
geom_text_repel(aes(label=movie.title), nudge_x = 0, nudge_y = 5)+
labs(x="Budget $M", y="Profit $ M", title="Top 10 Profitable Movies")
There’s duplication of rank 6 in the dataset. Let’s find and remove it.
which(data1$Profit==403279547)
## [1] 18 795
data1<-data1[-18,]
We also can see that  attached at the end of the movie title. Not sure what it is but I don’t think it should be there. Let’s remove that and any blank spaces.
head(data1$movie.title,10)
## [1] "Avatar "
## [2] "Pirates of the Caribbean: At World's End "
## [3] "Spectre "
## [4] "The Dark Knight Rises "
## [5] "Star Wars: Episode VII - The Force Awakens "
## [6] "John Carter "
## [7] "Spider-Man 3Â "
## [8] "Tangled "
## [9] "Avengers: Age of Ultron "
## [10] "Harry Potter and the Half-Blood Prince "
data1$movie.title<-sapply(data1$movie.title, function(x) gsub("Â","", x))
data1$movie.title<-str_trim(data1$movie.title, side = c("both"))
Now, let’s plot it again
data1 %>% filter(Profit>0) %>% top_n(10, Profit) %>% arrange(Profit) %>%
ggplot(aes(x=budget/1000000, y=Profit/1000000, col=factor(director.name)))+
geom_point()+
geom_text_repel(aes(label=movie.title), nudge_x = 0, nudge_y = 5)+
labs(x="Budget $M", y="Profit $ M", title="Top 10 Profitable Movies")
From the plot it seems that all top 10 movies were produced in USA. Lets verify it.
top10movies<-data1 %>% filter(Profit>0) %>% top_n(10, Profit) %>% arrange(Profit)
head(data.frame(top10movies$director.name, top10movies$country),10)
## top10movies.director.name top10movies.country
## 1 Gary Ross USA
## 2 Christopher Nolan USA
## 3 George Lucas USA
## 4 Roger Allers USA
## 5 Joss Whedon USA
## 6 Steven Spielberg USA
## 7 George Lucas USA
## 8 James Cameron USA
## 9 Colin Trevorrow USA
## 10 James Cameron USA
Since, all Top 10 profitable movies were made in USA we don’t have problem of exchange rate.
Let’s plot directors vs average IMDB ratings. The IMDB ratings are grouped into directors and its mean is obtained from it. Also, the plot will only display directors who have produced more then 5 movies.
directors <-data1 %>% group_by(director.name) %>%
select(director.name, imdb.score) %>%
summarise(Tot.films = n(), avg.imdb.score=mean(imdb.score, na.rm=T)) %>%
filter(Tot.films>5) %>%
na.omit() %>%
arrange(desc(avg.imdb.score))
p<-ggplot(directors, aes(avg.imdb.score, Tot.films))+geom_point(aes(text=director.name))+
labs(x="Averaged IMDB Score", y="Total flims made by Directors", title="Directors vs average IMDB ratings")
ggplotly(p)
Let’s plot Actor vs average IMDB rating with standard error as well.
rating.actors <- data1 %>%
group_by(actor1.name) %>%
select(actor1.name, imdb.score) %>%
na.omit()%>%
summarise(M=mean(imdb.score, na.rm=T), SE=sd(imdb.score, na.rm=T)/sqrt(length(na.omit(imdb.score))), Length=length(na.omit(imdb.score)))%>%
filter(Length>20)%>%
arrange(desc(M))
rating.actors$actor1.name <-factor(rating.actors$actor1.name)
rating.actors$actor1.name <-reorder(rating.actors$actor1.name, rating.actors$M)
The plot will display actor’s averaged IMDB rating who has played more than 20 movies.
ggplot(rating.actors, aes(actor1.name,M))+
geom_errorbar(aes(ymin=M-SE, ymax=M+SE))+
geom_line()+
geom_point(size=3, shape=21, fill="white")+
coord_flip()+
labs(x="Actors", y="Averaged Mean", title="Actors vs average IMDB rating")
Lastly, we’ll plot correlation matrix.
Let’s select all numeric variables.
numeric<-sapply(data1, is.numeric)
data1.numeric<-data1[,numeric]
data1.numeric<-cor(data1.numeric)
corrplot(data1.numeric, method= "circle", main = "Correlation Matrix", type = 'lower',tl.cex = .8)