Introduction

This dataset contains 28 variables about 5,043 movies spanning across 100 years in 66 countries. There are 2399 unique director names, and thousands of actors/actresses.I am going to perform exploratory data analysis on this dataset and find some meaningful insights.

Data Import & Cleaning

library(tidyverse)
library(stringr)

url<-"https://raw.githubusercontent.com/nitishghosal/IMDB-Data-Analysis/master/movie_metadata.csv"

movie <- as_data_frame(read.csv(url,stringsAsFactors = FALSE,na="NA"))

#movie <- as_data_frame(read.csv('tmdb_5000_movies.csv',stringsAsFactors = FALSE,na="NA"))

#credits <- as_data_frame(read.csv('tmdb_5000_credits.csv',stringsAsFactors = FALSE,na="NA"))

glimpse(movie)
## Observations: 5,043
## Variables: 28
## $ color                     <chr> "Color", "Color", "Color", "Color", ...
## $ director_name             <chr> "James Cameron", "Gore Verbinski", "...
## $ num_critic_for_reviews    <int> 723, 302, 602, 813, NA, 462, 392, 32...
## $ duration                  <int> 178, 169, 148, 164, NA, 132, 156, 10...
## $ director_facebook_likes   <int> 0, 563, 0, 22000, 131, 475, 0, 15, 0...
## $ actor_3_facebook_likes    <int> 855, 1000, 161, 23000, NA, 530, 4000...
## $ actor_2_name              <chr> "Joel David Moore", "Orlando Bloom",...
## $ actor_1_facebook_likes    <int> 1000, 40000, 11000, 27000, 131, 640,...
## $ gross                     <int> 760505847, 309404152, 200074175, 448...
## $ genres                    <chr> "Action|Adventure|Fantasy|Sci-Fi", "...
## $ actor_1_name              <chr> "CCH Pounder", "Johnny Depp", "Chris...
## $ movie_title               <chr> "Avatar ", "Pirates of the Caribbea...
## $ num_voted_users           <int> 886204, 471220, 275868, 1144337, 8, ...
## $ cast_total_facebook_likes <int> 4834, 48350, 11700, 106759, 143, 187...
## $ actor_3_name              <chr> "Wes Studi", "Jack Davenport", "Step...
## $ facenumber_in_poster      <int> 0, 0, 1, 0, 0, 1, 0, 1, 4, 3, 0, 0, ...
## $ plot_keywords             <chr> "avatar|future|marine|native|paraple...
## $ movie_imdb_link           <chr> "http://www.imdb.com/title/tt0499549...
## $ num_user_for_reviews      <int> 3054, 1238, 994, 2701, NA, 738, 1902...
## $ language                  <chr> "English", "English", "English", "En...
## $ country                   <chr> "USA", "USA", "UK", "USA", "", "USA"...
## $ content_rating            <chr> "PG-13", "PG-13", "PG-13", "PG-13", ...
## $ budget                    <dbl> 237000000, 300000000, 245000000, 250...
## $ title_year                <int> 2009, 2007, 2015, 2012, NA, 2012, 20...
## $ actor_2_facebook_likes    <int> 936, 5000, 393, 23000, 12, 632, 1100...
## $ imdb_score                <dbl> 7.9, 7.1, 6.8, 8.5, 7.1, 6.6, 6.2, 7...
## $ aspect_ratio              <dbl> 1.78, 2.35, 2.35, 2.35, NA, 2.35, 2....
## $ movie_facebook_likes      <int> 33000, 0, 85000, 164000, 0, 24000, 0...
summary(movie)
##     color           director_name      num_critic_for_reviews
##  Length:5043        Length:5043        Min.   :  1.0         
##  Class :character   Class :character   1st Qu.: 50.0         
##  Mode  :character   Mode  :character   Median :110.0         
##                                        Mean   :140.2         
##                                        3rd Qu.:195.0         
##                                        Max.   :813.0         
##                                        NA's   :50            
##     duration     director_facebook_likes actor_3_facebook_likes
##  Min.   :  7.0   Min.   :    0.0         Min.   :    0.0       
##  1st Qu.: 93.0   1st Qu.:    7.0         1st Qu.:  133.0       
##  Median :103.0   Median :   49.0         Median :  371.5       
##  Mean   :107.2   Mean   :  686.5         Mean   :  645.0       
##  3rd Qu.:118.0   3rd Qu.:  194.5         3rd Qu.:  636.0       
##  Max.   :511.0   Max.   :23000.0         Max.   :23000.0       
##  NA's   :15      NA's   :104             NA's   :23            
##  actor_2_name       actor_1_facebook_likes     gross          
##  Length:5043        Min.   :     0         Min.   :      162  
##  Class :character   1st Qu.:   614         1st Qu.:  5340988  
##  Mode  :character   Median :   988         Median : 25517500  
##                     Mean   :  6560         Mean   : 48468408  
##                     3rd Qu.: 11000         3rd Qu.: 62309438  
##                     Max.   :640000         Max.   :760505847  
##                     NA's   :7              NA's   :884        
##     genres          actor_1_name       movie_title       
##  Length:5043        Length:5043        Length:5043       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  num_voted_users   cast_total_facebook_likes actor_3_name      
##  Min.   :      5   Min.   :     0            Length:5043       
##  1st Qu.:   8594   1st Qu.:  1411            Class :character  
##  Median :  34359   Median :  3090            Mode  :character  
##  Mean   :  83668   Mean   :  9699                              
##  3rd Qu.:  96309   3rd Qu.: 13756                              
##  Max.   :1689764   Max.   :656730                              
##                                                                
##  facenumber_in_poster plot_keywords      movie_imdb_link   
##  Min.   : 0.000       Length:5043        Length:5043       
##  1st Qu.: 0.000       Class :character   Class :character  
##  Median : 1.000       Mode  :character   Mode  :character  
##  Mean   : 1.371                                            
##  3rd Qu.: 2.000                                            
##  Max.   :43.000                                            
##  NA's   :13                                                
##  num_user_for_reviews   language           country         
##  Min.   :   1.0       Length:5043        Length:5043       
##  1st Qu.:  65.0       Class :character   Class :character  
##  Median : 156.0       Mode  :character   Mode  :character  
##  Mean   : 272.8                                            
##  3rd Qu.: 326.0                                            
##  Max.   :5060.0                                            
##  NA's   :21                                                
##  content_rating         budget            title_year  
##  Length:5043        Min.   :2.180e+02   Min.   :1916  
##  Class :character   1st Qu.:6.000e+06   1st Qu.:1999  
##  Mode  :character   Median :2.000e+07   Median :2005  
##                     Mean   :3.975e+07   Mean   :2002  
##                     3rd Qu.:4.500e+07   3rd Qu.:2011  
##                     Max.   :1.222e+10   Max.   :2016  
##                     NA's   :492         NA's   :108   
##  actor_2_facebook_likes   imdb_score     aspect_ratio  
##  Min.   :     0         Min.   :1.600   Min.   : 1.18  
##  1st Qu.:   281         1st Qu.:5.800   1st Qu.: 1.85  
##  Median :   595         Median :6.600   Median : 2.35  
##  Mean   :  1652         Mean   :6.442   Mean   : 2.22  
##  3rd Qu.:   918         3rd Qu.:7.200   3rd Qu.: 2.35  
##  Max.   :137000         Max.   :9.500   Max.   :16.00  
##  NA's   :13                             NA's   :329    
##  movie_facebook_likes
##  Min.   :     0      
##  1st Qu.:     0      
##  Median :   166      
##  Mean   :  7526      
##  3rd Qu.:  3000      
##  Max.   :349000      
## 
#Data Cleaning

movie$movie_title <- substr(movie$movie_title,1,nchar(movie$movie_title)-1)
movie$genres_2 <- (sapply(movie$genres,gsub,pattern="\\|",replacement=" "))

movie = movie[!duplicated(movie$movie_title),]
movie$profit_flag <- as.factor(ifelse((movie$gross > movie$budget),1,0))

dim(movie)
## [1] 4917   30

Genre Analysis

library(tm)
library(dplyr)
library(ggplot2)
library(wordcloud)
genre <- Corpus(VectorSource(movie$genres_2))
genre_dtm <- DocumentTermMatrix(genre)
genre_freq <- colSums(as.matrix(genre_dtm))
freq <- sort(colSums(as.matrix(genre_dtm)), decreasing=TRUE) 
genre_wf <- data.frame(word=names(genre_freq), freq=genre_freq)

ggplot(genre_wf, aes(x=reorder(word,-freq), y=freq))+ 
  geom_bar(stat="identity")+
  theme(axis.text.x = element_text(angle=90),plot.title=element_text(color="Black",face="bold"),legend.position="none")+
  #theme(axis.text.x=element_text(angle=45, hjust=1))+
  ggtitle("Distribution of Movies by Genre")+
  xlab("Genre")+
  ylab("No of Movies")

set.seed(1)
pal2 <- brewer.pal(8,"Dark2")
wordcloud(genre_wf$word,genre_wf$freq,random.order=TRUE,
          rot.per=.15, colors=pal2,scale=c(4,.9),
          title="Sentiment Analysis of Movie Genre")

library(plotly)

genres<-movie$genres
test<-NULL
for(i in 1:length(genres)){
  str<-strsplit(genres[i], "|", fixed = TRUE)[[1]]
  test<-c(test,str)
}

ttable<-table(test)
newtest<-data.frame(ttable)
plot_ly(newtest,labels = ~test,textinfo = 'label+percent', values = ~Freq) %>%
  add_pie(hole = 0.6)

Top 10 by different categories

#Top 10 highest grossing movies

movie %>% drop_na(movie_title)%>%
arrange(desc(gross)) %>% 
head(10) %>%  
ggplot(aes(reorder(movie_title,gross),gross,fill=movie_title))+
geom_bar(stat="identity")+
theme(axis.text.x = element_text(angle=90),plot.title=element_text(color="Black",face="bold"),legend.position="none")+
scale_y_continuous(labels=scales::comma)+
labs(x="",y="Total Gross in USD",title="Top 10 highest grossing movies")

#Bottom 10 grossing movies
movie %>% drop_na(movie_title,gross)%>%
  arrange(desc(gross)) %>% 
  tail(10) %>%  
  ggplot(aes(reorder(movie_title,gross),gross,fill=movie_title))+
  geom_bar(stat="identity")+
  theme(axis.text.x = element_text(angle=90),plot.title=element_text(color="Black",face="bold"),legend.position="none")+
  scale_y_continuous(labels=scales::comma)+
  labs(x="",y="Total Gross in USD",title="Top 10 lowest grossing movies")

#Top 10 most profitable movies
movie$profit <- movie$gross - movie$budget

movie %>% drop_na(movie_title,profit)%>%
  arrange(desc(profit)) %>% 
  head(10) %>%  
  ggplot(aes(reorder(movie_title,profit),profit,fill=movie_title))+
  geom_bar(stat="identity")+
  theme(axis.text.x = element_text(angle=90),plot.title=element_text(color="Black",face="bold"),legend.position="none")+
  scale_y_continuous(labels=scales::comma)+
  labs(x="",y="Total Profit in USD",title="Top 10 most profitable movies")

#Top 10 least profitable movies
movie$loss <- movie$budget - movie$gross

movie %>% drop_na(movie_title,loss)%>%
  arrange(desc(loss)) %>% 
  head(10) %>%  
  ggplot(aes(reorder(movie_title,loss),loss,fill=movie_title))+
  geom_bar(stat="identity")+
  theme(axis.text.x = element_text(angle=90),plot.title=element_text(color="Black",face="bold"),legend.position="none")+
  scale_y_continuous(labels=scales::comma)+
  labs(x="",y="Total Loss in USD",title="Top 10 least profitable movies")

#Top 10 most popular movies

movie %>% drop_na(movie_title,num_voted_users)%>%
  arrange(desc(num_voted_users)) %>% 
  head(10) %>%  
  ggplot(aes(reorder(movie_title,num_voted_users),num_voted_users,fill=movie_title))+
  geom_bar(stat="identity")+
  theme(axis.text.x = element_text(angle=90),plot.title=element_text(color="Black",face="bold"),legend.position="none")+
  scale_y_continuous(labels=scales::comma)+
  labs(x="",y="Total Number of User Votes",title="Top 10 most popular movies")

# Relation between IMDB Score, Revenue & Budget
plot_ly(movie, x = ~imdb_score, y = ~budget/1000000, z = ~gross/1000000, 
        color = ~profit_flag,size = I(3),
        hoverinfo = 'text',
          text = ~paste('Movie: ', movie_title,
                        '</br></br> Gross: ', gross,
                        '</br> Budget: ', budget,
                        '</br> IMDB Score: ', imdb_score)) %>%
  add_markers() %>%
  layout(scene = list(xaxis = list(title = 'IMDB Score'),
                      yaxis = list(title = 'Budget'),
                      zaxis = list(title = 'Revenue')),
         title = "IMDB Score vs Revenue vs Budget",
         showlegend = FALSE)

Analysis By Country

# Top 10 countries by average profit per country
movie %>%
  group_by(country) %>%
  summarise(num = n_distinct(movie_title),
            average_profit = mean(profit,na.rm="true")) %>%
  arrange(-average_profit) %>%
  head(10) %>%
  ggplot(aes(reorder(country,average_profit),average_profit,fill=country))+
  #ggplot(aes(reorder(country,-num),num),fill=country)+
  geom_bar(stat = "identity")+
  theme(axis.text.x = element_text(angle=90),plot.title=element_text(color="Black",face="bold"),legend.position="none")+
  scale_y_continuous(labels=scales::comma)+
  xlab("")+ylab("Average Profit per Movie in USD")+
  ggtitle("Top countries by average profit per film")

#Top 10 countries by average IMDB rating per movie
movie %>%
  group_by(country) %>%
  summarise(num = n_distinct(movie_title),
            average_rating = mean(imdb_score,na.rm = "true")) %>%
  arrange(-average_rating) %>%
  head(10) %>%
  ggplot(aes(reorder(country,average_rating),average_rating,fill=country))+
  #ggplot(aes(reorder(country,-num),num),fill=country)+
  geom_bar(stat = "identity")+
  theme(axis.text.x = element_text(angle=90),plot.title=element_text(color="Black",face="bold"),legend.position="none")+
  xlab("")+ylab("Average IMDB rating")+
  ggtitle("Top countries by average IMDB rating of movies")

# Top 10 countries by average budget per film
movie %>%
  group_by(country) %>%
  summarise(num = n_distinct(movie_title),
            average_budget = mean(budget,na.rm="true")) %>%
  arrange(-average_budget) %>%
  head(10) %>%
  ggplot(aes(reorder(country,average_budget),average_budget,fill=country))+
  #ggplot(aes(reorder(country,-num),num),fill=country)+
  geom_bar(stat = "identity")+
  theme(axis.text.x = element_text(angle=90),plot.title=element_text(color="Black",face="bold"),legend.position="none")+
  scale_y_continuous(labels=scales::comma)+
  xlab("")+ylab("Average Budget per Movie in USD")+
  ggtitle("Top countries by average budget per film")

Best Directors & Actors

general_table = movie %>% group_by(director_name) %>% 
summarise(mean_imdb = mean(imdb_score, na.rm=T), 
          total_movies = n(), 
          standard_dev = sd(imdb_score), 
          lower_bound = mean_imdb- 2* standard_dev/sqrt(total_movies), 
          upper_bound = mean_imdb+ 2* standard_dev/sqrt(total_movies)) %>% 
arrange(desc(mean_imdb))

total_movies_mean = mean(general_table$total_movies)

director_final = general_table %>%  na.omit()
director_final = director_final%>% slice(1:30)

director_final$director_name = factor(director_final$director_name, levels= director_final$director_name[order(director_final$mean_imdb)])

ggplot(director_final, aes(x = mean_imdb , xmin = lower_bound, xmax = upper_bound, y = director_name)) + geom_point() + geom_segment( aes(x = lower_bound, xend = upper_bound, y = director_name, yend=director_name)) + theme(axis.text=element_text(size=8)) + xlab("Mean IMDB Rating") + ylab("Director") + ggtitle("Best Directors by Movie Rating") + theme_bw() 

lead_actor_table = movie %>% group_by(actor_1_name) %>% 
  summarise(mean_imdb = mean(imdb_score, na.rm=T), 
            total_movies = n(), 
            standard_dev = sd(imdb_score), 
            lower_bound = mean_imdb- 2* standard_dev/sqrt(total_movies), 
            upper_bound = mean_imdb+ 2* standard_dev/sqrt(total_movies) ) %>% 
  arrange(desc(mean_imdb))


lead_actor_table = subset(lead_actor_table, lead_actor_table$actor_1_name != "")

actor_mean_movies = mean(lead_actor_table$total_movies)

lead_actor_table = lead_actor_table %>% filter(total_movies >= 3)

top_30_actors = lead_actor_table %>% slice(1:30)

top_30_actors$actor_1_name = factor(top_30_actors$actor_1_name, levels = top_30_actors$actor_1_name[order(top_30_actors$mean_imdb)])

ggplot(top_30_actors, aes(x = mean_imdb, xmin = lower_bound, xmax = upper_bound, y = actor_1_name)) +
  geom_point() + 
  geom_segment( aes(x = lower_bound, xend = upper_bound, y = actor_1_name, yend=actor_1_name)) + 
  theme(axis.text=element_text(size=8)) + 
  xlab("Mean Movie Rating") + ylab("Lead Actor") + 
  ggtitle("Best Actors by IMDB Movie Rating") + theme_bw()

Correlation Matrix for important variables

library(corrgram)

corrgram_data <- movie %>% 
  dplyr::select(., duration, num_critic_for_reviews, gross,  num_voted_users, num_user_for_reviews, budget, title_year, imdb_score, movie_facebook_likes)


corrgram(corrgram_data,legend=T)

k-Means Clustering

Apply k-means clustering to assign movies into 5 classes.

library(ggplot2)
library(rpart)
library(rattle)
library(rpart.plot)
library(RColorBrewer)
library(caret)
mydata <- read.csv("movies.csv")
mydata <- na.omit(mydata)
set.seed(123)
#Generate movie clusters and plot
mydataCluster <- kmeans(na.omit(mydata[, 9:10]), 5, nstart = 20)
mydata$cluster <- as.factor(mydataCluster$cluster)
ggplot(mydata, aes(gross, imdb_score, color = mydata$cluster)) + geom_point()+scale_colour_manual(values=c("green", "blue","orange","purple","red")) + xlab("Gross") + ylab("IMDB Score")

Movies are being labeled into 5 different categories, as shown below. We can see that the majority of movies are in cluster 3, which generated low box offices and with high variations in IMDb scores; whereas movies in cluster 1 have relatively higher gross and scores compared to other clusters.