This dataset contains 28 variables about 5,043 movies spanning across 100 years in 66 countries. There are 2399 unique director names, and thousands of actors/actresses.I am going to perform exploratory data analysis on this dataset and find some meaningful insights.
library(tidyverse)
library(stringr)
url<-"https://raw.githubusercontent.com/nitishghosal/IMDB-Data-Analysis/master/movie_metadata.csv"
movie <- as_data_frame(read.csv(url,stringsAsFactors = FALSE,na="NA"))
#movie <- as_data_frame(read.csv('tmdb_5000_movies.csv',stringsAsFactors = FALSE,na="NA"))
#credits <- as_data_frame(read.csv('tmdb_5000_credits.csv',stringsAsFactors = FALSE,na="NA"))
glimpse(movie)
## Observations: 5,043
## Variables: 28
## $ color <chr> "Color", "Color", "Color", "Color", ...
## $ director_name <chr> "James Cameron", "Gore Verbinski", "...
## $ num_critic_for_reviews <int> 723, 302, 602, 813, NA, 462, 392, 32...
## $ duration <int> 178, 169, 148, 164, NA, 132, 156, 10...
## $ director_facebook_likes <int> 0, 563, 0, 22000, 131, 475, 0, 15, 0...
## $ actor_3_facebook_likes <int> 855, 1000, 161, 23000, NA, 530, 4000...
## $ actor_2_name <chr> "Joel David Moore", "Orlando Bloom",...
## $ actor_1_facebook_likes <int> 1000, 40000, 11000, 27000, 131, 640,...
## $ gross <int> 760505847, 309404152, 200074175, 448...
## $ genres <chr> "Action|Adventure|Fantasy|Sci-Fi", "...
## $ actor_1_name <chr> "CCH Pounder", "Johnny Depp", "Chris...
## $ movie_title <chr> "Avatar ", "Pirates of the Caribbea...
## $ num_voted_users <int> 886204, 471220, 275868, 1144337, 8, ...
## $ cast_total_facebook_likes <int> 4834, 48350, 11700, 106759, 143, 187...
## $ actor_3_name <chr> "Wes Studi", "Jack Davenport", "Step...
## $ facenumber_in_poster <int> 0, 0, 1, 0, 0, 1, 0, 1, 4, 3, 0, 0, ...
## $ plot_keywords <chr> "avatar|future|marine|native|paraple...
## $ movie_imdb_link <chr> "http://www.imdb.com/title/tt0499549...
## $ num_user_for_reviews <int> 3054, 1238, 994, 2701, NA, 738, 1902...
## $ language <chr> "English", "English", "English", "En...
## $ country <chr> "USA", "USA", "UK", "USA", "", "USA"...
## $ content_rating <chr> "PG-13", "PG-13", "PG-13", "PG-13", ...
## $ budget <dbl> 237000000, 300000000, 245000000, 250...
## $ title_year <int> 2009, 2007, 2015, 2012, NA, 2012, 20...
## $ actor_2_facebook_likes <int> 936, 5000, 393, 23000, 12, 632, 1100...
## $ imdb_score <dbl> 7.9, 7.1, 6.8, 8.5, 7.1, 6.6, 6.2, 7...
## $ aspect_ratio <dbl> 1.78, 2.35, 2.35, 2.35, NA, 2.35, 2....
## $ movie_facebook_likes <int> 33000, 0, 85000, 164000, 0, 24000, 0...
summary(movie)
## color director_name num_critic_for_reviews
## Length:5043 Length:5043 Min. : 1.0
## Class :character Class :character 1st Qu.: 50.0
## Mode :character Mode :character Median :110.0
## Mean :140.2
## 3rd Qu.:195.0
## Max. :813.0
## NA's :50
## duration director_facebook_likes actor_3_facebook_likes
## Min. : 7.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 93.0 1st Qu.: 7.0 1st Qu.: 133.0
## Median :103.0 Median : 49.0 Median : 371.5
## Mean :107.2 Mean : 686.5 Mean : 645.0
## 3rd Qu.:118.0 3rd Qu.: 194.5 3rd Qu.: 636.0
## Max. :511.0 Max. :23000.0 Max. :23000.0
## NA's :15 NA's :104 NA's :23
## actor_2_name actor_1_facebook_likes gross
## Length:5043 Min. : 0 Min. : 162
## Class :character 1st Qu.: 614 1st Qu.: 5340988
## Mode :character Median : 988 Median : 25517500
## Mean : 6560 Mean : 48468408
## 3rd Qu.: 11000 3rd Qu.: 62309438
## Max. :640000 Max. :760505847
## NA's :7 NA's :884
## genres actor_1_name movie_title
## Length:5043 Length:5043 Length:5043
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## num_voted_users cast_total_facebook_likes actor_3_name
## Min. : 5 Min. : 0 Length:5043
## 1st Qu.: 8594 1st Qu.: 1411 Class :character
## Median : 34359 Median : 3090 Mode :character
## Mean : 83668 Mean : 9699
## 3rd Qu.: 96309 3rd Qu.: 13756
## Max. :1689764 Max. :656730
##
## facenumber_in_poster plot_keywords movie_imdb_link
## Min. : 0.000 Length:5043 Length:5043
## 1st Qu.: 0.000 Class :character Class :character
## Median : 1.000 Mode :character Mode :character
## Mean : 1.371
## 3rd Qu.: 2.000
## Max. :43.000
## NA's :13
## num_user_for_reviews language country
## Min. : 1.0 Length:5043 Length:5043
## 1st Qu.: 65.0 Class :character Class :character
## Median : 156.0 Mode :character Mode :character
## Mean : 272.8
## 3rd Qu.: 326.0
## Max. :5060.0
## NA's :21
## content_rating budget title_year
## Length:5043 Min. :2.180e+02 Min. :1916
## Class :character 1st Qu.:6.000e+06 1st Qu.:1999
## Mode :character Median :2.000e+07 Median :2005
## Mean :3.975e+07 Mean :2002
## 3rd Qu.:4.500e+07 3rd Qu.:2011
## Max. :1.222e+10 Max. :2016
## NA's :492 NA's :108
## actor_2_facebook_likes imdb_score aspect_ratio
## Min. : 0 Min. :1.600 Min. : 1.18
## 1st Qu.: 281 1st Qu.:5.800 1st Qu.: 1.85
## Median : 595 Median :6.600 Median : 2.35
## Mean : 1652 Mean :6.442 Mean : 2.22
## 3rd Qu.: 918 3rd Qu.:7.200 3rd Qu.: 2.35
## Max. :137000 Max. :9.500 Max. :16.00
## NA's :13 NA's :329
## movie_facebook_likes
## Min. : 0
## 1st Qu.: 0
## Median : 166
## Mean : 7526
## 3rd Qu.: 3000
## Max. :349000
##
#Data Cleaning
movie$movie_title <- substr(movie$movie_title,1,nchar(movie$movie_title)-1)
movie$genres_2 <- (sapply(movie$genres,gsub,pattern="\\|",replacement=" "))
movie = movie[!duplicated(movie$movie_title),]
movie$profit_flag <- as.factor(ifelse((movie$gross > movie$budget),1,0))
dim(movie)
## [1] 4917 30
library(tm)
library(dplyr)
library(ggplot2)
library(wordcloud)
genre <- Corpus(VectorSource(movie$genres_2))
genre_dtm <- DocumentTermMatrix(genre)
genre_freq <- colSums(as.matrix(genre_dtm))
freq <- sort(colSums(as.matrix(genre_dtm)), decreasing=TRUE)
genre_wf <- data.frame(word=names(genre_freq), freq=genre_freq)
ggplot(genre_wf, aes(x=reorder(word,-freq), y=freq))+
geom_bar(stat="identity")+
theme(axis.text.x = element_text(angle=90),plot.title=element_text(color="Black",face="bold"),legend.position="none")+
#theme(axis.text.x=element_text(angle=45, hjust=1))+
ggtitle("Distribution of Movies by Genre")+
xlab("Genre")+
ylab("No of Movies")
set.seed(1)
pal2 <- brewer.pal(8,"Dark2")
wordcloud(genre_wf$word,genre_wf$freq,random.order=TRUE,
rot.per=.15, colors=pal2,scale=c(4,.9),
title="Sentiment Analysis of Movie Genre")
library(plotly)
genres<-movie$genres
test<-NULL
for(i in 1:length(genres)){
str<-strsplit(genres[i], "|", fixed = TRUE)[[1]]
test<-c(test,str)
}
ttable<-table(test)
newtest<-data.frame(ttable)
plot_ly(newtest,labels = ~test,textinfo = 'label+percent', values = ~Freq) %>%
add_pie(hole = 0.6)
#Top 10 highest grossing movies
movie %>% drop_na(movie_title)%>%
arrange(desc(gross)) %>%
head(10) %>%
ggplot(aes(reorder(movie_title,gross),gross,fill=movie_title))+
geom_bar(stat="identity")+
theme(axis.text.x = element_text(angle=90),plot.title=element_text(color="Black",face="bold"),legend.position="none")+
scale_y_continuous(labels=scales::comma)+
labs(x="",y="Total Gross in USD",title="Top 10 highest grossing movies")
#Bottom 10 grossing movies
movie %>% drop_na(movie_title,gross)%>%
arrange(desc(gross)) %>%
tail(10) %>%
ggplot(aes(reorder(movie_title,gross),gross,fill=movie_title))+
geom_bar(stat="identity")+
theme(axis.text.x = element_text(angle=90),plot.title=element_text(color="Black",face="bold"),legend.position="none")+
scale_y_continuous(labels=scales::comma)+
labs(x="",y="Total Gross in USD",title="Top 10 lowest grossing movies")
#Top 10 most profitable movies
movie$profit <- movie$gross - movie$budget
movie %>% drop_na(movie_title,profit)%>%
arrange(desc(profit)) %>%
head(10) %>%
ggplot(aes(reorder(movie_title,profit),profit,fill=movie_title))+
geom_bar(stat="identity")+
theme(axis.text.x = element_text(angle=90),plot.title=element_text(color="Black",face="bold"),legend.position="none")+
scale_y_continuous(labels=scales::comma)+
labs(x="",y="Total Profit in USD",title="Top 10 most profitable movies")
#Top 10 least profitable movies
movie$loss <- movie$budget - movie$gross
movie %>% drop_na(movie_title,loss)%>%
arrange(desc(loss)) %>%
head(10) %>%
ggplot(aes(reorder(movie_title,loss),loss,fill=movie_title))+
geom_bar(stat="identity")+
theme(axis.text.x = element_text(angle=90),plot.title=element_text(color="Black",face="bold"),legend.position="none")+
scale_y_continuous(labels=scales::comma)+
labs(x="",y="Total Loss in USD",title="Top 10 least profitable movies")
#Top 10 most popular movies
movie %>% drop_na(movie_title,num_voted_users)%>%
arrange(desc(num_voted_users)) %>%
head(10) %>%
ggplot(aes(reorder(movie_title,num_voted_users),num_voted_users,fill=movie_title))+
geom_bar(stat="identity")+
theme(axis.text.x = element_text(angle=90),plot.title=element_text(color="Black",face="bold"),legend.position="none")+
scale_y_continuous(labels=scales::comma)+
labs(x="",y="Total Number of User Votes",title="Top 10 most popular movies")
# Relation between IMDB Score, Revenue & Budget
plot_ly(movie, x = ~imdb_score, y = ~budget/1000000, z = ~gross/1000000,
color = ~profit_flag,size = I(3),
hoverinfo = 'text',
text = ~paste('Movie: ', movie_title,
'</br></br> Gross: ', gross,
'</br> Budget: ', budget,
'</br> IMDB Score: ', imdb_score)) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'IMDB Score'),
yaxis = list(title = 'Budget'),
zaxis = list(title = 'Revenue')),
title = "IMDB Score vs Revenue vs Budget",
showlegend = FALSE)
# Top 10 countries by average profit per country
movie %>%
group_by(country) %>%
summarise(num = n_distinct(movie_title),
average_profit = mean(profit,na.rm="true")) %>%
arrange(-average_profit) %>%
head(10) %>%
ggplot(aes(reorder(country,average_profit),average_profit,fill=country))+
#ggplot(aes(reorder(country,-num),num),fill=country)+
geom_bar(stat = "identity")+
theme(axis.text.x = element_text(angle=90),plot.title=element_text(color="Black",face="bold"),legend.position="none")+
scale_y_continuous(labels=scales::comma)+
xlab("")+ylab("Average Profit per Movie in USD")+
ggtitle("Top countries by average profit per film")
#Top 10 countries by average IMDB rating per movie
movie %>%
group_by(country) %>%
summarise(num = n_distinct(movie_title),
average_rating = mean(imdb_score,na.rm = "true")) %>%
arrange(-average_rating) %>%
head(10) %>%
ggplot(aes(reorder(country,average_rating),average_rating,fill=country))+
#ggplot(aes(reorder(country,-num),num),fill=country)+
geom_bar(stat = "identity")+
theme(axis.text.x = element_text(angle=90),plot.title=element_text(color="Black",face="bold"),legend.position="none")+
xlab("")+ylab("Average IMDB rating")+
ggtitle("Top countries by average IMDB rating of movies")
# Top 10 countries by average budget per film
movie %>%
group_by(country) %>%
summarise(num = n_distinct(movie_title),
average_budget = mean(budget,na.rm="true")) %>%
arrange(-average_budget) %>%
head(10) %>%
ggplot(aes(reorder(country,average_budget),average_budget,fill=country))+
#ggplot(aes(reorder(country,-num),num),fill=country)+
geom_bar(stat = "identity")+
theme(axis.text.x = element_text(angle=90),plot.title=element_text(color="Black",face="bold"),legend.position="none")+
scale_y_continuous(labels=scales::comma)+
xlab("")+ylab("Average Budget per Movie in USD")+
ggtitle("Top countries by average budget per film")
general_table = movie %>% group_by(director_name) %>%
summarise(mean_imdb = mean(imdb_score, na.rm=T),
total_movies = n(),
standard_dev = sd(imdb_score),
lower_bound = mean_imdb- 2* standard_dev/sqrt(total_movies),
upper_bound = mean_imdb+ 2* standard_dev/sqrt(total_movies)) %>%
arrange(desc(mean_imdb))
total_movies_mean = mean(general_table$total_movies)
director_final = general_table %>% na.omit()
director_final = director_final%>% slice(1:30)
director_final$director_name = factor(director_final$director_name, levels= director_final$director_name[order(director_final$mean_imdb)])
ggplot(director_final, aes(x = mean_imdb , xmin = lower_bound, xmax = upper_bound, y = director_name)) + geom_point() + geom_segment( aes(x = lower_bound, xend = upper_bound, y = director_name, yend=director_name)) + theme(axis.text=element_text(size=8)) + xlab("Mean IMDB Rating") + ylab("Director") + ggtitle("Best Directors by Movie Rating") + theme_bw()
lead_actor_table = movie %>% group_by(actor_1_name) %>%
summarise(mean_imdb = mean(imdb_score, na.rm=T),
total_movies = n(),
standard_dev = sd(imdb_score),
lower_bound = mean_imdb- 2* standard_dev/sqrt(total_movies),
upper_bound = mean_imdb+ 2* standard_dev/sqrt(total_movies) ) %>%
arrange(desc(mean_imdb))
lead_actor_table = subset(lead_actor_table, lead_actor_table$actor_1_name != "")
actor_mean_movies = mean(lead_actor_table$total_movies)
lead_actor_table = lead_actor_table %>% filter(total_movies >= 3)
top_30_actors = lead_actor_table %>% slice(1:30)
top_30_actors$actor_1_name = factor(top_30_actors$actor_1_name, levels = top_30_actors$actor_1_name[order(top_30_actors$mean_imdb)])
ggplot(top_30_actors, aes(x = mean_imdb, xmin = lower_bound, xmax = upper_bound, y = actor_1_name)) +
geom_point() +
geom_segment( aes(x = lower_bound, xend = upper_bound, y = actor_1_name, yend=actor_1_name)) +
theme(axis.text=element_text(size=8)) +
xlab("Mean Movie Rating") + ylab("Lead Actor") +
ggtitle("Best Actors by IMDB Movie Rating") + theme_bw()
library(corrgram)
corrgram_data <- movie %>%
dplyr::select(., duration, num_critic_for_reviews, gross, num_voted_users, num_user_for_reviews, budget, title_year, imdb_score, movie_facebook_likes)
corrgram(corrgram_data,legend=T)
Apply k-means clustering to assign movies into 5 classes.
library(ggplot2)
library(rpart)
library(rattle)
library(rpart.plot)
library(RColorBrewer)
library(caret)
mydata <- read.csv("movies.csv")
mydata <- na.omit(mydata)
set.seed(123)
#Generate movie clusters and plot
mydataCluster <- kmeans(na.omit(mydata[, 9:10]), 5, nstart = 20)
mydata$cluster <- as.factor(mydataCluster$cluster)
ggplot(mydata, aes(gross, imdb_score, color = mydata$cluster)) + geom_point()+scale_colour_manual(values=c("green", "blue","orange","purple","red")) + xlab("Gross") + ylab("IMDB Score")
Movies are being labeled into 5 different categories, as shown below. We can see that the majority of movies are in cluster 3, which generated low box offices and with high variations in IMDb scores; whereas movies in cluster 1 have relatively higher gross and scores compared to other clusters.