Objective: My goal is to find following for my favorite actors
1. Average movie rating, Average number of votes, total number of movies
2. Actors with highest total gross worldwide and total gross USA
3. Top grossing movies within Unites States and Globally
4. How many movies come out every decade for these actors


Download data from IMDB that is available in zip format

rm(list=ls())
gc()
##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 481130 25.7    1053860 56.3   641594 34.3
## Vcells 906101  7.0    8388608 64.0  1752684 13.4
setwd('~/Data_Science/R/Projects/IMDB')
files = c("name.basics.tsv.gz",
         "title.basics.tsv.gz",
         "title.ratings.tsv.gz")

for(i in 1:length(files)) {
  url = paste("https://datasets.imdbws.com/", files[i],sep = '')
  download.file(url = url, destfile = files[i], method = "auto", mode="wb")

}


Unzip the downloaded files

files=$(ls *.gz)
for file in $files
do
gzip -df $file
done


Load downloaded data in data frames

library(data.table)
names = fread("name.basics.tsv", sep="\t", header = TRUE, quote = "")
basic = fread("title.basics.tsv", sep="\t", header = TRUE, quote = "")
ratings  = fread("title.ratings.tsv", sep="\t", header = TRUE, quote = "")


Load libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(rvest)
## Loading required package: xml2
library(ggplot2)
library(tidyr)
# This function scrapes movie names for a given actor from IMDB website
get_movies <- function(actor_id) {
url = paste("https://www.imdb.com/name/", actor_id ,"/?ref_=nv_sr_srsg_0", sep='')
html = read_html(url)

nodes <- html %>%
         html_node("body") %>%
         xml_find_all("//div[starts-with(@id,'actor-')]")

movies = xml_attr(nodes, "id")
return(str_replace(movies, "actor-",''))
}
convert_money_to_number <- function(x, begin, end){
  x = substr(x, begin, end)
  
  #Get Currency
  currency_index = str_locate(x, "\\d")[1]
  currency = str_trim(substr(x, 1, currency_index-1),side=c("left"))
  currency = ifelse(currency=='$', 'USD', currency)
  
  #Get numbers
  money = substr(x, currency_index, nchar(x))
  money = str_replace_all(money, ',','')
  money = paste(currency,money, sep='')
  return(money)
}
# This function scrapes movie details like budget, opening weekend gross, gross USA and gross worldwide for a given movie from IMDB website
get_movie_details <- function(movie_id){
  money = c()
  money[1] = movie_id
  url = paste("https://www.imdb.com/title/", movie_id, "/?ref_=fn_al_tt_1", sep = '')
  html = read_html(url)
  
  nodes <- html %>%
           html_node("body") %>%
           xml_find_all("//div[contains(@class,'txt-block')]") 
           
  a = grep('Budget:', nodes)
  b = grep('Opening Weekend USA:', nodes)
  c = grep('Gross USA:', nodes)
  d = grep('Cumulative Worldwide Gross:', nodes)
  
  #Get Budget
  if (length(a) >0 ){
  budget = xml_text(nodes[[a]])
  begin  = str_locate(budget, 'Budget:')[2] +1
  end    = str_locate_all(budget,"\n")[[1]][2,1] -1
  budget = convert_money_to_number(budget, begin, end)
  money[2] = budget
  }
  else
  {
    money[2]=0
  }
  
  #Opening Weekend USA
  if (length(b) >0 ){
  opening_weekend = xml_text(nodes[[b]])
  begin = str_locate(opening_weekend,"Opening Weekend USA:")[2] +1
  end   = str_locate_all(opening_weekend,"\n")[[1]][2,1] -1
  opening_weekend = convert_money_to_number(opening_weekend, begin, end)
  money[3] = opening_weekend
  }
  else
  {
    money[3]=0
  }
  #USA gross
  if (length(c) >0 ){
  gross_usa = str_trim(xml_text(nodes[[c]]),side =c("right"))
  begin     = str_locate(gross_usa, "Gross USA:")[2] +1
  end       = nchar(gross_usa)
  gross_usa = convert_money_to_number(gross_usa, begin, end)
  money[4]  = gross_usa
  }
  else
  {
    money[4]=0
  }
  
  #Worldwide gross
  if (length(d) >0 ){
  gross_worldwide = str_trim(xml_text(nodes[[d]]),side =c("right"))
  begin     = str_locate(gross_worldwide, "Cumulative Worldwide Gross:")[2] +1
  end       = nchar(gross_worldwide)
  gross_worldwide = convert_money_to_number(gross_worldwide, begin, end)
  money[5] = gross_worldwide
  }
  else
  {
    money[5]=0
  }
  
  return(money)
}
# Using a vector of 10 of my favorite actors and the rest of exploration is in relation to these actos unless otherwise stated
favorite_actors <- c(
                     'nm0000093',       #Brad Pitt
                     'nm0000375',       #Robert Downey Jr.
                     'nm0005458',       #Jason Statham
                     'nm0000354',       #Matt Damon
                     'nm0000129',       #Tom Cruise
                     'nm0000125',       #Sean Connery
                     'nm0000112',       #Pierce Brosnan
                     'nm0000226',       #Will Smith      
                     'nm0004874',       #Vin Diesel
                     'nm0000246'        #Bruce Wilis
                     )
favorite_actors <- names %>%
                   filter(nconst %in% favorite_actors) 
Sys.time()
## [1] "2020-01-20 15:17:56 CST"
actor_movie = data.frame(matrix(ncol=3, nrow=0))
movie_details = data.frame(matrix(ncol=5, nrow=0))

for(i in 1:length(favorite_actors$nconst)){
    movies = get_movies(favorite_actors$nconst[i])
    movies <- basic %>%
              filter(tconst %in% movies & titleType=="movie") %>%
              select(tconst)
    movies = movies$tconst
    for(j in 1:length(movies)){
        actor_movie = rbind(actor_movie, c(favorite_actors$nconst[i], favorite_actors$primaryName[i], movies[j]), stringsAsFactors = FALSE)
        movie_details = rbind(movie_details, get_movie_details(movies[j]), stringsAsFactors = FALSE)
    }
}
colnames(actor_movie) =c('actor_id','name','movie_id')
colnames(movie_details) =  c('movie_id','budget', 'opening_weekend','gross_usa','gross_worldwide' )
movie_details = unique(movie_details)
Sys.time()
## [1] "2020-01-20 15:27:41 CST"
table(substr(movie_details$budget,1,3))
## 
##   0 EUR GBP INR ITL USD 
## 116   2   6   1   1 451


There are four different currencies used in estimated budget information that need to be converted to USD

EUR_multiplier = 1.11
GBP_multiplier = 1.3
INR_multiplier = 0.014
ITL_multiplier = 0.000001  #Lira is outdated and has been replaced by EUR

movie_details$currency = ifelse(movie_details$budget=='0','USD', substr(movie_details$budget,1,3))
movie_details$budget    = ifelse(movie_details$budget=='0', '0' , substr(movie_details$budget,4,nchar(movie_details$budget)))  
movie_details$budget = as.numeric(movie_details$budget)

movie_details$budget[movie_details$currency=='EUR'] = EUR_multiplier*movie_details$budget[movie_details$currency=='EUR']
movie_details$budget[movie_details$currency=='INR'] = INR_multiplier*movie_details$budget[movie_details$currency=='INR']
movie_details$budget[movie_details$currency=='GBP'] = GBP_multiplier*movie_details$budget[movie_details$currency=='GBP']
movie_details$budget[movie_details$currency=='ITL'] = ITL_multiplier*movie_details$budget[movie_details$currency=='ITL']
movie_details$opening_weekend = ifelse(movie_details$opening_weekend=='0', '0', substr(movie_details$opening_weekend,4, nchar(movie_details$opening_weekend)))
movie_details$gross_usa = ifelse(movie_details$gross_usa=='0', '0', substr(movie_details$gross_usa,4, nchar(movie_details$gross_usa)))
movie_details$gross_worldwide = ifelse(movie_details$gross_worldwide=='0', '0', substr(movie_details$gross_worldwide,4, nchar(movie_details$gross_worldwide)))

movie_details$opening_weekend = as.numeric(movie_details$opening_weekend)
movie_details$gross_usa = as.numeric(movie_details$gross_usa)
movie_details$gross_worldwide = as.numeric(movie_details$gross_worldwide)
#Let check number of movies, their average rating and average number of votes for these actors
inner_join(movie_details, ratings, by = c("movie_id" = "tconst")) %>%
inner_join(actor_movie, by = c("movie_id" = "movie_id")) %>%
select(name, averageRating, numVotes) %>%
group_by(name) %>%
summarize(averageRating = round(mean(averageRating), digits = 2), averagenumvotes= round(mean(numVotes)/1e3, digits = 2), number_of_movies=n()) %>%
ggplot(aes(x=name, y=number_of_movies, fill=averagenumvotes)) + geom_bar(stat = "identity") +
labs(x='', y= "Number Of Movies", caption = "bar label represent average movie rating") +
geom_text(aes(label = averageRating), nudge_y = 4) +
scale_fill_continuous(name="Votes in 1000s") + 
coord_flip()


A little surprising to see here that Will Smith and Vin Diesel have the least number of regular movies. Bruce Willis has come out in most movies and Robert Downey Jr. takes the sceond spot with almost 75 movies. Average movie rating is 6.14 or higher which tell me that other people like these actors and their movies as well :-).

#Lets look into money matters. Cumulative budget, Opening Weekend Gross, Gross USA and Gross Worldwide for the movies these actors starred in 
merge(actor_movie, movie_details) %>%
group_by(name) %>%
summarize(Total_Budget = sum(budget)/1e6, Total_opening_Weekend= sum(opening_weekend)/1e6, Total_Gross_USA = sum(gross_usa)/1e6, Total_Gross_Worldwide = sum(gross_worldwide)/1e6) %>%
arrange(desc(Total_Gross_Worldwide)) %>%
gather("id","Total",2:5) %>%
ggplot(aes(x=name, y=Total, fill=factor(id))) + geom_bar(stat="identity",position = "dodge") +
theme(axis.text.x = element_text(angle = 90)) + 
labs(x = '', y= "In millions of USD", title="") +
scale_fill_discrete(name='', labels =c('Total Expected Budget', 'Total Gross USA', 'Total Gross Worldwide','Total Opening Weekend Gross'))


Another shock, Vin Diesel’s movies have made more worldwide than most of my favortie actors. Even within USA, he is pretty close to Rober Downey Jr. Let’s see if Fast and Furious franchise has helped Vin Diesel in earning this spot.

merge(actor_movie, movie_details) %>%
filter(name == 'Vin Diesel') %>%
inner_join( basic, by = c('movie_id' = 'tconst')) %>%
filter(gross_worldwide > 0) %>%
select(originalTitle, gross_worldwide) %>%
ggplot(aes(x=originalTitle, y=gross_worldwide/1e6)) + geom_bar(stat= "identity", fill="#56B4E9") +
theme(axis.text.x = element_text(angle=90)) +
labs(x='',y='Worldwide Gross in Millions USD', title = "Vin Diesel Movies") +
theme(title = element_text(size=8))  


The bar chart above explains surprising outcome of the previous plot. Vin Diesel also played a role in Avenger and Guardian Of the Galaxy movies. A quick google search revealed he gave voice to Groot.

abc <- inner_join(movie_details, basic, by = c("movie_id" = "tconst")) %>%
select(originalTitle, budget, opening_weekend, gross_usa, gross_worldwide ) %>%
gather("Money_Type","Value",2:5) %>%
select(Money_Type, Value, originalTitle) %>%
group_by(Money_Type) %>%
top_n(10, wt = Value)

ggplot(abc[abc$Money_Type=='budget',], aes(x=originalTitle, y= Value/1e6)) + geom_bar(stat='identity') + 
  theme(axis.text.x = element_text(angle = 90)) +
  labs(x='',y = "Estimated Budget in Millions USD", title="Top 10 Highest Budget Movie") +
  theme(title = element_text(size=8))  

ggplot(abc[abc$Money_Type=='opening_weekend',], aes(x=originalTitle, y= Value/1e6)) + geom_bar(stat='identity') + 
  theme(axis.text.x = element_text(angle = 90)) +
  labs(x='',y = "Opening Weekend in Millions USD", title="Top 10 Highest Grossing Opening Weekend Movie") +
  theme(title = element_text(size=8))  

ggplot(abc[abc$Money_Type=='gross_usa',], aes(x=originalTitle, y= Value/1e6)) + geom_bar(stat='identity') + 
  theme(axis.text.x = element_text(angle = 90)) +
  labs(x='',y = "Gross USA in Millions USD", title="Top 10 Highest Grossing Movie in USA") +
  theme(title = element_text(size=8))  

ggplot(abc[abc$Money_Type=='gross_worldwide',], aes(x=originalTitle, y= Value/1e6)) + geom_bar(stat='identity') + 
  theme(axis.text.x = element_text(angle = 90)) +
  labs(x='',y = "Worldwide Gross in Millions USD", title="Top 10 Highest Grossing Weekend Movie in the World") +
  theme(title = element_text(size=8))  


Avengers: End Game appears to be the most successful movie which has my favorite actors. This movie made more money on the opening weekend than it estimated budget :-)

#Lets look at average Gross USA and Gross Worldwide for these movies
inner_join(movie_details, actor_movie, by = c("movie_id" = "movie_id")) %>%
select(name, budget, opening_weekend, gross_usa, gross_worldwide ) %>%
group_by(name) %>%
summarize(average_gross_usa=mean(gross_usa)/1e6, average_worldwide_gross=mean(gross_worldwide)/1e6) %>%
gather("Type","Value",2:3) %>%
ggplot(aes(x=name, y=Value, fill=Type)) + geom_bar(stat = "identity", position="dodge") +
  theme(axis.text.x = element_text(angle = 90)) +
  scale_fill_discrete(name='',labels=c('Average USA Gross','Average Worldwide Gross')) +
  labs(x='',y= 'In Millions USD')


Vin Diesel takes the first spot in Average USA and worldwide gross. However, Will Smith takes the second spot in Average Gross beating Robert Downey Jr.

#Lets see if there is a relation between Number of Votes and Average rating for these movies. Using 1og of number of votes to get a better picture
inner_join(movie_details, ratings, by = c("movie_id" = "tconst")) %>%
inner_join(basic, by = c("movie_id" = "tconst")) %>%
select(originalTitle, averageRating, numVotes) %>%
ggplot(aes(x=log10(numVotes), y=averageRating)) + geom_point() + geom_smooth(method = "loess") +
labs(x= 'Number Of Votes (log)', y = 'Average Rating')


From the plot above it appears that there is a non-linear relationship. The movies with higher average rating also have higher number of votes. But this plot is based on a much smaller dataset tied to movies with my favorite actors. Lets look at the ratings information for all the movies on IMDB

#Lets look at ratings for all the movies on IMDB. This is not limited to movies with my favorite actos
ratings$cat = ifelse(ratings$averageRating > 7, "High", "Average")
ratings$cat = ifelse(ratings$averageRating < 4, "Low", ratings$cat)

ratings %>%
group_by(cat) %>%
summarize(average= mean(numVotes)) %>%
ggplot(aes(x=cat, y=average)) + geom_bar(stat='identity')  +
labs(x="Movie Rating Category", y= 'Average Number of Votes')  


This plot proves that earlier indication was correct that highly rated movies get more votes than the rest.

#Lets look at number of movies by decade for these actors. Used a large range from 1950 to 1970s as Sean Connery was the only active actor during this time. Robert Downey Jr. had a few movies in 1970s
inner_join(actor_movie, basic, by = c("movie_id" = "tconst")) %>%
filter(! startYear == '\\N') %>%
mutate(decade = case_when(
                          as.numeric(startYear)>=1950 & as.numeric(startYear) < 1980 ~ '1950 - 1970s',
                          as.numeric(startYear)>=1980 & as.numeric(startYear) < 1990 ~ '1980s',
                          as.numeric(startYear)>=1990 & as.numeric(startYear) < 2000 ~ '1990s',
                          as.numeric(startYear)>=2000 & as.numeric(startYear) < 2010 ~ '2000s',
                          as.numeric(startYear)>=2010 & as.numeric(startYear) < 2020 ~ '2010s',
                          as.numeric(startYear)>=2020 & as.numeric(startYear) < 2030 ~ '2020s'
                          )
       )  %>%
  group_by(name, decade) %>%
  tally() %>%
  ggplot(aes(x=name, y=n)) + geom_bar(stat='identity') + labs(x='',y= 'Number Of Movies')+coord_flip() + facet_wrap(~decade)


I used a larger range 1950 - 1979 because Sean Connery was the only active actor during that time. Robert Downey Jr. had a couple of movies in the 70’s. Most of my favorite actors have been active since 1990s. Bruce Willis appears to have had most number of movies since 1990s.