©Deepak Kumar G S

#Introduation 数据集来自TMDB(The Movie Database)上5000部电影数据，包括电影的详细信息和参与电影的人员信息

#Loading the data and summary

library(plyr)
library(tidyverse)

## -- Attaching packages ---------------- tidyverse 1.2.1 --

## √ ggplot2 3.1.1       √ purrr   0.3.2  
## √ tibble  2.1.1       √ dplyr   0.8.0.1
## √ tidyr   0.8.3       √ stringr 1.4.0  
## √ readr   1.3.1       √ forcats 0.4.0

## -- Conflicts ------------------- tidyverse_conflicts() --
## x dplyr::arrange()   masks plyr::arrange()
## x purrr::compact()   masks plyr::compact()
## x dplyr::count()     masks plyr::count()
## x dplyr::failwith()  masks plyr::failwith()
## x dplyr::filter()    masks stats::filter()
## x dplyr::id()        masks plyr::id()
## x dplyr::lag()       masks stats::lag()
## x dplyr::mutate()    masks plyr::mutate()
## x dplyr::rename()    masks plyr::rename()
## x dplyr::summarise() masks plyr::summarise()
## x dplyr::summarize() masks plyr::summarize()

library(formattable)
library(splitstackshape)
library(jsonlite)

## 
## Attaching package: 'jsonlite'

## The following object is masked from 'package:purrr':
## 
##     flatten

library(wordcloud)

## Loading required package: RColorBrewer

library(RColorBrewer)
library(ggthemes)
library(tm)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(RSentiment)
library(zoo)

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(stringr)
movie=read_csv("D:/创新研究/tmdb_5000_movies.csv",col_names=TRUE,na="NA")

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   budget = col_double(),
##   id = col_double(),
##   popularity = col_double(),
##   release_date = col_date(format = ""),
##   revenue = col_double(),
##   runtime = col_double(),
##   vote_average = col_double(),
##   vote_count = col_double()
## )

## See spec(...) for full column specifications.

credits=read_csv("D:/创新研究/tmdb_5000_credits.csv",col_names=TRUE,na="NA")

## Parsed with column specification:
## cols(
##   movie_id = col_double(),
##   title = col_character(),
##   cast = col_character(),
##   crew = col_character()
## )

Movie dataset

glimpse(movie)

## Observations: 4,803
## Variables: 20
## $ budget               <dbl> 2.37e+08, 3.00e+08, 2.45e+08, 2.50e+08, 2...
## $ genres               <chr> "[{\"id\": 28, \"name\": \"Action\"}, {\"...
## $ homepage             <chr> "http://www.avatarmovie.com/", "http://di...
## $ id                   <dbl> 19995, 285, 206647, 49026, 49529, 559, 38...
## $ keywords             <chr> "[{\"id\": 1463, \"name\": \"culture clas...
## $ original_language    <chr> "en", "en", "en", "en", "en", "en", "en",...
## $ original_title       <chr> "Avatar", "Pirates of the Caribbean: At W...
## $ overview             <chr> "In the 22nd century, a paraplegic Marine...
## $ popularity           <dbl> 150.43758, 139.08262, 107.37679, 112.3129...
## $ production_companies <chr> "[{\"name\": \"Ingenious Film Partners\",...
## $ production_countries <chr> "[{\"iso_3166_1\": \"US\", \"name\": \"Un...
## $ release_date         <date> 2009-12-10, 2007-05-19, 2015-10-26, 2012...
## $ revenue              <dbl> 2787965087, 961000000, 880674609, 1084939...
## $ runtime              <dbl> 162, 169, 148, 165, 132, 139, 100, 141, 1...
## $ spoken_languages     <chr> "[{\"iso_639_1\": \"en\", \"name\": \"Eng...
## $ status               <chr> "Released", "Released", "Released", "Rele...
## $ tagline              <chr> "Enter the World of Pandora.", "At the en...
## $ title                <chr> "Avatar", "Pirates of the Caribbean: At W...
## $ vote_average         <dbl> 7.2, 6.9, 6.3, 7.6, 6.1, 5.9, 7.4, 7.3, 7...
## $ vote_count           <dbl> 11800, 4500, 4466, 9106, 2124, 3576, 3330...

summary(movie)

##      budget             genres            homepage        
##  Min.   :        0   Length:4803        Length:4803       
##  1st Qu.:   790000   Class :character   Class :character  
##  Median : 15000000   Mode  :character   Mode  :character  
##  Mean   : 29045040                                        
##  3rd Qu.: 40000000                                        
##  Max.   :380000000                                        
##                                                           
##        id           keywords         original_language  original_title    
##  Min.   :     5   Length:4803        Length:4803        Length:4803       
##  1st Qu.:  9014   Class :character   Class :character   Class :character  
##  Median : 14629   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 57166                                                           
##  3rd Qu.: 58611                                                           
##  Max.   :459488                                                           
##                                                                           
##    overview           popularity      production_companies
##  Length:4803        Min.   :  0.000   Length:4803         
##  Class :character   1st Qu.:  4.668   Class :character    
##  Mode  :character   Median : 12.922   Mode  :character    
##                     Mean   : 21.492                       
##                     3rd Qu.: 28.314                       
##                     Max.   :875.581                       
##                                                           
##  production_countries  release_date           revenue         
##  Length:4803          Min.   :1916-09-04   Min.   :0.000e+00  
##  Class :character     1st Qu.:1999-07-14   1st Qu.:0.000e+00  
##  Mode  :character     Median :2005-10-03   Median :1.917e+07  
##                       Mean   :2002-12-27   Mean   :8.226e+07  
##                       3rd Qu.:2011-02-16   3rd Qu.:9.292e+07  
##                       Max.   :2017-02-03   Max.   :2.788e+09  
##                       NA's   :1                               
##     runtime      spoken_languages      status            tagline         
##  Min.   :  0.0   Length:4803        Length:4803        Length:4803       
##  1st Qu.: 94.0   Class :character   Class :character   Class :character  
##  Median :103.0   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :106.9                                                           
##  3rd Qu.:118.0                                                           
##  Max.   :338.0                                                           
##  NA's   :2                                                               
##     title            vote_average      vote_count     
##  Length:4803        Min.   : 0.000   Min.   :    0.0  
##  Class :character   1st Qu.: 5.600   1st Qu.:   54.0  
##  Mode  :character   Median : 6.200   Median :  235.0  
##                     Mean   : 6.092   Mean   :  690.2  
##                     3rd Qu.: 6.800   3rd Qu.:  737.0  
##                     Max.   :10.000   Max.   :13752.0  
##

Credits dataset

glimpse(credits)

## Observations: 4,803
## Variables: 4
## $ movie_id <dbl> 19995, 285, 206647, 49026, 49529, 559, 38757, 99861, ...
## $ title    <chr> "Avatar", "Pirates of the Caribbean: At World's End",...
## $ cast     <chr> "[{\"cast_id\": 242, \"character\": \"Jake Sully\", \...
## $ crew     <chr> "[{\"credit_id\": \"52fe48009251416c750aca23\", \"dep...

summary(credits)

##     movie_id         title               cast               crew          
##  Min.   :     5   Length:4803        Length:4803        Length:4803       
##  1st Qu.:  9014   Class :character   Class :character   Class :character  
##  Median : 14629   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 57166                                                           
##  3rd Qu.: 58611                                                           
##  Max.   :459488

#Cleaning the Movie Dataset

genredf=movie %>% filter(nchar(genres)>2) %>% mutate(js=lapply(genres,fromJSON)) %>% unnest(js) %>% select(id,title,genre=name) #Convert JSON format into data frame
slice(genredf)

## # A tibble: 12,160 x 3
##        id title                                    genre          
##     <dbl> <chr>                                    <chr>          
##  1  19995 Avatar                                   Action         
##  2  19995 Avatar                                   Adventure      
##  3  19995 Avatar                                   Fantasy        
##  4  19995 Avatar                                   Science Fiction
##  5    285 Pirates of the Caribbean: At World's End Adventure      
##  6    285 Pirates of the Caribbean: At World's End Fantasy        
##  7    285 Pirates of the Caribbean: At World's End Action         
##  8 206647 Spectre                                  Action         
##  9 206647 Spectre                                  Adventure      
## 10 206647 Spectre                                  Crime          
## # ... with 12,150 more rows

#Wordcloud of Genre Representation

temp=genredf %>% group_by(genre) %>% summarise(count=length(genre)) %>% arrange(desc(count))

wordcloud(words=temp$genre,freq=temp$count,min.freq=100,max.words = 20,random.order=FALSE,random.color=FALSE,rot.per=0.35,colors = brewer.pal(20,"Dark2"),scale=c(5,.2))

## Warning in brewer.pal(20, "Dark2"): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors

Comedy,Drama,Thriller and Action are most represented.

#Movies with highest budget

movie %>% select(original_title,budget) %>% drop_na(original_title)%>% arrange(desc(budget)) %>% head(10) %>%  ggplot(aes(reorder(original_title,budget),budget,fill=original_title))+geom_bar(stat="identity")+theme(axis.text.x = element_text(angle=90),plot.title=element_text(color="Red",face="italic"),legend.position="none")+scale_y_continuous(labels=scales::comma)+labs(x="",y="Total Budget in $",title="Most Expensive Movies -Top 10")

#Highest grossing movies

movie %>% select(original_title,revenue) %>% drop_na(original_title)%>% arrange(desc(revenue)) %>% head(10)  %>% ggplot(aes(reorder(original_title,revenue),revenue,fill=original_title))+geom_bar(stat="identity")+theme(axis.text.x = element_text(angle=90),plot.title=element_text(color="Red",face="italic"),legend.position="none")+scale_y_continuous(limits=c(0,3000000000),breaks=seq(0,3000000000,500000000),labels=scales::comma)+labs(x="",y="Total Revenue in $",title="Highest Grossing Movies -Top 10")

#Movies with highest popularity

movie %>% group_by(original_title) %>% arrange(desc(popularity)) %>% head(10) %>% ggplot(aes(factor(original_title,levels=original_title),popularity,fill=original_title))+geom_bar(stat="identity")+theme_few()+theme(axis.text.x=element_text(angle=90,hjust=0.5),plot.title=element_text(hjust=0.5,size=15,color="red"),legend.position="none")+labs(x="Title",y="Rating",title="Popularity of Movies")+scale_x_discrete(labels=function(x)str_wrap(x,width=15))

#Production companies

production=movie%>% filter(nchar(production_companies)>2)%>% mutate(js=lapply(production_companies,fromJSON)) %>% unnest(js)%>% select(budget,revenue,company=name)
lapply(production,class)

## $budget
## [1] "numeric"
## 
## $revenue
## [1] "numeric"
## 
## $company
## [1] "character"

temp=production %>% group_by(company) %>% summarise(count=n()) %>% arrange(desc(count)) 
wordcloud(words=temp$company,freq=temp$count,max.words = 25,color =rainbow(7),scale=c(2,0.2))

Warner Bros,Universal Pictures,Parmount Pictures,20th century fox are represented more in this dataset.

Exploring the tagline

corpus=Corpus(VectorSource(list(movie$tagline)))
corpus=tm_map(corpus,removePunctuation)

## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation
## drops documents

corpus=tm_map(corpus,content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents

corpus=tm_map(corpus,stripWhitespace)

## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation
## drops documents

corpus=tm_map(corpus,removeWords,stopwords("english"))

## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents

dtm_tag=DocumentTermMatrix(VCorpus(VectorSource(corpus[1]$content)))
freq_tag=colSums(as.matrix(dtm_tag))

sent_tag=calculate_sentiment(names(freq_tag)) %>% cbind(as.data.frame(freq_tag))

Now its time to calcuate the sentiment.

positive=sent_tag[sent_tag$sentiment=="Positive",]
negative=sent_tag[sent_tag$sentiment=="Negative",]
cat("Positive Sentiment:",sum(positive$freq_tag),"Negative Sentiment:",sum(negative$freq_tag),sep="\n")

## Positive Sentiment:
## 3184
## Negative Sentiment:
## 2523

Wordcloud:

par(mfrow=c(1,2))
wordcloud(positive$text,positive$freq_tag, min.freq=10,rot.per=0,vfont=c("sans serif","plain"))
text(x=0.5, y=1.05, "Positive",col=brewer.pal(4, "Reds"))
wordcloud(negative$text,negative$freq_tag, min.freq=7,rot.per=0,vfont=c("sans serif","plain"))
text(x=0.5, y=1.05, "Negative",col=brewer.pal(4, "Reds"))

#Release date,year,month

Which year has seen maximum release of movies ?

class(movie$release_date)

## [1] "Date"

movie$Year=as.factor(format(movie$release_date,"%Y"))
movie$Date=as.factor(format(movie$release_date,"%d"))
movie$month=month.abb[(as.factor(format(movie$release_date,"%m")))]
movie %>% group_by(month) %>% drop_na(month) %>% summarise(count=n()) %>% arrange(desc(month)) %>% ggplot(aes(reorder(month,count),count,fill=month))+geom_bar(stat="identity")+theme(plot.title=element_text(size=14,face="italic",colour="red"),axis.text.x = element_text(angle=90),legend.position="none")+labs(x="",y="Total number of movies released",title="Number of Movies Releases per month")+coord_flip()+geom_label(aes(label=count))

Does higher number correlate to higher quality content ?

movie %>% drop_na(month) %>% ggplot(aes(month,vote_average,fill=month))+geom_boxplot(outlier.colour = "red",na.rm=TRUE)+theme(plot.title=element_text(size=14,face="italic",colour="red"),axis.text.x = element_text(angle=90),legend.position="none")+labs(x="",y="Average Vote",title="Boxplot of Average votes received by month")+coord_flip()

movie$month=factor(movie$month,levels=month.abb,ordered=TRUE)
#temp=movie %>% group_by(month) %>% filter(vote_average!=0)%>%  #summarise(median=median(vote_average),low=min(vote_average),high=max(vote_average)) %>% mutate(score=paste(low,"/",high))
temp=movie %>% drop_na(vote_average,month) %>% filter(vote_average!=0) %>% ddply(.(month),function(x){c(mid=median(x$vote_average),low=min(x$vote_average),high=max(x$vote_average))})
temp=temp[order(temp$month),]
formattable(temp,align=c("l","c","c","c"),list(mid=color_tile('lightblue','white'),low=color_bar('orange'),high=color_bar('lightgreen')))

month	mid	low	high
Jan	6.0	0.5	10.0
Feb	6.1	3.0	8.3
Mar	6.0	2.7	8.4
Apr	6.1	2.0	8.2
May	6.3	3.0	10.0
Jun	6.3	2.0	10.0
Jul	6.3	3.5	10.0
Aug	6.0	1.9	9.3
Sep	6.4	1.0	8.5
Oct	6.3	3.5	8.3
Nov	6.4	1.0	8.3
Dec	6.5	2.3	8.3

credit=credits %>% filter(nchar(cast)>2) %>% mutate(js=lapply(cast,fromJSON)) %>% unnest(js)
cat("Before JS the columns are ",names(credit),sep='\n')

## Before JS the columns are 
## movie_id
## title
## cast
## crew
## cast_id
## character
## credit_id
## gender
## id
## name
## order

credit= credit%>% select(-c(crew,cast_id,credit_id,id))
cat("After removing id columns the names are",names(credit),sep='\n')

## After removing id columns the names are
## movie_id
## title
## cast
## character
## gender
## name
## order

#Artist with most movies: Let us see which artist has acted in more movies

credit %>% group_by(name) %>% tally() %>% arrange(desc(n)) %>% head(10) %>% ggplot(aes(factor(name,levels=name),n,fill=name))+geom_bar(stat="identity")+labs(x="Artist",y="Count",title="Top 10 artist with most movies")+theme_few()+theme(axis.text.x=element_text(angle=90),plot.title=element_text(hjust=0.5,color="red"),legend.position="none")

db=movie %>% left_join(credits,by=c("id"="movie_id"))
db_credit=db %>% filter(nchar(cast)>2) %>% mutate(js=lapply(cast,fromJSON)) %>% unnest(js)
revenue=function (df,col_name,x,y,title){
 temp_df=df %>% filter(name==col_name) %>% arrange(desc(revenue)) %>% head(10)
 df_plot= ggplot(temp_df,aes(reorder(original_title,revenue),revenue,fill=original_title))+geom_bar(stat="identity")+theme_few()+theme(axis.text.x = element_text(angle=90,vjust=0.5),plot.title=element_text(hjust=0.5,size=15),legend.position="none")+labs(x=x,y=y,title=title)+coord_flip()+scale_x_discrete(labels=function(x)str_wrap(x,width=15))+scale_y_continuous(labels=scales::comma)
 print(df_plot)
 
}

Artist with High Budget Movies :

Let us see which artists movies have been made at high budgets.

db_credit %>%  filter(order==0) %>% group_by(name) %>% summarise(total=sum(budget)) %>% arrange(desc(total)) %>% head(10) %>% ggplot(aes(factor(name,levels=name),total,fill=name))+geom_bar(stat="identity")+theme(legend.position="none",plot.title=element_text(size=10,hjust=0.5),axis.text.x=element_text(angle=90))+labs(x="Actor Name",y="Sum of the budget",title="Actor with Highest Budgets till date..")+scale_y_continuous(labels=scales::comma)

Does high budget movie necessarily mean high popularity among viewers?

Let us correlate budget and popularity and see if there exist any correlation.

get_cor <- function(df){
    m <- cor(df$x,df$y, use="pairwise.complete.obs");
    eq <- substitute(expr=r==cor,env=list(cor=format(m, digits = 4)))
    return(as.character(as.expression(eq) ))                
}

temp=db_credit %>%  select(budget,popularity) %>% distinct() 
ggplot(temp,aes(budget,popularity))+stat_bin_hex(bins=15)+scale_fill_distiller(palette="Spectral")+stat_smooth(method="lm",color="orchid",size=2)+scale_x_continuous(labels=scales::comma)

get_cor(data.frame(x=temp$budget,y=temp$popularity))

## [1] "r == \"0.5033\""

For Tom Cruise

temp=db_credit %>% filter(name=="Tom Cruise") %>% subset(!(duplicated(original_title))) %>% arrange(desc(budget)) %>% head(20)  

ggplot(temp,aes(budget,as.integer(popularity),col=factor(original_title),size=popularity))+geom_point()+stat_smooth(method="lm",color="orchid",size=2)+theme(legend.position="bottom",plot.title=element_text(hjust=0.5,size=10),axis.text.x = element_text(angle = 90,hjust=0.5))+labs(x="Revenue",y="Popularity",title="Tom Cruise-Correlation between popularity and budget",subtitle="Filter by Top 20 Higest budget movies")+scale_x_continuous(labels=scales::comma)

The Movie Database -Analysis

4444

2019/5/30