©Deepak Kumar G S
#Introduation 数据集来自TMDB(The Movie Database)上5000部电影数据,包括电影的详细信息和参与电影的人员信息
#Loading the data and summary
library(plyr)
library(tidyverse)
## -- Attaching packages ---------------- tidyverse 1.2.1 --
## √ ggplot2 3.1.1 √ purrr 0.3.2
## √ tibble 2.1.1 √ dplyr 0.8.0.1
## √ tidyr 0.8.3 √ stringr 1.4.0
## √ readr 1.3.1 √ forcats 0.4.0
## -- Conflicts ------------------- tidyverse_conflicts() --
## x dplyr::arrange() masks plyr::arrange()
## x purrr::compact() masks plyr::compact()
## x dplyr::count() masks plyr::count()
## x dplyr::failwith() masks plyr::failwith()
## x dplyr::filter() masks stats::filter()
## x dplyr::id() masks plyr::id()
## x dplyr::lag() masks stats::lag()
## x dplyr::mutate() masks plyr::mutate()
## x dplyr::rename() masks plyr::rename()
## x dplyr::summarise() masks plyr::summarise()
## x dplyr::summarize() masks plyr::summarize()
library(formattable)
library(splitstackshape)
library(jsonlite)
##
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
##
## flatten
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(ggthemes)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(RSentiment)
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(stringr)
movie=read_csv("D:/创新研究/tmdb_5000_movies.csv",col_names=TRUE,na="NA")
## Parsed with column specification:
## cols(
## .default = col_character(),
## budget = col_double(),
## id = col_double(),
## popularity = col_double(),
## release_date = col_date(format = ""),
## revenue = col_double(),
## runtime = col_double(),
## vote_average = col_double(),
## vote_count = col_double()
## )
## See spec(...) for full column specifications.
credits=read_csv("D:/创新研究/tmdb_5000_credits.csv",col_names=TRUE,na="NA")
## Parsed with column specification:
## cols(
## movie_id = col_double(),
## title = col_character(),
## cast = col_character(),
## crew = col_character()
## )
glimpse(movie)
## Observations: 4,803
## Variables: 20
## $ budget <dbl> 2.37e+08, 3.00e+08, 2.45e+08, 2.50e+08, 2...
## $ genres <chr> "[{\"id\": 28, \"name\": \"Action\"}, {\"...
## $ homepage <chr> "http://www.avatarmovie.com/", "http://di...
## $ id <dbl> 19995, 285, 206647, 49026, 49529, 559, 38...
## $ keywords <chr> "[{\"id\": 1463, \"name\": \"culture clas...
## $ original_language <chr> "en", "en", "en", "en", "en", "en", "en",...
## $ original_title <chr> "Avatar", "Pirates of the Caribbean: At W...
## $ overview <chr> "In the 22nd century, a paraplegic Marine...
## $ popularity <dbl> 150.43758, 139.08262, 107.37679, 112.3129...
## $ production_companies <chr> "[{\"name\": \"Ingenious Film Partners\",...
## $ production_countries <chr> "[{\"iso_3166_1\": \"US\", \"name\": \"Un...
## $ release_date <date> 2009-12-10, 2007-05-19, 2015-10-26, 2012...
## $ revenue <dbl> 2787965087, 961000000, 880674609, 1084939...
## $ runtime <dbl> 162, 169, 148, 165, 132, 139, 100, 141, 1...
## $ spoken_languages <chr> "[{\"iso_639_1\": \"en\", \"name\": \"Eng...
## $ status <chr> "Released", "Released", "Released", "Rele...
## $ tagline <chr> "Enter the World of Pandora.", "At the en...
## $ title <chr> "Avatar", "Pirates of the Caribbean: At W...
## $ vote_average <dbl> 7.2, 6.9, 6.3, 7.6, 6.1, 5.9, 7.4, 7.3, 7...
## $ vote_count <dbl> 11800, 4500, 4466, 9106, 2124, 3576, 3330...
summary(movie)
## budget genres homepage
## Min. : 0 Length:4803 Length:4803
## 1st Qu.: 790000 Class :character Class :character
## Median : 15000000 Mode :character Mode :character
## Mean : 29045040
## 3rd Qu.: 40000000
## Max. :380000000
##
## id keywords original_language original_title
## Min. : 5 Length:4803 Length:4803 Length:4803
## 1st Qu.: 9014 Class :character Class :character Class :character
## Median : 14629 Mode :character Mode :character Mode :character
## Mean : 57166
## 3rd Qu.: 58611
## Max. :459488
##
## overview popularity production_companies
## Length:4803 Min. : 0.000 Length:4803
## Class :character 1st Qu.: 4.668 Class :character
## Mode :character Median : 12.922 Mode :character
## Mean : 21.492
## 3rd Qu.: 28.314
## Max. :875.581
##
## production_countries release_date revenue
## Length:4803 Min. :1916-09-04 Min. :0.000e+00
## Class :character 1st Qu.:1999-07-14 1st Qu.:0.000e+00
## Mode :character Median :2005-10-03 Median :1.917e+07
## Mean :2002-12-27 Mean :8.226e+07
## 3rd Qu.:2011-02-16 3rd Qu.:9.292e+07
## Max. :2017-02-03 Max. :2.788e+09
## NA's :1
## runtime spoken_languages status tagline
## Min. : 0.0 Length:4803 Length:4803 Length:4803
## 1st Qu.: 94.0 Class :character Class :character Class :character
## Median :103.0 Mode :character Mode :character Mode :character
## Mean :106.9
## 3rd Qu.:118.0
## Max. :338.0
## NA's :2
## title vote_average vote_count
## Length:4803 Min. : 0.000 Min. : 0.0
## Class :character 1st Qu.: 5.600 1st Qu.: 54.0
## Mode :character Median : 6.200 Median : 235.0
## Mean : 6.092 Mean : 690.2
## 3rd Qu.: 6.800 3rd Qu.: 737.0
## Max. :10.000 Max. :13752.0
##
glimpse(credits)
## Observations: 4,803
## Variables: 4
## $ movie_id <dbl> 19995, 285, 206647, 49026, 49529, 559, 38757, 99861, ...
## $ title <chr> "Avatar", "Pirates of the Caribbean: At World's End",...
## $ cast <chr> "[{\"cast_id\": 242, \"character\": \"Jake Sully\", \...
## $ crew <chr> "[{\"credit_id\": \"52fe48009251416c750aca23\", \"dep...
summary(credits)
## movie_id title cast crew
## Min. : 5 Length:4803 Length:4803 Length:4803
## 1st Qu.: 9014 Class :character Class :character Class :character
## Median : 14629 Mode :character Mode :character Mode :character
## Mean : 57166
## 3rd Qu.: 58611
## Max. :459488
#Cleaning the Movie Dataset
genredf=movie %>% filter(nchar(genres)>2) %>% mutate(js=lapply(genres,fromJSON)) %>% unnest(js) %>% select(id,title,genre=name) #Convert JSON format into data frame
slice(genredf)
## # A tibble: 12,160 x 3
## id title genre
## <dbl> <chr> <chr>
## 1 19995 Avatar Action
## 2 19995 Avatar Adventure
## 3 19995 Avatar Fantasy
## 4 19995 Avatar Science Fiction
## 5 285 Pirates of the Caribbean: At World's End Adventure
## 6 285 Pirates of the Caribbean: At World's End Fantasy
## 7 285 Pirates of the Caribbean: At World's End Action
## 8 206647 Spectre Action
## 9 206647 Spectre Adventure
## 10 206647 Spectre Crime
## # ... with 12,150 more rows
#Wordcloud of Genre Representation
temp=genredf %>% group_by(genre) %>% summarise(count=length(genre)) %>% arrange(desc(count))
wordcloud(words=temp$genre,freq=temp$count,min.freq=100,max.words = 20,random.order=FALSE,random.color=FALSE,rot.per=0.35,colors = brewer.pal(20,"Dark2"),scale=c(5,.2))
## Warning in brewer.pal(20, "Dark2"): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors
Comedy,Drama,Thriller and Action are most represented.
#Movies with highest budget
movie %>% select(original_title,budget) %>% drop_na(original_title)%>% arrange(desc(budget)) %>% head(10) %>% ggplot(aes(reorder(original_title,budget),budget,fill=original_title))+geom_bar(stat="identity")+theme(axis.text.x = element_text(angle=90),plot.title=element_text(color="Red",face="italic"),legend.position="none")+scale_y_continuous(labels=scales::comma)+labs(x="",y="Total Budget in $",title="Most Expensive Movies -Top 10")
#Highest grossing movies
movie %>% select(original_title,revenue) %>% drop_na(original_title)%>% arrange(desc(revenue)) %>% head(10) %>% ggplot(aes(reorder(original_title,revenue),revenue,fill=original_title))+geom_bar(stat="identity")+theme(axis.text.x = element_text(angle=90),plot.title=element_text(color="Red",face="italic"),legend.position="none")+scale_y_continuous(limits=c(0,3000000000),breaks=seq(0,3000000000,500000000),labels=scales::comma)+labs(x="",y="Total Revenue in $",title="Highest Grossing Movies -Top 10")
#Movies with highest popularity
movie %>% group_by(original_title) %>% arrange(desc(popularity)) %>% head(10) %>% ggplot(aes(factor(original_title,levels=original_title),popularity,fill=original_title))+geom_bar(stat="identity")+theme_few()+theme(axis.text.x=element_text(angle=90,hjust=0.5),plot.title=element_text(hjust=0.5,size=15,color="red"),legend.position="none")+labs(x="Title",y="Rating",title="Popularity of Movies")+scale_x_discrete(labels=function(x)str_wrap(x,width=15))
#Production companies
production=movie%>% filter(nchar(production_companies)>2)%>% mutate(js=lapply(production_companies,fromJSON)) %>% unnest(js)%>% select(budget,revenue,company=name)
lapply(production,class)
## $budget
## [1] "numeric"
##
## $revenue
## [1] "numeric"
##
## $company
## [1] "character"
temp=production %>% group_by(company) %>% summarise(count=n()) %>% arrange(desc(count))
wordcloud(words=temp$company,freq=temp$count,max.words = 25,color =rainbow(7),scale=c(2,0.2))
Warner Bros,Universal Pictures,Parmount Pictures,20th century fox are represented more in this dataset.
corpus=Corpus(VectorSource(list(movie$tagline)))
corpus=tm_map(corpus,removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation
## drops documents
corpus=tm_map(corpus,content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
corpus=tm_map(corpus,stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation
## drops documents
corpus=tm_map(corpus,removeWords,stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
dtm_tag=DocumentTermMatrix(VCorpus(VectorSource(corpus[1]$content)))
freq_tag=colSums(as.matrix(dtm_tag))
sent_tag=calculate_sentiment(names(freq_tag)) %>% cbind(as.data.frame(freq_tag))
Now its time to calcuate the sentiment.
positive=sent_tag[sent_tag$sentiment=="Positive",]
negative=sent_tag[sent_tag$sentiment=="Negative",]
cat("Positive Sentiment:",sum(positive$freq_tag),"Negative Sentiment:",sum(negative$freq_tag),sep="\n")
## Positive Sentiment:
## 3184
## Negative Sentiment:
## 2523
par(mfrow=c(1,2))
wordcloud(positive$text,positive$freq_tag, min.freq=10,rot.per=0,vfont=c("sans serif","plain"))
text(x=0.5, y=1.05, "Positive",col=brewer.pal(4, "Reds"))
wordcloud(negative$text,negative$freq_tag, min.freq=7,rot.per=0,vfont=c("sans serif","plain"))
text(x=0.5, y=1.05, "Negative",col=brewer.pal(4, "Reds"))
#Release date,year,month
class(movie$release_date)
## [1] "Date"
movie$Year=as.factor(format(movie$release_date,"%Y"))
movie$Date=as.factor(format(movie$release_date,"%d"))
movie$month=month.abb[(as.factor(format(movie$release_date,"%m")))]
movie %>% group_by(month) %>% drop_na(month) %>% summarise(count=n()) %>% arrange(desc(month)) %>% ggplot(aes(reorder(month,count),count,fill=month))+geom_bar(stat="identity")+theme(plot.title=element_text(size=14,face="italic",colour="red"),axis.text.x = element_text(angle=90),legend.position="none")+labs(x="",y="Total number of movies released",title="Number of Movies Releases per month")+coord_flip()+geom_label(aes(label=count))
movie %>% drop_na(month) %>% ggplot(aes(month,vote_average,fill=month))+geom_boxplot(outlier.colour = "red",na.rm=TRUE)+theme(plot.title=element_text(size=14,face="italic",colour="red"),axis.text.x = element_text(angle=90),legend.position="none")+labs(x="",y="Average Vote",title="Boxplot of Average votes received by month")+coord_flip()
movie$month=factor(movie$month,levels=month.abb,ordered=TRUE)
#temp=movie %>% group_by(month) %>% filter(vote_average!=0)%>% #summarise(median=median(vote_average),low=min(vote_average),high=max(vote_average)) %>% mutate(score=paste(low,"/",high))
temp=movie %>% drop_na(vote_average,month) %>% filter(vote_average!=0) %>% ddply(.(month),function(x){c(mid=median(x$vote_average),low=min(x$vote_average),high=max(x$vote_average))})
temp=temp[order(temp$month),]
formattable(temp,align=c("l","c","c","c"),list(mid=color_tile('lightblue','white'),low=color_bar('orange'),high=color_bar('lightgreen')))
| month | mid | low | high |
|---|---|---|---|
| Jan | 6.0 | 0.5 | 10.0 |
| Feb | 6.1 | 3.0 | 8.3 |
| Mar | 6.0 | 2.7 | 8.4 |
| Apr | 6.1 | 2.0 | 8.2 |
| May | 6.3 | 3.0 | 10.0 |
| Jun | 6.3 | 2.0 | 10.0 |
| Jul | 6.3 | 3.5 | 10.0 |
| Aug | 6.0 | 1.9 | 9.3 |
| Sep | 6.4 | 1.0 | 8.5 |
| Oct | 6.3 | 3.5 | 8.3 |
| Nov | 6.4 | 1.0 | 8.3 |
| Dec | 6.5 | 2.3 | 8.3 |
credit=credits %>% filter(nchar(cast)>2) %>% mutate(js=lapply(cast,fromJSON)) %>% unnest(js)
cat("Before JS the columns are ",names(credit),sep='\n')
## Before JS the columns are
## movie_id
## title
## cast
## crew
## cast_id
## character
## credit_id
## gender
## id
## name
## order
credit= credit%>% select(-c(crew,cast_id,credit_id,id))
cat("After removing id columns the names are",names(credit),sep='\n')
## After removing id columns the names are
## movie_id
## title
## cast
## character
## gender
## name
## order
#Artist with most movies: Let us see which artist has acted in more movies
credit %>% group_by(name) %>% tally() %>% arrange(desc(n)) %>% head(10) %>% ggplot(aes(factor(name,levels=name),n,fill=name))+geom_bar(stat="identity")+labs(x="Artist",y="Count",title="Top 10 artist with most movies")+theme_few()+theme(axis.text.x=element_text(angle=90),plot.title=element_text(hjust=0.5,color="red"),legend.position="none")
db=movie %>% left_join(credits,by=c("id"="movie_id"))
db_credit=db %>% filter(nchar(cast)>2) %>% mutate(js=lapply(cast,fromJSON)) %>% unnest(js)
revenue=function (df,col_name,x,y,title){
temp_df=df %>% filter(name==col_name) %>% arrange(desc(revenue)) %>% head(10)
df_plot= ggplot(temp_df,aes(reorder(original_title,revenue),revenue,fill=original_title))+geom_bar(stat="identity")+theme_few()+theme(axis.text.x = element_text(angle=90,vjust=0.5),plot.title=element_text(hjust=0.5,size=15),legend.position="none")+labs(x=x,y=y,title=title)+coord_flip()+scale_x_discrete(labels=function(x)str_wrap(x,width=15))+scale_y_continuous(labels=scales::comma)
print(df_plot)
}
Let us see which artists movies have been made at high budgets.
db_credit %>% filter(order==0) %>% group_by(name) %>% summarise(total=sum(budget)) %>% arrange(desc(total)) %>% head(10) %>% ggplot(aes(factor(name,levels=name),total,fill=name))+geom_bar(stat="identity")+theme(legend.position="none",plot.title=element_text(size=10,hjust=0.5),axis.text.x=element_text(angle=90))+labs(x="Actor Name",y="Sum of the budget",title="Actor with Highest Budgets till date..")+scale_y_continuous(labels=scales::comma)
Let us correlate budget and popularity and see if there exist any correlation.
get_cor <- function(df){
m <- cor(df$x,df$y, use="pairwise.complete.obs");
eq <- substitute(expr=r==cor,env=list(cor=format(m, digits = 4)))
return(as.character(as.expression(eq) ))
}
temp=db_credit %>% select(budget,popularity) %>% distinct()
ggplot(temp,aes(budget,popularity))+stat_bin_hex(bins=15)+scale_fill_distiller(palette="Spectral")+stat_smooth(method="lm",color="orchid",size=2)+scale_x_continuous(labels=scales::comma)
get_cor(data.frame(x=temp$budget,y=temp$popularity))
## [1] "r == \"0.5033\""
temp=db_credit %>% filter(name=="Tom Cruise") %>% subset(!(duplicated(original_title))) %>% arrange(desc(budget)) %>% head(20)
ggplot(temp,aes(budget,as.integer(popularity),col=factor(original_title),size=popularity))+geom_point()+stat_smooth(method="lm",color="orchid",size=2)+theme(legend.position="bottom",plot.title=element_text(hjust=0.5,size=10),axis.text.x = element_text(angle = 90,hjust=0.5))+labs(x="Revenue",y="Popularity",title="Tom Cruise-Correlation between popularity and budget",subtitle="Filter by Top 20 Higest budget movies")+scale_x_continuous(labels=scales::comma)