options(width=100)
knitr::opts_chunk$set(out.width='1000px',dpi=200,message=FALSE,warning=FALSE)
#load packages and csv file
library(ggplot2)
library(dplyr)
library(gridExtra)
library(grid)
library(ggthemes)
library(knitr)
library(ggdendro)
library(factoextra)
library(NbClust)
Following 538’s analysis on the Four types of Scarlett Johansson
, I had an updated look at this data (box office vs. RottenTomatoes score) in light of the release of her new movie : Ghost in the Shell
This kernel simply :
GITS
Data are taken from the same source of the article ; for GITS
box office and RottenTomatoes (Google)
df<-read.csv('scarlett_johansson_3.csv',sep=',')
head(df)
## X date Movie Role Domestic International
## 1 1 3/31/17 Ghost in the Shell The Major 33304297 96942362
## 2 2 12/21/16 Sing Ash 269943555 343600000
## 3 3 5/6/16 Captain America: Civil War Natasha Romanoff / Black Widow 408084349 743600000
## 4 4 4/15/16 The Jungle Book Kaa 364001123 599900000
## 5 5 2/5/16 Hail, Caesar! DeeAnna Moran 30080225 34091194
## 6 6 5/1/15 Avengers: Age of Ultron Natasha Romanoff/Black Widow 459005868 945700000
## Worldwide RottenTomatoes CAT DATE year
## 1 130246659 46 OTHER 2017-03-31 2017
## 2 613543555 73 AVENGERS 2016-12-21 2016
## 3 1151684349 90 AVENGERS 2016-05-06 2016
## 4 963901123 95 AVENGERS 2016-04-15 2016
## 5 64171419 86 HER 2016-02-05 2016
## 6 1404705868 75 AVENGERS 2015-05-01 2015
538
The categories define here as the same as W. Hickey in his article. Even by eye we can see that GITS may fit in the Peers
category : low RottenTomatoes score and low revenues at the box office.
g1<-ggplot() +
geom_point(data=filter(df,CAT=='AVENGERS'),aes(x=RottenTomatoes,y=Domestic,color='Avengers'),size=3) +
geom_point(data=filter(df,CAT=='HER'),aes(x=RottenTomatoes,y=Domestic,color='Her'),size=3) +
geom_point(data=filter(df,CAT=='PEERS'),aes(x=RottenTomatoes,y=Domestic,color='Peers'),size=3) +
geom_point(data=filter(df,CAT=='PRESTIGIOUS'),aes(x=RottenTomatoes,y=Domestic,color='Prestigious'),size=3) +
geom_point(data=filter(df,CAT=='OTHER'),aes(x=RottenTomatoes,y=Domestic,color='GITS'),size=3) +
scale_color_manual(name="",values = c(Avengers="orange",Her="#46ACC8",Peers="pink",Prestigious="#0B775E",GITS="#F21A00")) + theme_fivethirtyeight() + xlim(0,100) + theme(legend.position="top") + xlab('Rotten Tomatoes') + ylab('Domestic box office') + ggtitle("Domestic box office ($US) vs. RottenTomatoes score")
g1
Taking the yearly average of the RottenTomatoes score of the movies reeased in a given year, we can see when Johansson was hype
or less hype
.
g2<-df %>%
group_by(year) %>%
select(year,RottenTomatoes) %>%
summarise(meanRT = mean(RottenTomatoes)) %>%
ggplot(aes(x=year,y=meanRT)) +
geom_line(size=1.5,color='#E2D200') +
geom_point(aes(color=ifelse(year==2017,'gits','old')),size=2) +
ylim(0,100) +
scale_color_manual(name="",values = c(gits="#F21A00",old="black")) +
theme_fivethirtyeight() +
theme(legend.position='none') + xlab('year') + ylab('average Rotten Tomatoes') +
ggtitle("RottenTomatoes movies score vs. Year")
g2
#grid.arrange(g1,g2,ncol=2)
The goal here was to check/find new clusters ##Dendogram
tt <-data.frame(df %>% select(Movie,Domestic,RottenTomatoes))
rownames(tt)<-tt$Movie
tt$Movie<-NULL
dd <- dist(scale(tt), method = "euclidean")
hc <- hclust(dd, method = "ward.D2")
ggdendrogram(hc, rotate = FALSE, theme_dendro = FALSE) + xlab('') + ylab('')
The optimal cut is based on the Within cluster sum of squares
to find meaningful clusters.
nb <- NbClust(scale(tt), distance = "euclidean", min.nc = 2,max.nc = 10, method = "ward.D2", index ="all")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 2 proposed 2 as the best number of clusters
## * 12 proposed 3 as the best number of clusters
## * 5 proposed 4 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 3 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
fviz_nbclust(nb)
## Among all indices:
## ===================
## * 2 proposed 0 as the best number of clusters
## * 1 proposed 1 as the best number of clusters
## * 2 proposed 2 as the best number of clusters
## * 12 proposed 3 as the best number of clusters
## * 5 proposed 4 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 3 proposed 10 as the best number of clusters
##
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is 3 .
grp<-cutree(hc, k=3)
fviz_cluster(list(data = tt, cluster = grp)) + ggtitle('')
tt$group<-grp
ggplot() + geom_point(data=tt,aes(x=RottenTomatoes,y=Domestic,color=factor(group)),size=3) +
scale_color_manual(name="",values = c("#46ACC8","#0B775E","#F21A00")) +
theme_fivethirtyeight() + xlim(0,100) +
theme(legend.position="top") + xlab('Rotten Tomatoes') + ylab('Domestic box office') + ggtitle("Domestic box office ($US) vs. RottenTomatoes score")
NbClust
wasn’t able to differentiate Her
and Prestigious
clusters from the 538 analysis using hte optimal number of clusters,GITS
may be categorized in the Peers
categoryHistory :