Data analysis with non-hierarchical clustering (algorithm k-means) on data about movie lines distribution. The analysis was made on the dataset Polygraph’s Film Dialogue. Information about the dataset and how it was generated can be found in its original repository.
readr::read_csv(here("data/character_list5.csv"),
progress = FALSE,
col_types = cols(
script_id = col_integer(),
imdb_character_name = col_character(),
words = col_integer(),
gender = col_character(),
age = col_character()
)) %>%
mutate(age = as.numeric(age)) -> characters_list
readr::read_csv(here("data/meta_data7.csv"),
progress = FALSE,
col_types = cols(
script_id = col_integer(),
imdb_id = col_character(),
title = col_character(),
year = col_integer(),
gross = col_integer(),
lines_data = col_character()
)) %>%
mutate(title = iconv(title,"latin1", "UTF-8")) -> meta_data
left_join(characters_list,
meta_data,
by=c("script_id")) %>%
group_by(title, year) %>%
ungroup() -> scripts_data
scripts_data %>%
glimpse()
## Observations: 23,048
## Variables: 10
## $ script_id <int> 280, 280, 280, 280, 280, 280, 280, 623, 62...
## $ imdb_character_name <chr> "betty", "carolyn johnson", "eleanor", "fr...
## $ words <int> 311, 873, 138, 2251, 190, 723, 1908, 328, ...
## $ gender <chr> "f", "f", "f", "f", "f", "m", "m", "m", "f...
## $ age <dbl> 35, NA, NA, 46, 46, 38, 65, NA, 28, NA, 58...
## $ imdb_id <chr> "tt0112579", "tt0112579", "tt0112579", "tt...
## $ title <chr> "The Bridges of Madison County", "The Brid...
## $ year <int> 1995, 1995, 1995, 1995, 1995, 1995, 1995, ...
## $ gross <int> 142, 142, 142, 142, 142, 142, 142, 37, 37,...
## $ lines_data <chr> "43320234343434432034334343344334343434344...
scripts_data %>%
mutate(fem_words = ifelse(gender == "f",words,0),
man_words = ifelse(gender == "m",words,0)) %>%
group_by(title, year) %>%
mutate(total_fem_words = sum(fem_words),
total_man_words = sum(man_words)) %>%
filter(total_fem_words != 0) %>%
filter(total_man_words != 0) %>%
mutate(f_m_ratio = sum(gender == "f")/sum(gender == "m"),
mean_fem_words = ifelse(sum(gender == "f") == 0, 0, sum(fem_words)/sum(gender == "f")),
f_m_wordratio = total_fem_words/total_man_words) %>%
ungroup() %>%
drop_na() -> scripts_data
scripts_data %>%
select(title,
year,
f_m_ratio,
f_m_wordratio,
mean_fem_words) %>%
sample_n(10)
scripts_data %>%
group_by(title,year) %>%
slice(1) %>%
unique() %>%
ggplot(aes(x=f_m_wordratio,
y=(..count..)/sum(..count..))) +
geom_histogram(binwidth = 0.1,
boundary = 0,
fill = "grey",
color = "black") +
labs(y="Relative Frequency",
x="female/male wordratio")
scripts_data %>%
group_by(title,year) %>%
slice(1) %>%
unique() %>%
filter(f_m_wordratio < 10) %>%
ggplot(aes(x=f_m_wordratio,
y=(..count..)/sum(..count..))) +
geom_histogram(binwidth = 0.1,
fill = "grey",
color = "black") +
labs(y="Relative Frequency",
x="female/male wordratio")
scripts_data %>%
group_by(title,year) %>%
slice(1) %>%
unique() %>%
ggplot(aes(x="",
y=f_m_wordratio)) +
geom_violin(fill="grey",
width=0.5) +
labs(y="female/male wordratio") +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
scripts_data %>%
group_by(title,year) %>%
slice(1) %>%
unique() %>%
ggplot(aes(x=f_m_ratio,
y=(..count..)/sum(..count..))) +
geom_histogram(binwidth = 0.1,
boundary = 0,
fill = "grey",
color = "black") +
scale_x_continuous(breaks = seq(0,10,0.5)) +
labs(y="Relative Frequency",
x="(female chars / male chars) ratio")
scripts_data %>%
group_by(title,year) %>%
slice(1) %>%
unique() %>%
ggplot(aes(x="",
y=f_m_ratio)) +
geom_violin(fill="grey",
width=0.5) +
labs(y="(female chars / male chars) ratio") +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
scripts_data %>%
group_by(title,year) %>%
unique() %>%
filter(!mean_fem_words == 0) %>%
ggplot(aes(x=mean_fem_words,
y=(..count..)/sum(..count..))) +
geom_histogram(binwidth = 250,
boundary = 0,
fill = "grey",
color = "black") +
labs(y="Relative Frequency",
x="Mean of female words") +
scale_x_continuous(breaks = seq(0,7000,500))
scripts_data %>%
group_by(title,year) %>%
unique() %>%
filter(!mean_fem_words == 0) %>%
ggplot(aes(x="",
y=mean_fem_words)) +
geom_violin(fill="grey",
width=0.5) +
labs(y="Mean of female words") +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
scripts_data %>%
group_by(title,year) %>%
slice(1) %>%
unique() %>%
ggplot(aes(x=year)) +
geom_bar(fill = "grey",
color = "black") +
labs(y="Absolute Frequency",
x="Year of release")
scripts_data %>%
group_by(title,year) %>%
slice(1) %>%
unique() %>%
ggplot(aes(x="",
y=year)) +
geom_violin(fill="grey",
width=0.5) +
labs(y="Year of release") +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
scripts_data %>%
group_by(title,year) %>%
slice(1) %>%
unique() %>%
ggplot(aes(x=gross,
y=(..count..)/sum(..count..))) +
geom_histogram(binwidth = 50,
boundary = 0,
fill = "grey",
color = "black") +
labs(y="Relative Frequency", x="Gross")
scripts_data %>%
group_by(title,year) %>%
slice(1) %>%
unique() %>%
ggplot(aes(x="",
y=gross)) +
geom_violin(fill="grey",
width=0.5) +
labs(y="Gross") +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
scripts_data %>%
group_by(title) %>%
slice(1) %>%
unique() %>%
ungroup() %>%
select(title,
gross,
f_m_ratio,
f_m_wordratio,
mean_fem_words) %>%
na.omit() -> data
select(data, -title) %>%
mutate_all(funs(scale)) %>%
na.omit() -> scaled_data
scaled_data %>%
sample_n(10)
The GAP Statistic compares the grouping results with each available k in a data-set where there isn’t grouping structure.
plot_clusgap = function(clusgap, title="Gap Statistic calculation results"){
require("ggplot2")
gstab = data.frame(clusgap$Tab, k=1:nrow(clusgap$Tab))
p = ggplot(gstab, aes(k, gap)) + geom_line() + geom_point(size=5)
p = p + geom_errorbar(aes(ymax=gap+SE.sim, ymin=gap-SE.sim), width = .2)
p = p + ggtitle(title)
return(p)
}
plot_clusgap(gaps)
set.seed(23)
# Compute and plot wss for k = 2 to k = 15.
k.max <- 15
wss <- sapply(1:k.max,
function(k){kmeans(scaled_data, k, nstart=50,iter.max = 15 )$tot.withinss})
plot(1:k.max, wss,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
set.seed(23)
nb <- NbClust(scaled_data, diss=NULL, distance = "euclidean",
min.nc=2, max.nc=5, method = "kmeans",
index = "all", alphaBeale = 0.1)
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 3 proposed 2 as the best number of clusters
## * 5 proposed 3 as the best number of clusters
## * 2 proposed 4 as the best number of clusters
## * 13 proposed 5 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 5
##
##
## *******************************************************************
hist(nb$Best.nc[1,], breaks = max(na.omit(nb$Best.nc[1,])))
We’ll choose 3 groups (K=3) because the majority of the employed tests point 3 groups as the best solution, and in terms of explanation (in terms of a human agent) K=3 gave space for a better explanation than K=5.
set.seed(23)
n_clusters = 3
scaled_data %>%
kmeans(n_clusters, iter.max = 100, nstart = 20) -> km
p <- autoplot(km, data=scaled_data, frame = TRUE)
ggplotly(p)
set.seed(23)
row.names(scaled_data) <- data$title
toclust <- scaled_data %>%
rownames_to_column(var = "title")
km <- toclust %>%
select(-title) %>%
kmeans(centers = n_clusters, iter.max = 100, nstart = 20)
km %>%
augment(toclust) %>%
gather(key = "variável", value = "valor", -title, -.cluster) %>%
ggplot(aes(x = `variável`, y = valor, group = title, colour = .cluster)) +
geom_point(alpha = 0.2) +
geom_line(alpha = .5) +
facet_wrap(~ .cluster) +
labs(y="Value",
x="Dimension") +
coord_flip()
\(\color{red}{\text{Grupo 1}}\) - It’s A Man’s Man’s Man’s World
It’s A Man’s Man’s Man’s World is the group of movies of smallest female representativity, be it in the proportion of female characters or in the proportion of dialogue spoken by female characters. Curiously this is also the group with the Highest-grossing movies among all movies analyzed. This suggests an unfortunate association between lack of female representativity and profitable movies.
The name of the group refers to James Brown's famous song, which was written by his then girlfriend Betty Jean Newsome and comments on the dysfunctional dynamic between genders.
\(\color{green}{\text{Grupo 2}}\) - We Can Do It!
We Can Do It! is the group with the most female representativity, be it in the proportion of female characters or in the proportion of dialogue spoken by female characters. With a decent grossing which is better than that of the third group but nowhere near that of the first group.
The name of the group refers to the famous poster from J. Howard Miller published in 1943 motivating women to participate in the war effort in the factories.
\(\color{blue}{\text{Grupo 3}}\) - On the fence
The name of the group refers to the expression about not taking a side.
set.seed(23)
dists = scaled_data %>%
dist()
scaled_data %>%
kmeans(3, iter.max = 100, nstart = 20) -> km
silhouette(km$cluster, dists) %>%
plot(col = RColorBrewer::brewer.pal(4, "Set2"),
main="",
border=NA)