Acknowledgements
netflix_dataset <- read.csv("/Users/macbookpro/Desktop/netflix_titles.csv")
library(ggplot2)
library(dplyr)
library(tidyr)
library(tidyverse)
library(forcats)
# data packages specific for data mining
library(tidytext)
library(tidylo)
library(scales)
theme_set(theme_light())
Exploring and analysing the relationship patterns in the descriptions/ titles of Netflix releases.
word_unnested <- netflix_dataset %>%
unnest_tokens(word,description) %>%
anti_join(stop_words, by = "word") %>%
select(type,title,word,listed_in)
head(word_unnested)
## type title word listed_in
## 1 TV Show 3% future International TV Shows, TV Dramas, TV Sci-Fi & Fantasy
## 2 TV Show 3% elite International TV Shows, TV Dramas, TV Sci-Fi & Fantasy
## 3 TV Show 3% inhabit International TV Shows, TV Dramas, TV Sci-Fi & Fantasy
## 4 TV Show 3% island International TV Shows, TV Dramas, TV Sci-Fi & Fantasy
## 5 TV Show 3% paradise International TV Shows, TV Dramas, TV Sci-Fi & Fantasy
## 6 TV Show 3% crowded International TV Shows, TV Dramas, TV Sci-Fi & Fantasy
Words that are close to the line in the plot below have similar frequencies in both types, while words that are far away from the line are found more often in one type than the other.
freq_table <- word_unnested %>%
count(type,word) %>%
group_by(type) %>%
mutate(percent = n/sum(n)) %>%
select(-n) %>%
pivot_wider(names_from = type, values_from = percent)
freq_table %>%
filter(!is.na(`TV Show`), !is.na(Movie)) %>%
ggplot(aes(Movie,`TV Show`,color = abs(Movie - `TV Show`)))+
geom_jitter(alpha = .1, size = 2.5, width = .3, height = .3)+
scale_x_log10(labels = percent_format())+
scale_y_log10(labels = percent_format())+
scale_color_gradient(limits = c(0,0.001),
low = "darkslategray4",
high = "gray75")+
geom_abline(color = "gray40", lty = 2)+
geom_text(aes(label = word),
check_overlap = TRUE,
vjust = 1.5)+
theme(legend.position = "none")+
ggtitle("Comparison of word frequencies in description")
This can be further quantified by a correlation test
cor.test(freq_table$Movie,freq_table$`TV Show`)
##
## Pearson's product-moment correlation
##
## data: freq_table$Movie and freq_table$`TV Show`
## t = 109.82, df = 6153, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8051235 0.8220107
## sample estimates:
## cor
## 0.8137388
Sentiment analysis is the process of analysing the emotional tone of the message to be positive, negative, or neutral.
## prepare data
movie_title <- netflix_dataset %>%
filter(type == "Movie") %>%
separate_rows(listed_in, sep = ", ") %>%
mutate(genre = fct_lump(listed_in, 9)) %>%
select(title,genre,release_year) %>%
unnest_tokens(word,title) %>%
mutate(word = str_extract(word,"[a-z]+")) %>%
## to get rid of numbers as words and to extract words with symbols around them
filter(!is.na(word)) %>%
filter(!word %in% stop_words$word)
head(movie_title)
## # A tibble: 6 × 3
## genre release_year word
## <fct> <int> <chr>
## 1 Dramas 2014 oct
## 2 International Movies 2014 oct
## 3 Thrillers 2014 oct
## 4 Dramas 2018 jul
## 5 Thrillers 2018 jul
## 6 Comedies 2019 aug
Here lexicon “bing” is used which categorises words in a binary fashion into positive and negative emotions.
get_sentiments("bing")
## # A tibble: 6,786 × 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ℹ 6,776 more rows
movietit_senti <- movie_title %>%
inner_join(get_sentiments("bing")) %>%
count(genre,release_year,sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative) %>%
group_by(genre)
## Joining with `by = join_by(word)`
head(movietit_senti)
## # A tibble: 6 × 5
## # Groups: genre [1]
## genre release_year negative positive sentiment
## <fct> <int> <int> <int> <int>
## 1 Action & Adventure 1956 1 0 -1
## 2 Action & Adventure 1975 0 1 1
## 3 Action & Adventure 1976 1 0 -1
## 4 Action & Adventure 1979 1 0 -1
## 5 Action & Adventure 1980 0 1 1
## 6 Action & Adventure 1981 1 0 -1
movietit_senti %>%
ggplot(aes(release_year,sentiment,fill = genre))+
geom_col()+
facet_wrap(~genre,ncol = 2, scales = "free")+
theme(legend.position = "none")+
ggtitle("Sentiment trend over years of Netflix movie titles")
# set up datasets for processing: separated bigrams and united bigrams after filering
movietit_bigram_sep <- netflix_dataset %>%
unnest_tokens(bigram, description, token = "ngrams", n = 2) %>%
filter(!is.na(bigram),type == "Movie") %>%
select(bigram) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word) %>%
mutate(word1 = str_extract(word1,"[a-z]+"),
word2 = str_extract(word2,"[a-z]+")) %>%
filter(!is.na(word1), !is.na(word2))
head(movietit_bigram_sep)
## word1 word2
## 1 devastating earthquake
## 2 earthquake hits
## 3 hits mexico
## 4 mexico city
## 5 city trapped
## 6 trapped survivors
# the united data will be used in a later frequency analysis (see Netflix dataset [3])
movietit_bigram_united <- movietit_bigram_sep %>%
unite(bigram, word1, word2, sep = " ")
head(movietit_bigram_united)
## bigram
## 1 devastating earthquake
## 2 earthquake hits
## 3 hits mexico
## 4 mexico city
## 5 city trapped
## 6 trapped survivors
Here, the goal is to investiagte words most often combined with negative qualifiers. The sentiment lexicon used is “afinn”, which scores words along [-5 5] with negative values indicating negative words.
negative_prewords <- movietit_bigram_sep %>%
inner_join(get_sentiments("bing"), by = c(word1 = "word")) %>%
filter(sentiment == "negative") %>%
group_by(word1) %>%
count(word1, word2) %>%
arrange(desc(n)) %>%
inner_join(get_sentiments("afinn"), by = c(word2 = "word")) %>%
mutate(contribution = n*value) %>%
arrange(desc(abs(contribution))) %>%
select(-n,-value)
head(negative_prewords)
## # A tibble: 6 × 3
## # Groups: word1 [6]
## word1 word2 contribution
## <chr> <chr> <dbl>
## 1 lost love 12
## 2 petty criminals -9
## 3 evil spirit 8
## 4 unexpected romance 8
## 5 wreak havoc -8
## 6 wickedly funny 8
negative_prewords %>%
head(20) %>%
ggplot(aes(contribution, fct_reorder(word2, contribution), fill = contribution > 0))+
geom_col()+
theme(legend.position = "none")+
labs(x = "Sentimental value x number of occurrences",
y = "Words preceded by negative prewords")
Note that “victim”, “crime”, and “criminals” are preceded by multiple negative words.
library(widyr)
library(tidygraph)
library(ggraph)
set.seed(2025)
word_unnested %>%
distinct(type,title,word) %>%
add_count(word, name = "word_total") %>%
arrange(desc(word_total)) %>%
filter(word_total >= 50) %>%
pairwise_cor(word,title,sort = TRUE) %>%
filter(correlation >= .1) %>%
igraph::graph_from_data_frame() %>%
ggraph(layout = "fr")+
geom_edge_link(aes(alpha = correlation)) +
geom_node_point()+
geom_node_text(aes(label = name),
repel = TRUE)+
theme(legend.position = "none")
Here, the darker the line is, the higher the correlation between the pair of words.