Acknowledgements
netflix_dataset <- read.csv("/Users/macbookpro/Desktop/netflix_titles.csv")
library(dplyr)
library(ggplot2)
library(tidytext)
library(tidyr)
library(forcats)
library(stringr)
library(scales)
tf_idf is a statistical method in text analysis to investigate the importance of a word to describe a text document within a corpus of documents. [a good introduction video: https://www.youtube.com/watch?v=C3V2Lf1Y9Qk]
## looks at descriptive words by genres (UK and US releases)
descrip_genre <- netflix_dataset %>%
filter(country %in% c("United Kingdom","United States")) %>%
unnest_tokens(word,description) %>%
filter(!word %in% stop_words$word,
!is.na(word),
!is.na(listed_in)) %>%
mutate(word = str_extract(word,"[a-z]+")) %>%
count(word,listed_in) %>%
separate_rows(listed_in, sep = ", ") %>%
mutate(genre = fct_lump(listed_in,7)) %>%
select(-listed_in)
head(descrip_genre)
## # A tibble: 6 × 3
## word n genre
## <chr> <int> <fct>
## 1 aang 1 Other
## 2 aang 1 Other
## 3 aang 1 Other
## 4 aardman 1 Documentaries
## 5 aaron 1 Comedies
## 6 aaron 1 Other
descrip_tf_idf <- descrip_genre %>%
bind_tf_idf(word, genre, n)
descrip_tf_idf %>%
arrange(desc(tf_idf)) %>%
group_by(genre) %>%
slice_max(tf_idf, n = 5) %>%
ungroup() %>%
ggplot(aes(tf_idf, fct_reorder(word,tf_idf), fill = genre))+
geom_col()+
facet_wrap(~genre, ncol = 4, scales = "free")+
labs(x = "tf_idf", y = NULL)+
theme(axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
legend.position = "none")
Here, the results showed words that are most relevant to each genre type.
# bigram extracted from movie titles (also see [Nextflix dataset(2)])
movietit_bigram_united <- netflix_dataset %>%
unnest_tokens(bigram, description, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word) %>%
mutate(word1 = str_extract(word1,"[a-z]+"),
word2 = str_extract(word2,"[a-z]+")) %>%
filter(!is.na(word1), !is.na(word2))%>%
unite(bigram, word1, word2, sep = " ") %>%
select(bigram,listed_in) %>%
separate_rows(listed_in, sep = ", ") %>%
mutate(genre = fct_lump(listed_in, 5)) %>%
select(-listed_in)
head(movietit_bigram_united)
## # A tibble: 6 × 2
## bigram genre
## <chr> <fct>
## 1 elite inhabit International TV Shows
## 2 elite inhabit Other
## 3 elite inhabit Other
## 4 island paradise International TV Shows
## 5 island paradise Other
## 6 island paradise Other
bigram_tf_idf <- movietit_bigram_united %>%
count(genre,bigram) %>%
bind_tf_idf(bigram, genre, n) %>%
arrange(desc(tf_idf))
bigram_tf_idf %>%
group_by(genre) %>%
slice_max(tf_idf, n = 5) %>%
ungroup() %>%
ggplot(aes(tf_idf, fct_reorder(bigram,tf_idf), fill = genre))+
geom_col()+
facet_wrap(~genre, ncol = 2, scale = "free")+
theme(legend.position = "none")
Here, the results showed pairs of words that are most relevant to each genre type.
A word will have very low scores if all genres use it, even if some genres use it a whole lot more than others
library(tidylo)
genre_logodds <- netflix_dataset %>%
unnest_tokens(word,description) %>%
anti_join(stop_words, by = "word") %>%
distinct(type, title, word, genre = listed_in) %>%
add_count(word, name = "word_total") %>%
filter(word_total >= 25) %>%
separate_rows(genre, sep = ", ") %>%
filter(fct_lump(genre,9) != "Other") %>%
count(genre,word) %>%
bind_log_odds(genre,word,n) %>%
arrange(desc(log_odds_weighted))
head(genre_logodds)
## # A tibble: 6 × 4
## genre word n log_odds_weighted
## <chr> <chr> <int> <dbl>
## 1 Documentaries documentary 267 14.8
## 2 Documentaries interviews 35 9.73
## 3 Documentaries examines 28 7.51
## 4 Documentaries footage 34 7.15
## 5 Documentaries chronicles 23 6.78
## 6 International TV Shows docuseries 11 6.42
genre_logodds %>%
group_by(genre) %>%
top_n(10,log_odds_weighted) %>%
ungroup() %>%
mutate(word = fct_reorder(word,log_odds_weighted)) %>%
ggplot(aes(log_odds_weighted,word,fill=genre))+
geom_col()+
facet_wrap(~genre, scales = "free_y")+
scale_y_reordered()+
theme_bw()+
theme(legend.position = "none")+
ylab("")+
ggtitle("Descriptive words most relevant to each genre")
library(wordcloud)
library(reshape2)
# Example: most common descriptive words in movie genre "independent movies"
wordcloud_sample <- netflix_dataset %>%
filter(type == "Movie") %>%
separate_rows(listed_in, sep = ", ") %>%
filter(listed_in == "Independent Movies") %>%
distinct(title,description,listed_in) %>%
select(-listed_in) %>%
unnest_tokens(word, description) %>%
filter(!word %in% stop_words$word) %>%
count(word)
wordcloud_sample%>%
with(wordcloud(word,n,max.words = 100))