Netflix dataset (3) - text analysis and visualisation part II

Acknowledgements

Example dataset source: “https://www.kaggle.com/datasets/shivamb/netflix-shows/versions/4?resource=download”
Based on textbook “Text Mining with R” by Julia Silge & David Robinson and tutorials by David Robinson: Youtube page [www.youtube.com/@safe4democracy]

netflix_dataset <- read.csv("/Users/macbookpro/Desktop/netflix_titles.csv")

library(dplyr)
library(ggplot2)
library(tidytext)
library(tidyr)
library(forcats)
library(stringr)
library(scales)

Frequency analysis

tf_idf is a statistical method in text analysis to investigate the importance of a word to describe a text document within a corpus of documents. [a good introduction video: https://www.youtube.com/watch?v=C3V2Lf1Y9Qk]

tf: term frequency [how often a word occurs in one document]
idf: inverse document frequency [how often a word occurs across all the documents in a corpus]

## looks at descriptive words by genres (UK and US releases)
descrip_genre <- netflix_dataset %>% 
  filter(country %in% c("United Kingdom","United States")) %>% 
  unnest_tokens(word,description) %>% 
  filter(!word %in% stop_words$word,
         !is.na(word),
         !is.na(listed_in)) %>% 
  mutate(word = str_extract(word,"[a-z]+")) %>% 
  count(word,listed_in) %>% 
  separate_rows(listed_in, sep = ", ") %>% 
  mutate(genre = fct_lump(listed_in,7)) %>%  
  select(-listed_in) 

head(descrip_genre)

## # A tibble: 6 × 3
##   word        n genre        
##   <chr>   <int> <fct>        
## 1 aang        1 Other        
## 2 aang        1 Other        
## 3 aang        1 Other        
## 4 aardman     1 Documentaries
## 5 aaron       1 Comedies     
## 6 aaron       1 Other

descrip_tf_idf <- descrip_genre %>% 
  bind_tf_idf(word, genre, n)

descrip_tf_idf %>% 
  arrange(desc(tf_idf)) %>% 
  group_by(genre) %>% 
  slice_max(tf_idf, n = 5) %>% 
  ungroup() %>% 
  ggplot(aes(tf_idf, fct_reorder(word,tf_idf), fill = genre))+
  geom_col()+
  facet_wrap(~genre, ncol = 4, scales = "free")+
  labs(x = "tf_idf", y = NULL)+
  theme(axis.ticks.x = element_blank(),
        axis.text.x = element_blank(),
        legend.position = "none")

Here, the results showed words that are most relevant to each genre type.

Bigram frequency analysis

# bigram extracted from movie titles (also see [Nextflix dataset(2)])
movietit_bigram_united <- netflix_dataset %>% 
  unnest_tokens(bigram, description, token = "ngrams", n = 2) %>% 
  separate(bigram, c("word1", "word2"), sep = " ") %>% 
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word) %>% 
  mutate(word1 = str_extract(word1,"[a-z]+"),
         word2 = str_extract(word2,"[a-z]+")) %>% 
  filter(!is.na(word1), !is.na(word2))%>% 
  unite(bigram, word1, word2, sep = " ") %>% 
  select(bigram,listed_in) %>% 
  separate_rows(listed_in, sep = ", ") %>% 
  mutate(genre = fct_lump(listed_in, 5)) %>% 
  select(-listed_in)

head(movietit_bigram_united)

## # A tibble: 6 × 2
##   bigram          genre                 
##   <chr>           <fct>                 
## 1 elite inhabit   International TV Shows
## 2 elite inhabit   Other                 
## 3 elite inhabit   Other                 
## 4 island paradise International TV Shows
## 5 island paradise Other                 
## 6 island paradise Other

bigram_tf_idf <- movietit_bigram_united %>% 
  count(genre,bigram) %>% 
  bind_tf_idf(bigram, genre, n) %>% 
  arrange(desc(tf_idf))

bigram_tf_idf %>% 
  group_by(genre) %>% 
  slice_max(tf_idf, n = 5) %>% 
  ungroup() %>% 
  ggplot(aes(tf_idf, fct_reorder(bigram,tf_idf), fill = genre))+
  geom_col()+
  facet_wrap(~genre, ncol = 2, scale = "free")+
  theme(legend.position = "none")

Here, the results showed pairs of words that are most relevant to each genre type.

Weighted log odds as an alternative

However, to do stylist analysis, tf-idf might not be ideal.

A word will have very low scores if all genres use it, even if some genres use it a whole lot more than others

library(tidylo)

genre_logodds <- netflix_dataset %>% 
  unnest_tokens(word,description) %>% 
  anti_join(stop_words, by = "word") %>% 
  distinct(type, title, word, genre = listed_in) %>% 
  add_count(word, name = "word_total") %>% 
  filter(word_total >= 25) %>% 
  separate_rows(genre, sep = ", ") %>% 
  filter(fct_lump(genre,9) != "Other") %>% 
  count(genre,word) %>% 
  bind_log_odds(genre,word,n) %>% 
  arrange(desc(log_odds_weighted))

head(genre_logodds)

## # A tibble: 6 × 4
##   genre                  word            n log_odds_weighted
##   <chr>                  <chr>       <int>             <dbl>
## 1 Documentaries          documentary   267             14.8 
## 2 Documentaries          interviews     35              9.73
## 3 Documentaries          examines       28              7.51
## 4 Documentaries          footage        34              7.15
## 5 Documentaries          chronicles     23              6.78
## 6 International TV Shows docuseries     11              6.42

genre_logodds %>% 
  group_by(genre) %>% 
  top_n(10,log_odds_weighted) %>% 
  ungroup() %>% 
  mutate(word = fct_reorder(word,log_odds_weighted)) %>% 
  ggplot(aes(log_odds_weighted,word,fill=genre))+
  geom_col()+
  facet_wrap(~genre, scales = "free_y")+
  scale_y_reordered()+
  theme_bw()+
  theme(legend.position = "none")+
  ylab("")+
  ggtitle("Descriptive words most relevant to each genre")

Word Clouds

library(wordcloud)
library(reshape2)

# Example: most common descriptive words in movie genre "independent movies"
wordcloud_sample <- netflix_dataset %>% 
  filter(type == "Movie") %>% 
  separate_rows(listed_in, sep = ", ") %>% 
  filter(listed_in == "Independent Movies") %>% 
  distinct(title,description,listed_in) %>% 
  select(-listed_in) %>% 
  unnest_tokens(word, description) %>% 
  filter(!word %in% stop_words$word) %>% 
  count(word) 

wordcloud_sample%>% 
  with(wordcloud(word,n,max.words = 100))