Netflix dataset (2) - text analysis and visualisation part I

Acknowledgements

Example dataset source: “https://www.kaggle.com/datasets/shivamb/netflix-shows/versions/4?resource=download”
Based on textbook “Text Mining with R” by Julia Silge & David Robinson and tutorials by David Robinson: Youtube page [www.youtube.com/@safe4democracy]

netflix_dataset <- read.csv("/Users/macbookpro/Desktop/netflix_titles.csv")

library(ggplot2)
library(dplyr)
library(tidyr)
library(tidyverse)
library(forcats)

# data packages specific for data mining
library(tidytext)
library(tidylo)
library(scales)

theme_set(theme_light())

Exploring and analysing the relationship patterns in the descriptions/ titles of Netflix releases.

word_unnested <- netflix_dataset %>% 
  unnest_tokens(word,description) %>% 
  anti_join(stop_words, by = "word") %>% 
  select(type,title,word,listed_in)
head(word_unnested)

##      type title     word                                              listed_in
## 1 TV Show    3%   future International TV Shows, TV Dramas, TV Sci-Fi & Fantasy
## 2 TV Show    3%    elite International TV Shows, TV Dramas, TV Sci-Fi & Fantasy
## 3 TV Show    3%  inhabit International TV Shows, TV Dramas, TV Sci-Fi & Fantasy
## 4 TV Show    3%   island International TV Shows, TV Dramas, TV Sci-Fi & Fantasy
## 5 TV Show    3% paradise International TV Shows, TV Dramas, TV Sci-Fi & Fantasy
## 6 TV Show    3%  crowded International TV Shows, TV Dramas, TV Sci-Fi & Fantasy

Compare word frequencies in movies and tv shows

Words that are close to the line in the plot below have similar frequencies in both types, while words that are far away from the line are found more often in one type than the other.

freq_table <- word_unnested %>% 
  count(type,word) %>% 
  group_by(type) %>% 
  mutate(percent = n/sum(n)) %>% 
  select(-n) %>% 
  pivot_wider(names_from = type, values_from = percent)

freq_table %>% 
  filter(!is.na(`TV Show`), !is.na(Movie)) %>% 
  ggplot(aes(Movie,`TV Show`,color = abs(Movie - `TV Show`)))+
  geom_jitter(alpha = .1, size = 2.5, width = .3, height = .3)+
  scale_x_log10(labels = percent_format())+
  scale_y_log10(labels = percent_format())+
  scale_color_gradient(limits = c(0,0.001),
                       low = "darkslategray4",
                       high = "gray75")+
  geom_abline(color = "gray40", lty = 2)+
  geom_text(aes(label = word),
            check_overlap = TRUE,
            vjust = 1.5)+
  theme(legend.position = "none")+
  ggtitle("Comparison of word frequencies in description")

This can be further quantified by a correlation test

cor.test(freq_table$Movie,freq_table$`TV Show`)

## 
##  Pearson's product-moment correlation
## 
## data:  freq_table$Movie and freq_table$`TV Show`
## t = 109.82, df = 6153, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8051235 0.8220107
## sample estimates:
##       cor 
## 0.8137388

Sentiment analysis of movie titles by genres

Sentiment analysis is the process of analysing the emotional tone of the message to be positive, negative, or neutral.

## prepare data
movie_title <- netflix_dataset %>% 
  filter(type == "Movie") %>% 
  separate_rows(listed_in, sep = ", ") %>% 
  mutate(genre = fct_lump(listed_in, 9)) %>% 
  select(title,genre,release_year) %>% 
  unnest_tokens(word,title) %>% 
  mutate(word = str_extract(word,"[a-z]+")) %>% 
  ## to get rid of numbers as words and to extract words with symbols around them  
  filter(!is.na(word)) %>% 
  filter(!word %in% stop_words$word)

head(movie_title)

## # A tibble: 6 × 3
##   genre                release_year word 
##   <fct>                       <int> <chr>
## 1 Dramas                       2014 oct  
## 2 International Movies         2014 oct  
## 3 Thrillers                    2014 oct  
## 4 Dramas                       2018 jul  
## 5 Thrillers                    2018 jul  
## 6 Comedies                     2019 aug

Here lexicon “bing” is used which categorises words in a binary fashion into positive and negative emotions.

get_sentiments("bing")

## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ℹ 6,776 more rows

movietit_senti <- movie_title %>% 
  inner_join(get_sentiments("bing")) %>% 
  count(genre,release_year,sentiment) %>% 
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative) %>% 
  group_by(genre)

## Joining with `by = join_by(word)`

head(movietit_senti)

## # A tibble: 6 × 5
## # Groups:   genre [1]
##   genre              release_year negative positive sentiment
##   <fct>                     <int>    <int>    <int>     <int>
## 1 Action & Adventure         1956        1        0        -1
## 2 Action & Adventure         1975        0        1         1
## 3 Action & Adventure         1976        1        0        -1
## 4 Action & Adventure         1979        1        0        -1
## 5 Action & Adventure         1980        0        1         1
## 6 Action & Adventure         1981        1        0        -1

movietit_senti %>% 
  ggplot(aes(release_year,sentiment,fill = genre))+
  geom_col()+
  facet_wrap(~genre,ncol = 2, scales = "free")+
  theme(legend.position = "none")+
  ggtitle("Sentiment trend over years of Netflix movie titles")

However, this result can be biased by not considering qualifiers before words (e.g., no good). Sentimental lexicons are based on unigrams only.

Examine negative qualifiers in movie descriptions

# set up datasets for processing: separated bigrams and united bigrams after filering
movietit_bigram_sep <- netflix_dataset %>% 
  unnest_tokens(bigram, description, token = "ngrams", n = 2) %>% 
  filter(!is.na(bigram),type == "Movie") %>% 
  select(bigram) %>% 
  separate(bigram, c("word1", "word2"), sep = " ") %>% 
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word) %>% 
  mutate(word1 = str_extract(word1,"[a-z]+"),
         word2 = str_extract(word2,"[a-z]+")) %>% 
  filter(!is.na(word1), !is.na(word2))

head(movietit_bigram_sep)

##         word1      word2
## 1 devastating earthquake
## 2  earthquake       hits
## 3        hits     mexico
## 4      mexico       city
## 5        city    trapped
## 6     trapped  survivors

# the united data will be used in a later frequency analysis (see Netflix dataset [3])
movietit_bigram_united <- movietit_bigram_sep %>% 
  unite(bigram, word1, word2, sep = " ")
head(movietit_bigram_united)

##                   bigram
## 1 devastating earthquake
## 2        earthquake hits
## 3            hits mexico
## 4            mexico city
## 5           city trapped
## 6      trapped survivors

Here, the goal is to investiagte words most often combined with negative qualifiers. The sentiment lexicon used is “afinn”, which scores words along [-5 5] with negative values indicating negative words.

negative_prewords <- movietit_bigram_sep %>% 
  inner_join(get_sentiments("bing"), by = c(word1 = "word")) %>% 
  filter(sentiment == "negative") %>% 
  group_by(word1) %>% 
  count(word1, word2) %>% 
  arrange(desc(n)) %>% 
  inner_join(get_sentiments("afinn"), by = c(word2 = "word")) %>% 
  mutate(contribution = n*value) %>% 
  arrange(desc(abs(contribution))) %>% 
  select(-n,-value)

head(negative_prewords)

## # A tibble: 6 × 3
## # Groups:   word1 [6]
##   word1      word2     contribution
##   <chr>      <chr>            <dbl>
## 1 lost       love                12
## 2 petty      criminals           -9
## 3 evil       spirit               8
## 4 unexpected romance              8
## 5 wreak      havoc               -8
## 6 wickedly   funny                8

negative_prewords %>% 
  head(20) %>% 
  ggplot(aes(contribution, fct_reorder(word2, contribution), fill = contribution > 0))+
  geom_col()+
  theme(legend.position = "none")+
  labs(x = "Sentimental value x number of occurrences",
       y = "Words preceded by negative prewords")

Note that “victim”, “crime”, and “criminals” are preceded by multiple negative words.

Explore what words appear together in descriptions

library(widyr)
library(tidygraph)
library(ggraph)

set.seed(2025)

word_unnested %>% 
  distinct(type,title,word) %>% 
  add_count(word, name = "word_total") %>% 
  arrange(desc(word_total)) %>% 
  filter(word_total >= 50) %>% 
  pairwise_cor(word,title,sort = TRUE) %>% 
  filter(correlation >= .1) %>% 
  igraph::graph_from_data_frame() %>% 
  ggraph(layout = "fr")+
  geom_edge_link(aes(alpha = correlation)) +
  geom_node_point()+
  geom_node_text(aes(label = name),
                 repel = TRUE)+
  theme(legend.position = "none")

Here, the darker the line is, the higher the correlation between the pair of words.