Netflix datset text analysis
Introduction
In this example we are going to do some text and sentimental analysis
with the Netflix dataset. The dataset has a lot of information so you
may do many analysis based on your research question and target
variable.
Data source: https://www.kaggle.com/datasets/shivamb/netflix-shows/download?datasetVersionNumber=5
The main libraries we will use are:
library(dplyr)
library(tidytext)
library(ggplot2)
library(ggthemes)
library(stringr)
library(tidyr)
we will work on description variable so we will select only this
columns as a start.
library(readr)
netflix_titles <- read_csv("netflix_titles.csv")
Rows: 8807 Columns: 12── Column specification ───────────────────────────────────────────────────
Delimiter: ","
chr (11): show_id, type, title, director, cast, country, date_added, ra...
dbl (1): release_year
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(netflix_titles)
Dealing with stopwords helps a lot, especially when
our text information is not clear and official. Below we may see some of
the stop-words used in main libraries, but you may also create your own
stop-word vector and remove unnecessary words from your text. For more
on this refer to the work done here: https://github.com/EGjika/Text-Analysis-NASA-dataset
data("stop_words")
head(stop_words)
# or we may use
stop_words$word[which(stop_words$word %in% sentiments$word)] %>% head(20)
[1] "appreciate" "appropriate" "available" "awfully"
[5] "best" "better" "clearly" "enough"
[9] "like" "liked" "reasonably" "right"
[13] "sensible" "sorry" "thank" "unfortunately"
[17] "unlikely" "useful" "welcome" "well"
World cloud
For a start we will use wordcloud to have a better view of our text.
We may apply it at “description” or even to understand which cast and
director has mostly appeared. ### For description
# Libraries used
library(wordcloud)
library(SnowballC)
#generate word cloud
set.seed(1234)
wordcloud(words = netflix_titles$description, max.words=100, random.order=FALSE, rot.per=0.40, colors=brewer.pal(8, "Dark2"))
Warning: transformation drops documentsWarning: transformation drops documents

For Cast and director
set.seed(1234)
wordcloud(words = netflix_titles$cast, max.words=100, random.order=FALSE, rot.per=0.40, colors=brewer.pal(8, "Dark2"))
Warning: transformation drops documentsWarning: transformation drops documents

set.seed(1234)
wordcloud(words = netflix_titles$director, max.words=100, random.order=FALSE, rot.per=0.40, colors=brewer.pal(8, "Dark2"))
Warning: transformation drops documentsWarning: transformation drops documents

Word Association
We may also use correlation as a statistical measure to demonstrate
whether, and how strongly, pairs of variables are related. In this case
we are looking at analyzing which words occur most often in association
with the most frequently occurring words. This script shows which words
are most frequently associated with the terms
“good”,“work”,“health”,“love”,“comedy” (corlimit = 0.3 is the lower
limit/threshold set. You can set it lower to see more words, or higher
to see less). or, even change the list of words based on theri frequency
(higher better).
library(tm)
# Build a term-document matrix
Netflx_dtm <- TermDocumentMatrix(netflix_titles$description)
# Find associations
findAssocs(Netflx_dtm, terms = c("good","work","health","love","comedy"), corlimit = 0.3)
$good
numeric(0)
$work
numeric(0)
$health
mental
0.37
$love
numeric(0)
$comedy
numeric(0)
# Find associations for words that occur at least 50 times
findAssocs(Netflx_dtm, terms = findFreqTerms(Netflx_dtm, lowfreq = 1000), corlimit = 0.3)
$and
numeric(0)
$her
numeric(0)
$his
numeric(0)
$the
numeric(0)
$from
numeric(0)
$this
documentary
0.39
$their
numeric(0)
$with
numeric(0)
$when
numeric(0)
$`for`
numeric(0)
Tokenization
Here we will start our process of tikenization for our “description”
column.
library(dplyr)
tidy_netflix_description <- netflix_titles %>%
select("show_id","description")
head(tidy_netflix_title)
Netflx<-tidy_netflix_description %>%
unnest_tokens(
output = word,
input = description,# our column of interest
token = 'words',
drop = FALSE
) %>%
ungroup()
head(Netflx,24)# show just first 24 rows of words
NA
Let’s create a frequency table of words used in “description”:
Netflx<- Netflx %>%
unnest_tokens(word, description) %>%
count(word, sort = TRUE)
Netflx
Now, you can remove the stop words from your data frame:
Netflx <- Netflx %>%
anti_join(stop_words)
Joining, by = "word"
Netflx # after removing stop_words
Next, run the following to plot the words in Netflix Description that
appear more than 50 times. You will see in this situation almost nothing
to understand so, why not filter based on the frequency. (try to change
the number of frequency to other frequencies 3000)
Netflx %>%
filter(n > 50) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col(fill = "darkred") +
theme_fivethirtyeight() +
xlab(NULL) +
ylab("Word Count") +
coord_flip() +
ggtitle("Word Usage in Netflix description more than 50 times")

Netflx %>%
filter(n > 3000) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col(fill = "darkred") +
theme_fivethirtyeight() +
xlab(NULL) +
ylab("Word Count") +
coord_flip() +
ggtitle("Word Usage in Netflix description more than 50 times")

Why not group by show_id? Let’s do it!
Netflx_s<-tidy_netflix_description %>%
unnest_tokens(
output = word,
input = description,# our column of interest
token = 'words',
drop = FALSE
) %>%
ungroup()
# get sentiment via inner join
netflix_sentiment = Netflx_s %>%
inner_join(get_sentiments("afinn")) %>%
group_by(show_id, description) %>% # group by show_id and description
dplyr::summarise(sentiment=sum(value)) %>% # value will be the numerical sentiment given and we want to group by show_id
ungroup()
Joining, by = "word"`summarise()` has grouped output by 'show_id'. You can override using the `.groups` argument.
netflix_sentiment
Next! We may filter those show_id which are in a given range of
sentiment.
netflix_sentiment%>%
filter(sentiment >=12) %>%
ggplot(aes(show_id, sentiment)) +
theme_fivethirtyeight() +
geom_col() +
xlab(NULL) +
coord_flip() +
ylab("Word Count") +
ggtitle("Sentiment greater than 12 in Netflix show_id", subtitle = "Sentiment Analysis Using NRC")

Sentiment Lexicons
There are three mostly used lexicons in R:
get_sentiments("bing")
get_sentiments("afinn")
get_sentiments("nrc")
Now we will try BING lexicon on description.
Netflx_bing <- Netflx %>%
inner_join(get_sentiments("bing"))
Joining, by = "word"
Netflx_bing
Library “yarr”
We may create some graphs here . They work better if we have also a
time variable we may use. Why not try? (exercise for you)
library(yarrr)
Loading required package: jpeg
Loading required package: BayesFactor
Loading required package: coda
Loading required package: Matrix
Attaching package: ‘Matrix’
The following objects are masked from ‘package:tidyr’:
expand, pack, unpack
************
Welcome to BayesFactor 0.9.12-4.4. If you have questions, please contact Richard Morey (richarddmorey@gmail.com).
Type BFManual() to open the manual.
************
Loading required package: circlize
========================================
circlize version 0.4.15
CRAN page: https://cran.r-project.org/package=circlize
Github page: https://github.com/jokergoo/circlize
Documentation: https://jokergoo.github.io/circlize_book/book/
If you use it in published research, please cite:
Gu, Z. circlize implements and enhances circular visualization
in R. Bioinformatics 2014.
This message can be suppressed by:
suppressPackageStartupMessages(library(circlize))
========================================
yarrr v0.1.5. Citation info at citation('yarrr'). Package guide at yarrr.guide()
Email me at Nathaniel.D.Phillips.is@gmail.com
Attaching package: ‘yarrr’
The following object is masked from ‘package:ggplot2’:
diamonds
pirateplot(formula = n ~ word + sentiment, #Formula
data = Netflx_bing, #Data frame
xlab = NULL, ylab = "Word Count", #Axis labels
main = "Lexical Diversity Netflix", #Plot title
pal = "google", #Color scheme
point.o = .2, #Points
avg.line.o = 1, #Turn on the Average/Mean line
theme = 0, #Theme
point.pch = 16, #Point `pch` type
point.cex = 1.5, #Point size
jitter.val = .1, #Turn on jitter to see the songs better
cex.lab = .9, cex.names = .7) #Axis label size

To obtain a better view we may filter some of the words with a given
number of frequency:
Netflx_bing %>%
filter(n>=1500) %>%
pirateplot(formula = n ~ word + sentiment, #Formula
xlab = NULL, ylab = "Word Count", #Axis labels
main = "Lexical Diversity Netflix", #Plot title
pal = "google", #Color scheme
point.o = .2, #Points
avg.line.o = 1, #Turn on the Average/Mean line
theme = 0, #Theme
point.pch = 16, #Point `pch` type
point.cex = 1.5, #Point size
jitter.val = .1, #Turn on jitter to see the songs better
cex.lab = .9, cex.names = .7) #Axis label size

For more reference: https://www.datacamp.com/tutorial/sentiment-analysis-R
Netflx_nrc <- Netflx %>%
inner_join(get_sentiments("nrc"))
Joining, by = "word"
Netflx_nrc
Netflx_nrc %>%
filter(n > 2000) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill=sentiment)) +
theme_fivethirtyeight() +
geom_col() +
xlab(NULL) +
coord_flip() +
ylab("Word Count") +
ggtitle("Word Usage in Netflix description", subtitle = "Sentiment Analysis Using NRC")

Produce a horizontal bar chart showing positive and negative word
usage in Netflix description using the Bing et al. sentiment
lexicon.(try to change the value 2000 and observe the words)
Netflx_bing %>%
filter(n > 2000) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill=sentiment)) +
theme_fivethirtyeight() +
geom_col() +
xlab(NULL) +
coord_flip() +
ylab("Word Count") +
ggtitle("Word Usage in Netflix description", subtitle = "Sentiment Analysis Using
Bing et al.")

Netflx_nrc %>%
filter(n > 2000) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill=sentiment)) +
theme_fivethirtyeight() +
geom_col() +
xlab(NULL) +
coord_flip() +
ylab("Word Count") +
ggtitle("Word Usage in Netflix description", subtitle = "Sentiment Analysis Using nrc")

Try to filter some of the sentiments as in the examples below. I am
interested on knowwing which words are related to the sentiment
“surprise” and then “love” in teh description of the Netflix Movies or
TV Shows.
Net_nrc_surprise <- get_sentiments("nrc") %>%
filter(sentiment == "surprise")
Net_nrc_surprise
Netflx %>%
inner_join(Net_nrc_surprise)
Joining, by = "word"
Net_nrc_love <- get_sentiments("nrc") %>%
filter(sentiment == "love")
Net_nrc_love
Netflx %>%
inner_join(Net_nrc_love)
Joining, by = "word"
Try another word and explore some relationships!! We will see how NRC
result shows for some of the sentiments and top words.
Netflx_nrc %>%
filter(n>2500) %>% # filter only those frequency greater than 2000
mutate(word = reorder(word, n)) %>%
count(word,sentiment,sort=TRUE) %>%
group_by(sentiment)%>%
top_n(n=5) %>%
ungroup() %>%
ggplot(aes(x=reorder(word,n),y=n,fill=sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment,scales="free") +
coord_flip()
Selecting by n

Comparing Sentiment Lexicons
library(stringr)
library(tidyr)
Add more filtering such as “type” because we want more info also on
the type of the title if it was a Movie or a TV show.
Netflx_type <- netflix_titles %>%
select("show_id","type","description")
head(Netflx_type)
Netflx_type <-Netflx_type %>%
unnest_tokens(word, description) %>%
anti_join(stop_words)
Joining, by = "word"
Netflx_type
NA
Let’s use AFINN lexicon to observe the sentiment of Movie and TV
shows.
Netflx_type_afinn <- Netflx_type %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = type) %>%
summarize(sentiment = sum(value)) %>% # try to get also a mean
mutate(method = "AFINN")
Joining, by = "word"
Netflx_type_afinn
Netflx_type_bing <- Netflx_type %>%
inner_join(get_sentiments("bing")) %>%
group_by(index = type) %>%
#summarize(sentiment = sum(value)) %>% # try to get also a mean
mutate(method = "BING")
Joining, by = "word"
Netflx_type_bing
Netflx_bing %>%
filter(n>2500) %>% # filter only those frequency greater than 2500
mutate(word = reorder(word, n)) %>%
count(word,sentiment,sort=TRUE) %>%
group_by(sentiment) %>%
top_n(n=10) %>%
ungroup() %>%
ggplot(aes(x=reorder(word,n),y=n,fill=sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment,scales="free") +
coord_flip()
Selecting by n

Let’s do a comparision between sentiments and lexicons for the “type”
of the Netflix: TV show or Movie
Netflx_bingnrc <- bind_rows(Netflx_type %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),Netflx_type %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive","negative"))) %>%
mutate(method = "NRC")) %>%
count(method, index = type, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
Joining, by = "word"Joining, by = "word"
Netflx_bingnrc
Netflx_bingnrc <- bind_rows(Netflx_type %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),Netflx_type %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive","negative"))) %>%
mutate(method = "NRC")) %>%
count(method, index = word, sentiment) %>% # try to change index "word" or "type"
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
Joining, by = "word"Joining, by = "word"
Netflx_bingnrc
How much each word contributes to the overall sentiment of the
descriptions .
Netflix_bingcounts <- Netflx_bing %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
Joining, by = c("word", "sentiment")
Netflix_bingcounts
Netflix_bingcounts %>%
group_by(sentiment) %>%
#top_n(2) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
coord_flip() +
theme_fivethirtyeight() +
ggtitle("Words' Contribution to Sentiment in Netflix description", subtitle = "Using the Bing et. al Lexicon")

Netflx_nrccounts <- Netflx_nrc %>%
inner_join(get_sentiments("nrc")) %>%
filter(sentiment %in% c("positive","negative")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
Joining, by = c("word", "sentiment")
Netflx_nrccounts
Netflx_nrccounts %>%
group_by(sentiment) %>%
top_n(20) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
coord_flip() +
theme_fivethirtyeight() +
ggtitle("Words' Contribution to Sentiment in Netflix", subtitle = "Using the NRC Lexicon")
Selecting by n

More info: https://github.com/EGjika
