Data is collected from Kaggle. The data is in messy text script. Needs a lot of cleaning to make it tidy. Also scripts are separated in txt files episode wise. Need to read all scripts first.
We use purrr package’s map_dfr, which is similar to bind_row.
require(tidyverse)
require(wordcloud)
require(tidytext)
txt_files <- fs::dir_ls("C:/Users/User/Desktop/Friends NLP/", regexp = "\\.txt$")
txt_files %>% head() # List of all txt files in the folder
## C:/Users/User/Desktop/Friends NLP/S01E01 Monica Gets A Roommate.txt
## C:/Users/User/Desktop/Friends NLP/S01E02 The Sonogram At The End.txt
## C:/Users/User/Desktop/Friends NLP/S01E03 The Thumb.txt
## C:/Users/User/Desktop/Friends NLP/S01E04 George Stephanopoulos.txt
## C:/Users/User/Desktop/Friends NLP/S01E05 The East German Laundry Detergent.txt
## C:/Users/User/Desktop/Friends NLP/S01E06 The Butt.txt
dat <- txt_files %>% # map_dfr reads files and bind rows, tsv read tab separated values file
map_dfr(~read_tsv(. , col_names = FALSE, skip_empty_rows = T), .id = "source")
# using strings, regexp, joining, merging, trial-error, number formatting...
# Cleaning the Episode Names & Season number
dat_clean <- dat %>% mutate( episode = str_split_fixed(source, " ", n = 3), # Separate Folder name, season, episode name by space
Season = str_remove(episode[, 2], "^NLP/"), # Delete Folder Name
Season = str_remove(Season, "^\\d-S\\d\\d"), # \\d detects a number, so this step removes
Season = str_remove(Season, "^\\d-s\\d\\d"), # For a special case
Season = str_remove(Season, "E\\d\\d*"), # remove episode part
Season = str_remove(Season, "-S\\d\\dE\\d\\d*"), # For two episodes combined s1e1-s1e2, remove the last part
Season = str_remove(Season, "^S|^s"), # remove S or s to keep the season number only
Episode = str_remove(episode[ , 2], "^NLP/S\\d\\dE|^NLP/s\\d\\dE"), # Remove all parts except the episode number
Episode = str_remove(Episode, "-S\\d\\dE\\d\\d*"), # for two episodes combined s1e1-s1e2, remove the last part
Episode_Name = episode[, 3], # Episode name from file name
Episode_Name = str_remove(Episode_Name, ".txt*"), # Remove .txt part from Episode names
Episode_Name = str_trim(Episode_Name) # Remove white space if any, from end and start
) %>%
# Cleaning the Dialouges
# Remove Numbered Names in Episodes in First Line
mutate(X1 = str_remove(X1, "^[0-9]+ - [0-9]+ - |^[0-9]+ - |^[0-9]+-[0-9]+ - |^[0-9]+- |^[0-9]+|^[0-9]+ - ")) %>%
# 915 - 916 - The one in Barbados, 123 - , 1024-1023 - , 291-
# Special case of TOW phoebes rat
mutate(X1 = str_replace(X1, "^TOW ", "The One With ")) %>%
# Remove lines Containing other than dialogues
filter(!str_detect(X1, "closing credit|Closing Credit|^commercial break|^Commercial Break|^Part|^Dedicated|^NOTE|^Written|^Directed|^Hosted|^Produced|^Teleplay|^Friends|^FRIENDS|^opening|^Opening|^The One|^The one|^The Last|^ The One|THE ONE|^End|^The|^Originally|^Final|^Opening|^Story|^part|^Part|^Transcribed|^Lisa|^Matt|^Jennifer|OPENING")) %>%
# Remove lines starting with ( or [ or { or <
filter(!str_detect(X1, "^\\(|^\\{+|^\\[+|\\<+")) %>%
# Get everything to lower cases
mutate(X1 = str_to_lower(X1)) %>%
# Get Character & dialogue by splitting in two parts with ":"
mutate(`Character&Dialogue` = str_split_fixed(X1, ":", n = 2)) %>%
mutate(Character = `Character&Dialogue`[, 1],
Dialogue = `Character&Dialogue`[ , 2],
Dialogue = str_trim(Dialogue, side = "both")) %>% # Remove White Spaces in each line
# Filter out special episode
filter(Season != 07 & Episode != 24) %>%
mutate(Character = if_else(Character == "rach", "rachel", Character)) %>%
# Remove Unnecessary Columns
select(-source, -X1, -episode, -`Character&Dialogue`)
dat_clean %>% head()
## # A tibble: 6 x 5
## Season Episode Episode_Name Character Dialogue
## <chr> <chr> <chr> <chr> <chr>
## 1 01 01 Monica Gets A R~ monica there's nothing to tell! he's just ~
## 2 01 01 Monica Gets A R~ joey c'mon, you're going out with the gu~
## 3 01 01 Monica Gets A R~ chandler all right joey, be nice. so does h~
## 4 01 01 Monica Gets A R~ phoebe wait, does he eat chalk?
## 5 01 01 Monica Gets A R~ phoebe just, 'cause, i don't want her to g~
## 6 01 01 Monica Gets A R~ monica okay, everybody relax. this is not ~
# Final Check of the clean data
# Filter First row of each episode
dat_clean %>%
group_by(Season, Episode) %>%
filter(row_number() == 1 | row_number() == n()) %>%
select(Dialogue) %>% head()
## # A tibble: 6 x 3
## # Groups: Season, Episode [3]
## Season Episode Dialogue
## <chr> <chr> <chr>
## 1 01 01 there's nothing to tell! he's just some guy i work with!
## 2 01 01 okay, so, i'm in las vegas... i'm liza minelli-
## 3 01 02 what you guys don't understand is, for us, kissing is as impor~
## 4 01 02 (on phone) hi, mindy. hi, it-it's rachel. yeah, i'm fine. i-i ~
## 5 01 03 (entering) hi guys!
## 6 01 03 (returns) yeah, alright.
Looks all okay to me.
dat_clean %>%
count(Character, sort = T) %>%
head(n = 25) %>%
ggplot(aes(n, reorder(Character, n))) +
geom_col() +
theme_minimal()
dat_clean %>%
filter(Character %in% c("rachel", "joey", "phoebe", "ross", "monica", "chandler")) %>%
group_by(Character, Season) %>%
summarise(n()) %>%
left_join(dat_clean %>%
filter(Character %in% c("rachel", "joey", "phoebe", "ross", "monica", "chandler")) %>%
group_by(Character) %>%
summarise(sn = n()), by = "Character") %>%
ggplot(aes(x = reorder(Character,`n()`), y = `n()`, fill = Season)) +
geom_bar(stat="identity") +
geom_text(aes(label = `n()`, group = Season), position = position_stack(vjust = .5), size = 3.5) +
geom_text(aes(Character, sn, label = sn), nudge_y = 500) +
labs(x = "Character", y = "Dialogues") +
theme_minimal()
Rachel talks a lot. Phoebe is the most quite among the six.
dat_clean %>%
filter(Character %in% c("rachel", "joey", "phoebe", "ross", "monica", "chandler")) %>%
group_by(Season) %>%
summarise(n()) %>%
ggplot(aes(x = reorder(Season,`n()`), y = `n()`, fill = Season)) +
geom_bar(stat="identity") +
theme_minimal()
### unnest Tokens
### COnvert Dialogues into words
dat_word <- dat_clean %>%
unnest_tokens(Word, Dialogue, drop = T)
dat_word %>% head()
## # A tibble: 6 x 5
## Season Episode Episode_Name Character Word
## <chr> <chr> <chr> <chr> <chr>
## 1 01 01 Monica Gets A Roommate monica there's
## 2 01 01 Monica Gets A Roommate monica nothing
## 3 01 01 Monica Gets A Roommate monica to
## 4 01 01 Monica Gets A Roommate monica tell
## 5 01 01 Monica Gets A Roommate monica he's
## 6 01 01 Monica Gets A Roommate monica just
The Word column contains individual words from each dialogue.
dat_word %>%
filter(Character %in% c("rachel", "joey", "phoebe", "ross", "monica", "chandler")) %>%
count(Character, Word, sort = T) %>%
group_by(Character) %>%
top_n(30) %>%
ggplot(aes(n, reorder_within(Word, n, Character), fill = Character)) +
geom_col(show.legend = F) +
scale_y_reordered() +
facet_wrap(~Character, scales = "free") +
theme_minimal()
They all have similar common words. I , you, Yeah, know etc.
Lets remove the stop_words and try again.
dat_word %>%
anti_join(stop_words, by = c("Word" = "word")) %>% ### Remove stopwords
filter(Character %in% c("rachel", "joey", "phoebe", "ross", "monica", "chandler")) %>%
count(Character, Word, sort = T) %>%
group_by(Character) %>%
top_n(30) %>%
ggplot(aes(n, reorder_within(Word, n, Character), fill = Character)) +
geom_col(show.legend = F) +
scale_y_reordered() +
facet_wrap(~Character, scales = "free") +
theme_minimal()
Still kinda looks similar. But the number of times they call each other is interesting.
dat_word %>%
anti_join(stop_words, by = c("Word" = "word")) %>%
filter(Character %in% c("rachel", "joey", "phoebe", "ross", "monica", "chandler")) %>%
count(Character, Word, sort = T) %>%
group_by(Character) %>%
add_count(Total = sum(n)) %>%
ggplot(aes(n/Total, fill = Character)) +
geom_histogram(show.legend = FALSE) +
xlim(NA, 0.0005) +
facet_wrap(~Character, ncol = 3, scales = "free_y") +
theme_minimal()
As expected. Most common words appear most of the times. It’s the words at the right tail that separates the characters from each other.
dat_word %>%
anti_join(stop_words, by = c("Word" = "word")) %>%
filter(Character == "monica") %>%
count(Word, sort = T) %>%
with(wordcloud(words = Word, freq = n, random.order = T, max.words = 100 ))
Bathroom, alright, guys, baby, rach… sound pretty Monica to me !
dat_word %>%
anti_join(stop_words, by = c("Word" = "word")) %>%
filter(Character == "phoebe") %>%
count(Word, sort = T) %>%
with(wordcloud(words = Word, freq = n, random.order = T, max.words = 100 ))
Cat, hey, singing, remember, massage, totally… that’s Phoebe !!!
library(scales)
dat_word %>%
anti_join(stop_words, by = c("Word" = "word")) %>%
filter(Character %in% c("rachel", "ross")) %>%
mutate(Word = str_extract(Word, "[a-z']+")) %>%
count(Character, Word) %>%
group_by(Character) %>%
mutate(proportion = round(n / sum(n), 100)) %>%
select(-n) %>%
pivot_wider(names_from = Character, values_from = proportion) %>%
ggplot(aes(x = rachel, y = ross, color = abs(rachel - ross) )) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.05, size = 1, width = 0.1, height = 0.1) +
geom_text(aes(x = rachel, y = ross, label = Word), check_overlap = T, vjust = 0.5, size = 3.5) +
scale_x_log10() +
scale_y_log10() +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
theme_minimal() +
theme(legend.position="none")
Words above the line are more used by Ross, and words below by Rach !
dat_word %>%
anti_join(stop_words, by = c("Word" = "word")) %>%
filter(Character %in% c("chandler", "monica")) %>%
mutate(Word = str_extract(Word, "[a-z']+")) %>%
count(Character, Word) %>%
group_by(Character) %>%
mutate(proportion = round(n / sum(n), 100)) %>%
select(-n) %>%
pivot_wider(names_from = Character, values_from = proportion) %>%
ggplot(aes(x = chandler, y = monica, color = abs(chandler - monica) )) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.05, size = 1, width = 0.1, height = 0.1) +
geom_text(aes(x = chandler, y = monica, label = Word), check_overlap = T, vjust = 0.5, size = 3.5) +
scale_x_log10() +
scale_y_log10() +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
theme_minimal() +
theme(legend.position="none")
dat_word %>%
anti_join(stop_words, by = c("Word" = "word")) %>%
filter(Character %in% c("joey", "phoebe")) %>%
mutate(Word = str_extract(Word, "[a-z']+")) %>%
count(Character, Word) %>%
group_by(Character) %>%
mutate(proportion = round(n / sum(n), 100)) %>%
select(-n) %>%
pivot_wider(names_from = Character, values_from = proportion) %>%
ggplot(aes(x = joey, y = phoebe, color = abs(joey - phoebe) )) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.05, size = 1, width = 0.1, height = 0.1) +
geom_text(aes(x = joey, y = phoebe, label = Word), check_overlap = T, vjust = 0.5, size = 3.5) +
scale_x_log10() +
scale_y_log10() +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
theme_minimal() +
theme(legend.position="none")
This method extracts the word only exclusive to each character.
dat_tf <- dat_clean %>%
filter(Character %in% c("rachel", "joey", "phoebe", "ross", "monica", "chandler")) %>%
unnest_tokens(Word, Dialogue, drop = F) %>%
anti_join(stop_words, by = c("Word" = "word")) %>%
count(Character, Word, sort = TRUE)
dat_tf %>% head()
## # A tibble: 6 x 3
## Character Word n
## <chr> <chr> <int>
## 1 joey hey 1078
## 2 joey yeah 960
## 3 ross yeah 828
## 4 rachel yeah 785
## 5 phoebe yeah 768
## 6 ross hey 750
dat_tfidf <- dat_tf %>%
bind_tf_idf( term = Word, document = Character, n = n) %>%
arrange(desc(tf_idf))
dat_tfidf %>% head()
## # A tibble: 6 x 6
## Character Word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 phoebe minsk 12 0.000414 1.79 0.000742
## 2 rachel gavin 20 0.000602 1.10 0.000662
## 3 joey wayne 12 0.000363 1.79 0.000650
## 4 chandler tulsa 25 0.000824 0.693 0.000571
## 5 phoebe ree 9 0.000310 1.79 0.000556
## 6 phoebe sergei 9 0.000310 1.79 0.000556
dat_tfidf %>%
group_by(Character) %>%
slice_max(tf_idf, n = 30) %>%
ggplot(aes(tf_idf, reorder_within(Word, tf_idf, Character), fill = Character)) +
geom_col(show.legend = F) +
scale_y_reordered() +
facet_wrap(~Character, scales = "free") +
theme_minimal()
dat_tfidf %>%
filter(Character == "monica") %>%
arrange(desc(tf)) %>%
with(wordcloud(words = Word, freq = tf, random.order = F, max.words = 200 ))
dat_word
## # A tibble: 691,544 x 5
## Season Episode Episode_Name Character Word
## <chr> <chr> <chr> <chr> <chr>
## 1 01 01 Monica Gets A Roommate monica there's
## 2 01 01 Monica Gets A Roommate monica nothing
## 3 01 01 Monica Gets A Roommate monica to
## 4 01 01 Monica Gets A Roommate monica tell
## 5 01 01 Monica Gets A Roommate monica he's
## 6 01 01 Monica Gets A Roommate monica just
## 7 01 01 Monica Gets A Roommate monica some
## 8 01 01 Monica Gets A Roommate monica guy
## 9 01 01 Monica Gets A Roommate monica i
## 10 01 01 Monica Gets A Roommate monica work
## # ... with 691,534 more rows
library(tidyr)
dat_sentiment <- dat_clean %>%
group_by(Season, Episode) %>%
unnest_tokens(output = Word, input = Dialogue, drop = T) %>%
inner_join(get_sentiments("bing"), by = c("Word" = "word")) %>% ### Get sentiments from bing
count(Season, Episode, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative,
Season = paste0("Season ", Season)) %>%
group_by(Season) %>%
mutate(Episode = 1:n()) %>%
ungroup()
dat_sentiment
## # A tibble: 223 x 5
## Season Episode negative positive sentiment
## <chr> <int> <int> <int> <int>
## 1 Season 01 1 71 143 72
## 2 Season 01 2 66 96 30
## 3 Season 01 3 64 114 50
## 4 Season 01 4 70 114 44
## 5 Season 01 5 71 141 70
## 6 Season 01 6 61 98 37
## 7 Season 01 7 62 81 19
## 8 Season 01 8 48 103 55
## 9 Season 01 9 63 90 27
## 10 Season 01 10 61 100 39
## # ... with 213 more rows
dat_sentiment %>%
ggplot(aes(factor(Episode), sentiment, fill = Season)) +
geom_col(show.legend = FALSE) +
xlab("Episode") +
facet_wrap(~Season, ncol = 2, scales = "free_x") +
theme_minimal()
Friends is all about positive vibes. But there are episodes with less of the positive vibes. For example S05E08 where everyone reminisces their worst thanksgiving.
S09E17 is when Chandler and Ross joke around on their college website, Ross is accused of being dead which leads to the fact that he was not popular in college. Joey is not willing to give his favorite stuffed animal, Hugsy, to Emma. Monica helps Phoebe not calling Mike.
S10E06 is where Ross applies for a palaeontology grant, but discovers that Dr Hobart - who allocates such funding - used to go out with his girlfriend Charlie. Meanwhile, Mike wants Phoebe to get rid of one of her bizarre paintings, but neither Rachel nor Monica is keen to take it, and Chandler lies to Joey about watching his audition tape.
So we can say the low scoring episodes contain some sort of quarrelling among our favourite characters !
dat_rating <- read_csv("friends_episodes_v2.csv")
dat_rating <- dat_rating %>%
group_by(Season) %>%
mutate(Episode = 1:n(),
Season = paste0("Season 0", Season),
Season = ifelse(Season == "Season 010", "Season 10", Season))
dat_sentiment <- dat_sentiment %>%
left_join(dat_rating,
by = c("Season" = "Season", "Episode" = "Episode")
)
p1 <- dat_sentiment %>%
ggplot(aes(sentiment, Stars)) +
geom_jitter(aes(color = Season), alpha = 0.8) +
geom_smooth(method = 'lm', show.legend = F, se = F) +
theme_minimal()
p2 <- dat_sentiment %>%
ggplot(aes(sentiment, Stars)) +
geom_jitter(aes(color = Season), alpha = 0.8, show.legend = F) +
geom_smooth(method = 'lm', show.legend = F, se = F) +
theme_minimal() +
facet_wrap(~Season)
require(patchwork)
p1 / p2
Not so much relation between sentiment and episode rating. It’s more about the story I think. Season wise there is change in relationship which is interesting !
DT::datatable ( dat_rating %>%
arrange(desc(Stars)) %>%
select( Season, Episode, Year_of_prod, everything()) %>%
select(-Duration, -Summary, -Director), rownames = F)
Silge, Julia, and David Robinson. 2016. “tidytext: Text Mining and Analysis Using Tidy Data Principles in R.” JOSS 1 (3). https://doi.org/10.21105/joss.00037.
Julia Silge and David Robinson. 2017. Text Mining with R: A Tidy Approach (1st. ed.). O’Reilly Media, Inc.