Data Source

Data is collected from Kaggle. The data is in messy text script. Needs a lot of cleaning to make it tidy. Also scripts are separated in txt files episode wise. Need to read all scripts first.

Data Load

We use purrr package’s map_dfr, which is similar to bind_row.

require(tidyverse)
require(wordcloud)
require(tidytext)

txt_files <- fs::dir_ls("C:/Users/User/Desktop/Friends NLP/", regexp = "\\.txt$")
txt_files %>% head() # List of all txt files in the folder
## C:/Users/User/Desktop/Friends NLP/S01E01 Monica Gets A Roommate.txt
## C:/Users/User/Desktop/Friends NLP/S01E02 The Sonogram At The End.txt
## C:/Users/User/Desktop/Friends NLP/S01E03 The Thumb.txt
## C:/Users/User/Desktop/Friends NLP/S01E04 George Stephanopoulos.txt
## C:/Users/User/Desktop/Friends NLP/S01E05 The East German Laundry Detergent.txt
## C:/Users/User/Desktop/Friends NLP/S01E06 The Butt.txt
dat <- txt_files %>% # map_dfr reads files and bind rows, tsv read tab separated values file
  map_dfr(~read_tsv(. ,  col_names = FALSE, skip_empty_rows = T), .id = "source")

Cleaning Data

# using strings, regexp, joining, merging, trial-error, number formatting...

# Cleaning the Episode Names & Season number
dat_clean <- dat %>% mutate( episode =  str_split_fixed(source, " ", n = 3), # Separate Folder name, season, episode name by space
                             Season = str_remove(episode[, 2], "^NLP/"), # Delete Folder Name
                             Season = str_remove(Season, "^\\d-S\\d\\d"), # \\d detects a number, so this step removes 
                             Season = str_remove(Season, "^\\d-s\\d\\d"), # For a special case
                             Season = str_remove(Season, "E\\d\\d*"), # remove episode part
                             Season = str_remove(Season, "-S\\d\\dE\\d\\d*"), # For two episodes combined s1e1-s1e2, remove the last part
                             Season = str_remove(Season, "^S|^s"), # remove S or s to keep the season number only
                             Episode = str_remove(episode[ , 2], "^NLP/S\\d\\dE|^NLP/s\\d\\dE"), # Remove all parts except the episode number
                             Episode = str_remove(Episode, "-S\\d\\dE\\d\\d*"), # for two episodes combined s1e1-s1e2, remove the last part
                             Episode_Name = episode[, 3], # Episode name from file name
                             Episode_Name = str_remove(Episode_Name, ".txt*"), # Remove .txt part from Episode names
                             Episode_Name = str_trim(Episode_Name) # Remove white space if any, from end and start 
                             ) %>%   
  
  # Cleaning the Dialouges
  # Remove Numbered Names in Episodes in First Line
  mutate(X1 = str_remove(X1, "^[0-9]+ - [0-9]+ - |^[0-9]+ - |^[0-9]+-[0-9]+ - |^[0-9]+- |^[0-9]+|^[0-9]+ - ")) %>% 
  # 915 - 916 - The one in Barbados, 123 - , 1024-1023 - , 291-
  
  # Special case of TOW phoebes rat
  mutate(X1 = str_replace(X1, "^TOW ", "The One With ")) %>%
  
  # Remove lines Containing other than dialogues
  filter(!str_detect(X1, "closing credit|Closing Credit|^commercial break|^Commercial Break|^Part|^Dedicated|^NOTE|^Written|^Directed|^Hosted|^Produced|^Teleplay|^Friends|^FRIENDS|^opening|^Opening|^The One|^The one|^The Last|^ The One|THE ONE|^End|^The|^Originally|^Final|^Opening|^Story|^part|^Part|^Transcribed|^Lisa|^Matt|^Jennifer|OPENING")) %>% 
  
  # Remove lines starting with ( or [ or { or <
  filter(!str_detect(X1, "^\\(|^\\{+|^\\[+|\\<+")) %>% 
  
  # Get everything to lower cases
  mutate(X1 = str_to_lower(X1)) %>% 
  
  # Get Character & dialogue by splitting in two parts with ":"
  mutate(`Character&Dialogue` = str_split_fixed(X1, ":", n = 2)) %>% 
  mutate(Character = `Character&Dialogue`[, 1],
         Dialogue = `Character&Dialogue`[ , 2],
         Dialogue = str_trim(Dialogue, side = "both")) %>%  # Remove White Spaces in each line
  
  # Filter out special episode
  filter(Season != 07 & Episode != 24) %>% 
  mutate(Character = if_else(Character == "rach", "rachel", Character)) %>% 
  
  # Remove Unnecessary Columns
  select(-source, -X1, -episode, -`Character&Dialogue`) 

Tidy Data

dat_clean %>% head()
## # A tibble: 6 x 5
##   Season Episode Episode_Name     Character Dialogue                            
##   <chr>  <chr>   <chr>            <chr>     <chr>                               
## 1 01     01      Monica Gets A R~ monica    there's nothing to tell! he's just ~
## 2 01     01      Monica Gets A R~ joey      c'mon, you're going out with the gu~
## 3 01     01      Monica Gets A R~ chandler  all right joey, be nice.  so does h~
## 4 01     01      Monica Gets A R~ phoebe    wait, does he eat chalk?            
## 5 01     01      Monica Gets A R~ phoebe    just, 'cause, i don't want her to g~
## 6 01     01      Monica Gets A R~ monica    okay, everybody relax. this is not ~
# Final Check of the clean data
# Filter First row of each episode
dat_clean %>%
  group_by(Season, Episode) %>%
  filter(row_number() == 1 | row_number() == n()) %>% 
  select(Dialogue) %>% head()
## # A tibble: 6 x 3
## # Groups:   Season, Episode [3]
##   Season Episode Dialogue                                                       
##   <chr>  <chr>   <chr>                                                          
## 1 01     01      there's nothing to tell! he's just some guy i work with!       
## 2 01     01      okay, so, i'm in las vegas... i'm liza minelli-                
## 3 01     02      what you guys don't understand is, for us, kissing is as impor~
## 4 01     02      (on phone) hi, mindy. hi, it-it's rachel. yeah, i'm fine. i-i ~
## 5 01     03      (entering) hi guys!                                            
## 6 01     03      (returns) yeah, alright.

Looks all okay to me.

Text Analysis

Only The main 6 character dialogues are important

dat_clean %>% 
  count(Character, sort = T) %>% 
  head(n = 25) %>%
  ggplot(aes(n, reorder(Character, n))) +
  geom_col() +
  theme_minimal()

Dialogues Per Character

dat_clean %>% 
  filter(Character %in% c("rachel", "joey", "phoebe", "ross", "monica", "chandler")) %>% 
  group_by(Character, Season) %>% 
  summarise(n()) %>%
  left_join(dat_clean %>% 
              filter(Character %in% c("rachel", "joey", "phoebe", "ross", "monica", "chandler")) %>% 
              group_by(Character) %>% 
              summarise(sn = n()), by = "Character") %>% 
  ggplot(aes(x = reorder(Character,`n()`), y = `n()`, fill = Season)) + 
  geom_bar(stat="identity") +
  geom_text(aes(label = `n()`, group = Season), position = position_stack(vjust = .5), size = 3.5) +
  geom_text(aes(Character, sn, label = sn), nudge_y = 500) +
  labs(x = "Character", y = "Dialogues") +
  theme_minimal()

Rachel talks a lot. Phoebe is the most quite among the six.

Dialogue Per Season

dat_clean %>% 
  filter(Character %in% c("rachel", "joey", "phoebe", "ross", "monica", "chandler")) %>% 
  group_by(Season) %>% 
  summarise(n()) %>% 
  ggplot(aes(x = reorder(Season,`n()`), y = `n()`, fill = Season)) + 
  geom_bar(stat="identity") +
  theme_minimal()

Word Analysis

Unnest words

### unnest Tokens

### COnvert Dialogues into words
dat_word <- dat_clean %>%
  unnest_tokens(Word, Dialogue, drop = T)

dat_word %>% head()
## # A tibble: 6 x 5
##   Season Episode Episode_Name           Character Word   
##   <chr>  <chr>   <chr>                  <chr>     <chr>  
## 1 01     01      Monica Gets A Roommate monica    there's
## 2 01     01      Monica Gets A Roommate monica    nothing
## 3 01     01      Monica Gets A Roommate monica    to     
## 4 01     01      Monica Gets A Roommate monica    tell   
## 5 01     01      Monica Gets A Roommate monica    he's   
## 6 01     01      Monica Gets A Roommate monica    just

The Word column contains individual words from each dialogue.

Most common 30 Words by character

dat_word %>% 
  filter(Character %in% c("rachel", "joey", "phoebe", "ross", "monica", "chandler")) %>% 
  count(Character, Word, sort = T) %>% 
  group_by(Character) %>% 
  top_n(30) %>% 
  ggplot(aes(n, reorder_within(Word, n, Character), fill = Character)) +
  geom_col(show.legend = F) +
  scale_y_reordered() +
  facet_wrap(~Character, scales = "free") +
  theme_minimal()

They all have similar common words. I , you, Yeah, know etc.

Lets remove the stop_words and try again.

Most common words by character Without Stopwords

dat_word %>% 
  anti_join(stop_words, by = c("Word" = "word")) %>%  ### Remove stopwords
  filter(Character %in% c("rachel", "joey", "phoebe", "ross", "monica", "chandler")) %>% 
  count(Character, Word, sort = T) %>% 
  group_by(Character) %>% 
  top_n(30) %>% 
  ggplot(aes(n, reorder_within(Word, n, Character), fill = Character)) +
  geom_col(show.legend = F) +
  scale_y_reordered() +
  facet_wrap(~Character, scales = "free") +
  theme_minimal()

Still kinda looks similar. But the number of times they call each other is interesting.

Word Frequency and Histogram

dat_word %>% 
  anti_join(stop_words, by = c("Word" = "word")) %>% 
  filter(Character %in% c("rachel", "joey", "phoebe", "ross", "monica", "chandler")) %>% 
  count(Character, Word, sort = T) %>% 
  group_by(Character) %>% 
  add_count(Total = sum(n)) %>% 
  
  ggplot(aes(n/Total, fill = Character)) +
  geom_histogram(show.legend = FALSE) +
  xlim(NA, 0.0005) +
  facet_wrap(~Character, ncol = 3, scales = "free_y") +
  theme_minimal()

As expected. Most common words appear most of the times. It’s the words at the right tail that separates the characters from each other.

Wordcloud for Monica

dat_word %>% 
  anti_join(stop_words, by = c("Word" = "word")) %>% 
  filter(Character == "monica") %>% 
  count(Word, sort = T) %>%
  with(wordcloud(words = Word, freq = n, random.order = T, max.words = 100 ))

Bathroom, alright, guys, baby, rach… sound pretty Monica to me !

Wordcloud for Phoebe

dat_word %>% 
  anti_join(stop_words, by = c("Word" = "word")) %>% 
  filter(Character == "phoebe") %>% 
  count(Word, sort = T) %>%
  with(wordcloud(words = Word, freq = n, random.order = T, max.words = 100 ))

Cat, hey, singing, remember, massage, totally… that’s Phoebe !!!

Ross & Rachel Common Words

library(scales)

dat_word %>% 
  anti_join(stop_words, by = c("Word" = "word")) %>% 
  filter(Character %in% c("rachel", "ross")) %>% 
  mutate(Word = str_extract(Word, "[a-z']+")) %>% 
  count(Character, Word) %>%
  group_by(Character) %>% 
  mutate(proportion = round(n / sum(n), 100)) %>% 
  select(-n) %>% 
  pivot_wider(names_from = Character, values_from = proportion) %>% 
  
  ggplot(aes(x = rachel, y = ross, color = abs(rachel - ross) )) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.05, size = 1, width = 0.1, height = 0.1) +
  geom_text(aes(x = rachel, y = ross, label = Word), check_overlap = T, vjust = 0.5, size = 3.5) + 
  scale_x_log10() +
  scale_y_log10() +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  theme_minimal() +
  theme(legend.position="none")

Words above the line are more used by Ross, and words below by Rach !

Chandler and Monica Words

dat_word %>% 
  anti_join(stop_words, by = c("Word" = "word")) %>% 
  filter(Character %in% c("chandler", "monica")) %>% 
  mutate(Word = str_extract(Word, "[a-z']+")) %>% 
  count(Character, Word) %>%
  group_by(Character) %>% 
  mutate(proportion = round(n / sum(n), 100)) %>% 
  select(-n) %>% 
  pivot_wider(names_from = Character, values_from = proportion) %>% 
  
  ggplot(aes(x = chandler, y = monica, color = abs(chandler - monica) )) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.05, size = 1, width = 0.1, height = 0.1) +
  geom_text(aes(x = chandler, y = monica, label = Word), check_overlap = T, vjust = 0.5, size = 3.5) + 
  scale_x_log10() +
  scale_y_log10() +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  theme_minimal() +
  theme(legend.position="none") 

Joey and Pheeebs Words

dat_word %>% 
  anti_join(stop_words, by = c("Word" = "word")) %>% 
  filter(Character %in% c("joey", "phoebe")) %>% 
  mutate(Word = str_extract(Word, "[a-z']+")) %>% 
  count(Character, Word) %>%
  group_by(Character) %>% 
  mutate(proportion = round(n / sum(n), 100)) %>% 
  select(-n) %>% 
  pivot_wider(names_from = Character, values_from = proportion) %>% 
  
  ggplot(aes(x = joey, y = phoebe, color = abs(joey - phoebe) )) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.05, size = 1, width = 0.1, height = 0.1) +
  geom_text(aes(x = joey, y = phoebe, label = Word), check_overlap = T, vjust = 0.5, size = 3.5) + 
  scale_x_log10() +
  scale_y_log10() +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  theme_minimal() +
  theme(legend.position="none")

TF-IDF (Term Frequency and Inverse Document Frequency)

This method extracts the word only exclusive to each character.

dat_tf <- dat_clean %>% 
  filter(Character %in% c("rachel", "joey", "phoebe", "ross", "monica", "chandler")) %>% 
  unnest_tokens(Word, Dialogue, drop = F) %>%
  anti_join(stop_words, by = c("Word" = "word")) %>% 
  count(Character, Word, sort = TRUE)

dat_tf %>% head()
## # A tibble: 6 x 3
##   Character Word      n
##   <chr>     <chr> <int>
## 1 joey      hey    1078
## 2 joey      yeah    960
## 3 ross      yeah    828
## 4 rachel    yeah    785
## 5 phoebe    yeah    768
## 6 ross      hey     750
dat_tfidf <- dat_tf %>% 
  bind_tf_idf( term =  Word, document = Character, n = n) %>% 
  arrange(desc(tf_idf))

dat_tfidf %>% head()
## # A tibble: 6 x 6
##   Character Word       n       tf   idf   tf_idf
##   <chr>     <chr>  <int>    <dbl> <dbl>    <dbl>
## 1 phoebe    minsk     12 0.000414 1.79  0.000742
## 2 rachel    gavin     20 0.000602 1.10  0.000662
## 3 joey      wayne     12 0.000363 1.79  0.000650
## 4 chandler  tulsa     25 0.000824 0.693 0.000571
## 5 phoebe    ree        9 0.000310 1.79  0.000556
## 6 phoebe    sergei     9 0.000310 1.79  0.000556
dat_tfidf %>% 
  group_by(Character) %>% 
  slice_max(tf_idf, n = 30) %>% 
  ggplot(aes(tf_idf, reorder_within(Word, tf_idf, Character), fill = Character)) +
  geom_col(show.legend = F) +
  scale_y_reordered() +
  facet_wrap(~Character, scales = "free") +
  theme_minimal()

Wordcloud for Monica with term frequency

dat_tfidf %>% 
  filter(Character == "monica") %>% 
  arrange(desc(tf)) %>% 
  with(wordcloud(words = Word, freq = tf, random.order = F, max.words = 200 ))

Sentiment Analysis by Episode

dat_word
## # A tibble: 691,544 x 5
##    Season Episode Episode_Name           Character Word   
##    <chr>  <chr>   <chr>                  <chr>     <chr>  
##  1 01     01      Monica Gets A Roommate monica    there's
##  2 01     01      Monica Gets A Roommate monica    nothing
##  3 01     01      Monica Gets A Roommate monica    to     
##  4 01     01      Monica Gets A Roommate monica    tell   
##  5 01     01      Monica Gets A Roommate monica    he's   
##  6 01     01      Monica Gets A Roommate monica    just   
##  7 01     01      Monica Gets A Roommate monica    some   
##  8 01     01      Monica Gets A Roommate monica    guy    
##  9 01     01      Monica Gets A Roommate monica    i      
## 10 01     01      Monica Gets A Roommate monica    work   
## # ... with 691,534 more rows
library(tidyr)

dat_sentiment <- dat_clean %>%
  group_by(Season, Episode) %>% 
  unnest_tokens(output = Word, input = Dialogue, drop = T) %>% 
  inner_join(get_sentiments("bing"), by = c("Word" = "word")) %>% ### Get sentiments from bing
  count(Season, Episode, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative,
         Season = paste0("Season ", Season)) %>% 
  group_by(Season) %>% 
  mutate(Episode = 1:n()) %>% 
  ungroup()

dat_sentiment
## # A tibble: 223 x 5
##    Season    Episode negative positive sentiment
##    <chr>       <int>    <int>    <int>     <int>
##  1 Season 01       1       71      143        72
##  2 Season 01       2       66       96        30
##  3 Season 01       3       64      114        50
##  4 Season 01       4       70      114        44
##  5 Season 01       5       71      141        70
##  6 Season 01       6       61       98        37
##  7 Season 01       7       62       81        19
##  8 Season 01       8       48      103        55
##  9 Season 01       9       63       90        27
## 10 Season 01      10       61      100        39
## # ... with 213 more rows
dat_sentiment %>% 
  ggplot(aes(factor(Episode), sentiment, fill = Season)) +
  geom_col(show.legend = FALSE) +
  xlab("Episode") +
  facet_wrap(~Season, ncol = 2, scales = "free_x") +
  theme_minimal()

Friends is all about positive vibes. But there are episodes with less of the positive vibes. For example S05E08 where everyone reminisces their worst thanksgiving.

S09E17 is when Chandler and Ross joke around on their college website, Ross is accused of being dead which leads to the fact that he was not popular in college. Joey is not willing to give his favorite stuffed animal, Hugsy, to Emma. Monica helps Phoebe not calling Mike.

S10E06 is where Ross applies for a palaeontology grant, but discovers that Dr Hobart - who allocates such funding - used to go out with his girlfriend Charlie. Meanwhile, Mike wants Phoebe to get rid of one of her bizarre paintings, but neither Rachel nor Monica is keen to take it, and Chandler lies to Joey about watching his audition tape.

So we can say the low scoring episodes contain some sort of quarrelling among our favourite characters !

Does Sentiment Drive Rating of the Episode ?

dat_rating <- read_csv("friends_episodes_v2.csv")

dat_rating <- dat_rating %>% 
              group_by(Season) %>% 
              mutate(Episode = 1:n(),
              Season = paste0("Season 0", Season),
              Season = ifelse(Season == "Season 010", "Season 10", Season))

dat_sentiment <- dat_sentiment %>% 
  left_join(dat_rating, 
  by = c("Season" = "Season", "Episode" = "Episode")
            ) 

p1 <- dat_sentiment %>%
  ggplot(aes(sentiment, Stars)) +
  geom_jitter(aes(color = Season), alpha = 0.8) +
  geom_smooth(method = 'lm', show.legend = F, se = F) +
  theme_minimal()

p2 <- dat_sentiment %>%
  ggplot(aes(sentiment, Stars)) +
  geom_jitter(aes(color = Season), alpha = 0.8, show.legend = F) +
  geom_smooth(method = 'lm', show.legend = F, se = F) +
  theme_minimal() +
  facet_wrap(~Season)

require(patchwork)
p1 / p2

Not so much relation between sentiment and episode rating. It’s more about the story I think. Season wise there is change in relationship which is interesting !

Top Rated Episodes

DT::datatable ( dat_rating %>% 
  arrange(desc(Stars)) %>% 
  select( Season, Episode, Year_of_prod, everything()) %>% 
  select(-Duration, -Summary, -Director), rownames = F)

References