Friends Data Set Play

Whats the packages?

library(dplyr)
library(ggplot2)
library(friends)
library(RColorBrewer)
display.brewer.all()

Always check your data set

data(package = "friends")
summary(friends)
     text             speaker              season          episode     
 Length:67373       Length:67373       Min.   : 1.000   Min.   : 1.00  
 Class :character   Class :character   1st Qu.: 3.000   1st Qu.: 7.00  
 Mode  :character   Mode  :character   Median : 5.000   Median :13.00  
                                       Mean   : 5.438   Mean   :12.61  
                                       3rd Qu.: 8.000   3rd Qu.:19.00  
                                       Max.   :10.000   Max.   :25.00  
     scene          utterance     
 Min.   : 1.000   Min.   :  0.00  
 1st Qu.: 3.000   1st Qu.:  6.00  
 Median : 6.000   Median : 12.00  
 Mean   : 6.859   Mean   : 18.08  
 3rd Qu.:10.000   3rd Qu.: 22.00  
 Max.   :29.000   Max.   :255.00  
View(friends_emotions)
View(friends)

Try to group your data to see if there is any finding

How about to see who has the most lines?

by_speaker <- friends |> 
  group_by(speaker) |> 
  summarize(count=n()) |> 
  ungroup()

by_speaker |> 
arrange(desc(count))
# A tibble: 700 × 2
   speaker          count
   <chr>            <int>
 1 Rachel Green      9312
 2 Ross Geller       9157
 3 Chandler Bing     8465
 4 Monica Geller     8441
 5 Joey Tribbiani    8171
 6 Phoebe Buffay     7501
 7 Scene Directions  6063
 8 #ALL#              347
 9 Mike Hannigan      330
10 Richard Burke      281
# ℹ 690 more rows
# A tibble: 56,920 × 2
   text   count
   <chr>  <int>
 1 Hey!     661
 2 What?    539
 3 Yeah.    430
 4 Okay.    417
 5 Hey.     358
 6 Hi!      251
 7 Hi.      221
 8 What?!   208
 9 No.      201
10 Yeah!    156
# ℹ 56,910 more rows

Every season who has the most lines?

# A tibble: 700 × 2
   speaker          count
   <chr>            <int>
 1 Rachel Green      9312
 2 Ross Geller       9157
 3 Chandler Bing     8465
 4 Monica Geller     8441
 5 Joey Tribbiani    8171
 6 Phoebe Buffay     7501
 7 Scene Directions  6063
 8 #ALL#              347
 9 Mike Hannigan      330
10 Richard Burke      281
# ℹ 690 more rows

We also have emotion data by each lines. Lets check

View(friends_emotions)

friends_with_emotion <- friends |> 
inner_join(friends_emotions, by=c("season", "episode", "scene", "utterance") )
friends_with_emotion 
# A tibble: 12,606 × 7
   text                           speaker season episode scene utterance emotion
   <chr>                          <chr>    <int>   <int> <int>     <int> <chr>  
 1 I'm supposed to attach a brac… Ross G…      1       1     4         1 Mad    
 2 I'm thinking we've got a book… Joey T…      1       1     4         3 Neutral
 3 It's a beautiful thing.        Chandl…      1       1     4         4 Joyful 
 4 What's this?                   Joey T…      1       1     4         5 Neutral
 5 I would have to say that is a… Chandl…      1       1     4         6 Neutral
 6 Which goes where?              Joey T…      1       1     4         7 Neutral
 7 I have no idea.                Chandl…      1       1     4         8 Scared 
 8 Done with the bookcase!        Joey T…      1       1     4        10 Joyful 
 9 All finished!                  Chandl…      1       1     4        11 Joyful 
10 This was Carol's favorite bee… Ross G…      1       1     4        12 Sad    
# ℹ 12,596 more rows

Different speaker has different emotion proportion?

by_speaker_with_emo <- friends_with_emotion |> 
   group_by(speaker, emotion) |> 
summarize(count=n(), .groups = 'drop') |>
arrange(desc(count))|>
  filter(count>80) |>
  ungroup()


by_speaker_with_emo 
# A tibble: 41 × 3
   speaker        emotion count
   <chr>          <chr>   <int>
 1 Chandler Bing  Neutral   537
 2 Ross Geller    Neutral   506
 3 Monica Geller  Neutral   476
 4 Joey Tribbiani Neutral   466
 5 Phoebe Buffay  Neutral   446
 6 Rachel Green   Neutral   417
 7 Chandler Bing  Joyful    387
 8 Phoebe Buffay  Joyful    386
 9 Joey Tribbiani Joyful    368
10 Monica Geller  Joyful    326
# ℹ 31 more rows

How about show the numbers by chart?

ggplot(data = by_speaker_with_emo , mapping = aes(x = speaker, y = count, fill = emotion)) +
  geom_col(color = "white") +
  geom_text(aes(label = emotion), position = position_stack(vjust = 0.5), size = 3)+
  scale_fill_brewer(palette = "Set2")

How about the same sentence with different emotion?

library(tidyr)

by_text_with_emo <- friends_with_emotion  |> 
  group_by(emotion, text, speaker) |> 
summarize(count=n(), .groups = 'drop') |>
arrange(desc(count))

by_text_with_emo 
# A tibble: 12,094 × 4
   emotion text  speaker        count
   <chr>   <chr> <chr>          <int>
 1 Neutral Hey.  Joey Tribbiani    15
 2 Neutral Hi.   Monica Geller     14
 3 Neutral Yeah. Ross Geller       12
 4 Neutral Hey.  Phoebe Buffay     11
 5 Neutral Yeah. Joey Tribbiani    11
 6 Neutral Yeah. Rachel Green      11
 7 Scared  What? Rachel Green      11
 8 Joyful  Hey!  Joey Tribbiani    10
 9 Joyful  Hey!  Ross Geller       10
10 Neutral Hey.  Chandler Bing     10
# ℹ 12,084 more rows
ohmygod <- by_text_with_emo[grepl("oh my god", by_text_with_emo$text, ignore.case = TRUE), ]
sum(ohmygod$count)
[1] 170