FRIENDS EDA

Major credits to the Star Wars Script Analysis by Xavier on Kaggle (https://www.kaggle.com/xvivancos/analyzing-star-wars-movie-scripts) for providing a fantastic template to initiate this EDA process for sentiment analysis. Also to Heather for being the best and diving into our star wars data here (https://rpubs.com/hsmalley/starWars), providing another fantastic EDA template for sentiment analysis.

Packages and data

library(tidyverse) # data manipulation
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.5     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tm) # text mining
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(wordcloud) # word cloud generator
## Loading required package: RColorBrewer
library(wordcloud2) # word cloud generator
library(tidytext) # text mining for word processing and sentiment analysis
library(reshape2) # reshapes a data frame
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(radarchart) # drawing the radar chart from a data frame
library(RWeka) # data mining tasks
# script data
episode1 <- read.csv("/Users/nikohellman/Desktop/friends scripts/friendsEP1.1.csv")
episode4<- read.csv("/Users/nikohellman/Desktop/friends scripts/friendsEP1.4.csv")
episode8<- read.csv("/Users/nikohellman/Desktop/friends scripts/friendsEP1.8.csv")

str(episode1)
## 'data.frame':    300 obs. of  3 variables:
##  $ character: chr  "Monica" "Joey" "Chandler" "Phoebe" ...
##  $ line     : chr  " There's nothing to tell! He's just some guy I work with!" " C'mon, you're going out with the guy! There's gotta be something wrong with him!" " All right Joey, be nice.  So does he have a hump? A hump and a hairpiece?" " Wait, does he eat chalk?" ...
##  $ X        : chr  "" "" "" "" ...
head(episode1)
##   character
## 1    Monica
## 2      Joey
## 3  Chandler
## 4    Phoebe
## 5    Phoebe
## 6    Monica
##                                                                                                             line
## 1                                                       There's nothing to tell! He's just some guy I work with!
## 2                               C'mon, you're going out with the guy! There's gotta be something wrong with him!
## 3                                      All right Joey, be nice.  So does he have a hump? A hump and a hairpiece?
## 4                                                                                       Wait, does he eat chalk?
## 5                                Just, 'cause, I don't want her to go through what I went through with Carl- oh!
## 6  Okay, everybody relax. This is not even a date. It's just two people going out to dinner and- not having sex.
##   X
## 1  
## 2  
## 3  
## 4  
## 5  
## 6
episode1<- episode1%>%
  select(1:2)
head(episode1)
##   character
## 1    Monica
## 2      Joey
## 3  Chandler
## 4    Phoebe
## 5    Phoebe
## 6    Monica
##                                                                                                             line
## 1                                                       There's nothing to tell! He's just some guy I work with!
## 2                               C'mon, you're going out with the guy! There's gotta be something wrong with him!
## 3                                      All right Joey, be nice.  So does he have a hump? A hump and a hairpiece?
## 4                                                                                       Wait, does he eat chalk?
## 5                                Just, 'cause, I don't want her to go through what I went through with Carl- oh!
## 6  Okay, everybody relax. This is not even a date. It's just two people going out to dinner and- not having sex.
str(episode8)
## 'data.frame':    234 obs. of  2 variables:
##  $ character: chr  "Shelley" "Chandler" "Shelley" "Chandler" ...
##  $ line     : chr  " Hey gorgeous, how's it going?" " Dehydrated Japanese noodles under fluorescent lights... does it get better than this?" " Question. You're not dating anybody, are you, because I met somebody who would be perfect for you." " Ah, y'see, perfect might be a problem. Had you said 'co-dependent', or 'self-destructive'..." ...
# lexicon data
bing <- read_csv("/Users/nikohellman/Desktop/sentiment analysis sets/Bing.csv")
## Rows: 6786 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word, sentiment
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrc <- read_csv("/Users/nikohellman/Desktop/sentiment analysis sets/NRC.csv")
## Rows: 13901 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word, sentiment
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
afinn <- read_csv("/Users/nikohellman/Desktop/sentiment analysis sets/Afinn.csv")
## Rows: 2477 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): word
## dbl (1): value
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Functions

Thanks again to Xavier on Kaggle–

# Text transformations
cleanCorpus <- function(corpus){
  corpus.tmp <- tm_map(corpus, removePunctuation)
  corpus.tmp <- tm_map(corpus.tmp, stripWhitespace)
  corpus.tmp <- tm_map(corpus.tmp, content_transformer(tolower))
  v_stopwords <- c(stopwords("english"), c("thats","weve","hes","theres","ive","im",
                                           "will","can","cant","dont","youve","us",
                                           "youre","youll","theyre","whats","didnt"))
  corpus.tmp <- tm_map(corpus.tmp, removeWords, v_stopwords)
  corpus.tmp <- tm_map(corpus.tmp, removeNumbers)
  return(corpus.tmp)
}

# Most frequent terms 
frequentTerms <- function(text){
  s.cor <- Corpus(VectorSource(text))
  s.cor.cl <- cleanCorpus(s.cor)
  s.tdm <- TermDocumentMatrix(s.cor.cl)
  s.tdm <- removeSparseTerms(s.tdm, 0.999)
  m <- as.matrix(s.tdm)
  word_freqs <- sort(rowSums(m), decreasing=TRUE)
  dm <- data.frame(word=names(word_freqs), freq=word_freqs)
  return(dm)
}

# Define bigram tokenizer 
tokenizer  <- function(x){
  NGramTokenizer(x, Weka_control(min=2, max=2))
}

# Most frequent bigrams 
frequentBigrams <- function(text){
  s.cor <- VCorpus(VectorSource(text))
  s.cor.cl <- cleanCorpus(s.cor)
  s.tdm <- TermDocumentMatrix(s.cor.cl, control=list(tokenize=tokenizer))
  s.tdm <- removeSparseTerms(s.tdm, 0.999)
  m <- as.matrix(s.tdm)
  word_freqs <- sort(rowSums(m), decreasing=TRUE)
  dm <- data.frame(word=names(word_freqs), freq=word_freqs)
  return(dm)
}

EPISODE 1: “The one where Monica gets a new roommate”

# How many dialogues?
length(episode1$line)
## [1] 300
# How many characters?
length(levels(as.factor(episode1$character)))
## [1] 14
# Top 20 characters with more dialogues 
top.episode1.chars <- as.data.frame(sort(table(episode1$character), decreasing=TRUE))[1:20,]

ggplot(data=top.episode1.chars, aes(x=Var1, y=Freq)) +
  geom_bar(stat="identity", fill="#56B4E9", colour="black") +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  labs(x="Character", y="Number of lines")
## Warning: Removed 6 rows containing missing values (position_stack).

#We want to highlight Monica since she is highest
try<-top.episode1.chars%>%
  mutate(topspeak=ifelse(Var1=="Monica", "1", "0"))

ggplot(data=try, aes(x=Var1, y=Freq, fill=topspeak)) +
  scale_fill_manual( values = c( "1"="red", "0"="darkgray" ), guide = "none" )+
  geom_bar(stat="identity", colour="black") +
  theme_classic()+
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  labs(x="Character(s)", y="Number of lines")
## Warning: Removed 6 rows containing missing values (position_stack).

# Most frequent bigrams
episode1.bigrams <- frequentBigrams(episode1$line)[1:20,]
ggplot(data=episode1.bigrams, aes(x=reorder(word, -freq), y=freq)) +
  geom_bar(stat="identity", fill="chocolate2", colour="black") +
  labs(x="Bigram", y="Frequency")+
  theme_classic()+
  theme(axis.text.x=element_text(angle=45, hjust=1))+
  geom_text(x=7, y=11, label="Cut cut in episode 1 is in reference to Rachel \n cutting up her dad's credit cards as a sign of independence")

#### CUMMULATIVE LINES

episode1$linenum<-1:300
episode1$one<-rep(1, 300)

### Rachel
rachel<-episode1%>%
  filter(character=="Rachel")%>%
  select(-line)

rachel$count<-cumsum(rachel$one)

### Monica
monica<-episode1%>%
  filter(character=="Monica")%>%
  select(-line)

monica$count<-cumsum(monica$one)

### Phoebe
phoebe<-episode1%>%
  filter(character=="Phoebe")%>%
  select(-line)

phoebe$count<-cumsum(phoebe$one)


### Ross
ross<-episode1%>%
  filter(character=="Ross")%>%
  select(-line)

ross$count<-cumsum(ross$one)

### Chandler
chandler<-episode1%>%
  filter(character=="Chandler")%>%
  select(-line)

chandler$count<-cumsum(chandler$one)

### Joey
joey<-episode1%>%
  filter(character=="Joey")%>%
  select(-line)

joey$count<-cumsum(joey$one)

###### RBIND

timeline<-rbind(rachel, monica, phoebe,
                ross, chandler, joey)

ggplot(timeline, aes(x=linenum, y=count, color=character))+
  geom_line()

EPISODE 4: “The one with George Stephanopoulos”

# How many dialogues?
length(episode4$line)
## [1] 253
# How many characters?
length(levels(as.factor(episode4$character)))
## [1] 17
# Top 20 characters with more dialogues 
top.episode4.chars <- as.data.frame(sort(table(episode4$character), decreasing=TRUE))[1:20,]

ggplot(data=top.episode4.chars, aes(x=Var1, y=Freq)) +
  geom_bar(stat="identity", fill="#56B4E9", colour="black") +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  labs(x="Character", y="Number of lines")
## Warning: Removed 3 rows containing missing values (position_stack).

# Most frequent bigrams
episode4.bigrams <- frequentBigrams(episode4$line)[1:20,]
ggplot(data=episode4.bigrams, aes(x=reorder(word, -freq), y=freq)) +  
  geom_bar(stat="identity", fill="chocolate2", colour="black") +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  labs(x="Bigram", y="Frequency")

#### CUMMULATIVE LINES

episode4$linenum<-1:253
episode4$one<-rep(1, 253)

### Rachel
rachel4<-episode4%>%
  filter(character=="Rachel")%>%
  select(-line)

rachel4$count<-cumsum(rachel4$one)

### Monica
monica4<-episode4%>%
  filter(character=="Monica")%>%
  select(-line)

monica4$count<-cumsum(monica4$one)

### Phoebe
phoebe4<-episode4%>%
  filter(character=="Phoebe")%>%
  select(-line)

phoebe4$count<-cumsum(phoebe4$one)


### Ross
ross4<-episode4%>%
  filter(character=="Ross")%>%
  select(-line)

ross4$count<-cumsum(ross4$one)

### Chandler
chandler4<-episode4%>%
  filter(character=="Chandler")%>%
  select(-line)

chandler4$count<-cumsum(chandler4$one)

### Joey
joey4<-episode4%>%
  filter(character=="Joey")%>%
  select(-line)

joey4$count<-cumsum(joey4$one)

###### RBIND

timeline<-rbind(rachel4, monica4, phoebe4,
                ross4, chandler4, joey4)

ggplot(timeline, aes(x=linenum, y=count, color=character))+
  geom_line()

EPISODE 8: “The one where Nana dies twice”

# How many dialogues?
length(episode8$line)
## [1] 234
# How many characters?
length(levels(as.factor(episode8$character)))
## [1] 14
# Top 20 characters with more dialogues 
top.episode8.chars <- as.data.frame(sort(table(episode8$character), decreasing=TRUE))[1:20,]

ggplot(data=top.episode8.chars, aes(x=Var1, y=Freq)) +
  geom_bar(stat="identity", fill="#56B4E9", colour="black") +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  labs(x="Character", y="Number of lines")
## Warning: Removed 6 rows containing missing values (position_stack).

# Most frequent bigrams
episode8.bigrams <- frequentBigrams(episode8$line)[1:20,]
ggplot(data=episode8.bigrams, aes(x=reorder(word, -freq), y=freq)) +  
  geom_bar(stat="identity", fill="chocolate2", colour="black") +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  labs(x="Bigram", y="Frequency")

#### CUMMULATIVE LINES

episode8$linenum<-1:234
episode8$one<-rep(1, 234)

### Rachel
rachel8<-episode8%>%
  filter(character=="Rachel")%>%
  select(-line)

rachel8$count<-cumsum(rachel8$one)

### Monica
monica8<-episode8%>%
  filter(character=="Monica")%>%
  select(-line)

monica8$count<-cumsum(monica8$one)

### Phoebe
phoebe8<-episode8%>%
  filter(character=="Phoebe")%>%
  select(-line)

phoebe8$count<-cumsum(phoebe8$one)


### Ross
ross8<-episode8%>%
  filter(character=="Ross")%>%
  select(-line)

ross8$count<-cumsum(ross8$one)

### Chandler
chandler8<-episode8%>%
  filter(character=="Chandler")%>%
  select(-line)

chandler8$count<-cumsum(chandler8$one)

### Joey
joey8<-episode8%>%
  filter(character=="Joey")%>%
  select(-line)

joey8$count<-cumsum(joey8$one)

###### RBIND

timeline<-rbind(rachel8, monica8, phoebe8,
                ross8, chandler8, joey8)

ggplot(timeline, aes(x=linenum, y=count, color=character))+
  geom_line()

ALL THREE EPISODES

# all 3 episodes dialogues 
epset <- rbind(episode1, episode4, episode8)

# How many dialogues?
length(epset$line)
## [1] 787
# How many characters?
length(levels(as.factor(epset$character)))
## [1] 31
# Top 20 characters with more dialogues 
top.epset.chars <- as.data.frame(sort(table(epset$character), decreasing=TRUE))[1:20,]

# Visualization 
ggplot(data=top.epset.chars, aes(x=Var1, y=Freq)) +
  geom_bar(stat="identity", fill="#56B4E9", colour="black") +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  labs(x="Character", y="Number of lines")

# Most frequent bigrams
epset.bigrams <- frequentBigrams(epset$line)[1:20,]
ggplot(data=epset.bigrams, aes(x=reorder(word, -freq), y=freq)) +  
  geom_bar(stat="identity", fill="chocolate2", colour="black") +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  labs(x="Bigram", y="Frequency")

#ggsave("all3epbigram.pdf",height=4,width=8)

SENTIMENT ANALYSIS!!!

# Transform the text to a tidy data structure with one token per row
ftokens <- epset %>%  
  mutate(line=as.character(epset$line)) %>%
  unnest_tokens(word, line)

# Positive and negative words - bing
ftokens %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort=TRUE) %>%
  acast(word ~ sentiment, value.var="n", fill=0) %>%
  comparison.cloud(colors=c("#F8766D", "#00BFC4"), max.words=100)
## Joining, by = "word"

# Sentiments and frequency associated with each word - nrc
fsentiments <- ftokens %>% 
  inner_join(nrc, "word") %>%
  count(word, sentiment, sort=TRUE) 

# Frequency of each sentiment
ggplot(data=fsentiments, aes(x=reorder(sentiment, -n, sum), y=n)) + 
  geom_bar(stat="identity", aes(fill=sentiment), show.legend=FALSE) +
  labs(x="Sentiment", y="Frequency")+
  theme(axis.text.x = element_text(angle=45, hjust=1))

# Top 10 terms for each sentiment
fsentiments %>%
  group_by(sentiment) %>%
  arrange(desc(n)) %>%
  slice(1:10) %>%
  ggplot(aes(x=reorder(word, n), y=n)) +
  geom_col(aes(fill=sentiment), show.legend=FALSE) +
  facet_wrap(~sentiment, scales="free_y") +
  labs(y="Frequency", x="Terms") +
  coord_flip() 

#ggsave("sentimentfreq.pdf",height=4,width=8)

# Sentiment analysis for the main characters
ftokens %>%
  filter(character %in% c("Monica","Rachel","Phoebe","Ross","Chandler",
                          "Joey")) %>%
  inner_join(nrc, "word") %>%
  count(character, sentiment, sort=TRUE) %>%
  ggplot(aes(x=sentiment, y=n)) +
  geom_col(aes(fill=sentiment), show.legend=FALSE) +
  facet_wrap(~character, scales="free_x") +
  labs(x="Sentiment", y="Frequency") +
  coord_flip() 

#ggsave("sentimentsperchar.pdf",height=4,width=8)

# Stopwords
mystopwords <- tibble(word=c(stopwords("english"), 
                                 c("thats","weve","hes","theres","ive","im",
                                   "will","can","cant","dont","youve","us",
                                   "youre","youll","theyre","whats","didnt")))

# Tokens without stopwords
top.chars.ftokens <- epset %>%
  mutate(line=as.character(epset$line)) %>%
  filter(character %in% c("Monica","Rachel","Phoebe","Ross","Chandler",
                          "Joey")) %>%
  unnest_tokens(word, line) %>%
  anti_join(mystopwords, by="word")

# Most frequent words for each character
top.chars.ftokens %>%
  count(character, word) %>%
  group_by(character) %>% 
  arrange(desc(n)) %>%
  slice(1:10) %>%
  ungroup() %>%
  mutate(word2=factor(paste(word, character, sep="__"), 
                       levels=rev(paste(word, character, sep="__"))))%>%
  ggplot(aes(x=word2, y=n)) +
  geom_col(aes(fill=character), show.legend=FALSE) +
  facet_wrap(~character, scales="free_y") +
  labs(x="Sentiment", y="Frequency") +
  scale_x_discrete(labels=function(x) gsub("__.+$", "", x)) +
  coord_flip() 

# Most relevant words for each character
top.chars.ftokens %>%
  count(character, word) %>%
  bind_tf_idf(word, character, n) %>%
  group_by(character) %>% 
  arrange(desc(tf_idf)) %>%
  slice(1:10) %>%
  ungroup() %>%
  mutate(word2=factor(paste(word, character, sep="__"), 
                       levels=rev(paste(word, character, sep="__"))))%>%
  ggplot(aes(x=word2, y=tf_idf)) +
  geom_col(aes(fill=character), show.legend=FALSE) +
  facet_wrap(~character, scales="free_y") +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  labs(y="tf–idf", x="Sentiment") +
  scale_x_discrete(labels=function(x) gsub("__.+$", "", x)) +
  coord_flip() 

#ggsave("wordfreqperchar.pdf",height=4,width=8)