Major credits to the Star Wars Script Analysis by Xavier on Kaggle (https://www.kaggle.com/xvivancos/analyzing-star-wars-movie-scripts) for providing a fantastic template to initiate this EDA process for sentiment analysis. Also to Heather for being the best and diving into our star wars data here (https://rpubs.com/hsmalley/starWars), providing another fantastic EDA template for sentiment analysis.
library(tidyverse) # data manipulation
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.5 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tm) # text mining
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(wordcloud) # word cloud generator
## Loading required package: RColorBrewer
library(wordcloud2) # word cloud generator
library(tidytext) # text mining for word processing and sentiment analysis
library(reshape2) # reshapes a data frame
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(radarchart) # drawing the radar chart from a data frame
library(RWeka) # data mining tasks
# script data
episode1 <- read.csv("/Users/nikohellman/Desktop/friends scripts/friendsEP1.1.csv")
episode4<- read.csv("/Users/nikohellman/Desktop/friends scripts/friendsEP1.4.csv")
episode8<- read.csv("/Users/nikohellman/Desktop/friends scripts/friendsEP1.8.csv")
str(episode1)
## 'data.frame': 300 obs. of 3 variables:
## $ character: chr "Monica" "Joey" "Chandler" "Phoebe" ...
## $ line : chr " There's nothing to tell! He's just some guy I work with!" " C'mon, you're going out with the guy! There's gotta be something wrong with him!" " All right Joey, be nice. So does he have a hump? A hump and a hairpiece?" " Wait, does he eat chalk?" ...
## $ X : chr "" "" "" "" ...
head(episode1)
## character
## 1 Monica
## 2 Joey
## 3 Chandler
## 4 Phoebe
## 5 Phoebe
## 6 Monica
## line
## 1 There's nothing to tell! He's just some guy I work with!
## 2 C'mon, you're going out with the guy! There's gotta be something wrong with him!
## 3 All right Joey, be nice. So does he have a hump? A hump and a hairpiece?
## 4 Wait, does he eat chalk?
## 5 Just, 'cause, I don't want her to go through what I went through with Carl- oh!
## 6 Okay, everybody relax. This is not even a date. It's just two people going out to dinner and- not having sex.
## X
## 1
## 2
## 3
## 4
## 5
## 6
episode1<- episode1%>%
select(1:2)
head(episode1)
## character
## 1 Monica
## 2 Joey
## 3 Chandler
## 4 Phoebe
## 5 Phoebe
## 6 Monica
## line
## 1 There's nothing to tell! He's just some guy I work with!
## 2 C'mon, you're going out with the guy! There's gotta be something wrong with him!
## 3 All right Joey, be nice. So does he have a hump? A hump and a hairpiece?
## 4 Wait, does he eat chalk?
## 5 Just, 'cause, I don't want her to go through what I went through with Carl- oh!
## 6 Okay, everybody relax. This is not even a date. It's just two people going out to dinner and- not having sex.
str(episode8)
## 'data.frame': 234 obs. of 2 variables:
## $ character: chr "Shelley" "Chandler" "Shelley" "Chandler" ...
## $ line : chr " Hey gorgeous, how's it going?" " Dehydrated Japanese noodles under fluorescent lights... does it get better than this?" " Question. You're not dating anybody, are you, because I met somebody who would be perfect for you." " Ah, y'see, perfect might be a problem. Had you said 'co-dependent', or 'self-destructive'..." ...
# lexicon data
bing <- read_csv("/Users/nikohellman/Desktop/sentiment analysis sets/Bing.csv")
## Rows: 6786 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word, sentiment
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrc <- read_csv("/Users/nikohellman/Desktop/sentiment analysis sets/NRC.csv")
## Rows: 13901 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word, sentiment
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
afinn <- read_csv("/Users/nikohellman/Desktop/sentiment analysis sets/Afinn.csv")
## Rows: 2477 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): word
## dbl (1): value
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Thanks again to Xavier on Kaggle–
# Text transformations
cleanCorpus <- function(corpus){
corpus.tmp <- tm_map(corpus, removePunctuation)
corpus.tmp <- tm_map(corpus.tmp, stripWhitespace)
corpus.tmp <- tm_map(corpus.tmp, content_transformer(tolower))
v_stopwords <- c(stopwords("english"), c("thats","weve","hes","theres","ive","im",
"will","can","cant","dont","youve","us",
"youre","youll","theyre","whats","didnt"))
corpus.tmp <- tm_map(corpus.tmp, removeWords, v_stopwords)
corpus.tmp <- tm_map(corpus.tmp, removeNumbers)
return(corpus.tmp)
}
# Most frequent terms
frequentTerms <- function(text){
s.cor <- Corpus(VectorSource(text))
s.cor.cl <- cleanCorpus(s.cor)
s.tdm <- TermDocumentMatrix(s.cor.cl)
s.tdm <- removeSparseTerms(s.tdm, 0.999)
m <- as.matrix(s.tdm)
word_freqs <- sort(rowSums(m), decreasing=TRUE)
dm <- data.frame(word=names(word_freqs), freq=word_freqs)
return(dm)
}
# Define bigram tokenizer
tokenizer <- function(x){
NGramTokenizer(x, Weka_control(min=2, max=2))
}
# Most frequent bigrams
frequentBigrams <- function(text){
s.cor <- VCorpus(VectorSource(text))
s.cor.cl <- cleanCorpus(s.cor)
s.tdm <- TermDocumentMatrix(s.cor.cl, control=list(tokenize=tokenizer))
s.tdm <- removeSparseTerms(s.tdm, 0.999)
m <- as.matrix(s.tdm)
word_freqs <- sort(rowSums(m), decreasing=TRUE)
dm <- data.frame(word=names(word_freqs), freq=word_freqs)
return(dm)
}
# How many dialogues?
length(episode1$line)
## [1] 300
# How many characters?
length(levels(as.factor(episode1$character)))
## [1] 14
# Top 20 characters with more dialogues
top.episode1.chars <- as.data.frame(sort(table(episode1$character), decreasing=TRUE))[1:20,]
ggplot(data=top.episode1.chars, aes(x=Var1, y=Freq)) +
geom_bar(stat="identity", fill="#56B4E9", colour="black") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(x="Character", y="Number of lines")
## Warning: Removed 6 rows containing missing values (position_stack).
#We want to highlight Monica since she is highest
try<-top.episode1.chars%>%
mutate(topspeak=ifelse(Var1=="Monica", "1", "0"))
ggplot(data=try, aes(x=Var1, y=Freq, fill=topspeak)) +
scale_fill_manual( values = c( "1"="red", "0"="darkgray" ), guide = "none" )+
geom_bar(stat="identity", colour="black") +
theme_classic()+
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(x="Character(s)", y="Number of lines")
## Warning: Removed 6 rows containing missing values (position_stack).
# Most frequent bigrams
episode1.bigrams <- frequentBigrams(episode1$line)[1:20,]
ggplot(data=episode1.bigrams, aes(x=reorder(word, -freq), y=freq)) +
geom_bar(stat="identity", fill="chocolate2", colour="black") +
labs(x="Bigram", y="Frequency")+
theme_classic()+
theme(axis.text.x=element_text(angle=45, hjust=1))+
geom_text(x=7, y=11, label="Cut cut in episode 1 is in reference to Rachel \n cutting up her dad's credit cards as a sign of independence")
#### CUMMULATIVE LINES
episode1$linenum<-1:300
episode1$one<-rep(1, 300)
### Rachel
rachel<-episode1%>%
filter(character=="Rachel")%>%
select(-line)
rachel$count<-cumsum(rachel$one)
### Monica
monica<-episode1%>%
filter(character=="Monica")%>%
select(-line)
monica$count<-cumsum(monica$one)
### Phoebe
phoebe<-episode1%>%
filter(character=="Phoebe")%>%
select(-line)
phoebe$count<-cumsum(phoebe$one)
### Ross
ross<-episode1%>%
filter(character=="Ross")%>%
select(-line)
ross$count<-cumsum(ross$one)
### Chandler
chandler<-episode1%>%
filter(character=="Chandler")%>%
select(-line)
chandler$count<-cumsum(chandler$one)
### Joey
joey<-episode1%>%
filter(character=="Joey")%>%
select(-line)
joey$count<-cumsum(joey$one)
###### RBIND
timeline<-rbind(rachel, monica, phoebe,
ross, chandler, joey)
ggplot(timeline, aes(x=linenum, y=count, color=character))+
geom_line()
# How many dialogues?
length(episode4$line)
## [1] 253
# How many characters?
length(levels(as.factor(episode4$character)))
## [1] 17
# Top 20 characters with more dialogues
top.episode4.chars <- as.data.frame(sort(table(episode4$character), decreasing=TRUE))[1:20,]
ggplot(data=top.episode4.chars, aes(x=Var1, y=Freq)) +
geom_bar(stat="identity", fill="#56B4E9", colour="black") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(x="Character", y="Number of lines")
## Warning: Removed 3 rows containing missing values (position_stack).
# Most frequent bigrams
episode4.bigrams <- frequentBigrams(episode4$line)[1:20,]
ggplot(data=episode4.bigrams, aes(x=reorder(word, -freq), y=freq)) +
geom_bar(stat="identity", fill="chocolate2", colour="black") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(x="Bigram", y="Frequency")
#### CUMMULATIVE LINES
episode4$linenum<-1:253
episode4$one<-rep(1, 253)
### Rachel
rachel4<-episode4%>%
filter(character=="Rachel")%>%
select(-line)
rachel4$count<-cumsum(rachel4$one)
### Monica
monica4<-episode4%>%
filter(character=="Monica")%>%
select(-line)
monica4$count<-cumsum(monica4$one)
### Phoebe
phoebe4<-episode4%>%
filter(character=="Phoebe")%>%
select(-line)
phoebe4$count<-cumsum(phoebe4$one)
### Ross
ross4<-episode4%>%
filter(character=="Ross")%>%
select(-line)
ross4$count<-cumsum(ross4$one)
### Chandler
chandler4<-episode4%>%
filter(character=="Chandler")%>%
select(-line)
chandler4$count<-cumsum(chandler4$one)
### Joey
joey4<-episode4%>%
filter(character=="Joey")%>%
select(-line)
joey4$count<-cumsum(joey4$one)
###### RBIND
timeline<-rbind(rachel4, monica4, phoebe4,
ross4, chandler4, joey4)
ggplot(timeline, aes(x=linenum, y=count, color=character))+
geom_line()
# How many dialogues?
length(episode8$line)
## [1] 234
# How many characters?
length(levels(as.factor(episode8$character)))
## [1] 14
# Top 20 characters with more dialogues
top.episode8.chars <- as.data.frame(sort(table(episode8$character), decreasing=TRUE))[1:20,]
ggplot(data=top.episode8.chars, aes(x=Var1, y=Freq)) +
geom_bar(stat="identity", fill="#56B4E9", colour="black") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(x="Character", y="Number of lines")
## Warning: Removed 6 rows containing missing values (position_stack).
# Most frequent bigrams
episode8.bigrams <- frequentBigrams(episode8$line)[1:20,]
ggplot(data=episode8.bigrams, aes(x=reorder(word, -freq), y=freq)) +
geom_bar(stat="identity", fill="chocolate2", colour="black") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(x="Bigram", y="Frequency")
#### CUMMULATIVE LINES
episode8$linenum<-1:234
episode8$one<-rep(1, 234)
### Rachel
rachel8<-episode8%>%
filter(character=="Rachel")%>%
select(-line)
rachel8$count<-cumsum(rachel8$one)
### Monica
monica8<-episode8%>%
filter(character=="Monica")%>%
select(-line)
monica8$count<-cumsum(monica8$one)
### Phoebe
phoebe8<-episode8%>%
filter(character=="Phoebe")%>%
select(-line)
phoebe8$count<-cumsum(phoebe8$one)
### Ross
ross8<-episode8%>%
filter(character=="Ross")%>%
select(-line)
ross8$count<-cumsum(ross8$one)
### Chandler
chandler8<-episode8%>%
filter(character=="Chandler")%>%
select(-line)
chandler8$count<-cumsum(chandler8$one)
### Joey
joey8<-episode8%>%
filter(character=="Joey")%>%
select(-line)
joey8$count<-cumsum(joey8$one)
###### RBIND
timeline<-rbind(rachel8, monica8, phoebe8,
ross8, chandler8, joey8)
ggplot(timeline, aes(x=linenum, y=count, color=character))+
geom_line()
# all 3 episodes dialogues
epset <- rbind(episode1, episode4, episode8)
# How many dialogues?
length(epset$line)
## [1] 787
# How many characters?
length(levels(as.factor(epset$character)))
## [1] 31
# Top 20 characters with more dialogues
top.epset.chars <- as.data.frame(sort(table(epset$character), decreasing=TRUE))[1:20,]
# Visualization
ggplot(data=top.epset.chars, aes(x=Var1, y=Freq)) +
geom_bar(stat="identity", fill="#56B4E9", colour="black") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(x="Character", y="Number of lines")
# Most frequent bigrams
epset.bigrams <- frequentBigrams(epset$line)[1:20,]
ggplot(data=epset.bigrams, aes(x=reorder(word, -freq), y=freq)) +
geom_bar(stat="identity", fill="chocolate2", colour="black") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(x="Bigram", y="Frequency")
#ggsave("all3epbigram.pdf",height=4,width=8)
# Transform the text to a tidy data structure with one token per row
ftokens <- epset %>%
mutate(line=as.character(epset$line)) %>%
unnest_tokens(word, line)
# Positive and negative words - bing
ftokens %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort=TRUE) %>%
acast(word ~ sentiment, value.var="n", fill=0) %>%
comparison.cloud(colors=c("#F8766D", "#00BFC4"), max.words=100)
## Joining, by = "word"
# Sentiments and frequency associated with each word - nrc
fsentiments <- ftokens %>%
inner_join(nrc, "word") %>%
count(word, sentiment, sort=TRUE)
# Frequency of each sentiment
ggplot(data=fsentiments, aes(x=reorder(sentiment, -n, sum), y=n)) +
geom_bar(stat="identity", aes(fill=sentiment), show.legend=FALSE) +
labs(x="Sentiment", y="Frequency")+
theme(axis.text.x = element_text(angle=45, hjust=1))
# Top 10 terms for each sentiment
fsentiments %>%
group_by(sentiment) %>%
arrange(desc(n)) %>%
slice(1:10) %>%
ggplot(aes(x=reorder(word, n), y=n)) +
geom_col(aes(fill=sentiment), show.legend=FALSE) +
facet_wrap(~sentiment, scales="free_y") +
labs(y="Frequency", x="Terms") +
coord_flip()
#ggsave("sentimentfreq.pdf",height=4,width=8)
# Sentiment analysis for the main characters
ftokens %>%
filter(character %in% c("Monica","Rachel","Phoebe","Ross","Chandler",
"Joey")) %>%
inner_join(nrc, "word") %>%
count(character, sentiment, sort=TRUE) %>%
ggplot(aes(x=sentiment, y=n)) +
geom_col(aes(fill=sentiment), show.legend=FALSE) +
facet_wrap(~character, scales="free_x") +
labs(x="Sentiment", y="Frequency") +
coord_flip()
#ggsave("sentimentsperchar.pdf",height=4,width=8)
# Stopwords
mystopwords <- tibble(word=c(stopwords("english"),
c("thats","weve","hes","theres","ive","im",
"will","can","cant","dont","youve","us",
"youre","youll","theyre","whats","didnt")))
# Tokens without stopwords
top.chars.ftokens <- epset %>%
mutate(line=as.character(epset$line)) %>%
filter(character %in% c("Monica","Rachel","Phoebe","Ross","Chandler",
"Joey")) %>%
unnest_tokens(word, line) %>%
anti_join(mystopwords, by="word")
# Most frequent words for each character
top.chars.ftokens %>%
count(character, word) %>%
group_by(character) %>%
arrange(desc(n)) %>%
slice(1:10) %>%
ungroup() %>%
mutate(word2=factor(paste(word, character, sep="__"),
levels=rev(paste(word, character, sep="__"))))%>%
ggplot(aes(x=word2, y=n)) +
geom_col(aes(fill=character), show.legend=FALSE) +
facet_wrap(~character, scales="free_y") +
labs(x="Sentiment", y="Frequency") +
scale_x_discrete(labels=function(x) gsub("__.+$", "", x)) +
coord_flip()
# Most relevant words for each character
top.chars.ftokens %>%
count(character, word) %>%
bind_tf_idf(word, character, n) %>%
group_by(character) %>%
arrange(desc(tf_idf)) %>%
slice(1:10) %>%
ungroup() %>%
mutate(word2=factor(paste(word, character, sep="__"),
levels=rev(paste(word, character, sep="__"))))%>%
ggplot(aes(x=word2, y=tf_idf)) +
geom_col(aes(fill=character), show.legend=FALSE) +
facet_wrap(~character, scales="free_y") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(y="tf–idf", x="Sentiment") +
scale_x_discrete(labels=function(x) gsub("__.+$", "", x)) +
coord_flip()
#ggsave("wordfreqperchar.pdf",height=4,width=8)