library(tidyverse) # data manipulation
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.5 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tm) # text mining
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(wordcloud) # word cloud generator
## Loading required package: RColorBrewer
library(wordcloud2) # word cloud generator
library(tidytext) # text mining for word processing and sentiment analysis
library(reshape2) # reshapes a data frame
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(radarchart) # drawing the radar chart from a data frame
library(RWeka) # data mining tasks
# script data
# season 1 scripts
episode1 <- read.csv("/Users/nikohellman/Desktop/friends scripts/FriendsEp1withScene.csv")
episode2<- read.csv("/Users/nikohellman/Desktop/friends scripts/FriendsEp2withScene.csv")
episode3<- read.csv("/Users/nikohellman/Desktop/friends scripts/FriendsEp3withScene.csv")
episode4<- read.csv("/Users/nikohellman/Desktop/friends scripts/FriendsEp4withScene.csv")
episode5<- read.csv("/Users/nikohellman/Desktop/friends scripts/FriendsEp5withScene.csv")
episode6<- read.csv("/Users/nikohellman/Desktop/friends scripts/FriendsEp6withScene.csv")
episode7<- read.csv("/Users/nikohellman/Desktop/friends scripts/FriendsEp7withScene.csv")
episode8<- read.csv("/Users/nikohellman/Desktop/friends scripts/FriendsEp8withScene.csv")
episode9<- read.csv("/Users/nikohellman/Desktop/friends scripts/FriendsEp9withScene.csv")
episode10<- read.csv("/Users/nikohellman/Desktop/friends scripts/FriendsEp10withScene.csv")
episode11<- read.csv("/Users/nikohellman/Desktop/friends scripts/FriendsEp11withScene.csv")
episode12<- read.csv("/Users/nikohellman/Desktop/friends scripts/FriendsEp12withScene.csv")
episode13<- read.csv("/Users/nikohellman/Desktop/friends scripts/FriendsEp13withScene.csv")
episode14<- read.csv("/Users/nikohellman/Desktop/friends scripts/FriendsEp14withScene.csv")
episode15<- read.csv("/Users/nikohellman/Desktop/friends scripts/FriendsEp15withScene.csv")
episode16<- read.csv("/Users/nikohellman/Desktop/friends scripts/FriendsEp16withScene.csv")
episode17<- read.csv("/Users/nikohellman/Desktop/friends scripts/FriendsEp17withScene.csv")
episode18<- read.csv("/Users/nikohellman/Desktop/friends scripts/FriendsEp18withScene.csv")
episode19<- read.csv("/Users/nikohellman/Desktop/friends scripts/FriendsEp19withScene.csv")
episode20<- read.csv("/Users/nikohellman/Desktop/friends scripts/FriendsEp20withScene.csv")
episode21<- read.csv("/Users/nikohellman/Desktop/friends scripts/FriendsEp21withScene.csv")
episode22<- read.csv("/Users/nikohellman/Desktop/friends scripts/FriendsEp22withScene.csv")
episode23<- read.csv("/Users/nikohellman/Desktop/friends scripts/FriendsEp23withScene.csv")
episode24<- read.csv("/Users/nikohellman/Desktop/friends scripts/FriendsEp24withScene.csv")
# season 10 scripts
s10episode1 <- read.csv("/Users/nikohellman/Desktop/friends scripts/S10FriendsEp1withScene.csv")
s10episode2<- read.csv("/Users/nikohellman/Desktop/friends scripts/S10FriendsEp2withScene.csv")
s10episode3<- read.csv("/Users/nikohellman/Desktop/friends scripts/S10FriendsEp3withScene.csv")
s10episode4<- read.csv("/Users/nikohellman/Desktop/friends scripts/S10FriendsEp4withScene.csv")
s10episode5<- read.csv("/Users/nikohellman/Desktop/friends scripts/S10FriendsEp5withScene.csv")
s10episode6<- read.csv("/Users/nikohellman/Desktop/friends scripts/S10FriendsEp6withScene.csv")
s10episode7<- read.csv("/Users/nikohellman/Desktop/friends scripts/S10FriendsEp7withScene.csv")
s10episode8<- read.csv("/Users/nikohellman/Desktop/friends scripts/S10FriendsEp8withScene.csv")
s10episode9<- read.csv("/Users/nikohellman/Desktop/friends scripts/S10FriendsEp9withScene.csv")
s10episode10<- read.csv("/Users/nikohellman/Desktop/friends scripts/S10FriendsEp10withScene.csv")
s10episode11<- read.csv("/Users/nikohellman/Desktop/friends scripts/S10FriendsEp11withScene.csv")
s10episode12<- read.csv("/Users/nikohellman/Desktop/friends scripts/S10FriendsEp12withScene.csv")
s10episode13<- read.csv("/Users/nikohellman/Desktop/friends scripts/S10FriendsEp13withScene.csv")
s10episode14<- read.csv("/Users/nikohellman/Desktop/friends scripts/S10FriendsEp14withScene.csv")
s10episode15<- read.csv("/Users/nikohellman/Desktop/friends scripts/S10FriendsEp15withScene.csv")
s10episode16<- read.csv("/Users/nikohellman/Desktop/friends scripts/S10FriendsEp16withScene.csv")
s10episode17<- read.csv("/Users/nikohellman/Desktop/friends scripts/S10FriendsEp17withScene.csv")
# fix a little error i discovered when putting together all s10 episodes
s10episode11<- s10episode11%>%
mutate(Scene=X)%>%
select(Character, Line, Scene)
#View(s10episode11)
# lexicon data sets
bing <- read_csv("/Users/nikohellman/Desktop/sentiment analysis sets/Bing.csv")
## Rows: 6786 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word, sentiment
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrc <- read_csv("/Users/nikohellman/Desktop/sentiment analysis sets/NRC.csv")
## Rows: 13901 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word, sentiment
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
afinn <- read_csv("/Users/nikohellman/Desktop/sentiment analysis sets/Afinn.csv")
## Rows: 2477 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): word
## dbl (1): value
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Thanks again to Xavier on Kaggle and their Star Wars Script Analysis (https://www.kaggle.com/xvivancos/analyzing-star-wars-movie-scripts) for these functions!
# Text transformations
cleanCorpus <- function(corpus){
corpus.tmp <- tm_map(corpus, removePunctuation)
corpus.tmp <- tm_map(corpus.tmp, stripWhitespace)
corpus.tmp <- tm_map(corpus.tmp, content_transformer(tolower))
v_stopwords <- c(stopwords("english"), c("thats","weve","hes","theres","ive","im",
"will","can","cant","dont","youve","us",
"youre","youll","theyre","whats","didnt"))
corpus.tmp <- tm_map(corpus.tmp, removeWords, v_stopwords)
corpus.tmp <- tm_map(corpus.tmp, removeNumbers)
return(corpus.tmp)
}
# Most frequent terms
frequentTerms <- function(text){
s.cor <- Corpus(VectorSource(text))
s.cor.cl <- cleanCorpus(s.cor)
s.tdm <- TermDocumentMatrix(s.cor.cl)
s.tdm <- removeSparseTerms(s.tdm, 0.999)
m <- as.matrix(s.tdm)
word_freqs <- sort(rowSums(m), decreasing=TRUE)
dm <- data.frame(word=names(word_freqs), freq=word_freqs)
return(dm)
}
# Define bigram tokenizer
tokenizer <- function(x){
NGramTokenizer(x, Weka_control(min=2, max=2))
}
# Most frequent bigrams
frequentBigrams <- function(text){
s.cor <- VCorpus(VectorSource(text))
s.cor.cl <- cleanCorpus(s.cor)
s.tdm <- TermDocumentMatrix(s.cor.cl, control=list(tokenize=tokenizer))
s.tdm <- removeSparseTerms(s.tdm, 0.999)
m <- as.matrix(s.tdm)
word_freqs <- sort(rowSums(m), decreasing=TRUE)
dm <- data.frame(word=names(word_freqs), freq=word_freqs)
return(dm)
}
# How many dialogues?
length(episode1$Line)
## [1] 320
# How many characters?
length(levels(as.factor(episode1$Character)))
## [1] 21
# Top 20 characters with more dialogues
top.episode1.chars <- as.data.frame(sort(table(episode1$Character), decreasing=TRUE))[1:20,]
top.episode1.chars<- top.episode1.chars%>%
filter(!(str_detect(Var1, "\\[")))
ggplot(data=top.episode1.chars, aes(x=Var1, y=Freq)) +
geom_bar(stat="identity", fill="#56B4E9", colour="black") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(x="Character", y="Number of lines")
#We want to highlight Monica since she is highest
try<-top.episode1.chars%>%
mutate(topspeak=ifelse(Var1=="Monica", "1", "0"))
ggplot(data=try, aes(x=Var1, y=Freq, fill=topspeak)) +
scale_fill_manual( values = c( "1"="red", "0"="darkgray" ), guide = "none" )+
geom_bar(stat="identity", colour="black") +
theme_classic()+
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(x="Character(s)", y="Number of lines")
# Most frequent bigrams
episode1.bigrams <- frequentBigrams(episode1$Line)[1:20,]
ggplot(data=episode1.bigrams, aes(x=reorder(word, -freq), y=freq)) +
geom_bar(stat="identity", fill="chocolate2", colour="black") +
labs(x="Bigram", y="Frequency")+
theme_classic()+
theme(axis.text.x=element_text(angle=45, hjust=1))+
geom_text(x=5, y=13, label="\"Cut cut\" \n is in reference to Rachel \n cutting up her dad's credit cards \n as a sign of independence", fontface='italic', size=3)
#### CUMMULATIVE LINES
episode1char<- episode1
episode1char$linenum<-1:length(episode1$Line)
episode1char$one<-rep(1, length(episode1$Line))
### Rachel
rachel<-episode1char%>%
filter(Character=="Rachel")%>%
select(-Line)
rachel$count<-cumsum(rachel$one)
### Monica
monica<-episode1char%>%
filter(Character=="Monica")%>%
select(-Line)
monica$count<-cumsum(monica$one)
### Phoebe
phoebe<-episode1char%>%
filter(Character=="Phoebe")%>%
select(-Line)
phoebe$count<-cumsum(phoebe$one)
### Ross
ross<-episode1char%>%
filter(Character=="Ross")%>%
select(-Line)
ross$count<-cumsum(ross$one)
### Chandler
chandler<-episode1char%>%
filter(Character=="Chandler")%>%
select(-Line)
chandler$count<-cumsum(chandler$one)
### Joey
joey<-episode1char%>%
filter(Character=="Joey")%>%
select(-Line)
joey$count<-cumsum(joey$one)
###### RBIND
timeline<-rbind(rachel, monica, phoebe,
ross, chandler, joey)
ggplot(timeline, aes(x=linenum, y=count, color=Character))+
geom_line()
####Season 1, episode 24 (season finale): The One Where Rachel Finds Out
# How many dialogues?
length(episode24$Line)
## [1] 273
# How many characters?
length(levels(as.factor(episode24$Character)))
## [1] 19
# Top 20 characters with more dialogues
top.episode24.chars <- as.data.frame(sort(table(episode24$Character), decreasing=TRUE))[1:20,]
top.episode24.chars<- top.episode24.chars%>%
filter(!(str_detect(Var1, "\\[")))
ggplot(data=top.episode24.chars, aes(x=Var1, y=Freq)) +
geom_bar(stat="identity", fill="#56B4E9", colour="black") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(x="Character", y="Number of lines")
#We want to highlight Monica since she is highest
try24<-top.episode24.chars%>%
mutate(topspeak=ifelse(Var1=="Monica", "1", "0"))
ggplot(data=try24, aes(x=Var1, y=Freq, fill=topspeak)) +
scale_fill_manual( values = c( "1"="red", "0"="darkgray" ), guide = "none" )+
geom_bar(stat="identity", colour="black") +
theme_classic()+
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(x="Character(s)", y="Number of lines")
# Most frequent bigrams
episode24.bigrams <- frequentBigrams(episode24$Line)[1:20,]
ggplot(data=episode24.bigrams, aes(x=reorder(word, -freq), y=freq)) +
geom_bar(stat="identity", fill="chocolate2", colour="black") +
labs(x="Bigram", y="Frequency")+
theme_classic()+
theme(axis.text.x=element_text(angle=45, hjust=1))+
geom_text(x=5, y=13, label="\"Cut cut\" \n is in reference to Rachel \n cutting up her dad's credit cards \n as a sign of independence", fontface='italic', size=3)
#### CUMMULATIVE LINES
episode24char<- episode24
episode24char$linenum<-1:length(episode24char$Line)
episode24char$one<-rep(1, length(episode24char$Line))
### Rachel
rachel24<-episode24char%>%
filter(Character=="Rachel")%>%
select(-Line)
rachel24$count<-cumsum(rachel24$one)
### Monica
monica24<-episode24char%>%
filter(Character=="Monica")%>%
select(-Line)
monica24$count<-cumsum(monica24$one)
### Phoebe
phoebe24<-episode24char%>%
filter(Character=="Phoebe")%>%
select(-Line)
phoebe24$count<-cumsum(phoebe24$one)
### Ross
ross24<-episode24char%>%
filter(Character=="Ross")%>%
select(-Line)
ross24$count<-cumsum(ross24$one)
### Chandler
chandler24<-episode24char%>%
filter(Character=="Chandler")%>%
select(-Line)
chandler24$count<-cumsum(chandler24$one)
### Joey
joey24<-episode24char%>%
filter(Character=="Joey")%>%
select(-Line)
joey24$count<-cumsum(joey24$one)
###### RBIND
timeline24<-rbind(rachel24, monica24, phoebe24,
ross24, chandler24, joey24)
ggplot(timeline24, aes(x=linenum, y=count, color=Character))+
geom_line()
# all episodes dialogues
epset <- rbind(episode1, episode2, episode3, episode4, episode5, episode6, episode7,
episode8, episode9, episode10, episode11, episode12, episode13, episode14,
episode15, episode16, episode17, episode18, episode19, episode20, episode21,
episode22, episode23, episode24)
# How many dialogues?
length(epset$Line)
## [1] 6369
# How many characters?
length(levels(as.factor(epset$Character)))
## [1] 183
# Top 20 characters with more dialogues
top.epset.chars <- as.data.frame(sort(table(epset$Character), decreasing=TRUE))[1:20,]
top.epset.chars<- top.epset.chars%>%
filter(!(str_detect(Var1, "\\[")))
# Visualization
ggplot(data=top.epset.chars, aes(x=Var1, y=Freq)) +
geom_bar(stat="identity", fill="#56B4E9", colour="black") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(x="Character", y="Number of lines")
# Most frequent bigrams
epset.bigrams <- frequentBigrams(epset$Line)[1:20,]
ggplot(data=epset.bigrams, aes(x=reorder(word, -freq), y=freq)) +
geom_bar(stat="identity", fill="chocolate2", colour="black") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(x="Bigram", y="Frequency")
# Transform the text to a tidy data structure with one token per row
ftokens <- epset %>%
mutate(line=as.character(epset$Line)) %>%
unnest_tokens(word, line)
# Positive and negative words - bing
ftokens %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort=TRUE) %>%
acast(word ~ sentiment, value.var="n", fill=0) %>%
comparison.cloud(colors=c("#F8766D", "#00BFC4"), max.words=100)
## Joining, by = "word"
# Sentiments and frequency associated with each word - nrc
fsentiments <- ftokens %>%
inner_join(nrc, "word") %>%
count(word, sentiment, sort=TRUE)
# Frequency of each sentiment
ggplot(data=fsentiments, aes(x=reorder(sentiment, -n, sum), y=n)) +
geom_bar(stat="identity", aes(fill=sentiment), show.legend=FALSE) +
labs(x="Sentiment", y="Frequency")+
theme(axis.text.x = element_text(angle=45, hjust=1))
# Top 10 terms for each sentiment
fsentiments %>%
group_by(sentiment) %>%
arrange(desc(n)) %>%
slice(1:10) %>%
ggplot(aes(x=reorder(word, n), y=n)) +
geom_col(aes(fill=sentiment), show.legend=FALSE) +
facet_wrap(~sentiment, scales="free_y") +
labs(y="Frequency", x="Terms") +
coord_flip()
# Sentiment analysis for the main characters
ftokens %>%
filter(Character %in% c("Monica","Rachel","Phoebe","Ross","Chandler",
"Joey")) %>%
inner_join(nrc, "word") %>%
count(Character, sentiment, sort=TRUE) %>%
ggplot(aes(x=sentiment, y=n)) +
geom_col(aes(fill=sentiment), show.legend=FALSE) +
facet_wrap(~Character, scales="free_x") +
labs(x="Sentiment", y="Frequency") +
coord_flip()
# Stopwords
mystopwords <- tibble(word=c(stopwords("english"),
c("thats","weve","hes","theres","ive","im",
"will","can","cant","dont","youve","us",
"youre","youll","theyre","whats","didnt")))
# Tokens without stopwords
top.chars.ftokens <- epset %>%
mutate(line=as.character(epset$Line)) %>%
filter(Character %in% c("Monica","Rachel","Phoebe","Ross","Chandler",
"Joey")) %>%
unnest_tokens(word, line) %>%
anti_join(mystopwords, by="word")
# Most frequent words for each character
top.chars.ftokens %>%
count(Character, word) %>%
group_by(Character) %>%
arrange(desc(n)) %>%
slice(1:10) %>%
ungroup() %>%
mutate(word2=factor(paste(word, Character, sep="__"),
levels=rev(paste(word, Character, sep="__"))))%>%
ggplot(aes(x=word2, y=n)) +
geom_col(aes(fill=Character), show.legend=FALSE) +
facet_wrap(~Character, scales="free_y") +
labs(x="Sentiment", y="Frequency") +
scale_x_discrete(labels=function(x) gsub("__.+$", "", x)) +
coord_flip()
# Most relevant words for each character
top.chars.ftokens %>%
count(Character, word) %>%
bind_tf_idf(word, Character, n) %>%
group_by(Character) %>%
arrange(desc(tf_idf)) %>%
slice(1:10) %>%
ungroup() %>%
mutate(word2=factor(paste(word, Character, sep="__"),
levels=rev(paste(word, Character, sep="__"))))%>%
ggplot(aes(x=word2, y=tf_idf)) +
geom_col(aes(fill=Character), show.legend=FALSE) +
facet_wrap(~Character, scales="free_y") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(y="tf–idf", x="Sentiment") +
scale_x_discrete(labels=function(x) gsub("__.+$", "", x)) +
coord_flip()
# How many dialogues?
length(s10episode1$Line)
## [1] 358
# How many characters?
length(levels(as.factor(s10episode1$Character)))
## [1] 14
# Top 20 characters with more dialogues
top.s10episode1.chars <- as.data.frame(sort(table(s10episode1$Character), decreasing=TRUE))[1:20,]
top.s10episode1.chars<- top.s10episode1.chars%>%
filter(!(str_detect(Var1, "\\[")))
ggplot(data=top.s10episode1.chars, aes(x=Var1, y=Freq)) +
geom_bar(stat="identity", fill="#56B4E9", colour="black") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(x="Character", y="Number of lines")
# Most frequent bigrams
s10episode1.bigrams <- frequentBigrams(s10episode1$Line)[1:20,]
ggplot(data=s10episode1.bigrams, aes(x=reorder(word, -freq), y=freq)) +
geom_bar(stat="identity", fill="chocolate2", colour="black") +
labs(x="Bigram", y="Frequency")+
theme_classic()+
theme(axis.text.x=element_text(angle=45, hjust=1))
#### CUMMULATIVE LINES
s10episode1char<- s10episode1
s10episode1char$linenum<-1:length(s10episode1$Line)
s10episode1char$one<-rep(1, length(s10episode1$Line))
### Rachel
s10rachel<-s10episode1char%>%
filter(Character=="Rachel")%>%
select(-Line)
s10rachel$count<-cumsum(s10rachel$one)
### Monica
s10monica<-s10episode1char%>%
filter(Character=="Monica")%>%
select(-Line)
s10monica$count<-cumsum(s10monica$one)
### Phoebe
s10phoebe<-s10episode1char%>%
filter(Character=="Phoebe")%>%
select(-Line)
s10phoebe$count<-cumsum(s10phoebe$one)
### Ross
s10ross<-s10episode1char%>%
filter(Character=="Ross")%>%
select(-Line)
s10ross$count<-cumsum(s10ross$one)
### Chandler
s10chandler<-s10episode1char%>%
filter(Character=="Chandler")%>%
select(-Line)
s10chandler$count<-cumsum(s10chandler$one)
### Joey
s10joey<-s10episode1char%>%
filter(Character=="Joey")%>%
select(-Line)
s10joey$count<-cumsum(s10joey$one)
###### RBIND
s10timeline<-rbind(s10rachel, s10monica, s10phoebe,
s10ross, s10chandler, s10joey)
ggplot(s10timeline, aes(x=linenum, y=count, color=Character))+
geom_line()
# How many dialogues?
length(s10episode17$Line)
## [1] 609
# How many characters?
length(levels(as.factor(s10episode17$Character)))
## [1] 22
# Top 20 characters with more dialogues
top.s10episode17.chars <- as.data.frame(sort(table(s10episode17$Character), decreasing=TRUE))[1:20,]
top.s10episode17.chars<- top.s10episode17.chars%>%
filter(!(str_detect(Var1, "\\[")))
ggplot(data=top.s10episode17.chars, aes(x=Var1, y=Freq)) +
geom_bar(stat="identity", fill="#56B4E9", colour="black") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(x="Character", y="Number of lines")
# Most frequent bigrams
s10episode17.bigrams <- frequentBigrams(s10episode17$Line)[1:20,]
ggplot(data=s10episode17.bigrams, aes(x=reorder(word, -freq), y=freq)) +
geom_bar(stat="identity", fill="chocolate2", colour="black") +
labs(x="Bigram", y="Frequency")+
theme_classic()+
theme(axis.text.x=element_text(angle=45, hjust=1))
#### CUMMULATIVE LINES
s10episode17char<- s10episode17
s10episode17char$linenum<-1:length(s10episode17$Line)
s10episode17char$one<-rep(1, length(s10episode17$Line))
### Rachel
s10rachel17<-s10episode17char%>%
filter(Character=="Rachel")%>%
select(-Line)
s10rachel17$count<-cumsum(s10rachel17$one)
### Monica
s10monica17<-s10episode17char%>%
filter(Character=="Monica")%>%
select(-Line)
s10monica17$count<-cumsum(s10monica17$one)
### Phoebe
s10phoebe17<-s10episode17char%>%
filter(Character=="Phoebe")%>%
select(-Line)
s10phoebe17$count<-cumsum(s10phoebe17$one)
### Ross
s10ross17<-s10episode17char%>%
filter(Character=="Ross")%>%
select(-Line)
s10ross17$count<-cumsum(s10ross17$one)
### Chandler
s10chandler17<-s10episode1char%>%
filter(Character=="Chandler")%>%
select(-Line)
s10chandler17$count<-cumsum(s10chandler17$one)
### Joey
s10joey17<-s10episode17char%>%
filter(Character=="Joey")%>%
select(-Line)
s10joey17$count<-cumsum(s10joey17$one)
###### RBIND
s10timeline17<-rbind(s10rachel17, s10monica17, s10phoebe17,
s10ross17, s10chandler17, s10joey17)
ggplot(s10timeline17, aes(x=linenum, y=count, color=Character))+
geom_line()
# all episodes dialogues
s10epset <- rbind(s10episode1, s10episode2, s10episode3, s10episode4, s10episode5, s10episode6,
s10episode7, s10episode8, s10episode9, s10episode10, s10episode11, s10episode12,
s10episode13, s10episode14, s10episode15, s10episode16, s10episode17)
# How many dialogues?
length(s10epset$Line)
## [1] 5566
# How many characters?
length(levels(as.factor(s10epset$Character)))
## [1] 139
# Top 20 characters with more dialogues
top.s10epset.chars <- as.data.frame(sort(table(s10epset$Character), decreasing=TRUE))[1:20,]
top.s10epset.chars<- top.s10epset.chars%>%
filter(!(str_detect(Var1, "\\[")))
# Visualization
ggplot(data=top.s10epset.chars, aes(x=Var1, y=Freq)) +
geom_bar(stat="identity", fill="#56B4E9", colour="black") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(x="Character", y="Number of lines")
# Most frequent bigrams
s10epset.bigrams <- frequentBigrams(s10epset$Line)[1:20,]
ggplot(data=s10epset.bigrams, aes(x=reorder(word, -freq), y=freq)) +
geom_bar(stat="identity", fill="chocolate2", colour="black") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(x="Bigram", y="Frequency")
# Transform the text to a tidy data structure with one token per row
s10ftokens <- s10epset %>%
mutate(line=as.character(s10epset$Line)) %>%
unnest_tokens(word, line)
# Positive and negative words - bing
s10ftokens %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort=TRUE) %>%
acast(word ~ sentiment, value.var="n", fill=0) %>%
comparison.cloud(colors=c("#F8766D", "#00BFC4"), max.words=100)
## Joining, by = "word"
# Sentiments and frequency associated with each word - nrc
s10fsentiments <- s10ftokens %>%
inner_join(nrc, "word") %>%
count(word, sentiment, sort=TRUE)
# Frequency of each sentiment
ggplot(data=s10fsentiments, aes(x=reorder(sentiment, -n, sum), y=n)) +
geom_bar(stat="identity", aes(fill=sentiment), show.legend=FALSE) +
labs(x="Sentiment", y="Frequency")+
theme(axis.text.x = element_text(angle=45, hjust=1))
# Top 10 terms for each sentiment
s10fsentiments %>%
group_by(sentiment) %>%
arrange(desc(n)) %>%
slice(1:10) %>%
ggplot(aes(x=reorder(word, n), y=n)) +
geom_col(aes(fill=sentiment), show.legend=FALSE) +
facet_wrap(~sentiment, scales="free_y") +
labs(y="Frequency", x="Terms") +
coord_flip()
# Sentiment analysis for the main characters
s10ftokens %>%
filter(Character %in% c("Monica","Rachel","Phoebe","Ross","Chandler",
"Joey")) %>%
inner_join(nrc, "word") %>%
count(Character, sentiment, sort=TRUE) %>%
ggplot(aes(x=sentiment, y=n)) +
geom_col(aes(fill=sentiment), show.legend=FALSE) +
facet_wrap(~Character, scales="free_x") +
labs(x="Sentiment", y="Frequency") +
coord_flip()
# Stopwords
mystopwords <- tibble(word=c(stopwords("english"),
c("thats","weve","hes","theres","ive","im",
"will","can","cant","dont","youve","us",
"youre","youll","theyre","whats","didnt")))
# Tokens without stopwords
s10top.chars.ftokens <- s10epset %>%
mutate(line=as.character(s10epset$Line)) %>%
filter(Character %in% c("Monica","Rachel","Phoebe","Ross","Chandler",
"Joey")) %>%
unnest_tokens(word, line) %>%
anti_join(mystopwords, by="word")
# Most frequent words for each character
s10top.chars.ftokens %>%
count(Character, word) %>%
group_by(Character) %>%
arrange(desc(n)) %>%
slice(1:10) %>%
ungroup() %>%
mutate(word2=factor(paste(word, Character, sep="__"),
levels=rev(paste(word, Character, sep="__"))))%>%
ggplot(aes(x=word2, y=n)) +
geom_col(aes(fill=Character), show.legend=FALSE) +
facet_wrap(~Character, scales="free_y") +
labs(x="Sentiment", y="Frequency") +
scale_x_discrete(labels=function(x) gsub("__.+$", "", x)) +
coord_flip()
# Most relevant words for each character
s10top.chars.ftokens %>%
count(Character, word) %>%
bind_tf_idf(word, Character, n) %>%
group_by(Character) %>%
arrange(desc(tf_idf)) %>%
slice(1:10) %>%
ungroup() %>%
mutate(word2=factor(paste(word, Character, sep="__"),
levels=rev(paste(word, Character, sep="__"))))%>%
ggplot(aes(x=word2, y=tf_idf)) +
geom_col(aes(fill=Character), show.legend=FALSE) +
facet_wrap(~Character, scales="free_y") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(y="tf–idf", x="Sentiment") +
scale_x_discrete(labels=function(x) gsub("__.+$", "", x)) +
coord_flip()
#season 1
girls<- epset%>%
filter(Character %in% c("Monica","Rachel","Phoebe"))
#how many lines for lead girls in s1?
length(girls$Line)
## [1] 2359
boys<- epset%>%
filter(Character %in% c("Joey", "Chandler", "Ross"))
#how many lines for lead boys in s1?
length(boys$Line)
## [1] 2410
#girls most frequent bigrams
girls.bigrams <- frequentBigrams(girls$Line)[1:20,]
ggplot(data=girls.bigrams, aes(x=reorder(word, -freq), y=freq)) +
geom_bar(stat="identity", fill="chocolate2", colour="black") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(x="Bigram", y="Frequency")
#boys most freq bigrams
boys.bigrams <- frequentBigrams(boys$Line)[1:20,]
ggplot(data=boys.bigrams, aes(x=reorder(word, -freq), y=freq)) +
geom_bar(stat="identity", fill="chocolate2", colour="black") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(x="Bigram", y="Frequency")
#girls positive/negative word cloud
girlstokens <- girls %>%
mutate(line=as.character(girls$Line)) %>%
unnest_tokens(word, line)
girlstokens %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort=TRUE) %>%
acast(word ~ sentiment, value.var="n", fill=0) %>%
comparison.cloud(colors=c("#F8766D", "#00BFC4"), max.words=100)
## Joining, by = "word"
# boys positive/negative word cloud
boystokens <- boys %>%
mutate(line=as.character(boys$Line)) %>%
unnest_tokens(word, line)
boystokens %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort=TRUE) %>%
acast(word ~ sentiment, value.var="n", fill=0) %>%
comparison.cloud(colors=c("#F8766D", "#00BFC4"), max.words=100)
## Joining, by = "word"
# girls sentiments frequency
girlssentiments <- girlstokens %>%
inner_join(nrc, "word") %>%
count(word, sentiment, sort=TRUE)
ggplot(data=girlssentiments, aes(x=reorder(sentiment, -n, sum), y=n)) +
geom_bar(stat="identity", aes(fill=sentiment), show.legend=FALSE) +
labs(x="Sentiment", y="Frequency")+
theme(axis.text.x = element_text(angle=45, hjust=1))
#boys sentiments frequency
boyssentiments <- boystokens %>%
inner_join(nrc, "word") %>%
count(word, sentiment, sort=TRUE)
ggplot(data=boyssentiments, aes(x=reorder(sentiment, -n, sum), y=n)) +
geom_bar(stat="identity", aes(fill=sentiment), show.legend=FALSE) +
labs(x="Sentiment", y="Frequency")+
theme(axis.text.x = element_text(angle=45, hjust=1))
#girls top words associated w each sentiment
girlssentiments %>%
group_by(sentiment) %>%
arrange(desc(n)) %>%
slice(1:10) %>%
ggplot(aes(x=reorder(word, n), y=n)) +
geom_col(aes(fill=sentiment), show.legend=FALSE) +
facet_wrap(~sentiment, scales="free_y") +
labs(y="Frequency", x="Terms") +
coord_flip()
#boys top words associated w each sentiment
boyssentiments %>%
group_by(sentiment) %>%
arrange(desc(n)) %>%
slice(1:10) %>%
ggplot(aes(x=reorder(word, n), y=n)) +
geom_col(aes(fill=sentiment), show.legend=FALSE) +
facet_wrap(~sentiment, scales="free_y") +
labs(y="Frequency", x="Terms") +
coord_flip()
#season 10
s10girls<- s10epset%>%
filter(Character %in% c("Monica","Rachel","Phoebe"))
# how many lines for lead girls in s10?
length(s10girls$Line)
## [1] 2157
s10boys<- s10epset%>%
filter(Character %in% c("Joey", "Chandler", "Ross"))
# how many lines for lead boys in s10?
length(s10boys$Line)
## [1] 2295
#girls most frequent bigrams
s10girls.bigrams <- frequentBigrams(s10girls$Line)[1:20,]
ggplot(data=s10girls.bigrams, aes(x=reorder(word, -freq), y=freq)) +
geom_bar(stat="identity", fill="chocolate2", colour="black") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(x="Bigram", y="Frequency")
#boys most freq bigrams
s10boys.bigrams <- frequentBigrams(s10boys$Line)[1:20,]
ggplot(data=s10boys.bigrams, aes(x=reorder(word, -freq), y=freq)) +
geom_bar(stat="identity", fill="chocolate2", colour="black") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(x="Bigram", y="Frequency")
#girls positive/negative word cloud
s10girlstokens <- s10girls %>%
mutate(line=as.character(s10girls$Line)) %>%
unnest_tokens(word, line)
s10girlstokens %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort=TRUE) %>%
acast(word ~ sentiment, value.var="n", fill=0) %>%
comparison.cloud(colors=c("#F8766D", "#00BFC4"), max.words=100)
## Joining, by = "word"
# boys positive/negative word cloud
s10boystokens <- s10boys %>%
mutate(line=as.character(s10boys$Line)) %>%
unnest_tokens(word, line)
s10boystokens %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort=TRUE) %>%
acast(word ~ sentiment, value.var="n", fill=0) %>%
comparison.cloud(colors=c("#F8766D", "#00BFC4"), max.words=100)
## Joining, by = "word"
# girls sentiments frequency
s10girlssentiments <- s10girlstokens %>%
inner_join(nrc, "word") %>%
count(word, sentiment, sort=TRUE)
ggplot(data=s10girlssentiments, aes(x=reorder(sentiment, -n, sum), y=n)) +
geom_bar(stat="identity", aes(fill=sentiment), show.legend=FALSE) +
labs(x="Sentiment", y="Frequency")+
theme(axis.text.x = element_text(angle=45, hjust=1))
#boys sentiments frequency
s10boyssentiments <- s10boystokens %>%
inner_join(nrc, "word") %>%
count(word, sentiment, sort=TRUE)
ggplot(data=s10boyssentiments, aes(x=reorder(sentiment, -n, sum), y=n)) +
geom_bar(stat="identity", aes(fill=sentiment), show.legend=FALSE) +
labs(x="Sentiment", y="Frequency")+
theme(axis.text.x = element_text(angle=45, hjust=1))
#girls top words associated w each sentiment
s10girlssentiments %>%
group_by(sentiment) %>%
arrange(desc(n)) %>%
slice(1:10) %>%
ggplot(aes(x=reorder(word, n), y=n)) +
geom_col(aes(fill=sentiment), show.legend=FALSE) +
facet_wrap(~sentiment, scales="free_y") +
labs(y="Frequency", x="Terms") +
coord_flip()
#boys top words associated w each sentiment
s10boyssentiments %>%
group_by(sentiment) %>%
arrange(desc(n)) %>%
slice(1:10) %>%
ggplot(aes(x=reorder(word, n), y=n)) +
geom_col(aes(fill=sentiment), show.legend=FALSE) +
facet_wrap(~sentiment, scales="free_y") +
labs(y="Frequency", x="Terms") +
coord_flip()
#epset.mascbigrams <- frequentBigrams(s10epset$Line)%>% filter(str_detect(word, "boy|Boy|boys|Boys|guy|Guy|guys|Guys|man|Man|men|Men|dude|Dude|dude|Dudes|fella|Fella"), !str_detect(word, 'apartment'))
#ggplot(data=s10epset.bigrams, aes(x=reorder(word, -freq), y=freq)) +
# geom_bar(stat="identity", fill="chocolate2", colour="black") +
# theme(axis.text.x=element_text(angle=45, hjust=1)) +
# labs(x="Bigram", y="Frequency")
#season 1 bigrams for masculine nouns
mascbigrams<- epset%>%
unnest_tokens(bigram, Line, token = "ngrams", n = 2)%>%
separate(bigram, c("word1", "word2"), sep = " ")%>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)%>%
filter(word2 %in% c("boy","Boy","boys","Boys","man","Man","men","Men","guy","Guy","guys","Guys"))%>%
unite(bigram, word1, word2, sep = " ")%>%
count(bigram, sort = TRUE)
head(mascbigrams)
## bigram n
## 1 naked guy 7
## 2 nice guy 6
## 3 wine guy 5
## 4 coma guy 3
## 5 hey guys 3
## 6 kinda guy 3
#season 1 bigrams for feminine nouns
fembigrams<- epset%>%
unnest_tokens(bigram, Line, token = "ngrams", n = 2)%>%
separate(bigram, c("word1", "word2"), sep = " ")%>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)%>%
filter(word2 %in% c("girl","Girl","girls","Girls","woman","Woman","women","Women","lady","Lady","ladies","Ladies"))%>%
unite(bigram, word1, word2, sep = " ")%>%
count(bigram, sort = TRUE)
head(fembigrams)
## bigram n
## 1 beautiful woman 5
## 2 bug lady 2
## 3 likes women 2
## 4 weird girl 2
## 5 apartment girl 1
## 6 beautiful women 1
# SEASON 10 bigrams for masc
s10mascbigrams<- s10epset%>%
unnest_tokens(bigram, Line, token = "ngrams", n = 2)%>%
separate(bigram, c("word1", "word2"), sep = " ")%>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)%>%
filter(word2 %in% c("boy","Boy","boys","Boys","man","Man","men","Men","guy","Guy","guys","Guys"))%>%
unite(bigram, word1, word2, sep = " ")%>%
count(bigram, sort = TRUE)
head(s10mascbigrams)
## bigram n
## 1 hey guys 10
## 2 cool guy 2
## 3 aged guy 1
## 4 agency guy 1
## 5 amazing guy 1
## 6 bad boy 1
#season 10 bigrams for fem
s10fembigrams<- s10epset%>%
unnest_tokens(bigram, Line, token = "ngrams", n = 2)%>%
separate(bigram, c("word1", "word2"), sep = " ")%>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)%>%
filter(word2 %in% c("girl","Girl","girls","Girls","woman","Woman","women","Women","lady","Lady","ladies","Ladies"))%>%
unite(bigram, word1, word2, sep = " ")%>%
count(bigram, sort = TRUE)
head(s10fembigrams)
## bigram n
## 1 adoption lady 3
## 2 beautiful woman 2
## 3 white ladies 2
## 4 birthday girl 1
## 5 blonde woman 1
## 6 body girl 1
#VISUALIZE
# season 1
topmascbigrams <- mascbigrams[1:20,]
ggplot(data=topmascbigrams, aes(x=reorder(bigram,n), y=n)) +
geom_bar(stat="identity", fill="chocolate2", colour="black") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(x="Bigram", y="Frequency")+
coord_flip()
topfembigrams <- fembigrams[1:20,]
ggplot(data=topfembigrams, aes(x=reorder(bigram,n), y=n)) +
geom_bar(stat="identity", fill="chocolate2", colour="black") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(x="Bigram", y="Frequency")+
coord_flip()
# season 10
s10topmascbigrams <- s10mascbigrams[1:20,]
ggplot(data=s10topmascbigrams, aes(x=reorder(bigram,n), y=n)) +
geom_bar(stat="identity", fill="chocolate2", colour="black") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(x="Bigram", y="Frequency")+
coord_flip()
s10topfembigrams <- s10fembigrams[1:20,]%>%
filter(!is.na(bigram))
ggplot(data=s10topfembigrams, aes(x=reorder(bigram,n), y=n)) +
geom_bar(stat="identity", fill="chocolate2", colour="black") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(x="Bigram", y="Frequency")+
coord_flip()
sgdata<- tibble(Gender=c("Gals","Guys","Gals","Guys"), Season=c("Season 1","Season 1","Season 10","Season 10"), LineCount=c(length(girls$Line), length(boys$Line), length(s10girls$Line),length(s10boys$Line)))
ggplot(sgdata) +
aes(x = as.factor(Season), y = LineCount,
group = Gender, color=Gender) +
geom_line(size=2)+
geom_label(aes(label=LineCount), size=5)+
theme(axis.line = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_text(size=10, face='bold'),
plot.background = element_blank(),
panel.background = element_blank(),
legend.position = 'none',
plot.title = element_text(hjust = 0.5),
plot.caption = element_text(hjust=0.35, vjust=50, face='italic')) +
scale_color_manual(values=c("indianred","steelblue3"))+
ylab("")+
xlab("")+
labs(title="How many lines do the guys and gals have?",
caption="Our data shows that the women leads \n have less lines than their male counterparts, \n with the gap widening from season 1 to 10.")
#annotate("text", x=-2.1e+06, y=.5e+06, label="SALAD",color='#5ABFB5', size=3, fontface='bold')+