502 project exploration

Sentiment analysis project exploration: Star Wars example on Kaggle

https://www.kaggle.com/xvivancos/analyzing-star-wars-movie-scripts

# Copied and pasted from link above
#install.packages("RWeka") # install RWeka package
#install.packages("tm")
#install.packages("wordcloud")
#install.packages("wordcloud2")
#install.packages("tidytext")
#install.packages("reshape2")
#install.packages("radarchart")
library(tidyverse) # data manipulation
library(tm) # text mining
library(wordcloud) # word cloud generator
library(wordcloud2) # word cloud generator
library(tidytext) # text mining for word processing and sentiment analysis
library(reshape2) # reshapes a data frame
library(radarchart) # drawing the radar chart from a data frame
library(RWeka) # data mining tasks

“RWeka” is the R interface for Java’s Weka package, a collection of machine learning algorithms for data mining tasks including data pre-processing, classification, regression, clustering, association rules, and visualization. (Source: CRAN)

“tm” is a common(?) text mining package for R.

“wordcloud” and “wordcloud2” are both packages to generate wordcloud visuals in R, something that is otherwise pretty tricky. Similarly, “radarchart” is for making a particular sort of visual, the radar/spider chart. However, not sure how necessary this package is– based on my research I think ggplot also can create radar plots. TBD.

“tidytext” this package contains functions and supporting data sets to allow conversion of text to and from tidy formats, and to switch seamlessly between tidy tools and existing text mining packages.(Source: CRAN)

“reshape 2” to flexibly restructure and aggregate data using just two functions: melt and ‘dcast’ (or ‘acast’). (Source: CRAN)

# script data
ep4 <- read.table("/Users/nikohellman/Desktop/star wars scripts/SW_EpisodeIV.txt")
ep5 <- read.table("/Users/nikohellman/Desktop/star wars scripts/SW_EpisodeV.txt")
ep6 <- read.table("/Users/nikohellman/Desktop/star wars scripts/SW_EpisodeVI.txt")

str(ep4)

## 'data.frame':    1010 obs. of  2 variables:
##  $ character: chr  "THREEPIO" "THREEPIO" "THREEPIO" "THREEPIO" ...
##  $ dialogue : chr  "Did you hear that?  They've shut down the main reactor.  We'll be destroyed for sure.  This is madness!" "We're doomed!" "There'll be no escape for the Princess this time." "What's that?" ...

str(ep5)

## 'data.frame':    839 obs. of  2 variables:
##  $ character: chr  "LUKE" "HAN" "LUKE" "HAN" ...
##  $ dialogue : chr  "Echo Three to Echo Seven. Han, old buddy, do you  read me?" "Loud and clear, kid. What's up?" "Well, I finished my circle. I don't pick up any  life readings." "There isn't enough life on this ice cube to fill a  space cruiser. The sensors are placed. I'm going back." ...

str(ep6)

## 'data.frame':    674 obs. of  2 variables:
##  $ character: chr  "SHUTTLE CAPTAIN" "DEATH STAR CONTROLLER" "SHUTTLE CAPTAIN" "OFFICER" ...
##  $ dialogue : chr  "Command station, this is ST 321. Code Clearance Blue. We're starting our approach. Deactivate the security shield." "The security deflector shield will be deactivated when we have confirmation of your code transmission. Stand by"| __truncated__ "We're starting our approach." "Inform the commander that Lord Vader's shuttle has arrived." ...

# lexicon data
bing <- read_csv("/Users/nikohellman/Desktop/sentiment analysis sets/Bing.csv")

## Rows: 6786 Columns: 2

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word, sentiment

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

nrc <- read_csv("/Users/nikohellman/Desktop/sentiment analysis sets/NRC.csv")

## Rows: 13901 Columns: 2

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word, sentiment

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

afinn <- read_csv("/Users/nikohellman/Desktop/sentiment analysis sets/Afinn.csv")

## Rows: 2477 Columns: 2

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): word
## dbl (1): value

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Functions

# Text transformations
cleanCorpus <- function(corpus){

  corpus.tmp <- tm_map(corpus, removePunctuation)
  corpus.tmp <- tm_map(corpus.tmp, stripWhitespace)
  corpus.tmp <- tm_map(corpus.tmp, content_transformer(tolower))
  v_stopwords <- c(stopwords("english"), c("thats","weve","hes","theres","ive","im",
                                           "will","can","cant","dont","youve","us",
                                           "youre","youll","theyre","whats","didnt"))
  corpus.tmp <- tm_map(corpus.tmp, removeWords, v_stopwords)
  corpus.tmp <- tm_map(corpus.tmp, removeNumbers)
  return(corpus.tmp)

}

# Most frequent terms 
frequentTerms <- function(text){

  s.cor <- Corpus(VectorSource(text))
  s.cor.cl <- cleanCorpus(s.cor)
  s.tdm <- TermDocumentMatrix(s.cor.cl)
  s.tdm <- removeSparseTerms(s.tdm, 0.999)
  m <- as.matrix(s.tdm)
  word_freqs <- sort(rowSums(m), decreasing=TRUE)
  dm <- data.frame(word=names(word_freqs), freq=word_freqs)
  return(dm)

}

# Define bigram tokenizer 
tokenizer  <- function(x){

  NGramTokenizer(x, Weka_control(min=2, max=2))

}

# Most frequent bigrams 
frequentBigrams <- function(text){

  s.cor <- VCorpus(VectorSource(text))
  s.cor.cl <- cleanCorpus(s.cor)
  s.tdm <- TermDocumentMatrix(s.cor.cl, control=list(tokenize=tokenizer))
  s.tdm <- removeSparseTerms(s.tdm, 0.999)
  m <- as.matrix(s.tdm)
  word_freqs <- sort(rowSums(m), decreasing=TRUE)
  dm <- data.frame(word=names(word_freqs), freq=word_freqs)
  return(dm)

}

EPISODE IV

# How many dialogues?
length(ep4$dialogue)

## [1] 1010

# How many characters?
length(levels(as.factor(ep4$character)))

## [1] 60

# Top 20 characters with more dialogues 
top.ep4.chars <- as.data.frame(sort(table(ep4$character), decreasing=TRUE))[1:20,]

ggplot(data=top.ep4.chars, aes(x=Var1, y=Freq)) +
  geom_bar(stat="identity", fill="#56B4E9", colour="black") +
  theme_bw() +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  labs(x="Character", y="Number of dialogues")

# Wordcloud for Episode IV -- author unable to render in Kaggle so he did this separate. 
#We will work on this later by figuring out how to use wordcloud2 package

# Most frequent bigrams
ep4.bigrams <- frequentBigrams(ep4$dialogue)[1:20,]
ggplot(data=ep4.bigrams, aes(x=reorder(word, -freq), y=freq)) +  
  geom_bar(stat="identity", fill="chocolate2", colour="black") +
  theme_bw() +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  labs(x="Bigram", y="Frequency")

EPISODE V

# How many dialogues?
length(ep5$dialogue)

## [1] 839

# How many characters?
length(levels(as.factor(ep5$character)))

## [1] 49

# Top 20 characters with more dialogues 
top.ep5.chars <- as.data.frame(sort(table(ep5$character), decreasing=TRUE))[1:20,]

# Visualization 
ggplot(data=top.ep5.chars, aes(x=Var1, y=Freq)) +
  geom_bar(stat="identity", fill="#56B4E9", colour="black") +
  theme_bw() +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  labs(x="Character", y="Number of dialogues")

# Wordcloud for Episode V -- same problem as previous

# Most frequent bigrams
ep5.bigrams <- frequentBigrams(ep5$dialogue)[1:20,]
ggplot(data=ep5.bigrams, aes(x=reorder(word, -freq), y=freq)) +  
  geom_bar(stat="identity", fill="chocolate2", colour="black") +
  theme_bw() +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  labs(x="Bigram", y="Frequency")

EPISODE VI

# How many dialogues?
length(ep6$dialogue)

## [1] 674

# How many characters?
length(levels(as.factor(ep6$character)))

## [1] 53

# Top 20 characters with more dialogues
top.ep6.chars <- as.data.frame(sort(table(ep6$character), decreasing=TRUE))[1:20,]

# Visualization 
ggplot(data=top.ep6.chars, aes(x=Var1, y=Freq)) +
  geom_bar(stat="identity", fill="#56B4E9", colour="black") +
  theme_bw() +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  labs(x="Character", y="Number of dialogues")

# Wordcloud for Episode VI -- same problem

# Most frequent bigrams
ep6.bigrams <- frequentBigrams(ep6$dialogue)[1:20,]
ggplot(data=ep6.bigrams, aes(x=reorder(word, -freq), y=freq)) +  
  geom_bar(stat="identity", fill="chocolate2", colour="black") +
  theme_bw() +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  labs(x="Bigram", y="Frequency")

TRILOGY

# The Original Trilogy dialogues 
trilogy <- rbind(ep4, ep5, ep6)

# How many dialogues?
length(trilogy$dialogue)

## [1] 2523

# How many characters?
length(levels(as.factor(trilogy$character)))

## [1] 129

# Top 20 characters with more dialogues 
top.trilogy.chars <- as.data.frame(sort(table(trilogy$character), decreasing=TRUE))[1:20,]

# Visualization 
ggplot(data=top.trilogy.chars, aes(x=Var1, y=Freq)) +
  geom_bar(stat="identity", fill="#56B4E9", colour="black") +
  theme_bw() +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  labs(x="Character", y="Number of dialogues")

# Wordcloud for The Original Trilogy -- same problem

# Most frequent bigrams
trilogy.bigrams <- frequentBigrams(trilogy$dialogue)[1:20,]
ggplot(data=trilogy.bigrams, aes(x=reorder(word, -freq), y=freq)) +  
  geom_bar(stat="identity", fill="chocolate2", colour="black") +
  theme_bw() +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  labs(x="Bigram", y="Frequency")

SENTIMENT ANALYSIS!!!

# Transform the text to a tidy data structure with one token per row
tokens <- trilogy %>%  
  mutate(dialogue=as.character(trilogy$dialogue)) %>%
  unnest_tokens(word, dialogue)

# Positive and negative words - bing
tokens %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort=TRUE) %>%
  acast(word ~ sentiment, value.var="n", fill=0) %>%
  comparison.cloud(colors=c("#F8766D", "#00BFC4"), max.words=100)

## Joining, by = "word"

# Sentiments and frequency associated with each word - nrc
sentiments <- tokens %>% 
  inner_join(nrc, "word") %>%
  count(word, sentiment, sort=TRUE) 

# Frequency of each sentiment
ggplot(data=sentiments, aes(x=reorder(sentiment, -n, sum), y=n)) + 
  geom_bar(stat="identity", aes(fill=sentiment), show.legend=FALSE) +
  labs(x="Sentiment", y="Frequency") +
  theme_bw()

# Top 10 terms for each sentiment
sentiments %>%
  group_by(sentiment) %>%
  arrange(desc(n)) %>%
  slice(1:10) %>%
  ggplot(aes(x=reorder(word, n), y=n)) +
  geom_col(aes(fill=sentiment), show.legend=FALSE) +
  facet_wrap(~sentiment, scales="free_y") +
  labs(y="Frequency", x="Terms") +
  coord_flip() +
  theme_bw()

# Sentiment analysis for the Top 10 characters with more dialogues
tokens %>%
  filter(character %in% c("LUKE","HAN","THREEPIO","LEIA","VADER",
                          "BEN","LANDO","YODA","EMPEROR","RED LEADER")) %>%
  inner_join(nrc, "word") %>%
  count(character, sentiment, sort=TRUE) %>%
  ggplot(aes(x=sentiment, y=n)) +
  geom_col(aes(fill=sentiment), show.legend=FALSE) +
  facet_wrap(~character, scales="free_x") +
  labs(x="Sentiment", y="Frequency") +
  coord_flip() +
  theme_bw()

# Stopwords
mystopwords <- tibble(word=c(stopwords("english"), 
                                 c("thats","weve","hes","theres","ive","im",
                                   "will","can","cant","dont","youve","us",
                                   "youre","youll","theyre","whats","didnt")))

# Tokens without stopwords
top.chars.tokens <- trilogy %>%
  mutate(dialogue=as.character(trilogy$dialogue)) %>%
  filter(character %in% c("LUKE","HAN","THREEPIO","LEIA","VADER",
                          "BEN","LANDO","YODA","EMPEROR","RED LEADER")) %>%
  unnest_tokens(word, dialogue) %>%
  anti_join(mystopwords, by="word")

# Most frequent words for each character
top.chars.tokens %>%
  count(character, word) %>%
  group_by(character) %>% 
  arrange(desc(n)) %>%
  slice(1:10) %>%
  ungroup() %>%
  mutate(word2=factor(paste(word, character, sep="__"), 
                       levels=rev(paste(word, character, sep="__"))))%>%
  ggplot(aes(x=word2, y=n)) +
  geom_col(aes(fill=character), show.legend=FALSE) +
  facet_wrap(~character, scales="free_y") +
  labs(x="Sentiment", y="Frequency") +
  scale_x_discrete(labels=function(x) gsub("__.+$", "", x)) +
  coord_flip() +
  theme_bw()

#note: different order from kaggle but same info

# Most relevant words for each character
top.chars.tokens %>%
  count(character, word) %>%
  bind_tf_idf(word, character, n) %>%
  group_by(character) %>% 
  arrange(desc(tf_idf)) %>%
  slice(1:10) %>%
  ungroup() %>%
  mutate(word2=factor(paste(word, character, sep="__"), 
                       levels=rev(paste(word, character, sep="__"))))%>%
  ggplot(aes(x=word2, y=tf_idf)) +
  geom_col(aes(fill=character), show.legend=FALSE) +
  facet_wrap(~character, scales="free_y") +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  labs(y="tf–idf", x="Sentiment") +
  scale_x_discrete(labels=function(x) gsub("__.+$", "", x)) +
  coord_flip() +
  theme_bw()