defaultW <- getOption("warn")
options(warn = -1)
library(readr)
library(ggplot2)
library(tidyverse)
library(tm)
library(wordcloud)
library(wordcloud2)
library(tidytext)
library(grid)
library(radarchart)
library(stringr)
library(textmineR)
library(dplyr)
library(tidytext)
library(RColorBrewer)
library(reshape2)
options(warn = defaultW)
WordCloud - Most Frequent Words
# Custom Color Palette
my_colors <- c("#05A4C0", "#85CEDA", "#D2A7D8", "#A67BC5", "#BB1C8B", "#8D266E")
cols <- brewer.pal(n = 3, name = 'Set2')
# Load the dataframe
data <- read_csv("cleaned_tweets_sentiments.csv",
col_types = cols('Tweet' = col_character(), 'Sentiment' = col_character()))
# Create a corpus
tweet_words<- data$Tweet
word <- Corpus(VectorSource(tweet_words))
word <- word %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
word <- tm_map(word, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(word, content_transformer(tolower)):
## transformation drops documents
word <- tm_map(word, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(word, removeWords, stopwords("english")):
## transformation drops documents
dtm <- TermDocumentMatrix(word)
matrix <- as.matrix(dtm)
words <- sort(rowSums(matrix),decreasing=TRUE)
# Create df with the frequency of each word
df_word <- data.frame(word = names(words),freq=words)
# Plot WordCloud
set.seed(888)
wordcloud2(data=df_word, size=1.6, color='random-light', backgroundColor="black", shape="diamond", fontFamily="HersheySymbol")
Comparison Cloud - Positive vs Negative Sentiments
# Create a function to clean corpus
cleanCorpus <- function(Tweet){
# Remove punctuation, whitespace, lowercase, numbers
text.tmp <- tm_map(Tweet, removePunctuation)
text.tmp <- tm_map(text.tmp, stripWhitespace)
text.tmp <- tm_map(text.tmp, content_transformer(tolower))
text.tmp <- tm_map(text.tmp, removeNumbers)
# Remove stopwords
stopwords_remove <- c(stopwords("en"), c("thats","weve","hes","theres","ive","im",
"will","can","cant","dont","youve","us",
"youre","youll","theyre","whats","didnt"))
text.tmp <- tm_map(text.tmp, removeWords, stopwords_remove)
return(text.tmp)
}
# --- UNIGRAM ---
# Unigram tokenizer
frequentTerms <- function(Tweet){
s.cor <- VCorpus(VectorSource(Tweet))
s.cor.cl <- cleanCorpus(s.cor)
s.tdm <- TermDocumentMatrix(s.cor.cl)
s.tdm <- removeSparseTerms(s.tdm, 0.999)
m <- as.matrix(s.tdm)
word_freqs <- sort(rowSums(m), decreasing = T)
dm <- data.frame(word=names(word_freqs), freq=word_freqs)
return(dm)
}
# --- BIGRAM ---
# Bigram tokenizer
tokenizer_2 <- function(x){
NGramTokenizer(x, Weka_control(min=2, max=2))
}
# Bigram function
frequentBigrams <- function(Tweet){
s.cor <- VCorpus(VectorSource(Tweet))
s.cor.cl <- cleanCorpus(s.cor)
s.tdm <- TermDocumentMatrix(s.cor.cl, control=list(tokenize=tokenizer_2))
s.tdm <- removeSparseTerms(s.tdm, 0.999)
m <- as.matrix(s.tdm)
word_freqs <- sort(rowSums(m), decreasing=T)
dm <- data.frame(word=names(word_freqs), freq=word_freqs)
return(dm)
}
# --- TRIGRAM ---
# Trigram tokenizer
tokenizer_3 <- function(x){
NGramTokenizer(x, Weka_control(min=3, max=3))
}
# Trigram function
frequentTrigrams <- function(Tweet){
s.cor <- VCorpus(VectorSource(Tweet))
s.cor.cl <- cleanCorpus(s.cor)
s.tdm <- TermDocumentMatrix(s.cor.cl, control=list(tokenize=tokenizer_3))
s.tdm <- removeSparseTerms(s.tdm, 0.999)
m <- as.matrix(s.tdm)
word_freqs <- sort(rowSums(m), decreasing=T)
dm <- data.frame(word=names(word_freqs), freq=word_freqs)
return(dm)
}
afinn <- read_csv("Afinn.csv",
col_types = cols(word = col_character(), value = col_double()))
bing <- read_csv("Bing.csv",
col_types = cols(word = col_character(), sentiment = col_character()))
nrc <- read_csv("NRC.csv",
col_types = cols(word = col_character(), sentiment = col_character()))
unnest_tweets <- data %>%
mutate(text = as.character(data$Tweet)) %>%
unnest_tokens(word, text) # Split a column into tokens
unnest_tweets<-unnest_tweets %>%
inner_join(bing, by="word") %>%
count(word, sentiment, sort=T) %>%
acast(word ~ sentiment, value.var = "n", fill=0)
# Plot comparison cloud
comparison.cloud(unnest_tweets, colors=my_colors[c(5, 1)], max.words = 500, title.size = 2,scale = c(5,0.5) )
