defaultW <- getOption("warn") 
options(warn = -1) 
library(readr)        
library(ggplot2)     
library(tidyverse)    
library(tm)
library(wordcloud)    
library(wordcloud2)
library(tidytext)     
library(grid)
library(radarchart)
library(stringr)
library(textmineR)
library(dplyr)
library(tidytext)
library(RColorBrewer)
library(reshape2)
options(warn = defaultW)

WordCloud - Most Frequent Words

# Custom Color Palette
my_colors <- c("#05A4C0", "#85CEDA", "#D2A7D8", "#A67BC5", "#BB1C8B", "#8D266E")
cols <- brewer.pal(n = 3, name = 'Set2')

# Load the dataframe
data <- read_csv("cleaned_tweets_sentiments.csv", 
                 col_types = cols('Tweet' = col_character(), 'Sentiment' = col_character()))

# Create a corpus 
tweet_words<- data$Tweet
word <- Corpus(VectorSource(tweet_words))
word <- word %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
word <- tm_map(word, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(word, content_transformer(tolower)):
## transformation drops documents
word <- tm_map(word, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(word, removeWords, stopwords("english")):
## transformation drops documents
dtm <- TermDocumentMatrix(word) 
matrix <- as.matrix(dtm) 
words <- sort(rowSums(matrix),decreasing=TRUE) 

# Create df with the frequency of each word
df_word <- data.frame(word = names(words),freq=words)

# Plot WordCloud
set.seed(888)
wordcloud2(data=df_word, size=1.6, color='random-light', backgroundColor="black", shape="diamond",  fontFamily="HersheySymbol")

Comparison Cloud - Positive vs Negative Sentiments

# Create a function to clean corpus
cleanCorpus <- function(Tweet){
  # Remove punctuation, whitespace, lowercase, numbers
  text.tmp <- tm_map(Tweet, removePunctuation)
  text.tmp <- tm_map(text.tmp, stripWhitespace)
  text.tmp <- tm_map(text.tmp, content_transformer(tolower))
  text.tmp <- tm_map(text.tmp, removeNumbers)
  
  # Remove stopwords
  stopwords_remove <- c(stopwords("en"), c("thats","weve","hes","theres","ive","im",
                                           "will","can","cant","dont","youve","us",
                                           "youre","youll","theyre","whats","didnt"))
  text.tmp <- tm_map(text.tmp, removeWords, stopwords_remove)
  
  return(text.tmp)
}

# --- UNIGRAM ---
# Unigram tokenizer
frequentTerms <- function(Tweet){
  s.cor <- VCorpus(VectorSource(Tweet))
  s.cor.cl <- cleanCorpus(s.cor)
  s.tdm <- TermDocumentMatrix(s.cor.cl)
  s.tdm <- removeSparseTerms(s.tdm, 0.999)
  m <- as.matrix(s.tdm)
  word_freqs <- sort(rowSums(m), decreasing = T)
  

  dm <- data.frame(word=names(word_freqs), freq=word_freqs)
  
  return(dm)
}


# --- BIGRAM ---
# Bigram tokenizer
tokenizer_2 <- function(x){
  NGramTokenizer(x, Weka_control(min=2, max=2))
}

# Bigram function 
frequentBigrams <- function(Tweet){
  
  s.cor <- VCorpus(VectorSource(Tweet))
  s.cor.cl <- cleanCorpus(s.cor)
  s.tdm <- TermDocumentMatrix(s.cor.cl, control=list(tokenize=tokenizer_2))
  s.tdm <- removeSparseTerms(s.tdm, 0.999)
  m <- as.matrix(s.tdm)
  word_freqs <- sort(rowSums(m), decreasing=T)
  dm <- data.frame(word=names(word_freqs), freq=word_freqs)
  
  return(dm)
}

# --- TRIGRAM ---
# Trigram tokenizer
tokenizer_3 <- function(x){
  NGramTokenizer(x, Weka_control(min=3, max=3))
}

# Trigram function 
frequentTrigrams <- function(Tweet){
  
  s.cor <- VCorpus(VectorSource(Tweet))
  s.cor.cl <- cleanCorpus(s.cor)
  s.tdm <- TermDocumentMatrix(s.cor.cl, control=list(tokenize=tokenizer_3))
  s.tdm <- removeSparseTerms(s.tdm, 0.999)
  m <- as.matrix(s.tdm)
  word_freqs <- sort(rowSums(m), decreasing=T)
  dm <- data.frame(word=names(word_freqs), freq=word_freqs)
  
  return(dm)
}

afinn <- read_csv("Afinn.csv",
                  col_types = cols(word = col_character(), value = col_double()))
bing <- read_csv("Bing.csv",
                 col_types = cols(word = col_character(), sentiment = col_character()))
nrc <- read_csv("NRC.csv",
                col_types = cols(word = col_character(), sentiment = col_character()))

unnest_tweets <- data %>% 
  mutate(text = as.character(data$Tweet)) %>% 
  unnest_tokens(word, text) # Split a column into tokens


unnest_tweets<-unnest_tweets %>% 
  inner_join(bing, by="word") %>%
  count(word, sentiment, sort=T) %>% 
  acast(word ~ sentiment, value.var = "n", fill=0) 


# Plot comparison cloud 
comparison.cloud(unnest_tweets, colors=my_colors[c(5, 1)], max.words = 500, title.size = 2,scale = c(5,0.5) )