This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

Install necessary packages

# install.packages('tm')
# install.packages('RColorBrewer')
# install.packages('wordcloud')
library('tm')
## Loading required package: NLP
library('RColorBrewer')
library('wordcloud')

Process data

# Import data from a csv file to data frame 
trump <- read.csv("Trump.csv", comment.char="#")
maleTw <- subset(trump, USER_GENDER == "male")
femaleTw <- subset(trump, USER_GENDER == "female")
# collecting data from Trump data of users from "WASHINGTON"
WSTw <- subset(trump, USER_CITY == "WASHINGTON")
# collecting data from Trump data of users from "NEW YORK CITY" 

NYTw <- subset(trump, USER_CITY == "New York City")

# Save and read data to/from a R data object
saveRDS(maleTw, "maleTw.RDS")
maleTw <- readRDS("maleTw.RDS")

Mtweets <- maleTw$MESSAGE_BODY
Ftweets <- femaleTw$MESSAGE_BODY

# Function to clean tweets
clean.text = function(x)
{
  # tolower
  x = tolower(x)
  # remove rt
  x = gsub("rt", "", x)
  # remove at
  x = gsub("@\\w+", "", x)
  # remove punctuation
  x = gsub("[[:punct:]]", "", x)
  # remove numbers
  x = gsub("[[:digit:]]", "", x)
  # remove links http
  x = gsub("http\\w+", "", x)
  # remove tabs
  x = gsub("[ |\t]{2,}", "", x)
  # remove blank spaces at the beginning
  x = gsub("^ ", "", x)
  # remove blank spaces at the end
  x = gsub(" $", "", x)
  return(x)
}

# clean tweets
Mtweets = clean.text(Mtweets)
Ftweets = clean.text(Ftweets)

Create word cloud of tweets of male users

corpus = Corpus(VectorSource(Mtweets))
# corpus = Corpus(VectorSource(cmail))
# create term-document matrix
tdm = TermDocumentMatrix(
  corpus,
  control = list(
    wordLengths=c(3,20),
    removePunctuation = TRUE,
    stopwords = c("the", "a", stopwords("english")),
    removeNumbers = TRUE, tolower = TRUE) )

# convert as matrix
tdm = as.matrix(tdm)

# get word counts in decreasing order
word_freqs = sort(rowSums(tdm), decreasing=TRUE) 

#remove the top words which don’t generate insights such as "the", "a", "and", etc.
word_freqs = word_freqs[-(1:9)]  #Here “1” is 1st word in the list we want to remove 

# create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)

#Plot corpus in a clored graph; need RColorBrewer package

wordcloud(head(dm$word, 50), head(dm$freq, 50), random.order=FALSE, colors=brewer.pal(8, "Dark2"))

#check top 50 most mentioned words
head(word_freqs, 20)
##                new          president                one 
##                 53                 53                 53 
##               like                win                amp 
##                 52                 50                 44 
##             debate           politics               next 
##                 40                 38                 35 
##               take               dont          something 
##                 33                 32                 32 
##           neverump electiontrumptrain              great 
##                 31                 30                 30 
##                can             people               says 
##                 30                 29                 29 
##               just                get 
##                 29                 28

Create word cloud of tweets of female users

##          woman            one    republicans      gopdebate            amp 
##             29             28             27             26             26 
##       neverump            new           just            win butterelection 
##             25             25             25             24             24 
##         claims         seesin      president           tell         debate 
##             24             24             24             24             23 
##         matter         pledge        thewont            see           said 
##             23             22             22             20             19