Week 3 R Notebook

This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

Install necessary packages

# install.packages('tm')
# install.packages('RColorBrewer')
# install.packages('wordcloud')
library('tm')

## Warning: package 'tm' was built under R version 3.4.4

## Loading required package: NLP

library('RColorBrewer')
library('wordcloud')

## Warning: package 'wordcloud' was built under R version 3.4.4

Process data

# Import data from a csv file to data frame 
trump <- read.csv("Trump.csv", comment.char="#")
NewYorkCityTw <- subset(trump, USER_CITY == "New York City")
WashingtonTw <- subset(trump, USER_CITY == "WASHINGTON")

# Save and read data to/from a R data object
saveRDS(NewYorkCityTw, "NewYorkCityTw.RDS")
NewYorkCityTw <- readRDS("NewYorkCityTw.RDS")

NYCtweets <- NewYorkCityTw$MESSAGE_BODY
Wtweets <- WashingtonTw$MESSAGE_BODY

# Function to clean tweets
clean.text = function(x)
{
  # tolower
  x = tolower(x)
  # remove rt
  x = gsub("rt", "", x)
  # remove at
  x = gsub("@\\w+", "", x)
  # remove punctuation
  x = gsub("[[:punct:]]", "", x)
  # remove numbers
  x = gsub("[[:digit:]]", "", x)
  # remove links http
  x = gsub("http\\w+", "", x)
  # remove tabs
  x = gsub("[ |\t]{2,}", "", x)
  # remove blank spaces at the beginning
  x = gsub("^ ", "", x)
  # remove blank spaces at the end
  x = gsub(" $", "", x)
  return(x)
}

# clean tweets
NYCtweets = clean.text(NYCtweets)
Wtweets = clean.text(Wtweets)

Create word cloud of tweets of male users

corpus = Corpus(VectorSource(NYCtweets))
# corpus = Corpus(VectorSource(cmail))
# create term-document matrix
tdm = TermDocumentMatrix(
  corpus,
  control = list(
    wordLengths=c(3,20),
    removePunctuation = TRUE,
    stopwords = c("the", "a", stopwords("english")),
    removeNumbers = TRUE, tolower = TRUE) )

# convert as matrix
tdm = as.matrix(tdm)

# get word counts in decreasing order
word_freqs = sort(rowSums(tdm), decreasing=TRUE) 

#remove the top words which donât generate insights such as "the", "a", "and", etc.
word_freqs = word_freqs[-(1:9)]  #Here â1â is 1st word in the list we want to remove 

# create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)

#Plot corpus in a clored graph; need RColorBrewer package

wordcloud(head(dm$word, 50), head(dm$freq, 50), random.order=FALSE, colors=brewer.pal(8, "Dark2"))

#check top 50 most mentioned words
head(word_freqs, 20)

##           think             gop      againtrump   politicianshe 
##               3               2               2               2 
##            smas            tcot         trumped     racistmania 
##               2               2               2               2 
##     donaldtrump          skills        cruzâs         goldman 
##               2               2               2               2 
##         problem realdonaldtrump           sachs             ted 
##               2               2               2               2 
##      trumptrain            just            like       endracism 
##               2               2               2               2

Create word cloud of tweets of Washington users

##          amp        point   republican       answer          can 
##            3            3            3            2            2 
##          new       theyre     michigan          win         beat 
##            2            2            2            2            2 
##     amiright        tough    nhprimary       trumps         good 
##            2            2            2            2            2 
##  donaldtrump planelection          tax     campaign    thousands 
##            2            2            2            2            2