Wk3 Homework assignment

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Collect tweets from Twitter API

#install.packages("ROAuth")
#install.packages("twitteR")
library("ROAuth")
library("twitteR")
# install.packages('tm')
# install.packages('RColorBrewer')
# install.packages('wordcloud')
library('tm')

## Loading required package: NLP

library('RColorBrewer')
library('wordcloud')

#*****************************
# Create your own Twitter key
# https://developer.twitter.com/en/docs/basics/getting-started#get-started-app

## [1] "Using direct authentication"

## Warning in doRppAPICall("search/tweets", n, params = params,
## retryOnRateLimit = retryOnRateLimit, : 1000 tweets were requested but the
## API can only return 144

# Function to clean tweets
clean.text = function(x)
{
  # tolower
  x = tolower(x)
  # remove rt
  x = gsub("rt", "", x)
  # remove at
  x = gsub("@\\w+", "", x)
  # remove punctuation
  x = gsub("[[:punct:]]", "", x)
  # remove numbers
  x = gsub("[[:digit:]]", "", x)
  # remove links http
  x = gsub("http\\w+", "", x)
  # remove tabs
  x = gsub("[ |\t]{2,}", "", x)
  # remove blank spaces at the beginning
  x = gsub("^ ", "", x)
  # remove blank spaces at the end
  x = gsub(" $", "", x)
  return(x)
}

# clean tweets
liuBK = clean.text(liuBK)

Create word cloud of tweets

corpus = Corpus(VectorSource(liuBK))
# corpus = Corpus(VectorSource(cmail))
# create term-document matrix
tdm = TermDocumentMatrix(
  corpus,
  control = list(
    wordLengths=c(3,20),
    removePunctuation = TRUE,
    stopwords = c("the", "a", stopwords("english")),
    removeNumbers = TRUE, tolower = TRUE) )

# convert as matrix
tdm = as.matrix(tdm)

# get word counts in decreasing order
word_freqs = sort(rowSums(tdm), decreasing=TRUE) 

#remove the top words which donâ????t generate insights such as "the", "a", "and", etc.
word_freqs = word_freqs[-(1:9)]  #Here â????1â?? is 1st word in the list we want to remove 

# create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)

#Plot corpus in a clored graph; need RColorBrewer package

wordcloud(head(dm$word, 100), head(dm$freq, 100), random.order=FALSE, colors=brewer.pal(8, "Dark2"))

#check top 50 most mentioned words
head(word_freqs, 20)

##                stud   eduaubdedubuamark            brooklyn 
##                  91                  86                  14 
##             clienta                 web              center 
##                  12                  12                  10 
##      neduaubdedubud         liubrooklyn            androida 
##                  10                   9                   9 
##                 big                 day           steinberg 
##                   8                   7                   7 
##             francis                game     neduaubdedububa 
##                   6                   6                   5 
##           nufufefpm             onepack               saint 
##                   5                   5                   5 
## wracnneduaubcedubfu               litea 
##                   5                   5

Wk3 Homework assignment

Manasi Gore

2/8/2018

R Markdown