Week 3-Homework CS695

This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

Install necessary packages

# install.packages('tm')
# install.packages('RColorBrewer')
# install.packages('wordcloud')
library('tm')

## Loading required package: NLP

library('RColorBrewer')
library('wordcloud')

Collect tweets from Twitter API

## [1] "Using direct authentication"

Process data

# Import data from a csv file to data frame 
liuBK <- read.csv("liuBKData.csv", comment.char="#")
LIUTweets <- subset(liuBK)

# Save and read data to/from a R data object
saveRDS(LIUTweets, "LIUTweets.RDS")
LIUTweets <- readRDS("LIUTweets.RDS")

MLIUTweets <- LIUTweets$text

# Function to clean tweets
clean.text = function(x)
{
  # tolower
  x = tolower(x)
  # remove rt
  x = gsub("rt", "", x)
  # remove at
  x = gsub("@\\w+", "", x)
  # remove punctuation
  x = gsub("[[:punct:]]", "", x)
  # remove numbers
  x = gsub("[[:digit:]]", "", x)
  # remove links http
  x = gsub("http\\w+", "", x)
  # remove tabs
  x = gsub("[ |\t]{2,}", "", x)
  # remove blank spaces at the beginning
  x = gsub("^ ", "", x)
  # remove blank spaces at the end
  x = gsub(" $", "", x)
  return(x)
}

# clean tweets
MLIUTweets = clean.text(MLIUTweets)

Create word cloud of tweets from LIU-Brooklyn

corpus = Corpus(VectorSource(MLIUTweets))
# corpus = Corpus(VectorSource(cmail))
# create term-document matrix
tdm = TermDocumentMatrix(
  corpus,
  control = list(
    wordLengths=c(3,20),
    removePunctuation = TRUE,
    stopwords = c("the", "a", stopwords("english")),
    removeNumbers = TRUE, 
    tolower = TRUE) )

# convert as matrix
tdm = as.matrix(tdm)

# get word counts in decreasing order
word_freqs = sort(rowSums(tdm), decreasing=TRUE) 

#remove the top words which donât generate insights such as "the", "a", "and", etc.
#Here â1â is 1st word in the list we want to remove 
word_freqs = word_freqs[-(1:9)]  

# create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)

#Plot corpus in a clored graph; need RColorBrewer package
wordcloud(head(dm$word, 50), head(dm$freq, 50), random.order=FALSE, colors=brewer.pal(8, "Dark2"))

#check top 50 most mentioned words
head(word_freqs, 20)

##          students    eduaubdedububa      eduaubdedubu              glbs 
##                 9                 8                 8                 7 
##    ubliubrooklynu          ufufefpm           gameday              ufpm 
##                 7                 7                 6                 6 
##           francis       advocacyday            albany standupstudentaid 
##                 6                 5                 5                 5 
##  weareliubrooklyn               big           onepack             saint 
##                 5                 5                 5                 5 
##              wrac   blackbirdnation         spotlight        blackbirds 
##                 5                 5                 4                 4

Week 3-Homework CS695

Aneudis Salcedo

Install necessary packages

Collect tweets from Twitter API

Process data

Create word cloud of tweets from LIU-Brooklyn