Now we are going to read the file, create a corpus, convert everything to lower cap, remove all punctuation, remove numbers, remove stop words, and do a histogram of what we found. Loading the whole twitter file took about 30 minutes so we’ll just run this with the first 1% of the data.
aFile = readLines("en_US.twitter.txt",n=2360)
library(tm)
## Loading required package: NLP
library(RWeka)
library(wordcloud)
## Loading required package: RColorBrewer
myCorpus = Corpus(VectorSource(aFile))
myCorpus = tm_map(myCorpus, tolower)
myCorpus = tm_map(myCorpus, removePunctuation)
myCorpus = tm_map(myCorpus, removeNumbers)
myCorpus = tm_map(myCorpus, removeWords, stopwords("english"))
myCorpus = tm_map(myCorpus, PlainTextDocument)
BigramTokenizer = function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
myDTM = TermDocumentMatrix(myCorpus, control = list(tokenize = BigramTokenizer))
findFreqTerms(myDTM, lowfreq = 50)
## character(0)
findFreqTerms(myDTM, lowfreq = 100)
## character(0)
#We can also identify high frequency terms.
dtm.matrix = as.matrix(myDTM)
words = rowSums(dtm.matrix)
hist(words)

words = sort(words, decreasing = TRUE)
wordcloud(names(words), words, min.freq = 75)

head(sort(words, decreasing = TRUE))
## last night cant wait right now dont know just got lets go
## 15 14 14 9 9 9
## in the for the of the to be on the thanks for
## 481 442 356 303 275 268
tail(sort(words, decreasing = TRUE))
## zombie magazine zone block zone game zoom meant
## 1 1 1 1
## zooming overused zutara ect
## 1 1
options(warn=0)
## zooming is zooming the zurawik at zutara and zygodactyl bird zygodactylous a
## 1 1 1 1 1 1
#another way to do it is with the tau library. We will probably continue with the tm library.
#library(tau)
#bigrams = textcnt(aFile, n = 2, method = "string")
#bigrams = bigrams[order(bigrams, decreasing = TRUE)]
#trigrams = textcnt(aFile, n = 3, method = "string")
#trigrams = trigrams[order(trigrams, decreasing = TRUE)]
#TrigramTokenizer <- function(x) NGramTokenizer(x,
# Weka_control(min = 3, max = 3))
#tdm2 <- TermDocumentMatrix(a, control = list(tokenize = TrigramTokenizer))
#inspect(tdm2)