This
## Loading required package: NLP
##
## Attaching package: 'ggplot2'
##
## The following object is masked from 'package:NLP':
##
## annotate
##
## Loading required package: graph
## Loading required package: grid
This is a project of Data Science Capstone – explorary data.
getSources()
## [1] "DataframeSource" "DirSource" "URISource" "VectorSource"
## [5] "XMLSource"
getReaders()
## [1] "readDOC" "readPDF"
## [3] "readPlain" "readRCV1"
## [5] "readRCV1asPlain" "readReut21578XML"
## [7] "readReut21578XMLasPlain" "readTabular"
## [9] "readXML"
my.read.lines <- function( fname, buf.size=5e7 ) {
s = file.info( fname )$size
in.file = file( fname, "r" )
buf=""
res = list()
i=1
while( s > 0 ) {
n = min( c( buf.size, s ) )
buf = paste(buf, readChar( in.file, n ),sep="" )
s = s - n
r = strsplit( buf, "\n", fixed=T, useBytes=T)[[1]]
n=nchar(buf)
if( substr(buf,n,n)=="\n" ) {
res[[i]] = r
buf = ""
} else {
res[[i]] = head(r,-1)
buf = tail(r,1)
}
i=i+1
}
close( in.file )
c( unlist(res), buf )
}
cname <- file.path(".", "en_US", "./")
cname
## [1] "./en_US/./"
length(dir(cname))
## [1] 3
dir(cname)
## [1] "en_US.blogs.txt" "en_US.news.txt" "en_US.twitter.txt"
library(tm)
filet ="en_US/en_US.twitter.txt"
file.info(filet)
## size isdir mode mtime
## en_US/en_US.twitter.txt 167105338 FALSE 644 2014-07-22 10:12:58
## ctime atime uid gid
## en_US/en_US.twitter.txt 2014-08-26 17:15:03 2014-09-17 10:11:31 501 20
## uname grname
## en_US/en_US.twitter.txt wangwf staff
doc1 <- readLines(filet, n=5000)
docs <- Corpus(VectorSource(doc1))
rm(doc1)
class(docs)
## [1] "VCorpus" "Corpus"
class(docs[[1]])
## [1] "PlainTextDocument" "TextDocument"
summary(docs[c(1:10)])
## Length Class Mode
## 1 2 PlainTextDocument list
## 2 2 PlainTextDocument list
## 3 2 PlainTextDocument list
## 4 2 PlainTextDocument list
## 5 2 PlainTextDocument list
## 6 2 PlainTextDocument list
## 7 2 PlainTextDocument list
## 8 2 PlainTextDocument list
## 9 2 PlainTextDocument list
## 10 2 PlainTextDocument list
inspect(docs[1])
## <<VCorpus (documents: 1, metadata (corpus/indexed): 0/0)>>
##
## [[1]]
## <<PlainTextDocument (metadata: 7)>>
## How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long.
The function tm_map() is used to apply the transformations. We will apply the transformations sequentially to remove unwanted characters from the text.
getTransformations()
## [1] "removeNumbers" "removePunctuation" "removeWords"
## [4] "stemDocument" "stripWhitespace"
To lower case, removeNumbers, removerPunctuation, stopWords
docs <- tm_map(docs, content_transformer(tolower)) # tolower)
#inspect(docs[[1]])
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeWords, stopwords("english"))
stopwords("english")
## [1] "i" "me" "my" "myself" "we"
## [6] "our" "ours" "ourselves" "you" "your"
## [11] "yours" "yourself" "yourselves" "he" "him"
## [16] "his" "himself" "she" "her" "hers"
## [21] "herself" "it" "its" "itself" "they"
## [26] "them" "their" "theirs" "themselves" "what"
## [31] "which" "who" "whom" "this" "that"
## [36] "these" "those" "am" "is" "are"
## [41] "was" "were" "be" "been" "being"
## [46] "have" "has" "had" "having" "do"
## [51] "does" "did" "doing" "would" "should"
## [56] "could" "ought" "i'm" "you're" "he's"
## [61] "she's" "it's" "we're" "they're" "i've"
## [66] "you've" "we've" "they've" "i'd" "you'd"
## [71] "he'd" "she'd" "we'd" "they'd" "i'll"
## [76] "you'll" "he'll" "she'll" "we'll" "they'll"
## [81] "isn't" "aren't" "wasn't" "weren't" "hasn't"
## [86] "haven't" "hadn't" "doesn't" "don't" "didn't"
## [91] "won't" "wouldn't" "shan't" "shouldn't" "can't"
## [96] "cannot" "couldn't" "mustn't" "let's" "that's"
## [101] "who's" "what's" "here's" "there's" "when's"
## [106] "where's" "why's" "how's" "a" "an"
## [111] "the" "and" "but" "if" "or"
## [116] "because" "as" "until" "while" "of"
## [121] "at" "by" "for" "with" "about"
## [126] "against" "between" "into" "through" "during"
## [131] "before" "after" "above" "below" "to"
## [136] "from" "up" "down" "in" "out"
## [141] "on" "off" "over" "under" "again"
## [146] "further" "then" "once" "here" "there"
## [151] "when" "where" "why" "how" "all"
## [156] "any" "both" "each" "few" "more"
## [161] "most" "other" "some" "such" "no"
## [166] "nor" "not" "only" "own" "same"
## [171] "so" "than" "too" "very"
length(stopwords("english"))
## [1] 174
# remove own stop words
#docs <- tm_map(docs, removeWords, c("department", "email"))
docs <- tm_map(docs, stripWhitespace)
Stemming uses an algorithm that removes common word ending for English words, such as “es”, “ed”, and “s”. The functionality for stemming is provided by wordStem() from SnowballC.
library(SnowballC)
docs <- tm_map(docs, stemDocument)
#docs <- tm_map(docs, content_transformer(stemCompletion))
##
#bio <- tm_map(docs[3], grep, pattern="\\<biostats")
#sum(unlist(bio))
library(RWeka)
OnegramTokenizer <- function(x) {RWeka::NGramTokenizer(x,
RWeka::Weka_control(min = 1, max = 1))}
TwogramTokenizer <- function(x) {RWeka::NGramTokenizer(x,
RWeka::Weka_control(min = 2, max = 2))}
TrigramTokenizer <- function(x) {RWeka::NGramTokenizer(x,
RWeka::Weka_control(min = 3, max = 3))}
#oneG <- OnegramTokenizer(docs[1])
#twoG <- TwogramTokenizer(docs[1])
#tdm <- TermDocumentMatrix(docs) # , control = list(wordLengths = c(1, Inf)))
#tdm = TermDocumentMatrix(docs, control = list(tokenize = words, dictionary = wrdList, bounds = list(global = c(2,3))))
#tdm <- TermDocumentMatrix(docs, control = list(tokenize = twoG))
#tdm
A document term matrix (document term) is a matrix: documents as the rows and terms as the columns and a count of the frequency of words as the cells of the matrix. The transpose is created using TermDocumentMatrix()
dtm <- DocumentTermMatrix(docs)
dtm
inspect(dtm[1:2, 100:200])
class(dtm)
dim(dtm)
# Removing Sparse Terms
dtms <- removeSparseTerms(dtm, 0.1)
dim(dtms)
freq <- colSums(as.matrix(dtm))
## Error: object 'dtm' not found
length(freq)
## Error: object 'freq' not found
# ordering the frequenies
ord <- order(freq)
## Error: object 'freq' not found
freq[head(ord)]
## Error: object 'freq' not found
freq[tail(ord)]
## Error: object 'freq' not found
head(table(freq), 15)
## Error: error in evaluating the argument 'x' in selecting a method for function 'head': Error in table(freq) : object 'freq' not found
tail(table(freq), 15)
## Error: error in evaluating the argument 'x' in selecting a method for function 'tail': Error in table(freq) : object 'freq' not found
m <- as.matrix(dtm)
## Error: object 'dtm' not found
dim(m)
## Error: object 'm' not found
freq <- colSums(as.matrix(dtms))
## Error: object 'dtms' not found
freq
## Error: object 'freq' not found
table(freq)
## Error: object 'freq' not found
findFreqTerms(dtm, lowfreq=1000)
## Error: object 'dtm' not found
findFreqTerms(dtm, lowfreq=100)
## Error: object 'dtm' not found
#plot(dtm, term=findFreqTerms(dtm, lowfreq=100)[1:50], corThreshold=0.5)
freq <- sort(colSums(as.matrix(dtm)), decreasing =TRUE)
## Error: object 'dtm' not found
head(freq, 14)
## Error: error in evaluating the argument 'x' in selecting a method for function 'head': Error: object 'freq' not found
wf <- data.frame(word = names(freq), freq = freq)
## Error: object 'freq' not found
head(wf)
## Error: error in evaluating the argument 'x' in selecting a method for function 'head': Error: object 'wf' not found
Plot the frequency of those words that occurs at least 500 times
library(ggplot2)
p <- ggplot(subset(wf, freq>500), aes(word, freq))
## Error: object 'wf' not found
p <- p + geom_bar(stat="identity")
## Error: object 'p' not found
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
## Error: object 'p' not found
p
## Error: object 'p' not found
library(wordcloud)
set.seed(123)
wordcloud(names(freq), freq, min.freq=50)
## Error: object 'freq' not found
set.seed(142)
wordcloud(names(freq), freq, min.freq=100, colors=brewer.pal(6, "Dark2"))
## Error: object 'freq' not found
#set.seed(142)
#wordcloud(names(freq), freq, min.freq=100, scale=c(5, .1), colors=brewer.pal(6, "Dark2"))
#findAssocs(dtm, "data", corlimit=0.6)
testing <- function(str1){
str1 <- removeNumbers(tolower(str1))
str1 <- removePunctuation(str1)
str1 <- removeWords(str1, stopwords("english"))
str1 <- stripWhitespace(str1)
str1 <- stemDocument(str1)
str1<-unlist(strsplit(str1," "))
for(i in 1:length(str1)){
print(str1[i])
findAssocs(dtm, str1[i],corlimit=0.1)
}
# findAssocs(dtm)
}
str1 <- "The guy in front of me just bought a pound of bacon, a bouquet, and a case of"
testing(str1)
## [1] ""
## Error: object 'dtm' not found
str2 <- "You're the reason why I smile everyday. Can you follow me please? It would mean the"
testing(str2)
## [1] "youre"
## Error: object 'dtm' not found
str3 <-"Hey sunshine, can you follow me and make me the"
str4 <- "Very early observations on the Bills game: Offense still struggling but the"
str5 <-"Go on a romantic date at the"
str6 <-"Well I'm pretty sure my granny has some old bagpipes in her garage I'll dust them off and be on my"
str7 <- "Ohhhhh #PointBreak is on tomorrow. Love that film and haven't seen it in quite some"
str8 <-"After the ice bucket challenge Louis will push his long wet hair out of his eyes with his little"
str9 <- "Be grateful for the good times and keep the faith during the"
str10 <- "If this isn't the cutest thing you've ever seen, then you must be"