## NewsMB BlogsMB TwitterMB TotalMB
## 19.17972 248.49350 301.39670 569.06992
## $NewsSummary
## Length Class Mode
## 77259 character character
##
## $BlogsSummary
## Length Class Mode
## 899288 character character
##
## $TwitterSummary
## Length Class Mode
## 2360148 character character
## $NewsSample
## [1] "He wasn't home alone, apparently."
##
## $BlogsSample
## [1] "In the years thereafter, most of the Oil fields and platforms were named after pagan <U+0093>gods<U+0094>."
##
## $TwitterSample
## [1] "How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long."
## news blogs twitter totalLines
## 77259 899288 2360148 3336695
## newsWords blogWords twitterWords totalWords
## 2643969 37334131 30373543 70351643
## newsReduced blogsReduced twitterReduced totalReduced
## 1000 1000 1000 3000
comb_trunc <- c(news_trunc, blogs_trunc, twitter_trunc)
comb_trunc <- iconv(comb_trunc, "latin1", "ASCII", sub="")
combCorpus <- Corpus(VectorSource(comb_trunc))
inspect(combCorpus[4:6])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 3
##
## [1] When you look at the internal data on international tests you find a telling dichotomy regarding American scores. America has a disproportional number of students doing very, very well and a large portion doing poorly. The Organization for Economic Co-operation and Development, which administers the PISA test, explains this dichotomy: the number of children in poverty. Twenty-two percent of American children live in mind-numbing poverty, which is far greater than the next highest country. Children do not learn well when they are hungry.
## [2] Another strong month of hiring makes it less likely that the Federal Reserve will take additional steps to boost the economy at its meeting next week.
## [3] "I doubt you're ever going to see me kicking dirt, throwing bases, that kind of stuff," Matheny said after the Cardinals' 2-1 loss to Washington. "I don't think it's going to happen, but I don't know. I've lost it a couple times (as a player). ... Mostly, in spring training you don't see that."
rm(news_trunc, blogs_trunc, twitter_trunc, comb_trunc)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4983215 266.2 14442815 771.4 11669289 623.3
## Vcells 66068555 504.1 208068404 1587.5 173323665 1322.4
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
combCorpusS <- tm_map(combCorpus, toSpace, "/")
combCorpusS <- tm_map(combCorpus, toSpace, "@")
combCorpusSL <- tm_map(combCorpusS, content_transformer(tolower))
#One can Inspect the corpus objects
#inspect(combCorpusSL)
combCorpusSLN <- tm_map(combCorpusSL, content_transformer(removeNumbers))
combCorpusSLNS <- tm_map(combCorpusSLN, removeWords, stopwords("english"))
names(profane)<-"profane"
combCorpusSLNSP <- tm_map(combCorpusSLNS, removeWords, profane[,1])
combCorpusSLNSPP <- tm_map(combCorpusSLNSP, content_transformer(removePunctuation))
combCorpusSLNSPPW <- tm_map(combCorpusSLNSPP, content_transformer(stripWhitespace))
combCorpusSLNSPPWSt <- tm_map(combCorpusSLNSPPW, content_transformer(stemDocument))
#writeCorpus(combCorpusSLNSPPWSt)
strCorpus <- concatenate ( lapply (combCorpusSLNSPPWSt , "[", 1) )
ng1 <- ngram(strCorpus, n=1)
ng2 <- ngram(strCorpus, n=2)
ng3 <- ngram(strCorpus, n=3)
ng4 <- ngram(strCorpus, n=4)
head(get.phrasetable(ngram(strCorpus, n=1)), 5)
## ngrams freq prop
## 1 said 304 0.006414585
## 2 like 280 0.005908170
## 3 one 268 0.005654963
## 4 will 266 0.005612762
## 5 just 251 0.005296253
head(get.phrasetable(ngram(strCorpus, n=2)), 5)
## ngrams freq prop
## 1 new york 26 0.0005486274
## 2 last year 22 0.0004642232
## 3 last night 16 0.0003376168
## 4 right now 15 0.0003165158
## 5 feel like 14 0.0002954147
head(get.phrasetable(ngram(strCorpus, n=3)), 5)
## ngrams freq prop
## 1 cinco de mayo 5 1.055075e-04
## 2 cricket world cup 4 8.440599e-05
## 3 osama bin laden 4 8.440599e-05
## 4 new york citi 4 8.440599e-05
## 5 want make sure 3 6.330449e-05
head(get.phrasetable(ngram(strCorpus, n=4)), 5)
## ngrams freq prop
## 1 cricket world cup dvd 3 6.330583e-05
## 2 overal hire remain strong 2 4.220389e-05
## 3 littl stage puppet theater 2 4.220389e-05
## 4 roman cathol code canon 2 4.220389e-05
## 5 done unto us believ 2 4.220389e-05
#Setup the function to help extract the word coverage information
getCoverage <- function (unigram, coverage)
{
#start with zero frequency
frequency <- 0
#determined final coverage frequency from unigram
coverageFrequency <- coverage * sum(unigram$freq)
#the unigram information already comes sorted
for (i in 1:nrow(unigram)) {
if (frequency >= coverageFrequency)
{
return (i)
}
frequency <- frequency + unigram[i, "freq"]
}
return (nrow(unigram))
}
#Setup the unigram dictionary with column names from string corpus
unigram <- get.phrasetable(ngram(strCorpus, n=1))
#Request the coverages
getCoverage(unigram, coverage= 0.5)
## [1] 598
getCoverage(unigram, coverage= 0.9)
## [1] 5092
dtm <- TermDocumentMatrix(combCorpusSLNSPPWSt)
dim(dtm)
## [1] 9626 3000
inspect(dtm[1:10,])
## <<TermDocumentMatrix (terms: 10, documents: 3000)>>
## Non-/sparse entries: 144/29856
## Sparsity : 100%
## Maximal term length: 8
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms 1 1428 1940 310 339 398 543 751 767 781
## aggress 1 0 0 0 0 0 0 0 0 0
## bank 2 0 0 4 0 0 0 1 0 1
## confer 1 0 0 0 0 0 0 0 0 0
## debtthat 1 0 0 0 0 0 0 0 0 0
## eas 1 0 0 0 0 0 0 0 0 0
## ecb 1 0 0 0 0 0 0 0 0 0
## european 1 0 0 0 0 0 0 0 1 0
## expans 1 0 0 0 0 0 0 0 0 0
## govern 1 0 0 0 3 0 2 1 1 1
## help 1 3 3 0 0 2 0 0 2 2
findFreqTerms(dtm, 100) #also use: findAssocs(dtm, "word to associate with", 0.8)
## [1] "said" "new" "look" "like" "make" "week" "will" "know"
## [9] "see" "think" "time" "good" "just" "can" "get" "year"
## [17] "work" "say" "also" "much" "day" "use" "way" "two"
## [25] "need" "come" "peopl" "one" "now" "want" "right" "first"
## [33] "last" "love"
#Removing those terms at least 95% sparse (i.e. terms occurring 0 times in a document)
#dtm <- removeSparseTerms(dtm, 0.95)
#str(dtm) #also use: inspect(removeSparseTerms(dtm, 0.95))
## word freq
## said said 304
## like like 280
## one one 268
## will will 266
## just just 251
## time time 236
## get get 230
## day day 207
## year year 196
## can can 195