Title: Text Mining

This

## Loading required package: NLP
## 
## Attaching package: 'ggplot2'
## 
## The following object is masked from 'package:NLP':
## 
##     annotate
## 
## Loading required package: graph
## Loading required package: grid

Loading data

Getting Ready: Corpus Sources and Readers

This is a project of Data Science Capstone – explorary data.

getSources()
## [1] "DataframeSource" "DirSource"       "URISource"       "VectorSource"   
## [5] "XMLSource"
getReaders()
## [1] "readDOC"                 "readPDF"                
## [3] "readPlain"               "readRCV1"               
## [5] "readRCV1asPlain"         "readReut21578XML"       
## [7] "readReut21578XMLasPlain" "readTabular"            
## [9] "readXML"
my.read.lines <- function( fname, buf.size=5e7 ) {
 s = file.info( fname )$size
 in.file = file( fname, "r" )
 buf=""
 res = list()
 i=1
 while( s > 0 ) {
  n = min( c( buf.size, s ) )
  buf = paste(buf, readChar( in.file, n ),sep="" )

  s = s - n
  r = strsplit( buf, "\n", fixed=T, useBytes=T)[[1]]
  n=nchar(buf)
  if( substr(buf,n,n)=="\n" ) {
   res[[i]] = r
   buf = ""
  } else {
   res[[i]] = head(r,-1)
   buf = tail(r,1)
  }
  i=i+1
 }
 close( in.file )
 c( unlist(res), buf )
}

Loading a Corpus: Text Documents

cname <- file.path(".", "en_US", "./")
cname
## [1] "./en_US/./"
length(dir(cname))
## [1] 3
dir(cname)
## [1] "en_US.blogs.txt"   "en_US.news.txt"    "en_US.twitter.txt"
library(tm)
filet ="en_US/en_US.twitter.txt"
file.info(filet)
##                              size isdir mode               mtime
## en_US/en_US.twitter.txt 167105338 FALSE  644 2014-07-22 10:12:58
##                                       ctime               atime uid gid
## en_US/en_US.twitter.txt 2014-08-26 17:15:03 2014-09-17 10:11:31 501  20
##                          uname grname
## en_US/en_US.twitter.txt wangwf  staff
doc1 <- readLines(filet, n=5000)
docs <- Corpus(VectorSource(doc1))
rm(doc1)
class(docs)
## [1] "VCorpus" "Corpus"
class(docs[[1]])
## [1] "PlainTextDocument" "TextDocument"
summary(docs[c(1:10)])
##    Length Class             Mode
## 1  2      PlainTextDocument list
## 2  2      PlainTextDocument list
## 3  2      PlainTextDocument list
## 4  2      PlainTextDocument list
## 5  2      PlainTextDocument list
## 6  2      PlainTextDocument list
## 7  2      PlainTextDocument list
## 8  2      PlainTextDocument list
## 9  2      PlainTextDocument list
## 10 2      PlainTextDocument list

inspect documents using inspect()

inspect(docs[1])
## <<VCorpus (documents: 1, metadata (corpus/indexed): 0/0)>>
## 
## [[1]]
## <<PlainTextDocument (metadata: 7)>>
## How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long.

Preparing the Corpus

The function tm_map() is used to apply the transformations. We will apply the transformations sequentially to remove unwanted characters from the text.

getTransformations()
## [1] "removeNumbers"     "removePunctuation" "removeWords"      
## [4] "stemDocument"      "stripWhitespace"

To lower case, removeNumbers, removerPunctuation, stopWords

docs <- tm_map(docs, content_transformer(tolower)) # tolower)
#inspect(docs[[1]])
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeWords, stopwords("english"))
stopwords("english")
##   [1] "i"          "me"         "my"         "myself"     "we"        
##   [6] "our"        "ours"       "ourselves"  "you"        "your"      
##  [11] "yours"      "yourself"   "yourselves" "he"         "him"       
##  [16] "his"        "himself"    "she"        "her"        "hers"      
##  [21] "herself"    "it"         "its"        "itself"     "they"      
##  [26] "them"       "their"      "theirs"     "themselves" "what"      
##  [31] "which"      "who"        "whom"       "this"       "that"      
##  [36] "these"      "those"      "am"         "is"         "are"       
##  [41] "was"        "were"       "be"         "been"       "being"     
##  [46] "have"       "has"        "had"        "having"     "do"        
##  [51] "does"       "did"        "doing"      "would"      "should"    
##  [56] "could"      "ought"      "i'm"        "you're"     "he's"      
##  [61] "she's"      "it's"       "we're"      "they're"    "i've"      
##  [66] "you've"     "we've"      "they've"    "i'd"        "you'd"     
##  [71] "he'd"       "she'd"      "we'd"       "they'd"     "i'll"      
##  [76] "you'll"     "he'll"      "she'll"     "we'll"      "they'll"   
##  [81] "isn't"      "aren't"     "wasn't"     "weren't"    "hasn't"    
##  [86] "haven't"    "hadn't"     "doesn't"    "don't"      "didn't"    
##  [91] "won't"      "wouldn't"   "shan't"     "shouldn't"  "can't"     
##  [96] "cannot"     "couldn't"   "mustn't"    "let's"      "that's"    
## [101] "who's"      "what's"     "here's"     "there's"    "when's"    
## [106] "where's"    "why's"      "how's"      "a"          "an"        
## [111] "the"        "and"        "but"        "if"         "or"        
## [116] "because"    "as"         "until"      "while"      "of"        
## [121] "at"         "by"         "for"        "with"       "about"     
## [126] "against"    "between"    "into"       "through"    "during"    
## [131] "before"     "after"      "above"      "below"      "to"        
## [136] "from"       "up"         "down"       "in"         "out"       
## [141] "on"         "off"        "over"       "under"      "again"     
## [146] "further"    "then"       "once"       "here"       "there"     
## [151] "when"       "where"      "why"        "how"        "all"       
## [156] "any"        "both"       "each"       "few"        "more"      
## [161] "most"       "other"      "some"       "such"       "no"        
## [166] "nor"        "not"        "only"       "own"        "same"      
## [171] "so"         "than"       "too"        "very"
length(stopwords("english"))
## [1] 174
# remove own stop words
#docs <- tm_map(docs, removeWords, c("department", "email"))
docs <- tm_map(docs, stripWhitespace)

Stemming

Stemming uses an algorithm that removes common word ending for English words, such as “es”, “ed”, and “s”. The functionality for stemming is provided by wordStem() from SnowballC.

library(SnowballC)
docs <- tm_map(docs, stemDocument)
#docs <- tm_map(docs, content_transformer(stemCompletion))

##

#bio <- tm_map(docs[3], grep, pattern="\\<biostats")
#sum(unlist(bio))

library(RWeka)
OnegramTokenizer <- function(x) {RWeka::NGramTokenizer(x, 
                                RWeka::Weka_control(min = 1, max = 1))}

TwogramTokenizer <- function(x) {RWeka::NGramTokenizer(x, 
                                 RWeka::Weka_control(min = 2, max = 2))}

TrigramTokenizer <- function(x) {RWeka::NGramTokenizer(x, 
                                RWeka::Weka_control(min = 3, max = 3))}
#oneG <- OnegramTokenizer(docs[1])
#twoG <- TwogramTokenizer(docs[1])

#tdm <- TermDocumentMatrix(docs) # , control = list(wordLengths = c(1, Inf)))
#tdm = TermDocumentMatrix(docs, control = list(tokenize = words, dictionary = wrdList, bounds = list(global = c(2,3))))
#tdm <- TermDocumentMatrix(docs, control = list(tokenize = twoG))
#tdm

Creating a Document Term Matrix

A document term matrix (document term) is a matrix: documents as the rows and terms as the columns and a count of the frequency of words as the cells of the matrix. The transpose is created using TermDocumentMatrix()

dtm <- DocumentTermMatrix(docs)
dtm
inspect(dtm[1:2, 100:200])
class(dtm)
dim(dtm)
# Removing Sparse Terms
dtms <- removeSparseTerms(dtm, 0.1)
dim(dtms)

Exploring the Document Term Matrix

freq <- colSums(as.matrix(dtm))
## Error: object 'dtm' not found
length(freq)
## Error: object 'freq' not found
# ordering the frequenies
ord <- order(freq)
## Error: object 'freq' not found
freq[head(ord)]
## Error: object 'freq' not found
freq[tail(ord)]
## Error: object 'freq' not found

Distribution of Term Frequencies

head(table(freq), 15)
## Error: error in evaluating the argument 'x' in selecting a method for function 'head': Error in table(freq) : object 'freq' not found
tail(table(freq), 15)
## Error: error in evaluating the argument 'x' in selecting a method for function 'tail': Error in table(freq) : object 'freq' not found

Conversion to Matrix and Save to CSV

m <- as.matrix(dtm)
## Error: object 'dtm' not found
dim(m)
## Error: object 'm' not found
freq <- colSums(as.matrix(dtms))
## Error: object 'dtms' not found
freq
## Error: object 'freq' not found
table(freq)
## Error: object 'freq' not found
findFreqTerms(dtm, lowfreq=1000)
## Error: object 'dtm' not found
findFreqTerms(dtm, lowfreq=100)
## Error: object 'dtm' not found

Correlations Plots

#plot(dtm,    term=findFreqTerms(dtm, lowfreq=100)[1:50],    corThreshold=0.5)
freq <- sort(colSums(as.matrix(dtm)), decreasing =TRUE)
## Error: object 'dtm' not found
head(freq, 14)
## Error: error in evaluating the argument 'x' in selecting a method for function 'head': Error: object 'freq' not found
wf <- data.frame(word = names(freq), freq = freq)
## Error: object 'freq' not found
head(wf)
## Error: error in evaluating the argument 'x' in selecting a method for function 'head': Error: object 'wf' not found

Plot the frequency of those words that occurs at least 500 times

library(ggplot2)
p <- ggplot(subset(wf, freq>500), aes(word, freq))
## Error: object 'wf' not found
p <- p + geom_bar(stat="identity")
## Error: object 'p' not found
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
## Error: object 'p' not found
p
## Error: object 'p' not found

Word Clouds

library(wordcloud)
set.seed(123)
wordcloud(names(freq), freq, min.freq=50)
## Error: object 'freq' not found
set.seed(142)
wordcloud(names(freq), freq, min.freq=100, colors=brewer.pal(6, "Dark2"))
## Error: object 'freq' not found
#set.seed(142)
#wordcloud(names(freq), freq, min.freq=100, scale=c(5, .1), colors=brewer.pal(6, "Dark2"))

Testing Association rule

#findAssocs(dtm, "data", corlimit=0.6)
testing <- function(str1){
    str1 <- removeNumbers(tolower(str1))
    str1 <- removePunctuation(str1)
    str1 <- removeWords(str1, stopwords("english"))
    str1 <- stripWhitespace(str1)
    str1 <- stemDocument(str1)
    str1<-unlist(strsplit(str1," "))
    for(i in 1:length(str1)){
        print(str1[i])
        findAssocs(dtm, str1[i],corlimit=0.1)
    }
#    findAssocs(dtm)
}
str1 <- "The guy in front of me just bought a pound of bacon, a bouquet, and a case of"
testing(str1)
## [1] ""
## Error: object 'dtm' not found
str2 <- "You're the reason why I smile everyday. Can you follow me please? It would mean the"
testing(str2)
## [1] "youre"
## Error: object 'dtm' not found
str3 <-"Hey sunshine, can you follow me and make me the"

str4 <- "Very early observations on the Bills game: Offense still struggling but the"

str5 <-"Go on a romantic date at the"

str6 <-"Well I'm pretty sure my granny has some old bagpipes in her garage I'll dust them off and be on my"

str7 <- "Ohhhhh #PointBreak is on tomorrow. Love that film and haven't seen it in quite some"
str8 <-"After the ice bucket challenge Louis will push his long wet hair out of his eyes with his little"

str9 <- "Be grateful for the good times and keep the faith during the"

str10 <- "If this isn't the cutest thing you've ever seen, then you must be"