Step 1: Install the tm packages
# Main Text Mining Package
#install.packages("tm")
# Package for creating word clouds (visualization tool)
#install.packages("wordcloud")
# Stemming Package
#install.packages("SnowballC")
Step 2: Load tm package
library(tm)
## Loading required package: NLP
library(wordcloud)
## Loading required package: RColorBrewer
library(SnowballC)
Step 3: Create Corpus
docs <- c("This is a data mining course.",
"We are studying text mining. Text mining is a subfield of data mining.",
"Mining text is interesting, and I am interested in it.")
docs
## [1] "This is a data mining course."
## [2] "We are studying text mining. Text mining is a subfield of data mining."
## [3] "Mining text is interesting, and I am interested in it."
docCorpus <- Corpus(VectorSource(docs))
inspect(docCorpus)
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 3
##
## [1] This is a data mining course.
## [2] We are studying text mining. Text mining is a subfield of data mining.
## [3] Mining text is interesting, and I am interested in it.
Step 4: Visualize Corpus (for fun!)
wordcloud(docCorpus, min.freq = 1, random.order = FALSE)
wordcloud(docCorpus, min.freq = 1, random.order = FALSE, colors = "orange")
Step 5: Carry out case folding
corpus_clean <- tm_map(docCorpus, tolower)
Step 6: Remove stopwords and punctuation
#examine default stopwords
stopwords()
## [1] "i" "me" "my" "myself" "we"
## [6] "our" "ours" "ourselves" "you" "your"
## [11] "yours" "yourself" "yourselves" "he" "him"
## [16] "his" "himself" "she" "her" "hers"
## [21] "herself" "it" "its" "itself" "they"
## [26] "them" "their" "theirs" "themselves" "what"
## [31] "which" "who" "whom" "this" "that"
## [36] "these" "those" "am" "is" "are"
## [41] "was" "were" "be" "been" "being"
## [46] "have" "has" "had" "having" "do"
## [51] "does" "did" "doing" "would" "should"
## [56] "could" "ought" "i'm" "you're" "he's"
## [61] "she's" "it's" "we're" "they're" "i've"
## [66] "you've" "we've" "they've" "i'd" "you'd"
## [71] "he'd" "she'd" "we'd" "they'd" "i'll"
## [76] "you'll" "he'll" "she'll" "we'll" "they'll"
## [81] "isn't" "aren't" "wasn't" "weren't" "hasn't"
## [86] "haven't" "hadn't" "doesn't" "don't" "didn't"
## [91] "won't" "wouldn't" "shan't" "shouldn't" "can't"
## [96] "cannot" "couldn't" "mustn't" "let's" "that's"
## [101] "who's" "what's" "here's" "there's" "when's"
## [106] "where's" "why's" "how's" "a" "an"
## [111] "the" "and" "but" "if" "or"
## [116] "because" "as" "until" "while" "of"
## [121] "at" "by" "for" "with" "about"
## [126] "against" "between" "into" "through" "during"
## [131] "before" "after" "above" "below" "to"
## [136] "from" "up" "down" "in" "out"
## [141] "on" "off" "over" "under" "again"
## [146] "further" "then" "once" "here" "there"
## [151] "when" "where" "why" "how" "all"
## [156] "any" "both" "each" "few" "more"
## [161] "most" "other" "some" "such" "no"
## [166] "nor" "not" "only" "own" "same"
## [171] "so" "than" "too" "very"
corpus_clean <- tm_map(corpus_clean, removeWords, stopwords())
# Remove punctuation
corpus_clean <- tm_map(corpus_clean, removePunctuation)
Alternatively define your own set of stopwords
mystopwords <- c("and", "for", "in", "is", "it", "not", "the", "to", "i")
corpus_clean <- tm_map(corpus_clean, removeWords, mystopwords)
Step 7: Carry out stemming
corpus_clean <- tm_map(corpus_clean, stemDocument, language = "english")
Step 8: Carry out other cleaning operations (we don’t need this for our example)
# Remove numbers
corpus_clean <- tm_map(corpus_clean, removeNumbers)
Step 9: Convert corpus into a document Matrix
# default is a document Matrix with frequency counts
docMatrix <- DocumentTermMatrix(corpus_clean)
# View the Matrix
inspect(docMatrix)
## <<DocumentTermMatrix (documents: 3, terms: 7)>>
## Non-/sparse entries: 11/10
## Sparsity : 48%
## Maximal term length: 8
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs cours data interest mine studi subfield text
## 1 1 1 0 1 0 0 0
## 2 0 1 0 3 1 1 2
## 3 0 0 2 1 0 0 1
# Build a document Matrix using binary weights
docMatrix <- DocumentTermMatrix(corpus_clean, control = list(weighting = weightBin))
# Build a document Matrix using TF-Idf weights
docMatrix <- DocumentTermMatrix(corpus_clean, control = list(weighting = weightTfIdf))
inspect(docMatrix)
## <<DocumentTermMatrix (documents: 3, terms: 7)>>
## Non-/sparse entries: 8/13
## Sparsity : 62%
## Maximal term length: 8
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
## Sample :
## Terms
## Docs cours data interest mine studi subfield text
## 1 0.5283208 0.19498750 0.0000000 0 0.0000000 0.0000000 0.0000000
## 2 0.0000000 0.07312031 0.0000000 0 0.1981203 0.1981203 0.1462406
## 3 0.0000000 0.00000000 0.7924813 0 0.0000000 0.0000000 0.1462406