Recreating the example in the lecture notes using R

Step 1: Install the tm packages

# Main Text Mining Package
#install.packages("tm")

# Package for creating word clouds (visualization tool)
#install.packages("wordcloud")

# Stemming Package
#install.packages("SnowballC")

Step 2: Load tm package

library(tm)

## Loading required package: NLP

library(wordcloud)

## Loading required package: RColorBrewer

library(SnowballC)

Step 3: Create Corpus

docs <- c("This is a data mining course.", 
"We are studying text mining. Text mining is a subfield of data mining.",
"Mining text is interesting, and I am interested in it.")

docs

## [1] "This is a data mining course."                                         
## [2] "We are studying text mining. Text mining is a subfield of data mining."
## [3] "Mining text is interesting, and I am interested in it."

docCorpus <- Corpus(VectorSource(docs))

inspect(docCorpus)

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 3
## 
## [1] This is a data mining course.                                         
## [2] We are studying text mining. Text mining is a subfield of data mining.
## [3] Mining text is interesting, and I am interested in it.

Step 4: Visualize Corpus (for fun!)

      wordcloud(docCorpus, min.freq = 1, random.order = FALSE)

      wordcloud(docCorpus, min.freq = 1, random.order = FALSE,  colors = "orange")

Step 5: Carry out case folding

      corpus_clean <- tm_map(docCorpus, tolower)

Step 6: Remove stopwords and punctuation

      #examine default stopwords 
      stopwords()

##   [1] "i"          "me"         "my"         "myself"     "we"        
##   [6] "our"        "ours"       "ourselves"  "you"        "your"      
##  [11] "yours"      "yourself"   "yourselves" "he"         "him"       
##  [16] "his"        "himself"    "she"        "her"        "hers"      
##  [21] "herself"    "it"         "its"        "itself"     "they"      
##  [26] "them"       "their"      "theirs"     "themselves" "what"      
##  [31] "which"      "who"        "whom"       "this"       "that"      
##  [36] "these"      "those"      "am"         "is"         "are"       
##  [41] "was"        "were"       "be"         "been"       "being"     
##  [46] "have"       "has"        "had"        "having"     "do"        
##  [51] "does"       "did"        "doing"      "would"      "should"    
##  [56] "could"      "ought"      "i'm"        "you're"     "he's"      
##  [61] "she's"      "it's"       "we're"      "they're"    "i've"      
##  [66] "you've"     "we've"      "they've"    "i'd"        "you'd"     
##  [71] "he'd"       "she'd"      "we'd"       "they'd"     "i'll"      
##  [76] "you'll"     "he'll"      "she'll"     "we'll"      "they'll"   
##  [81] "isn't"      "aren't"     "wasn't"     "weren't"    "hasn't"    
##  [86] "haven't"    "hadn't"     "doesn't"    "don't"      "didn't"    
##  [91] "won't"      "wouldn't"   "shan't"     "shouldn't"  "can't"     
##  [96] "cannot"     "couldn't"   "mustn't"    "let's"      "that's"    
## [101] "who's"      "what's"     "here's"     "there's"    "when's"    
## [106] "where's"    "why's"      "how's"      "a"          "an"        
## [111] "the"        "and"        "but"        "if"         "or"        
## [116] "because"    "as"         "until"      "while"      "of"        
## [121] "at"         "by"         "for"        "with"       "about"     
## [126] "against"    "between"    "into"       "through"    "during"    
## [131] "before"     "after"      "above"      "below"      "to"        
## [136] "from"       "up"         "down"       "in"         "out"       
## [141] "on"         "off"        "over"       "under"      "again"     
## [146] "further"    "then"       "once"       "here"       "there"     
## [151] "when"       "where"      "why"        "how"        "all"       
## [156] "any"        "both"       "each"       "few"        "more"      
## [161] "most"       "other"      "some"       "such"       "no"        
## [166] "nor"        "not"        "only"       "own"        "same"      
## [171] "so"         "than"       "too"        "very"

      corpus_clean <- tm_map(corpus_clean, removeWords, stopwords())
      # Remove punctuation  
      corpus_clean <- tm_map(corpus_clean, removePunctuation)

Alternatively define your own set of stopwords

mystopwords <- c("and", "for", "in", "is", "it", "not", "the", "to", "i")
corpus_clean <- tm_map(corpus_clean, removeWords, mystopwords)

Step 7: Carry out stemming

corpus_clean <- tm_map(corpus_clean, stemDocument, language = "english")

Step 8: Carry out other cleaning operations (we don’t need this for our example)

# Remove numbers
corpus_clean <- tm_map(corpus_clean, removeNumbers)

Step 9: Convert corpus into a document Matrix

# default is a document Matrix with frequency counts
docMatrix <- DocumentTermMatrix(corpus_clean)

# View the Matrix
inspect(docMatrix)

## <<DocumentTermMatrix (documents: 3, terms: 7)>>
## Non-/sparse entries: 11/10
## Sparsity           : 48%
## Maximal term length: 8
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs cours data interest mine studi subfield text
##    1     1    1        0    1     0        0    0
##    2     0    1        0    3     1        1    2
##    3     0    0        2    1     0        0    1

# Build a document Matrix using binary weights
docMatrix <- DocumentTermMatrix(corpus_clean, control = list(weighting = weightBin))

# Build a document Matrix using TF-Idf weights
 docMatrix <- DocumentTermMatrix(corpus_clean, control = list(weighting = weightTfIdf))
 
 inspect(docMatrix)

## <<DocumentTermMatrix (documents: 3, terms: 7)>>
## Non-/sparse entries: 8/13
## Sparsity           : 62%
## Maximal term length: 8
## Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)
## Sample             :
##     Terms
## Docs     cours       data  interest mine     studi  subfield      text
##    1 0.5283208 0.19498750 0.0000000    0 0.0000000 0.0000000 0.0000000
##    2 0.0000000 0.07312031 0.0000000    0 0.1981203 0.1981203 0.1462406
##    3 0.0000000 0.00000000 0.7924813    0 0.0000000 0.0000000 0.1462406

Recreating the example in the lecture notes using R

Dr. Samhaa R. El-Beltagy