A. Load Libraries

# Install
#install.packages(c("tm", "SnowballC", "wordcloud", "RColorBrewer"))

# Load Libraries
library("tm")
library("SnowballC")
library("wordcloud")
library("RColorBrewer")

Text Mining

First work with the text from ONLY the 3rd column of ‘Bioconductor’ text found on Bioconductor All Packages descriptions.

B. Import Text/Corpus/Data

text <- readLines("BioC_pack_col3.txt")

# Load the data as a corpus
docs <- Corpus(VectorSource(text))

# Inspect all the words by removing the comment line below.
#inspect(docs) 

C. Text Preprocessing & Cleaning Text

Preprocessing is performed using tm_map() function to replace or remove special characters from the text, for example.

# Convert all text to lower case
docs <- tm_map(docs, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(docs, content_transformer(tolower)):
## transformation drops documents
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
## Warning in tm_map.SimpleCorpus(docs, removePunctuation): transformation
## drops documents
# Remove numbers
docs <- tm_map(docs, removeNumbers)
## Warning in tm_map.SimpleCorpus(docs, removeNumbers): transformation drops
## documents
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(docs, removeWords, stopwords("english")):
## transformation drops documents
# Remove your own stop word
# ADD stopwords as a character vector
docs <- tm_map(docs, removeWords, c("big", "small")) 
## Warning in tm_map.SimpleCorpus(docs, removeWords, c("big", "small")):
## transformation drops documents
# Text stemming
docs <- tm_map(docs, stemDocument)
## Warning in tm_map.SimpleCorpus(docs, stemDocument): transformation drops
## documents

Replacing “/”, “@” and “|” with space

It is even possible to be very specific regarding the symbols or words to remove.

toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))

docs <- tm_map(docs, toSpace, "/")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "/"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "@")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "@"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "\\|")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "\\|"): transformation drops
## documents
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
## Warning in tm_map.SimpleCorpus(docs, stripWhitespace): transformation drops
## documents

D. Build a Term-document Matrix

The Document-matrix is a table containing the frequency of the words. Column names are words and row names are documents. The function TermDocumentMatrix() from text mining package can be used as follow :

doc_mat <- TermDocumentMatrix(docs)

m <- as.matrix(doc_mat)

v <- sort(rowSums(m), decreasing = TRUE)

d_BioC <- data.frame(word = names(v), freq = v)

head(d_BioC, 5)
##            word freq
## data       data  486
## analysi analysi  363
## gene       gene  189
## express express  153
## packag   packag  143

E. Generate First Word Cloud from R-cran package descriptions

The importance of words can be illustrated as a word cloud where font size of the word shows its importance.

wordcloud(words = d_BioC$word, 
          freq = d_BioC$freq, 
          min.freq = 1,
          max.words = 100, 
          random.order = FALSE,
          rot.per = 0.0, # proportion words with 90 degree rotation
          colors = brewer.pal(4, "Set1"))

Going Further

Explore frequent terms and their associations

One can have a look at the frequent terms in the term-document-matrix as follows. In the example below we want to find the words which occur at least 100 times, frequency => 13.

Rcran_list <- findFreqTerms(doc_mat, lowfreq = 13)
findFreqTerms(doc_mat, lowfreq = 13)
##   [1] "affymetrix"     "analysi"        "array"          "autom"         
##   [5] "packag"         "base"           "classif"        "enrich"        
##   [9] "express"        "gene"           "region"         "data"          
##  [13] "genom"          "microarray"     "statist"        "differ"        
##  [17] "method"         "model"          "rnaseq"         "detect"        
##  [21] "differenti"     "class"          "function"       "algorithm"     
##  [25] "comput"         "experi"         "object"         "use"           
##  [29] "multipl"        "test"           "cluster"        "file"          
##  [33] "assess"         "structur"       "analys"         "qualiti"       
##  [37] "tool"           "generat"        "analyz"         "process"       
##  [41] "cancer"         "sampl"          "variat"         "explor"        
##  [45] "map"            "approach"       "coexpress"      "network"       
##  [49] "interact"       "plot"           "integr"         "mirna"         
##  [53] "copi"           "number"         "annot"          "biolog"        
##  [57] "visualis"       "databas"        "interfac"       "bioconductor"  
##  [61] "access"         "perform"        "estim"          "protein"       
##  [65] "normal"         "visual"         "high"           "pipelin"       
##  [69] "sequenc"        "set"            "illumina"       "methyl"        
##  [73] "predict"        "site"           "associ"         "studi"         
##  [77] "identif"        "splice"         "select"         "signatur"      
##  [81] "control"        "cell"           "identifi"       "singlecel"     
##  [85] "bayesian"       "bind"           "factor"         "transcript"    
##  [89] "distribut"      "read"           "call"           "effect"        
##  [93] "chipseq"        "dna"            "highthroughput" "singl"         
##  [97] "graph"          "similar"        "profil"         "omic"          
## [101] "manipul"        "target"         "util"           "infer"         
## [105] "regulatori"     "metaanalysi"    "mass"           "chromosom"     
## [109] "cytometri"      "featur"         "correl"         "genet"         
## [113] "inform"         "dataset"        "proteom"        "pathway"       
## [117] "variant"        "motif"          "transcriptom"   "design"        
## [121] "phenotyp"       "metabolom"      "flow"           "align"
findAssocs(doc_mat, terms = "model", corlimit = 0.05)
## $model
##                ise             hidden             mixtur 
##               0.33               0.32               0.29 
##           bayesian            describ            livecel 
##               0.25               0.23               0.23 
##             linear              equat           hierarch 
##               0.21               0.20               0.20 
##              dynam dirichletmultinomi             markov 
##               0.19               0.16               0.16 
##                bma                fit            regress 
##               0.16               0.15               0.13 
##          genomewid             surviv               rnai 
##               0.13               0.13               0.13 
##             spline             averag            absolut 
##               0.13               0.12               0.11 
##             absseq           subtract          implement 
##               0.11               0.11               0.11 
##          singlecel         constraint               nois 
##               0.11               0.11               0.11 
##          higherord              mskcc             growth 
##               0.11               0.11               0.11 
##            boolean              assay              grade 
##               0.11               0.11               0.11 
##              tuqtl               gaga            catalog 
##               0.11               0.11               0.11 
##              error             impuls             laplac 
##               0.11               0.11               0.11 
##            linnorm            dropout    michaelismenten 
##               0.11               0.11               0.11 
##           piecewis              medip            lognorm 
##               0.11               0.11               0.11 
##             bifurc        determinist     networkregular 
##               0.11               0.11               0.11 
##      regressionbas                law              plgem 
##               0.11               0.11               0.11 
##      likelihoodbas            baselin                cox 
##               0.11               0.11               0.11 
##             hazard              decay         likelihood 
##               0.11               0.11               0.11 
##          mutagenet           rtreemix             latent 
##               0.11               0.11               0.11 
##             follow        mixedeffect              imbal 
##               0.11               0.11               0.11 
##             vegamc         crossstudi                xde 
##               0.11               0.11               0.11 
##           zeroinfl        reconstruct              trait 
##               0.11               0.10               0.09 
##             binomi              negat             smooth 
##               0.09               0.09               0.09 
##                new         differenti           isotherm 
##               0.08               0.08               0.08 
##           langmuir             degrad         membership 
##               0.08               0.08               0.08 
##           coeffici           chipchip             calibr 
##               0.08               0.08               0.08 
##           progress          pairedend           knowledg 
##               0.08               0.08               0.08 
##              logic              prior              train 
##               0.08               0.08               0.08 
##               time          polymorph                cpg 
##               0.08               0.08               0.08 
##               seri           phenotyp            dispers 
##               0.08               0.08               0.08 
##           shrinkag               iter                run 
##               0.08               0.08               0.08 
##            emblebi           epistasi            toolset 
##               0.08               0.08               0.08 
##            truncat         waveletbas               nest 
##               0.08               0.08               0.08 
##             propag          prioritis          shortread 
##               0.08               0.08               0.08 
##                mix           mnaseseq             rnaseq 
##               0.08               0.08               0.07 
##               imag         probelevel             driver 
##               0.07               0.06               0.06 
##            metabol            variabl            factori 
##               0.06               0.06               0.06 
##         experiment         networkbas           gaussian 
##               0.06               0.06               0.06 
##               eqtl          timecours         decomposit 
##               0.06               0.06               0.06 
##                use 
##               0.05
head(d_BioC, 10)
##                  word freq
## data             data  486
## analysi       analysi  363
## gene             gene  189
## express       express  153
## packag         packag  143
## sequenc       sequenc  101
## differenti differenti   94
## use               use   90
## tool             tool   86
## genom           genom   82

Plot Word frequencies of R-cran descriptions

The frequency of the first 25 words are plotted.

barplot(d_BioC[1:25,]$freq, 
        las = 2, 
        names.arg = d_BioC[1:25,]$word,
        col ="lightyellow", 
        main ="Most Frequent Words From R-cran Packages",
        ylab = "Word Count")