# Install
#install.packages(c("tm", "SnowballC", "wordcloud", "RColorBrewer"))
# Load Libraries
library("tm")
library("SnowballC")
library("wordcloud")
library("RColorBrewer")
First work with the text from ONLY the 3rd column of ‘Bioconductor’ text found on Bioconductor All Packages descriptions.
text <- readLines("BioC_pack_col3.txt")
# Load the data as a corpus
docs <- Corpus(VectorSource(text))
# Inspect all the words by removing the comment line below.
#inspect(docs)
Preprocessing is performed using tm_map() function to replace or remove special characters from the text, for example.
# Convert all text to lower case
docs <- tm_map(docs, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(docs, content_transformer(tolower)):
## transformation drops documents
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
## Warning in tm_map.SimpleCorpus(docs, removePunctuation): transformation
## drops documents
# Remove numbers
docs <- tm_map(docs, removeNumbers)
## Warning in tm_map.SimpleCorpus(docs, removeNumbers): transformation drops
## documents
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(docs, removeWords, stopwords("english")):
## transformation drops documents
# Remove your own stop word
# ADD stopwords as a character vector
docs <- tm_map(docs, removeWords, c("big", "small"))
## Warning in tm_map.SimpleCorpus(docs, removeWords, c("big", "small")):
## transformation drops documents
# Text stemming
docs <- tm_map(docs, stemDocument)
## Warning in tm_map.SimpleCorpus(docs, stemDocument): transformation drops
## documents
It is even possible to be very specific regarding the symbols or words to remove.
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "/")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "/"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "@")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "@"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "\\|")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "\\|"): transformation drops
## documents
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
## Warning in tm_map.SimpleCorpus(docs, stripWhitespace): transformation drops
## documents
The Document-matrix is a table containing the frequency of the words. Column names are words and row names are documents. The function TermDocumentMatrix() from text mining package can be used as follow :
doc_mat <- TermDocumentMatrix(docs)
m <- as.matrix(doc_mat)
v <- sort(rowSums(m), decreasing = TRUE)
d_BioC <- data.frame(word = names(v), freq = v)
head(d_BioC, 5)
## word freq
## data data 486
## analysi analysi 363
## gene gene 189
## express express 153
## packag packag 143
The importance of words can be illustrated as a word cloud where font size of the word shows its importance.
wordcloud(words = d_BioC$word,
freq = d_BioC$freq,
min.freq = 1,
max.words = 100,
random.order = FALSE,
rot.per = 0.0, # proportion words with 90 degree rotation
colors = brewer.pal(4, "Set1"))
Explore frequent terms and their associations
One can have a look at the frequent terms in the term-document-matrix as follows. In the example below we want to find the words which occur at least 100 times, frequency => 13.
Rcran_list <- findFreqTerms(doc_mat, lowfreq = 13)
findFreqTerms(doc_mat, lowfreq = 13)
## [1] "affymetrix" "analysi" "array" "autom"
## [5] "packag" "base" "classif" "enrich"
## [9] "express" "gene" "region" "data"
## [13] "genom" "microarray" "statist" "differ"
## [17] "method" "model" "rnaseq" "detect"
## [21] "differenti" "class" "function" "algorithm"
## [25] "comput" "experi" "object" "use"
## [29] "multipl" "test" "cluster" "file"
## [33] "assess" "structur" "analys" "qualiti"
## [37] "tool" "generat" "analyz" "process"
## [41] "cancer" "sampl" "variat" "explor"
## [45] "map" "approach" "coexpress" "network"
## [49] "interact" "plot" "integr" "mirna"
## [53] "copi" "number" "annot" "biolog"
## [57] "visualis" "databas" "interfac" "bioconductor"
## [61] "access" "perform" "estim" "protein"
## [65] "normal" "visual" "high" "pipelin"
## [69] "sequenc" "set" "illumina" "methyl"
## [73] "predict" "site" "associ" "studi"
## [77] "identif" "splice" "select" "signatur"
## [81] "control" "cell" "identifi" "singlecel"
## [85] "bayesian" "bind" "factor" "transcript"
## [89] "distribut" "read" "call" "effect"
## [93] "chipseq" "dna" "highthroughput" "singl"
## [97] "graph" "similar" "profil" "omic"
## [101] "manipul" "target" "util" "infer"
## [105] "regulatori" "metaanalysi" "mass" "chromosom"
## [109] "cytometri" "featur" "correl" "genet"
## [113] "inform" "dataset" "proteom" "pathway"
## [117] "variant" "motif" "transcriptom" "design"
## [121] "phenotyp" "metabolom" "flow" "align"
findAssocs(doc_mat, terms = "model", corlimit = 0.05)
## $model
## ise hidden mixtur
## 0.33 0.32 0.29
## bayesian describ livecel
## 0.25 0.23 0.23
## linear equat hierarch
## 0.21 0.20 0.20
## dynam dirichletmultinomi markov
## 0.19 0.16 0.16
## bma fit regress
## 0.16 0.15 0.13
## genomewid surviv rnai
## 0.13 0.13 0.13
## spline averag absolut
## 0.13 0.12 0.11
## absseq subtract implement
## 0.11 0.11 0.11
## singlecel constraint nois
## 0.11 0.11 0.11
## higherord mskcc growth
## 0.11 0.11 0.11
## boolean assay grade
## 0.11 0.11 0.11
## tuqtl gaga catalog
## 0.11 0.11 0.11
## error impuls laplac
## 0.11 0.11 0.11
## linnorm dropout michaelismenten
## 0.11 0.11 0.11
## piecewis medip lognorm
## 0.11 0.11 0.11
## bifurc determinist networkregular
## 0.11 0.11 0.11
## regressionbas law plgem
## 0.11 0.11 0.11
## likelihoodbas baselin cox
## 0.11 0.11 0.11
## hazard decay likelihood
## 0.11 0.11 0.11
## mutagenet rtreemix latent
## 0.11 0.11 0.11
## follow mixedeffect imbal
## 0.11 0.11 0.11
## vegamc crossstudi xde
## 0.11 0.11 0.11
## zeroinfl reconstruct trait
## 0.11 0.10 0.09
## binomi negat smooth
## 0.09 0.09 0.09
## new differenti isotherm
## 0.08 0.08 0.08
## langmuir degrad membership
## 0.08 0.08 0.08
## coeffici chipchip calibr
## 0.08 0.08 0.08
## progress pairedend knowledg
## 0.08 0.08 0.08
## logic prior train
## 0.08 0.08 0.08
## time polymorph cpg
## 0.08 0.08 0.08
## seri phenotyp dispers
## 0.08 0.08 0.08
## shrinkag iter run
## 0.08 0.08 0.08
## emblebi epistasi toolset
## 0.08 0.08 0.08
## truncat waveletbas nest
## 0.08 0.08 0.08
## propag prioritis shortread
## 0.08 0.08 0.08
## mix mnaseseq rnaseq
## 0.08 0.08 0.07
## imag probelevel driver
## 0.07 0.06 0.06
## metabol variabl factori
## 0.06 0.06 0.06
## experiment networkbas gaussian
## 0.06 0.06 0.06
## eqtl timecours decomposit
## 0.06 0.06 0.06
## use
## 0.05
head(d_BioC, 10)
## word freq
## data data 486
## analysi analysi 363
## gene gene 189
## express express 153
## packag packag 143
## sequenc sequenc 101
## differenti differenti 94
## use use 90
## tool tool 86
## genom genom 82
The frequency of the first 25 words are plotted.
barplot(d_BioC[1:25,]$freq,
las = 2,
names.arg = d_BioC[1:25,]$word,
col ="lightyellow",
main ="Most Frequent Words From R-cran Packages",
ylab = "Word Count")