This script takes ten articles from the abstracts on earache articles from NCBI’s PubMed, then abstracts the words associated for text mining and to build a word cloud. The csv file used is ‘NCBI-EarAche-PubMed.csv’ available at github.

This first part will extract the articles from the first column and create a separate csv file for the article in a folder that the text mining package (tm) will use to build the document term matrix. The words extracted are stemmed, which takes the root word without using the meaning of the root word for a quicker analysis than lemmatization which is in the second part search this: ‘%^&’ in document to get to the 2nd part with lemmatization.

library(tm)
## Loading required package: NLP
library(SnowballC)
library(wordcloud)
## Loading required package: RColorBrewer
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(textstem)
## Loading required package: koRpus.lang.en
## Loading required package: koRpus
## Loading required package: sylly
## For information on available language packages for 'koRpus', run
## 
##   available.koRpus.lang()
## 
## and see ?install.koRpus.lang()
library(DT)
Auto <- read.csv('NCBI-EarAche-PubMed.csv', sep=',',
                 header=FALSE, na.strings=c('',' '))
colnames(Auto) <- c('abstract','source')

auto <- Auto[complete.cases(Auto$abstract),]
dir.create('./Earache')

ea <- as.character(auto$abstract)
setwd('./Earache')

for (j in 1:length(ea)){
  write(ea[j], paste(paste('EA',j, sep='.'), '.txt', sep=''))
}
setwd('../')

This section names the corpus of documents for tm to text mine, it then cleans up the punctuation, numbers, changes the words to lower case, removes stop words in English like ‘a’ or ‘the’ and others. The whitespace is also stripped and the root/stem words are extracted to count for the document term matrix.

Earache <- Corpus(DirSource("Earache"))


Earache
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 10
Earache <- tm_map(Earache, removePunctuation)
Earache <- tm_map(Earache, removeNumbers)
Earache <- tm_map(Earache, tolower)
Earache <- tm_map(Earache, removeWords, stopwords("english"))
Earache <- tm_map(Earache, stripWhitespace)
Earache <- tm_map(Earache, stemDocument)

dtmEarache <- DocumentTermMatrix(Earache)
dtmEarache
## <<DocumentTermMatrix (documents: 10, terms: 417)>>
## Non-/sparse entries: 652/3518
## Sparsity           : 84%
## Maximal term length: 17
## Weighting          : term frequency (tf)
freq <- colSums(as.matrix(dtmEarache))

FREQ <- data.frame(freq)

freq2 <- datatable(data=FREQ,  rownames=TRUE,
                      filter=list(position='top'),
                      options=list(
                        dom='Bfrtip',
                        buttons=c('colvis','csv'),
                        language=list(sSearch='Filter:')),
                      extensions=c('Buttons','Responsive')
)
freq2
ord <- order(freq, decreasing=TRUE)

freq[head(ord, 25)]
##   patient    earach   otalgia       ear      pain   symptom      caus    examin 
##        32        26        26        22        15        15        14        13 
##     refer agreement     evalu treatment    common       ent       can   histori 
##        13        11         8         8         7         7         7         7 
##     medic  antibiot      mani    normal  diagnosi    diseas   present     studi 
##         7         7         6         6         6         6         6         6 
##   primari 
##         6
findAssocs(dtmEarache, "patient", corlimit=0.5)
## $patient
##  particip       set     visit      four       two      care     advis  antibiot 
##      0.79      0.79      0.74      0.74      0.74      0.71      0.71      0.71 
##    appear  appropri     centr  consecut    design      educ     given     hundr 
##      0.71      0.71      0.71      0.71      0.71      0.71      0.71      0.71 
##      most necessari    object     other    outcom physician  prescrib prescript 
##      0.71      0.71      0.71      0.71      0.71      0.71      0.71      0.71 
##    relief    remain    return     rural selflimit      seri  strategi   subsequ 
##      0.71      0.71      0.71      0.71      0.71      0.71      0.71      0.71 
##      thus  unilater      wait    walkin     watch   whether     first    measur 
##      0.71      0.71      0.71      0.71      0.71      0.71      0.70      0.66 
##    famili    clinic     peopl   conclus    earach     clear    condit    receiv 
##      0.65      0.63      0.63      0.54      0.53      0.51      0.51      0.51
findAssocs(dtmEarache, "ear", corlimit=0.5)
## $ear
##              pain              risk             joint temporomandibular 
##              0.91              0.88              0.87              0.87 
##            dental             older             studi             sourc 
##              0.67              0.66              0.66              0.66 
##           analysi            associ          chisquar    conclusionthes 
##              0.62              0.62              0.62              0.62 
##           conduct         crosssect            dearth             elder 
##              0.62              0.62              0.62              0.62 
##            episod              full             futur              hear 
##              0.62              0.62              0.62              0.62 
##         highlight          independ          individu    introductionan 
##              0.62              0.62              0.62              0.62 
##              live              loss          methodsa               mix 
##              0.62              0.62              0.62              0.62 
##            modifi              none       objectiveto               odd 
##              0.62              0.62              0.62              0.62 
##              part            preval             ratio            regard 
##              0.62              0.62              0.62              0.62 
##             relat         resultsof             short          signific 
##              0.62              0.62              0.62              0.62 
##           statist           subject              test              tmjd 
##              0.62              0.62              0.62              0.62 
##             tmjds            verifi              well              imag 
##              0.62              0.62              0.62              0.62 
##            abnorm           alcohol             appar          arthriti 
##              0.62              0.62              0.62              0.62 
##           consult          determin            diabet        erythrocyt 
##              0.62              0.62              0.62              0.62 
##           externa          fiberopt           increas            magnet 
##              0.62              0.62              0.62              0.62 
##             media  nasolaryngoscopi            occult            option 
##              0.62              0.62              0.62              0.62 
##             otiti  otolaryngologist           pharyng           potenti 
##              0.62              0.62              0.62              0.62 
##              rate             reson         secondari          sediment 
##              0.62              0.62              0.62              0.62 
##           serious         suspicion         symptomat           syndrom 
##              0.62              0.62              0.62              0.62 
##             trial             typic             whose           without 
##              0.62              0.62              0.62              0.62 
##           histori             evalu       epidemiolog           general 
##              0.60              0.53              0.53              0.53 
##          identifi              long           process         difficult 
##              0.53              0.53              0.53              0.53 
##             drink             smoke           complex            consid 
##              0.53              0.53              0.53              0.53 
##            factor            innerv 
##              0.52              0.52
findAssocs(dtmEarache, "pain", corlimit=0.5)
## $pain
##               ear             sourc            abnorm           alcohol 
##              0.91              0.87              0.83              0.83 
##             appar          arthriti           consult          determin 
##              0.83              0.83              0.83              0.83 
##            diabet        erythrocyt           externa          fiberopt 
##              0.83              0.83              0.83              0.83 
##           increas            magnet             media  nasolaryngoscopi 
##              0.83              0.83              0.83              0.83 
##            occult            option             otiti  otolaryngologist 
##              0.83              0.83              0.83              0.83 
##           pharyng           potenti              rate             reson 
##              0.83              0.83              0.83              0.83 
##         secondari          sediment           serious         suspicion 
##              0.83              0.83              0.83              0.83 
##         symptomat           syndrom             trial             typic 
##              0.83              0.83              0.83              0.83 
##             whose           without              imag             joint 
##              0.83              0.83              0.82              0.79 
## temporomandibular              caus            examin            physic 
##              0.79              0.76              0.76              0.74 
##              mani         difficult             drink             smoke 
##              0.69              0.69              0.69              0.69 
##           complex            consid            dental           histori 
##              0.69              0.69              0.68              0.67 
##              risk            innerv             usual           primari 
##              0.67              0.67              0.65              0.61 
##            common             evalu             often             older 
##              0.60              0.57              0.55              0.54
wf <- data.frame(word=names(freq), freq=freq)
p <- ggplot(subset(wf, freq>4), aes(word, freq))
p <- p + geom_bar(stat= 'identity') 
p <- p + theme(axis.text.x=element_text(angle=90, hjust=1)) 
p

wordcloud(names(freq), freq, min.freq=4,colors=brewer.pal(3,'Dark2'))

wordcloud(names(freq), freq, max.words=40,colors=brewer.pal(6,'Dark2'))

%^&

This is the second part of the text mining of ear ache PubMed articles that will now analyze the same corpus of documents to get the root word by meaning for building the document term matrix by word count frequencies. This uses the original file read in earlier as a data table named auto.

lemma <- lemmatize_strings(auto$abstract, dictionary=lexicon::hash_lemmas)

Lemma <- as.data.frame(lemma)
Lemma <- cbind(Lemma, auto)

colnames(Lemma) <- c('lemmatizedAbstract','abstract', 'source')
write.csv(Lemma, 'LemmatizedEarAche.csv', row.names=FALSE)

The following creates a separate folder of lemmatized corpus documents on Ear aches from the lemmatized data table named Lemma.

dir.create('./EarAche-Lemma')

ea <- as.character(Lemma$lemmatizedAbstract)
setwd('./EarAche-Lemma')

for (j in 1:length(ea)){
  write(ea[j], paste(paste('EAL',j, sep='.'), '.txt', sep=''))
}
setwd('../')
Earache <- Corpus(DirSource("EarAche-Lemma"))

Earache
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 10
Earache <- tm_map(Earache, removePunctuation)
Earache <- tm_map(Earache, removeNumbers)
Earache <- tm_map(Earache, tolower)
Earache <- tm_map(Earache, removeWords, stopwords("english"))
Earache <- tm_map(Earache, stripWhitespace)

dtmEarache <- DocumentTermMatrix(Earache)
dtmEarache
## <<DocumentTermMatrix (documents: 10, terms: 432)>>
## Non-/sparse entries: 669/3651
## Sparsity           : 85%
## Maximal term length: 17
## Weighting          : term frequency (tf)
freq <- colSums(as.matrix(dtmEarache))

FREQ <- data.frame(freq)


freq <- colSums(as.matrix(dtmEarache))

FREQ <- data.frame(freq)

freq2 <- datatable(data=FREQ,  rownames=TRUE,
                      filter=list(position='top'),
                      options=list(
                        dom='Bfrtip',
                        buttons=c('colvis','csv'),
                        language=list(sSearch='Filter:')),
                      extensions=c('Buttons','Responsive')
)
freq2
ord <- order(freq, decreasing=TRUE)

freq[head(ord, 25)]
##     patient     earache     otalgia         ear     symptom        pain 
##          31          26          26          22          17          15 
##       cause examination       refer   agreement   treatment      common 
##          14          12          12          11           8           7 
##         ent         can     history     medical  antibiotic      report 
##           7           7           7           7           7           7 
##  evaluation        many        much      normal   diagnosis     disease 
##           6           6           6           6           6           6 
##     present 
##           6
patient <- as.data.frame(findAssocs(dtmEarache, "patient", corlimit=0.6))
earache <- as.data.frame(findAssocs(dtmEarache, "earache", corlimit=0.55))
treatment <- as.data.frame(findAssocs(dtmEarache, "treatment", corlimit=0.55))
wf <- data.frame(word=names(freq), freq=freq)
p <- ggplot(subset(wf, freq>4), aes(word, freq))
p <- p + geom_bar(stat= 'identity') 
p <- p + theme(axis.text.x=element_text(angle=90, hjust=1)) 
p

wordcloud(names(freq), freq, min.freq=5,colors=brewer.pal(3,'Dark2'))

wordcloud(names(freq), freq, max.words=40,colors=brewer.pal(6,'Dark2'))