This script takes ten articles from the abstracts on earache articles from NCBI’s PubMed, then abstracts the words associated for text mining and to build a word cloud. The csv file used is ‘NCBI-EarAche-PubMed.csv’ available at github.
This first part will extract the articles from the first column and create a separate csv file for the article in a folder that the text mining package (tm) will use to build the document term matrix. The words extracted are stemmed, which takes the root word without using the meaning of the root word for a quicker analysis than lemmatization which is in the second part search this: ‘%^&’ in document to get to the 2nd part with lemmatization.
library(tm)
## Loading required package: NLP
library(SnowballC)
library(wordcloud)
## Loading required package: RColorBrewer
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(textstem)
## Loading required package: koRpus.lang.en
## Loading required package: koRpus
## Loading required package: sylly
## For information on available language packages for 'koRpus', run
##
## available.koRpus.lang()
##
## and see ?install.koRpus.lang()
library(DT)
Auto <- read.csv('NCBI-EarAche-PubMed.csv', sep=',',
header=FALSE, na.strings=c('',' '))
colnames(Auto) <- c('abstract','source')
auto <- Auto[complete.cases(Auto$abstract),]
dir.create('./Earache')
ea <- as.character(auto$abstract)
setwd('./Earache')
for (j in 1:length(ea)){
write(ea[j], paste(paste('EA',j, sep='.'), '.txt', sep=''))
}
setwd('../')
This section names the corpus of documents for tm to text mine, it then cleans up the punctuation, numbers, changes the words to lower case, removes stop words in English like ‘a’ or ‘the’ and others. The whitespace is also stripped and the root/stem words are extracted to count for the document term matrix.
Earache <- Corpus(DirSource("Earache"))
Earache
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 10
Earache <- tm_map(Earache, removePunctuation)
Earache <- tm_map(Earache, removeNumbers)
Earache <- tm_map(Earache, tolower)
Earache <- tm_map(Earache, removeWords, stopwords("english"))
Earache <- tm_map(Earache, stripWhitespace)
Earache <- tm_map(Earache, stemDocument)
dtmEarache <- DocumentTermMatrix(Earache)
dtmEarache
## <<DocumentTermMatrix (documents: 10, terms: 417)>>
## Non-/sparse entries: 652/3518
## Sparsity : 84%
## Maximal term length: 17
## Weighting : term frequency (tf)
freq <- colSums(as.matrix(dtmEarache))
FREQ <- data.frame(freq)
freq2 <- datatable(data=FREQ, rownames=TRUE,
filter=list(position='top'),
options=list(
dom='Bfrtip',
buttons=c('colvis','csv'),
language=list(sSearch='Filter:')),
extensions=c('Buttons','Responsive')
)
freq2
ord <- order(freq, decreasing=TRUE)
freq[head(ord, 25)]
## patient earach otalgia ear pain symptom caus examin
## 32 26 26 22 15 15 14 13
## refer agreement evalu treatment common ent can histori
## 13 11 8 8 7 7 7 7
## medic antibiot mani normal diagnosi diseas present studi
## 7 7 6 6 6 6 6 6
## primari
## 6
findAssocs(dtmEarache, "patient", corlimit=0.5)
## $patient
## particip set visit four two care advis antibiot
## 0.79 0.79 0.74 0.74 0.74 0.71 0.71 0.71
## appear appropri centr consecut design educ given hundr
## 0.71 0.71 0.71 0.71 0.71 0.71 0.71 0.71
## most necessari object other outcom physician prescrib prescript
## 0.71 0.71 0.71 0.71 0.71 0.71 0.71 0.71
## relief remain return rural selflimit seri strategi subsequ
## 0.71 0.71 0.71 0.71 0.71 0.71 0.71 0.71
## thus unilater wait walkin watch whether first measur
## 0.71 0.71 0.71 0.71 0.71 0.71 0.70 0.66
## famili clinic peopl conclus earach clear condit receiv
## 0.65 0.63 0.63 0.54 0.53 0.51 0.51 0.51
findAssocs(dtmEarache, "ear", corlimit=0.5)
## $ear
## pain risk joint temporomandibular
## 0.91 0.88 0.87 0.87
## dental older studi sourc
## 0.67 0.66 0.66 0.66
## analysi associ chisquar conclusionthes
## 0.62 0.62 0.62 0.62
## conduct crosssect dearth elder
## 0.62 0.62 0.62 0.62
## episod full futur hear
## 0.62 0.62 0.62 0.62
## highlight independ individu introductionan
## 0.62 0.62 0.62 0.62
## live loss methodsa mix
## 0.62 0.62 0.62 0.62
## modifi none objectiveto odd
## 0.62 0.62 0.62 0.62
## part preval ratio regard
## 0.62 0.62 0.62 0.62
## relat resultsof short signific
## 0.62 0.62 0.62 0.62
## statist subject test tmjd
## 0.62 0.62 0.62 0.62
## tmjds verifi well imag
## 0.62 0.62 0.62 0.62
## abnorm alcohol appar arthriti
## 0.62 0.62 0.62 0.62
## consult determin diabet erythrocyt
## 0.62 0.62 0.62 0.62
## externa fiberopt increas magnet
## 0.62 0.62 0.62 0.62
## media nasolaryngoscopi occult option
## 0.62 0.62 0.62 0.62
## otiti otolaryngologist pharyng potenti
## 0.62 0.62 0.62 0.62
## rate reson secondari sediment
## 0.62 0.62 0.62 0.62
## serious suspicion symptomat syndrom
## 0.62 0.62 0.62 0.62
## trial typic whose without
## 0.62 0.62 0.62 0.62
## histori evalu epidemiolog general
## 0.60 0.53 0.53 0.53
## identifi long process difficult
## 0.53 0.53 0.53 0.53
## drink smoke complex consid
## 0.53 0.53 0.53 0.53
## factor innerv
## 0.52 0.52
findAssocs(dtmEarache, "pain", corlimit=0.5)
## $pain
## ear sourc abnorm alcohol
## 0.91 0.87 0.83 0.83
## appar arthriti consult determin
## 0.83 0.83 0.83 0.83
## diabet erythrocyt externa fiberopt
## 0.83 0.83 0.83 0.83
## increas magnet media nasolaryngoscopi
## 0.83 0.83 0.83 0.83
## occult option otiti otolaryngologist
## 0.83 0.83 0.83 0.83
## pharyng potenti rate reson
## 0.83 0.83 0.83 0.83
## secondari sediment serious suspicion
## 0.83 0.83 0.83 0.83
## symptomat syndrom trial typic
## 0.83 0.83 0.83 0.83
## whose without imag joint
## 0.83 0.83 0.82 0.79
## temporomandibular caus examin physic
## 0.79 0.76 0.76 0.74
## mani difficult drink smoke
## 0.69 0.69 0.69 0.69
## complex consid dental histori
## 0.69 0.69 0.68 0.67
## risk innerv usual primari
## 0.67 0.67 0.65 0.61
## common evalu often older
## 0.60 0.57 0.55 0.54
wf <- data.frame(word=names(freq), freq=freq)
p <- ggplot(subset(wf, freq>4), aes(word, freq))
p <- p + geom_bar(stat= 'identity')
p <- p + theme(axis.text.x=element_text(angle=90, hjust=1))
p
wordcloud(names(freq), freq, min.freq=4,colors=brewer.pal(3,'Dark2'))
wordcloud(names(freq), freq, max.words=40,colors=brewer.pal(6,'Dark2'))
%^&
This is the second part of the text mining of ear ache PubMed articles that will now analyze the same corpus of documents to get the root word by meaning for building the document term matrix by word count frequencies. This uses the original file read in earlier as a data table named auto.
lemma <- lemmatize_strings(auto$abstract, dictionary=lexicon::hash_lemmas)
Lemma <- as.data.frame(lemma)
Lemma <- cbind(Lemma, auto)
colnames(Lemma) <- c('lemmatizedAbstract','abstract', 'source')
write.csv(Lemma, 'LemmatizedEarAche.csv', row.names=FALSE)
The following creates a separate folder of lemmatized corpus documents on Ear aches from the lemmatized data table named Lemma.
dir.create('./EarAche-Lemma')
ea <- as.character(Lemma$lemmatizedAbstract)
setwd('./EarAche-Lemma')
for (j in 1:length(ea)){
write(ea[j], paste(paste('EAL',j, sep='.'), '.txt', sep=''))
}
setwd('../')
Earache <- Corpus(DirSource("EarAche-Lemma"))
Earache
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 10
Earache <- tm_map(Earache, removePunctuation)
Earache <- tm_map(Earache, removeNumbers)
Earache <- tm_map(Earache, tolower)
Earache <- tm_map(Earache, removeWords, stopwords("english"))
Earache <- tm_map(Earache, stripWhitespace)
dtmEarache <- DocumentTermMatrix(Earache)
dtmEarache
## <<DocumentTermMatrix (documents: 10, terms: 432)>>
## Non-/sparse entries: 669/3651
## Sparsity : 85%
## Maximal term length: 17
## Weighting : term frequency (tf)
freq <- colSums(as.matrix(dtmEarache))
FREQ <- data.frame(freq)
freq <- colSums(as.matrix(dtmEarache))
FREQ <- data.frame(freq)
freq2 <- datatable(data=FREQ, rownames=TRUE,
filter=list(position='top'),
options=list(
dom='Bfrtip',
buttons=c('colvis','csv'),
language=list(sSearch='Filter:')),
extensions=c('Buttons','Responsive')
)
freq2
ord <- order(freq, decreasing=TRUE)
freq[head(ord, 25)]
## patient earache otalgia ear symptom pain
## 31 26 26 22 17 15
## cause examination refer agreement treatment common
## 14 12 12 11 8 7
## ent can history medical antibiotic report
## 7 7 7 7 7 7
## evaluation many much normal diagnosis disease
## 6 6 6 6 6 6
## present
## 6
patient <- as.data.frame(findAssocs(dtmEarache, "patient", corlimit=0.6))
earache <- as.data.frame(findAssocs(dtmEarache, "earache", corlimit=0.55))
treatment <- as.data.frame(findAssocs(dtmEarache, "treatment", corlimit=0.55))
wf <- data.frame(word=names(freq), freq=freq)
p <- ggplot(subset(wf, freq>4), aes(word, freq))
p <- p + geom_bar(stat= 'identity')
p <- p + theme(axis.text.x=element_text(angle=90, hjust=1))
p
wordcloud(names(freq), freq, min.freq=5,colors=brewer.pal(3,'Dark2'))
wordcloud(names(freq), freq, max.words=40,colors=brewer.pal(6,'Dark2'))