library(tm)
## Loading required package: NLP
library(SnowballC)
library(wordcloud)
## Loading required package: RColorBrewer
setwd("D:/bkup/Bioinfo.2016/Bioinfo_Jobs")
#Create Corpus
docs <- Corpus(DirSource("D:/bkup/Bioinfo.2016/Bioinfo_Jobs"))

# getTransformations() # To see available commands

#create the 'toSpace' content transformer
toSpace <- content_transformer(function(x, pattern) {return (gsub(pattern, " ", x))})

## Now we can use  this content transformer to eliminate colons and hypens like so:
docs <- tm_map(docs, toSpace, "-")
docs <- tm_map(docs, toSpace, ":")
#Remove punctuation b
docs <- tm_map(docs, removePunctuation)
# Remove non-std punctuation
docs <- tm_map(docs, toSpace, "'")
docs <- tm_map(docs, toSpace, "`")
docs <- tm_map(docs, toSpace, "_")
# Any additional words
docs <- tm_map(docs, toSpace, "across")
docs <- tm_map(docs, toSpace, "also")

# getTransformations() # To see available commands

#Transform to lower case (need to wrap in content_transformer)
docs <- tm_map(docs,content_transformer(tolower))
#Strip digits (std transformation, so no need for content_transformer)
docs <- tm_map(docs, removeNumbers)
#remove stopwords using the standard list in tm
docs <- tm_map(docs, removeWords, stopwords("english"))
#Strip whitespace (cosmetic?)
docs <- tm_map(docs, stripWhitespace)

#load library
library(SnowballC)
#Stem document
##docs <- tm_map(docs,stemDocument)
writeLines(as.character(docs[[30]]))
## bioinformatics scientist
## ngm biopharmaceuticals inc
## submit
## add job folder view jobs back search results
## location 
## south san francisco ca
## posted date 
## 
## position type 
## full time
## job code 
## hiah
## salary 
## required education 
## masters degree
## areas expertise desired 
## bioinformatics biology computational genomics pharmaceutical phd
## job description
## ngm biopharmaceuticals inc private biotech start dedicated discovering next generation medicines treatment metabolic hepatic diseases experienced scientific team joined forces impressive group industry professionals nobel laureates distinguished researchers form company innovation cutting edge science will provide foundation robust drug discovery engine recently signed five year broad collaboration merck total funding million dollars partnership provided strong validation ngm biologics discovery engine will generate transformational medicines unmet medical needs
##  currently looking enthusiastic highly motivated bioinformatician join discovery efforts candidate opportunity contribute hisher superb technical skills scientific expertise towards discovery transformational new therapeutics particular looking support expertise efforts involving data analysis eg next gen sequencing data analysis proteomics data analysis data mining database generation support target discovery efforts ngm candidate must display good communication skills able function effectively part research team
## requirements
## applicants self motivated possess bachelor’s degree higher bioinformatics computer science life sciences related field years working knowledge computational biology including algorithm development knowledge genomics proteomics software platforms sequence databases highly desirable position
dtm <- DocumentTermMatrix(docs)
## inspect(dtm[1:2,1000:1005])

freq <- colSums(as.matrix(dtm))
#length should be total number of terms
length(freq)
## [1] 3692
#create sort order (descending)
ord <- order(freq,decreasing=TRUE)
#inspect most frequently occurring terms
freq[head(ord)]
##           will     experience           data       research bioinformatics 
##            421            394            389            368            270 
##    development 
##            261
#inspect least frequently occurring terms
freq[tail(ord)]
##     wyatt xmlnshttp    yearly  years’       yes       yet 
##         1         1         1         1         1         1
dtmr <-DocumentTermMatrix(docs,
                          control=list(wordLengths=c(4, 20),
                                       bounds = list(global = c(3,27))))
freqr <- colSums(as.matrix(dtmr))
#length should be total number of terms
length(freqr)
## [1] 1245
#create sort order (asc)
ordr <- order(freqr,decreasing=TRUE)
#inspect most frequently occurring terms
freqr[head(ordr)]
##      freqr    minfreq       page biomedical   national     leidos 
##        239        235        234         88         83         61
#inspect least frequently occurring terms
freqr[tail(ordr)]
## transition  unixlinux     values     viable    welcome       year 
##          3          3          3          3          3          3
findFreqTerms(dtmr,lowfreq=60)
## [1] "biomedical" "freqr"      "leidos"     "minfreq"    "national"  
## [6] "page"
findAssocs(dtmr,"biomed",0.6)
## $biomed
## numeric(0)
wf=data.frame(term=names(freqr),occurrences=freqr)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
p <- ggplot(subset(wf, freqr>50), aes(term, occurrences))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p

#wordcloud
library(wordcloud)
#setting the same seed each time ensures consistent look across clouds
set.seed(42)
#limit words by specifying min frequency
wordcloud(names(freqr),freqr, min.freq=40)

#b
wordcloud(names(freqr),freqr,min.freq=40,colors=brewer.pal(6,"Dark2"))