Loads text into corpus
review_source <- VectorSource(review_text)
corpus <- Corpus(review_source)
Cleans the data - converts all to lowercase, removes punctuation, removes white space, and removes unneeded words
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, removeWords, c("andor", "status well", "within",
"business", "learning", "field",
"span", "amp", "across", "strong", "large", "using",
"help", "clients", "class", "classresultlinkbarcontainer",
"every", "work", "can", "position", "risk", "global", "work",
"will","brbr", "resultlinkbarviewjob", "years",
"idjobsummary","div","new", "skills"))
dtm <- DocumentTermMatrix(corpus)
dtm2 <- as.matrix(dtm)
Finding most frequent terms in entire dataset - all job postings, all words (except stop words*)
frequency <- colSums(dtm2)
frequency <- sort(frequency, decreasing=T)
table <- head(frequency)
kable(table)
data |
2779 |
experience |
1311 |
team |
760 |
machine |
700 |
summary |
657 |
result |
653 |
Create word cloud
words <- names(frequency)
wordcloud(words[1:100], frequency[1:100],
colors=brewer.pal(8, "Dark2"))

wf <- data.frame(word=names(frequency), frequency=frequency)
head(wf)
## word frequency
## data data 2779
## experience experience 1311
## team team 760
## machine machine 700
## summary summary 657
## result result 653
p <- ggplot(subset(wf, frequency>200), aes(x = reorder(word, -frequency), y = frequency)) +
geom_bar(stat = "identity") +
theme(axis.text.x=element_text(angle=45, hjust=1))
p

Filter by specific skills that are needed by Data Scientists:
review_text <- paste(url, collapse=" ")
review_source <- VectorSource(review_text)
corpus <- Corpus(review_source)
keep = c("python", "sql", "hadoop", "statistics", "tableau", "analysis", "Java", "Machine",
"geo", "jupyter notebook", "sas", "scala", "matlab", "big data", "database", "data mining",
"collaboration", "unsupervised", "visulization", "sra", "svm", "hdfs", "linux", "map reduce",
"pig", "decision forests", "phd","masters", "bachelors", "r", "c", "experience")
keepOnlyWords<-content_transformer(function(x,words) {
regmatches(x,
gregexpr(paste0("(\\b", paste(words, collapse = "\\b|\\b"), "\\b)"), x)
, invert = T) <- " "
x
})
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, keepOnlyWords, keep)
inspect(DocumentTermMatrix(corpus, control = list(wordLengths = c(1, Inf))))
## <<DocumentTermMatrix (documents: 1, terms: 32)>>
## Non-/sparse entries: 32/0
## Sparsity : 0%
## Maximal term length: 13
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs analysis big data experience mining phd python r sql statistics
## 1 445 112 200 1311 88 121 292 201 183 252
dtm <- DocumentTermMatrix(corpus, control = list(wordLengths = c(1, Inf)))
dtm2 <- as.matrix(dtm)
frequency <- colSums(dtm2)
frequency <- sort(frequency, decreasing=T)
kable(frequency)
experience |
1311 |
analysis |
445 |
python |
292 |
statistics |
252 |
r |
201 |
data |
200 |
sql |
183 |
phd |
121 |
big |
112 |
mining |
88 |
hadoop |
87 |
c |
81 |
scala |
58 |
collaboration |
49 |
database |
41 |
sas |
41 |
tableau |
34 |
masters |
31 |
matlab |
30 |
linux |
21 |
pig |
18 |
unsupervised |
15 |
bachelors |
8 |
hdfs |
6 |
svm |
6 |
map |
4 |
reduce |
4 |
decision |
2 |
forests |
2 |
jupyter |
2 |
notebook |
2 |
geo |
1 |
wf <- data.frame(word=names(frequency), frequency=frequency)
head(wf)
## word frequency
## experience experience 1311
## analysis analysis 445
## python python 292
## statistics statistics 252
## r r 201
## data data 200
p <- ggplot(subset(wf, frequency>50), aes(x = reorder(word, -frequency), y = frequency)) +
geom_bar(stat = "identity") +
theme(axis.text.x=element_text(angle=45, hjust=1))
p
