Text Mining (tm) - Project 3

library(tidyverse)
library(tm)
library(wordcloud)
library(knitr)

Read in file to R

url <- readLines("https://raw.githubusercontent.com/nschettini/CUNY-MSDS-DATA-607/master/cleanjobfiles.txt")

Combines everything together

review_text <- paste(url, collapse=" ")

Loads text into corpus

review_source <- VectorSource(review_text)
corpus <- Corpus(review_source)

Cleans the data - converts all to lowercase, removes punctuation, removes white space, and removes unneeded words

corpus <- tm_map(corpus, content_transformer(tolower))

corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, removeWords, c("andor", "status well", "within",
                                        "business", "learning", "field",
                                        "span", "amp", "across", "strong", "large", "using",
                                        "help", "clients", "class", "classresultlinkbarcontainer",
                                        "every", "work", "can", "position", "risk", "global", "work",
                                        "will","brbr", "resultlinkbarviewjob", "years",
                                        "idjobsummary","div","new", "skills"))

dtm <- DocumentTermMatrix(corpus)
dtm2 <- as.matrix(dtm)

Finding most frequent terms in entire dataset - all job postings, all words (except stop words*)

frequency <- colSums(dtm2)
frequency <- sort(frequency, decreasing=T)

table <- head(frequency)
kable(table)

	x
data	2174
experience	1316
research	837
summary	651
bar	607
result	607

Create word cloud

words <- names(frequency)
wordcloud(words[1:100], frequency[1:100], 
          colors=brewer.pal(8, "Dark2"))

wf <- data.frame(word=names(frequency), frequency=frequency)
head(wf)

##                  word frequency
## data             data      2174
## experience experience      1316
## research     research       837
## summary       summary       651
## bar               bar       607
## result         result       607

p <- ggplot(subset(wf, frequency>200), aes(x = reorder(word, -frequency), y = frequency)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x=element_text(angle=45, hjust=1))
p

Filter by specific skills that are needed by Data Scientists:

review_text <- paste(url, collapse=" ")

review_source <- VectorSource(review_text)
corpus <- Corpus(review_source)

keep = c("python", "sql", "hadoop", "statistics", "tableau", "analysis", "Java", "Machine",
         "geo", "jupyter notebook", "sas", "scala", "matlab", "big data", "database", "data mining",
         "collaboration", "unsupervised", "visulization", "sra", "svm", "hdfs", "linux", "map reduce",
         "pig", "decision forests", "phd","masters", "bachelors", "r", "c", "experience")

keepOnlyWords<-content_transformer(function(x,words) {
  regmatches(x, 
             gregexpr(paste0("(\\b",  paste(words, collapse = "\\b|\\b"), "\\b)"), x)
             , invert = T) <- " "
  x
})

corpus <- tm_map(corpus, content_transformer(tolower))

corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, keepOnlyWords, keep)


inspect(DocumentTermMatrix(corpus, control = list(wordLengths = c(1, Inf))))

## <<DocumentTermMatrix (documents: 1, terms: 31)>>
## Non-/sparse entries: 31/0
## Sparsity           : 0%
## Maximal term length: 13
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs analysis big data experience phd python   r sas sql statistics
##    1      489 101  161       1316  95    185 178  81 136        176

dtm <- DocumentTermMatrix(corpus, control = list(wordLengths = c(1, Inf)))
dtm2 <- as.matrix(dtm)

frequency <- colSums(dtm2)
frequency <- sort(frequency, decreasing=T)


kable(frequency)

	x
experience	1316
analysis	489
python	185
r	178
statistics	176
data	161
sql	136
big	101
phd	95
sas	81
database	79
hadoop	77
mining	60
collaboration	58
c	57
scala	44
tableau	38
matlab	26
masters	24
linux	23
pig	14
unsupervised	8
svm	7
bachelors	4
hdfs	3
geo	2
map	2
reduce	2
jupyter	1
notebook	1
sra	1

wf <- data.frame(word=names(frequency), frequency=frequency)
head(wf)

##                  word frequency
## experience experience      1316
## analysis     analysis       489
## python         python       185
## r                   r       178
## statistics statistics       176
## data             data       161

p <- ggplot(subset(wf, frequency>50), aes(x = reorder(word, -frequency), y = frequency)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x=element_text(angle=45, hjust=1))
p

Text Mining (tm) - Project 3

NIcholas Schettini

March 21, 2018

Read in file to R

Combines everything together

Loads text into corpus

Cleans the data - converts all to lowercase, removes punctuation, removes white space, and removes unneeded words

Finding most frequent terms in entire dataset - all job postings, all words (except stop words*)

Create word cloud

Filter by specific skills that are needed by Data Scientists: