Project 3

References: [http://www.sthda.com/english/wiki/text-mining-and-word-cloud-fundamentals-in-r-5-simple-steps-you-should-know] [https://www.tidytextmining.com/nasa.html#word-co-ocurrences-and-correlations]

library(tm)

## Loading required package: NLP

library(SnowballC)
library(wordcloud)

## Loading required package: RColorBrewer

library(RColorBrewer)
library(stringr)




jobs <- read.csv('data_scientist_united_states_job_postings_jobspikr.csv')

head(jobs)

# Remove non-ascii characters
jobs$job_description <-  gsub("[^\x20-\x7E]", "", jobs$job_description)



descriptions <- Corpus(VectorSource(jobs$job_description))




toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
descriptions <- tm_map(descriptions, toSpace, "/")

## Warning in tm_map.SimpleCorpus(descriptions, toSpace, "/"): transformation drops
## documents

descriptions <- tm_map(descriptions, toSpace, "\n")

## Warning in tm_map.SimpleCorpus(descriptions, toSpace, "\n"): transformation
## drops documents

descriptions <- tm_map(descriptions, toSpace, "@")

## Warning in tm_map.SimpleCorpus(descriptions, toSpace, "@"): transformation drops
## documents

descriptions <- tm_map(descriptions, toSpace, "\\|")

## Warning in tm_map.SimpleCorpus(descriptions, toSpace, "\\|"): transformation
## drops documents

descriptions <- tm_map(descriptions, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(descriptions, content_transformer(tolower)):
## transformation drops documents

descriptions <- tm_map(descriptions, removeWords, stopwords("english"))

## Warning in tm_map.SimpleCorpus(descriptions, removeWords, stopwords("english")):
## transformation drops documents

descriptions <- tm_map(descriptions, removePunctuation)

## Warning in tm_map.SimpleCorpus(descriptions, removePunctuation): transformation
## drops documents

descriptions <- tm_map(descriptions, stripWhitespace)

## Warning in tm_map.SimpleCorpus(descriptions, stripWhitespace): transformation
## drops documents

#Stem the document: reduces common word endings (ing, es, s)
#descriptions <- tm_map(descriptions, stemDocument)


dtm <- TermDocumentMatrix(descriptions)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)

This is how the frequency looks

#Set ups the Word Cloud
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 100,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

Project 3

Sung Lee

3/18/2020

This is how the frequency looks