library(readr)
library(tidyverse)
library(stringi)
library(tm)
library(corpus)
library(wordcloud)
library(data.table)
library(stringr)
library(kableExtra)
Read in data and filter by position
all_data<-read.csv("https://raw.githubusercontent.com/cassandra-coste/CUNY607/main/project_3/raw_jobdata.csv")
all_data$position<-tolower(all_data$position)
all_data$description<-tolower(all_data$description)
all_data$description<-tolower(all_data$description)%>%
str_remove_all("â|€|™")
##filters for data science positions
data_scientists<-all_data%>%
mutate(contents = str_detect(tolower(position), "data [b-z]|ai|machine"))%>%
filter(contents == TRUE)
data_analysts<-all_data%>%
mutate(contents = str_detect(tolower(position), "anal"))%>%
filter(contents == TRUE)
rm(all_data)
Filter for targeted skills
using the same search criteria above make new columns containing the strings of interest to be worked with later. issues with this is there are tons of NA values
ds<-data_scientists%>%
mutate(skill = str_extract_all(data_scientists$description, " .{75,100} skill.{150,200} "))%>%
mutate(must_have=str_extract_all(data_scientists$description, "must have.{150,200} "))%>%
mutate(knowledge=str_extract_all(data_scientists$description, " .{75,100} knowledge.{150,200} "))%>%
mutate(experience=str_extract_all(data_scientists$description, " .{100,150} exper.{100,150} "))%>%
mutate(excel=str_extract_all(data_scientists$description, "excel at.{150,200} |excel with.{150,200} |excel in.{150,200} "))%>%
mutate(responsible=str_extract_all(data_scientists$description, "responsible.{150,200} "))%>%
mutate(proficient=str_extract_all(data_scientists$description," .{100,150} profi.{110,160} "))%>%
mutate(understands=str_extract_all(data_scientists$description, " .{100,150} understand.{150,200} "))%>%
mutate(utilize=str_extract_all(data_scientists$description, "utilize.{150,200} "))%>%
mutate(lead=str_extract_all(data_scientists$description, " .{150,200} lead.{150,200} "))%>%
mutate(work=str_extract_all(data_scientists$description, " .{50,75} work.{150,200} "))%>%
mutate(looking=str_extract_all(data_scientists$description, "looking.{150,200} "))
ds$skill<-lapply(ds$skill, function(x)paste(unlist(x), collapse=' '))
ds$must_have<-lapply(ds$must_have, function(x)paste(unlist(x), collapse=' '))
ds$knowledge<-lapply(ds$knowledge, function(x)paste(unlist(x), collapse=' '))
ds$understands<-lapply(ds$understands, function(x)paste(unlist(x), collapse=' '))
ds$experience<-lapply(ds$experience, function(x)paste(unlist(x), collapse=' '))
ds$excel<-lapply(ds$excel, function(x)paste(unlist(x), collapse=' '))
ds$responsible<-lapply(ds$responsible, function(x)paste(unlist(x), collapse=' '))
ds$proficient<-lapply(ds$proficient, function(x)paste(unlist(x), collapse=' '))
ds$understands<-lapply(ds$understands, function(x)paste(unlist(x), collapse=' '))
ds$utilize<-lapply(ds$utilize, function(x)paste(unlist(x), collapse=' '))
ds$lead<-lapply(ds$lead, function(x)paste(unlist(x), collapse=' '))
ds$work<-lapply(ds$work, function(x)paste(unlist(x), collapse=' '))
ds$looking<-lapply(ds$looking, function(x)paste(unlist(x), collapse=' '))
da<-data_analysts%>%
mutate(skill = str_extract_all(data_analysts$description, " .{75,100} skill.{150,200} "))%>%
mutate(must_have=str_extract_all(data_analysts$description, "must have.{150,200} "))%>%
mutate(knowledge=str_extract_all(data_analysts$description, " .{75,100} knowledge.{150,200} "))%>%
mutate(experience=str_extract_all(data_analysts$description, " .{100,150} exper.{100,150} "))%>%
mutate(excel=str_extract_all(data_analysts$description, "excel at.{150,200} |excel with.{150,200} |excel in.{150,200} "))%>%
mutate(responsible=str_extract_all(data_analysts$description, "responsible.{150,200} "))%>%
mutate(proficient=str_extract_all(data_analysts$description," .{100,150} profi.{110,160} "))%>%
mutate(understands=str_extract_all(data_analysts$description, " .{100,150} understand.{150,200} "))%>%
mutate(utilize=str_extract_all(data_analysts$description, "utilize.{150,200} "))%>%
mutate(lead=str_extract_all(data_analysts$description, " .{150,200} lead.{150,200} "))%>%
mutate(work=str_extract_all(data_analysts$description, " .{50,75} work.{150,200} "))%>%
mutate(looking=str_extract_all(data_analysts$description, "looking.{150,200} "))
da$skill<-lapply(da$skill, function(x)paste(unlist(x), collapse=' '))
da$must_have<-lapply(da$must_have, function(x)paste(unlist(x), collapse=' '))
da$knowledge<-lapply(da$knowledge, function(x)paste(unlist(x), collapse=' '))
da$understands<-lapply(da$understands, function(x)paste(unlist(x), collapse=' '))
da$experience<-lapply(da$experience, function(x)paste(unlist(x), collapse=' '))
da$excel<-lapply(da$excel, function(x)paste(unlist(x), collapse=' '))
da$responsible<-lapply(da$responsible, function(x)paste(unlist(x), collapse=' '))
da$proficient<-lapply(da$proficient, function(x)paste(unlist(x), collapse=' '))
da$understands<-lapply(da$understands, function(x)paste(unlist(x), collapse=' '))
da$utilize<-lapply(da$utilize, function(x)paste(unlist(x), collapse=' '))
da$lead<-lapply(da$lead, function(x)paste(unlist(x), collapse=' '))
da$work<-lapply(da$work, function(x)paste(unlist(x), collapse=' '))
da$looking<-lapply(da$looking, function(x)paste(unlist(x), collapse=' '))
Make corpus and remove punctuation, numbers, stopwords, convert cases, etc
corpus <- VCorpus(VectorSource(text))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, removeWords, c("skill","responsible","proficient","knowledge","understands", "must", "experience", "character", "will", "looking", "excels at", "work", "lead", "utilize"))
corpus_Clean <- tm_map(corpus, stripWhitespace)
wordcloud(corpus, max.words = 50, colors = colorRampPalette(brewer.pal(7, "Dark2"))(32))

Tokenization of textbody into unigrams (one word), bigrams (two words), trigrams (three words), and quadgrams(four words)
#Unigrams
unigramTokenizer <- function(x) { unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE) }
unigram <- TermDocumentMatrix(corpus, control = list(wordLengths = c(1, 20)))
#Bigrams
bigramTokenizer <- function(x) { unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE) }
bigram <- TermDocumentMatrix(corpus, control = list(wordLengths = c(3, 40),tokenize = bigramTokenizer))
#Trigrams
trigramTokenizer <- function(x) { unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE) }
trigram <- TermDocumentMatrix(corpus, control = list(wordLengths = c(3, 60),tokenize = trigramTokenizer))
Plot unigram
#Unigrams
unigramrow <- sort(slam::row_sums(unigram), decreasing=T)
unigramfreq <- data.table(tok = names(unigramrow), freq = unigramrow)
ggplot(unigramfreq[1:25,], aes(x = reorder(tok,freq), y = freq)) + coord_flip() +
geom_bar(stat = "identity", fill = "coral") + theme_bw() +
ggtitle("Top 25 Unigrams") +labs(x = "", y = "")

Plot bigram
#Bigrams
bigramrow <- sort(slam::row_sums(bigram), decreasing=T)
bigramfreq <- data.table(tok = names(bigramrow), freq = bigramrow)
ggplot(bigramfreq[1:25,], aes(x = reorder(tok,freq), y = freq)) + coord_flip() +
geom_bar(stat = "identity", fill = "coral") + theme_bw() +
ggtitle("Top 25 Bigrams") +labs(x = "", y = "")

Plot trigram
#Trigrams
trigramrow <- sort(slam::row_sums(trigram), decreasing=T)
trigramfreq <- data.table(tok = names(trigramrow), freq = trigramrow)
ggplot(trigramfreq[1:25,], aes(x = reorder(tok,freq), y = freq)) + coord_flip() +
geom_bar(stat = "identity", fill = "coral") + theme_bw() +
ggtitle("Top 25 Trigrams") +labs(x = "", y = "")

Create dictionary and search for string in column
#Load dictionary
dictionary_analyst <- read.csv("https://raw.githubusercontent.com/cassandra-coste/CUNY607/main/project_3/Eric_DataAnalystDictionary.csv", header=FALSE, fileEncoding = "UTF-8-BOM")
dictionary_ngrams <- read.csv("https://raw.githubusercontent.com/ericonsi/Project3/master/dictionary/Ngrams_dictionary.csv?token=ASWDA45DKKSCY6O2LEEIEZ3AMNTEM", fileEncoding = "UTF-8-BOM")
os <- read.csv("https://raw.githubusercontent.com/ericonsi/Project3/master/dictionary/OS_dictionary_skills.csv?token=ASWDA4ZNPKGI4AE2TNYGNTDAMPRL2", fileEncoding = "UTF-8-BOM")
onet <- read.csv("https://raw.githubusercontent.com/ericonsi/Project3/master/dictionary/ONET%20Technology%20Skills.csv?token=ASWDA42WGNR5QH4OV5EZATTAMPSEI", fileEncoding = "UTF-8-BOM")
# create dictionary with single skill column for merge
dictionary_onet <- onet %>% select(skill = Skill)
dictionary_os <- os %>% select(skill)
#Assign column name to analyst dictionary
names(dictionary_analyst) <- ('skill')
names(dictionary_ngrams) <- ('skill')
#convert to lowercase and remove special characters where needed
dictionary_analyst <- dictionary_analyst %>% mutate(across(where(is.character), tolower))
dictionary_ngrams <- dictionary_ngrams %>% mutate(across(where(is.character), tolower))
dictionary_os <- dictionary_os %>% mutate(across(where(is.character), tolower))
dictionary_onet <- dictionary_onet %>% mutate(across(where(is.character), tolower)) %>% mutate_all(funs(gsub("[[:punct:]]", "", .)))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
#merge dictionaries and remove duplicates
MyMerge <- function(x, y){
df <- merge(x, y, all = TRUE)
return(df)
}
# merge all four dictionaries and delete duplicate skills
dictionary <- Reduce(MyMerge, list(dictionary_analyst, dictionary_ngrams, dictionary_onet, dictionary_os)) %>% distinct()
#remove common character strings found within words and phrases to analyze separately
#remove skills that need to be removed when then appear alone but not when in a phrase
dictionary$skill <- str_remove(dictionary$skill, "(?! )(ai|science|business)(?! )")
#remove short character skills that will be picked up within words
dictionary$skill <- str_remove(dictionary$skill, "(?:^|\\W)(r|c|ms|go)(?:$|\\W)")
# turn dictionary into vector and remove empty rows
dictionary <- dictionary[!apply(dictionary == "", 1, all),]
get counts for r, ms, ai, go
data_sci_count<-data.frame("r" = sum(str_count(data_scientists$description, " r | r,| r\\.")), "ms" = sum(str_count(data_scientists$description, " ms | ms,| ms.")), Go = sum(str_count(data_scientists$description, " go ")), "ai" = sum(str_count(data_scientists$description, " ai | ai,| ai\\.")))
data_ana_count<-data.frame("r" = sum(str_count(data_analysts$description, " r | r,| r\\.")), "ms" = sum(str_count(data_analysts$description, " ms | ms,| ms\\.")), Go = sum(str_count(data_analysts$description, " go ")), "ai" = sum(str_count(data_analysts$description, " ai | ai,| ai\\.")))
Detect dictionary words in original descriptions
# Pulls skills out of description based on dictionary
setDT(data_scientists)[, skills := paste(dictionary[unlist(lapply(dictionary, function(x) grepl(x, description, ignore.case = T)))], collapse = ","), by = 1:nrow(data_scientists)]
setDT(data_analysts)[, skills := paste(dictionary[unlist(lapply(dictionary, function(x) grepl(x, description, ignore.case = T)))], collapse = ","), by = 1:nrow(data_analysts)]
# Create a count of skills for data science
skillsfreq_ds <- data_scientists %>%
separate_rows(skills, sep = ',') %>%
group_by(skills = tolower(skills)) %>%
summarise(count = n())
# Create a count of skills for data analyst
skillsfreq_da <- data_analysts %>%
separate_rows(skills, sep = ',') %>%
group_by(skills = tolower(skills)) %>%
summarise(count = n())
# Merge counts for r, ms, ai, go
data_sci_count <- data_sci_count %>% gather(skills, count, 1:4)
data_ana_count <- data_ana_count %>% gather(skills, count, 1:4)
skillsfreq_ds <- merge(skillsfreq_ds, data_sci_count, all = TRUE)
skillsfreq_da <- merge(skillsfreq_da, data_ana_count, all = TRUE)
Table top data scientist skills
top_skills_ds <- skillsfreq_ds %>% arrange(desc(count)) %>% select(skills, count)
head(top_skills_ds) %>%
kbl(caption = "Top Skills") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Top Skills
|
skills
|
count
|
|
python
|
5250
|
|
machine learning
|
1693
|
|
design
|
1468
|
|
computer
|
1460
|
|
science
|
1383
|
|
research
|
1254
|
Table top data analyst skills
top_skills_da <- skillsfreq_da %>% arrange(desc(count)) %>% select(skills, count)
head(top_skills_da) %>%
kbl(caption = "Top Skills") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Top Skills
|
skills
|
count
|
|
python
|
1275
|
|
research
|
785
|
|
communication
|
776
|
|
analytical
|
653
|
|
design
|
627
|
|
organization
|
605
|
Merge top data skills for data scientist and data analyst
skillsfreq_all <- full_join(skillsfreq_ds ,skillsfreq_da,by="skills") %>% rename(data_scientist = count.x, data_analyst = count.y)
Table top data skills
top_skills_all <- skillsfreq_all %>% arrange(desc(data_scientist)) %>% select(skills, data_scientist, data_analyst)
head(top_skills_all) %>%
kbl(caption = "Top Skills") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Top Skills
|
skills
|
data_scientist
|
data_analyst
|
|
python
|
5250
|
1275
|
|
machine learning
|
1693
|
207
|
|
design
|
1468
|
627
|
|
computer
|
1460
|
380
|
|
science
|
1383
|
312
|
|
research
|
1254
|
785
|