Project 3 Dictionary

library(readr)
library(tidyverse)
library(stringi)
library(tm)
library(corpus)
library(wordcloud)
library(data.table)
library(stringr)
library(kableExtra)

Read in data and filter by position

all_data<-read.csv("https://raw.githubusercontent.com/cassandra-coste/CUNY607/main/project_3/raw_jobdata.csv")
all_data$position<-tolower(all_data$position)
all_data$description<-tolower(all_data$description)
all_data$description<-tolower(all_data$description)%>%
  str_remove_all("â|€|™")

##filters for data science positions
data_scientists<-all_data%>%
  mutate(contents = str_detect(tolower(position), "data [b-z]|ai|machine"))%>%
  filter(contents == TRUE)

data_analysts<-all_data%>%
  mutate(contents = str_detect(tolower(position), "anal"))%>%
  filter(contents == TRUE)

rm(all_data)

Filter for targeted skills

using the same search criteria above make new columns containing the strings of interest to be worked with later. issues with this is there are tons of NA values

ds<-data_scientists%>%
  mutate(skill = str_extract_all(data_scientists$description, " .{75,100}   skill.{150,200} "))%>%
  mutate(must_have=str_extract_all(data_scientists$description, "must have.{150,200} "))%>%
  mutate(knowledge=str_extract_all(data_scientists$description, " .{75,100} knowledge.{150,200} "))%>%
  mutate(experience=str_extract_all(data_scientists$description, " .{100,150} exper.{100,150} "))%>%
  mutate(excel=str_extract_all(data_scientists$description, "excel at.{150,200} |excel with.{150,200} |excel in.{150,200} "))%>%
  mutate(responsible=str_extract_all(data_scientists$description, "responsible.{150,200} "))%>%
  mutate(proficient=str_extract_all(data_scientists$description," .{100,150} profi.{110,160} "))%>%
  mutate(understands=str_extract_all(data_scientists$description, " .{100,150} understand.{150,200} "))%>%
  mutate(utilize=str_extract_all(data_scientists$description, "utilize.{150,200} "))%>%
  mutate(lead=str_extract_all(data_scientists$description, " .{150,200} lead.{150,200} "))%>%
  mutate(work=str_extract_all(data_scientists$description, " .{50,75} work.{150,200} "))%>%
  mutate(looking=str_extract_all(data_scientists$description, "looking.{150,200} "))


ds$skill<-lapply(ds$skill, function(x)paste(unlist(x), collapse=' '))

ds$must_have<-lapply(ds$must_have, function(x)paste(unlist(x), collapse=' '))

ds$knowledge<-lapply(ds$knowledge, function(x)paste(unlist(x), collapse=' '))

ds$understands<-lapply(ds$understands, function(x)paste(unlist(x), collapse=' '))

ds$experience<-lapply(ds$experience, function(x)paste(unlist(x), collapse=' '))

ds$excel<-lapply(ds$excel, function(x)paste(unlist(x), collapse=' '))

ds$responsible<-lapply(ds$responsible, function(x)paste(unlist(x), collapse=' '))

ds$proficient<-lapply(ds$proficient, function(x)paste(unlist(x), collapse=' '))

ds$understands<-lapply(ds$understands, function(x)paste(unlist(x), collapse=' '))

ds$utilize<-lapply(ds$utilize, function(x)paste(unlist(x), collapse=' '))

ds$lead<-lapply(ds$lead, function(x)paste(unlist(x), collapse=' '))

ds$work<-lapply(ds$work, function(x)paste(unlist(x), collapse=' '))

ds$looking<-lapply(ds$looking, function(x)paste(unlist(x), collapse=' '))

da<-data_analysts%>%
  mutate(skill = str_extract_all(data_analysts$description, " .{75,100}   skill.{150,200} "))%>%
  mutate(must_have=str_extract_all(data_analysts$description, "must have.{150,200} "))%>%
  mutate(knowledge=str_extract_all(data_analysts$description, " .{75,100} knowledge.{150,200} "))%>%
  mutate(experience=str_extract_all(data_analysts$description, " .{100,150} exper.{100,150} "))%>%
  mutate(excel=str_extract_all(data_analysts$description, "excel at.{150,200} |excel with.{150,200} |excel in.{150,200} "))%>%
  mutate(responsible=str_extract_all(data_analysts$description, "responsible.{150,200} "))%>%
  mutate(proficient=str_extract_all(data_analysts$description," .{100,150} profi.{110,160} "))%>%
  mutate(understands=str_extract_all(data_analysts$description, " .{100,150} understand.{150,200} "))%>%
  mutate(utilize=str_extract_all(data_analysts$description, "utilize.{150,200} "))%>%
  mutate(lead=str_extract_all(data_analysts$description, " .{150,200} lead.{150,200} "))%>%
  mutate(work=str_extract_all(data_analysts$description, " .{50,75} work.{150,200} "))%>%
  mutate(looking=str_extract_all(data_analysts$description, "looking.{150,200} "))


da$skill<-lapply(da$skill, function(x)paste(unlist(x), collapse=' '))

da$must_have<-lapply(da$must_have, function(x)paste(unlist(x), collapse=' '))

da$knowledge<-lapply(da$knowledge, function(x)paste(unlist(x), collapse=' '))

da$understands<-lapply(da$understands, function(x)paste(unlist(x), collapse=' '))

da$experience<-lapply(da$experience, function(x)paste(unlist(x), collapse=' '))

da$excel<-lapply(da$excel, function(x)paste(unlist(x), collapse=' '))

da$responsible<-lapply(da$responsible, function(x)paste(unlist(x), collapse=' '))

da$proficient<-lapply(da$proficient, function(x)paste(unlist(x), collapse=' '))

da$understands<-lapply(da$understands, function(x)paste(unlist(x), collapse=' '))

da$utilize<-lapply(da$utilize, function(x)paste(unlist(x), collapse=' '))

da$lead<-lapply(da$lead, function(x)paste(unlist(x), collapse=' '))

da$work<-lapply(da$work, function(x)paste(unlist(x), collapse=' '))

da$looking<-lapply(da$looking, function(x)paste(unlist(x), collapse=' '))

Transform Data Science dataframe to long

ds_long <- ds %>% gather(keyword, text, 7:18)

text <- ds_long %>% select(text)

Make corpus and remove punctuation, numbers, stopwords, convert cases, etc

corpus <- VCorpus(VectorSource(text))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, removeWords, c("skill","responsible","proficient","knowledge","understands", "must", "experience", "character", "will", "looking", "excels at", "work", "lead", "utilize"))
corpus_Clean <- tm_map(corpus, stripWhitespace)

wordcloud(corpus, max.words = 50, colors = colorRampPalette(brewer.pal(7, "Dark2"))(32))

Tokenization of textbody into unigrams (one word), bigrams (two words), trigrams (three words), and quadgrams(four words)

#Unigrams
unigramTokenizer <- function(x) { unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE) }
unigram <- TermDocumentMatrix(corpus, control = list(wordLengths = c(1, 20)))


#Bigrams
bigramTokenizer <- function(x) { unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE) }
bigram <- TermDocumentMatrix(corpus, control = list(wordLengths = c(3, 40),tokenize = bigramTokenizer))


#Trigrams
trigramTokenizer <- function(x) { unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE) }
trigram <- TermDocumentMatrix(corpus, control = list(wordLengths = c(3, 60),tokenize = trigramTokenizer))

Plot unigram

#Unigrams

unigramrow <- sort(slam::row_sums(unigram), decreasing=T)
unigramfreq <- data.table(tok = names(unigramrow), freq = unigramrow)


ggplot(unigramfreq[1:25,], aes(x = reorder(tok,freq), y = freq)) + coord_flip() +
     geom_bar(stat = "identity", fill = "coral") + theme_bw() +
     ggtitle("Top 25 Unigrams") +labs(x = "", y = "")

Plot bigram

#Bigrams

bigramrow <- sort(slam::row_sums(bigram), decreasing=T)
bigramfreq <- data.table(tok = names(bigramrow), freq = bigramrow)

ggplot(bigramfreq[1:25,], aes(x = reorder(tok,freq), y = freq)) + coord_flip() +
     geom_bar(stat = "identity", fill = "coral") + theme_bw() +
     ggtitle("Top 25 Bigrams") +labs(x = "", y = "")

Plot trigram

#Trigrams

trigramrow <- sort(slam::row_sums(trigram), decreasing=T)
trigramfreq <- data.table(tok = names(trigramrow), freq = trigramrow)

ggplot(trigramfreq[1:25,], aes(x = reorder(tok,freq), y = freq)) + coord_flip() +
     geom_bar(stat = "identity", fill = "coral") + theme_bw() +
     ggtitle("Top 25 Trigrams") +labs(x = "", y = "")

Create dictionary and search for string in column

#Load dictionary 

dictionary_analyst <- read.csv("https://raw.githubusercontent.com/cassandra-coste/CUNY607/main/project_3/Eric_DataAnalystDictionary.csv", header=FALSE, fileEncoding = "UTF-8-BOM")

dictionary_ngrams <- read.csv("https://raw.githubusercontent.com/ericonsi/Project3/master/dictionary/Ngrams_dictionary.csv?token=ASWDA45DKKSCY6O2LEEIEZ3AMNTEM", fileEncoding = "UTF-8-BOM")

os <- read.csv("https://raw.githubusercontent.com/ericonsi/Project3/master/dictionary/OS_dictionary_skills.csv?token=ASWDA4ZNPKGI4AE2TNYGNTDAMPRL2", fileEncoding = "UTF-8-BOM")

onet <- read.csv("https://raw.githubusercontent.com/ericonsi/Project3/master/dictionary/ONET%20Technology%20Skills.csv?token=ASWDA42WGNR5QH4OV5EZATTAMPSEI", fileEncoding = "UTF-8-BOM")


# create dictionary with single skill column for merge

dictionary_onet <- onet %>% select(skill = Skill)
dictionary_os <- os %>% select(skill)

#Assign column name to analyst dictionary 

names(dictionary_analyst) <- ('skill')
names(dictionary_ngrams) <- ('skill')

#convert to lowercase and remove special characters where needed

dictionary_analyst <- dictionary_analyst %>% mutate(across(where(is.character), tolower))
dictionary_ngrams <- dictionary_ngrams  %>% mutate(across(where(is.character), tolower))
dictionary_os <- dictionary_os  %>% mutate(across(where(is.character), tolower))
dictionary_onet <- dictionary_onet  %>% mutate(across(where(is.character), tolower)) %>% mutate_all(funs(gsub("[[:punct:]]", "", .)))

## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))

#merge dictionaries and remove duplicates 

MyMerge <- function(x, y){
  df <- merge(x, y, all = TRUE)
  return(df)
}

# merge all four dictionaries and delete duplicate skills

dictionary <- Reduce(MyMerge, list(dictionary_analyst, dictionary_ngrams, dictionary_onet, dictionary_os)) %>% distinct()


#remove common character strings found within words and phrases to analyze separately 

#remove skills that need to be removed when then appear alone but not when in a phrase

dictionary$skill <- str_remove(dictionary$skill, "(?! )(ai|science|business)(?! )")

#remove short character skills that will be picked up within words

dictionary$skill <- str_remove(dictionary$skill, "(?:^|\\W)(r|c|ms|go)(?:$|\\W)")

# turn dictionary into vector and remove empty rows

dictionary <- dictionary[!apply(dictionary == "", 1, all),]

get counts for r, ms, ai, go

data_sci_count<-data.frame("r" = sum(str_count(data_scientists$description, " r | r,| r\\.")), "ms" = sum(str_count(data_scientists$description, " ms | ms,| ms.")), Go = sum(str_count(data_scientists$description, " go ")), "ai" = sum(str_count(data_scientists$description, " ai | ai,| ai\\.")))

data_ana_count<-data.frame("r" = sum(str_count(data_analysts$description, " r | r,| r\\.")), "ms" = sum(str_count(data_analysts$description, " ms | ms,| ms\\.")), Go = sum(str_count(data_analysts$description, " go ")), "ai" = sum(str_count(data_analysts$description, " ai | ai,| ai\\.")))

Detect dictionary words in original descriptions

# Pulls skills out of description based on dictionary

setDT(data_scientists)[, skills := paste(dictionary[unlist(lapply(dictionary, function(x) grepl(x, description, ignore.case = T)))], collapse = ","), by = 1:nrow(data_scientists)]

setDT(data_analysts)[, skills := paste(dictionary[unlist(lapply(dictionary, function(x) grepl(x, description, ignore.case = T)))], collapse = ","), by = 1:nrow(data_analysts)]



# Create a count of skills for data science

skillsfreq_ds <- data_scientists %>%
  separate_rows(skills, sep = ',') %>%
  group_by(skills = tolower(skills)) %>%
  summarise(count = n())

# Create a count of skills for data analyst

skillsfreq_da <- data_analysts %>%
  separate_rows(skills, sep = ',') %>%
  group_by(skills = tolower(skills)) %>%
  summarise(count = n())

# Merge counts for r, ms, ai, go

data_sci_count <- data_sci_count %>% gather(skills, count, 1:4)

data_ana_count <- data_ana_count %>% gather(skills, count, 1:4)

skillsfreq_ds <- merge(skillsfreq_ds, data_sci_count, all = TRUE)

skillsfreq_da <- merge(skillsfreq_da, data_ana_count, all = TRUE)

Table top data scientist skills

top_skills_ds <- skillsfreq_ds %>% arrange(desc(count)) %>% select(skills, count)                     
head(top_skills_ds) %>%
  kbl(caption = "Top Skills") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

Top Skills
skills	count
python	5250
machine learning	1693
design	1468
computer	1460
science	1383
research	1254

Table top data analyst skills

top_skills_da <- skillsfreq_da %>% arrange(desc(count)) %>% select(skills, count)                     
head(top_skills_da) %>%
  kbl(caption = "Top Skills") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

Top Skills
skills	count
python	1275
research	785
communication	776
analytical	653
design	627
organization	605

Merge top data skills for data scientist and data analyst

skillsfreq_all <- full_join(skillsfreq_ds ,skillsfreq_da,by="skills") %>% rename(data_scientist = count.x, data_analyst = count.y)

Table top data skills

top_skills_all <- skillsfreq_all %>% arrange(desc(data_scientist)) %>% select(skills, data_scientist, data_analyst)       

head(top_skills_all) %>%
  kbl(caption = "Top Skills") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

Top Skills
skills	data_scientist	data_analyst
python	5250	1275
machine learning	1693	207
design	1468	627
computer	1460	380
science	1383	312
research	1254	785

Project 3 Dictionary

Cassandra Coste and Daniel Sullivan

3/19/2021

Read in data and filter by position

Filter for targeted skills

Transform Data Science dataframe to long

Make corpus and remove punctuation, numbers, stopwords, convert cases, etc

Tokenization of textbody into unigrams (one word), bigrams (two words), trigrams (three words), and quadgrams(four words)

Plot unigram

Plot bigram

Plot trigram

Create dictionary and search for string in column

Detect dictionary words in original descriptions

Table top data scientist skills

Table top data analyst skills

Merge top data skills for data scientist and data analyst

Table top data skills