library(readr)
library(tidyverse)
library(stringi)
library(tm)
library(corpus)
library(wordcloud)
library(data.table)
library(stringr)
library(kableExtra)

Read in data and filter by position

all_data<-read.csv("https://raw.githubusercontent.com/cassandra-coste/CUNY607/main/project_3/raw_jobdata.csv")
all_data$position<-tolower(all_data$position)
all_data$description<-tolower(all_data$description)
all_data$description<-tolower(all_data$description)%>%
  str_remove_all("â|€|™")

##filters for data science positions
data_scientists<-all_data%>%
  mutate(contents = str_detect(tolower(position), "data [b-z]|ai|machine"))%>%
  filter(contents == TRUE)

data_analysts<-all_data%>%
  mutate(contents = str_detect(tolower(position), "anal"))%>%
  filter(contents == TRUE)

rm(all_data) 

Filter for targeted skills

using the same search criteria above make new columns containing the strings of interest to be worked with later. issues with this is there are tons of NA values

ds<-data_scientists%>%
  mutate(skill = str_extract_all(data_scientists$description, " .{75,100}   skill.{150,200} "))%>%
  mutate(must_have=str_extract_all(data_scientists$description, "must have.{150,200} "))%>%
  mutate(knowledge=str_extract_all(data_scientists$description, " .{75,100} knowledge.{150,200} "))%>%
  mutate(experience=str_extract_all(data_scientists$description, " .{100,150} exper.{100,150} "))%>%
  mutate(excel=str_extract_all(data_scientists$description, "excel at.{150,200} |excel with.{150,200} |excel in.{150,200} "))%>%
  mutate(responsible=str_extract_all(data_scientists$description, "responsible.{150,200} "))%>%
  mutate(proficient=str_extract_all(data_scientists$description," .{100,150} profi.{110,160} "))%>%
  mutate(understands=str_extract_all(data_scientists$description, " .{100,150} understand.{150,200} "))%>%
  mutate(utilize=str_extract_all(data_scientists$description, "utilize.{150,200} "))%>%
  mutate(lead=str_extract_all(data_scientists$description, " .{150,200} lead.{150,200} "))%>%
  mutate(work=str_extract_all(data_scientists$description, " .{50,75} work.{150,200} "))%>%
  mutate(looking=str_extract_all(data_scientists$description, "looking.{150,200} "))


ds$skill<-lapply(ds$skill, function(x)paste(unlist(x), collapse=' '))

ds$must_have<-lapply(ds$must_have, function(x)paste(unlist(x), collapse=' '))

ds$knowledge<-lapply(ds$knowledge, function(x)paste(unlist(x), collapse=' '))

ds$understands<-lapply(ds$understands, function(x)paste(unlist(x), collapse=' '))

ds$experience<-lapply(ds$experience, function(x)paste(unlist(x), collapse=' '))

ds$excel<-lapply(ds$excel, function(x)paste(unlist(x), collapse=' '))

ds$responsible<-lapply(ds$responsible, function(x)paste(unlist(x), collapse=' '))

ds$proficient<-lapply(ds$proficient, function(x)paste(unlist(x), collapse=' '))

ds$understands<-lapply(ds$understands, function(x)paste(unlist(x), collapse=' '))

ds$utilize<-lapply(ds$utilize, function(x)paste(unlist(x), collapse=' '))

ds$lead<-lapply(ds$lead, function(x)paste(unlist(x), collapse=' '))

ds$work<-lapply(ds$work, function(x)paste(unlist(x), collapse=' '))

ds$looking<-lapply(ds$looking, function(x)paste(unlist(x), collapse=' '))
da<-data_analysts%>%
  mutate(skill = str_extract_all(data_analysts$description, " .{75,100}   skill.{150,200} "))%>%
  mutate(must_have=str_extract_all(data_analysts$description, "must have.{150,200} "))%>%
  mutate(knowledge=str_extract_all(data_analysts$description, " .{75,100} knowledge.{150,200} "))%>%
  mutate(experience=str_extract_all(data_analysts$description, " .{100,150} exper.{100,150} "))%>%
  mutate(excel=str_extract_all(data_analysts$description, "excel at.{150,200} |excel with.{150,200} |excel in.{150,200} "))%>%
  mutate(responsible=str_extract_all(data_analysts$description, "responsible.{150,200} "))%>%
  mutate(proficient=str_extract_all(data_analysts$description," .{100,150} profi.{110,160} "))%>%
  mutate(understands=str_extract_all(data_analysts$description, " .{100,150} understand.{150,200} "))%>%
  mutate(utilize=str_extract_all(data_analysts$description, "utilize.{150,200} "))%>%
  mutate(lead=str_extract_all(data_analysts$description, " .{150,200} lead.{150,200} "))%>%
  mutate(work=str_extract_all(data_analysts$description, " .{50,75} work.{150,200} "))%>%
  mutate(looking=str_extract_all(data_analysts$description, "looking.{150,200} "))


da$skill<-lapply(da$skill, function(x)paste(unlist(x), collapse=' '))

da$must_have<-lapply(da$must_have, function(x)paste(unlist(x), collapse=' '))

da$knowledge<-lapply(da$knowledge, function(x)paste(unlist(x), collapse=' '))

da$understands<-lapply(da$understands, function(x)paste(unlist(x), collapse=' '))

da$experience<-lapply(da$experience, function(x)paste(unlist(x), collapse=' '))

da$excel<-lapply(da$excel, function(x)paste(unlist(x), collapse=' '))

da$responsible<-lapply(da$responsible, function(x)paste(unlist(x), collapse=' '))

da$proficient<-lapply(da$proficient, function(x)paste(unlist(x), collapse=' '))

da$understands<-lapply(da$understands, function(x)paste(unlist(x), collapse=' '))

da$utilize<-lapply(da$utilize, function(x)paste(unlist(x), collapse=' '))

da$lead<-lapply(da$lead, function(x)paste(unlist(x), collapse=' '))

da$work<-lapply(da$work, function(x)paste(unlist(x), collapse=' '))

da$looking<-lapply(da$looking, function(x)paste(unlist(x), collapse=' '))

Transform Data Science dataframe to long

ds_long <- ds %>% gather(keyword, text, 7:18)

text <- ds_long %>% select(text)

Make corpus and remove punctuation, numbers, stopwords, convert cases, etc

corpus <- VCorpus(VectorSource(text))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, removeWords, c("skill","responsible","proficient","knowledge","understands", "must", "experience", "character", "will", "looking", "excels at", "work", "lead", "utilize"))
corpus_Clean <- tm_map(corpus, stripWhitespace)
wordcloud(corpus, max.words = 50, colors = colorRampPalette(brewer.pal(7, "Dark2"))(32))

Tokenization of textbody into unigrams (one word), bigrams (two words), trigrams (three words), and quadgrams(four words)

#Unigrams
unigramTokenizer <- function(x) { unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE) }
unigram <- TermDocumentMatrix(corpus, control = list(wordLengths = c(1, 20)))


#Bigrams
bigramTokenizer <- function(x) { unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE) }
bigram <- TermDocumentMatrix(corpus, control = list(wordLengths = c(3, 40),tokenize = bigramTokenizer))


#Trigrams
trigramTokenizer <- function(x) { unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE) }
trigram <- TermDocumentMatrix(corpus, control = list(wordLengths = c(3, 60),tokenize = trigramTokenizer))

Plot unigram

#Unigrams

unigramrow <- sort(slam::row_sums(unigram), decreasing=T)
unigramfreq <- data.table(tok = names(unigramrow), freq = unigramrow)


ggplot(unigramfreq[1:25,], aes(x = reorder(tok,freq), y = freq)) + coord_flip() +
     geom_bar(stat = "identity", fill = "coral") + theme_bw() +
     ggtitle("Top 25 Unigrams") +labs(x = "", y = "")

Plot bigram

#Bigrams

bigramrow <- sort(slam::row_sums(bigram), decreasing=T)
bigramfreq <- data.table(tok = names(bigramrow), freq = bigramrow)

ggplot(bigramfreq[1:25,], aes(x = reorder(tok,freq), y = freq)) + coord_flip() +
     geom_bar(stat = "identity", fill = "coral") + theme_bw() +
     ggtitle("Top 25 Bigrams") +labs(x = "", y = "")

Plot trigram

#Trigrams

trigramrow <- sort(slam::row_sums(trigram), decreasing=T)
trigramfreq <- data.table(tok = names(trigramrow), freq = trigramrow)

ggplot(trigramfreq[1:25,], aes(x = reorder(tok,freq), y = freq)) + coord_flip() +
     geom_bar(stat = "identity", fill = "coral") + theme_bw() +
     ggtitle("Top 25 Trigrams") +labs(x = "", y = "")

Create dictionary and search for string in column

#Load dictionary 

dictionary_analyst <- read.csv("https://raw.githubusercontent.com/cassandra-coste/CUNY607/main/project_3/Eric_DataAnalystDictionary.csv", header=FALSE, fileEncoding = "UTF-8-BOM")

dictionary_ngrams <- read.csv("https://raw.githubusercontent.com/ericonsi/Project3/master/dictionary/Ngrams_dictionary.csv?token=ASWDA45DKKSCY6O2LEEIEZ3AMNTEM", fileEncoding = "UTF-8-BOM")

os <- read.csv("https://raw.githubusercontent.com/ericonsi/Project3/master/dictionary/OS_dictionary_skills.csv?token=ASWDA4ZNPKGI4AE2TNYGNTDAMPRL2", fileEncoding = "UTF-8-BOM")

onet <- read.csv("https://raw.githubusercontent.com/ericonsi/Project3/master/dictionary/ONET%20Technology%20Skills.csv?token=ASWDA42WGNR5QH4OV5EZATTAMPSEI", fileEncoding = "UTF-8-BOM")


# create dictionary with single skill column for merge

dictionary_onet <- onet %>% select(skill = Skill)
dictionary_os <- os %>% select(skill)

#Assign column name to analyst dictionary 

names(dictionary_analyst) <- ('skill')
names(dictionary_ngrams) <- ('skill')

#convert to lowercase and remove special characters where needed

dictionary_analyst <- dictionary_analyst %>% mutate(across(where(is.character), tolower))
dictionary_ngrams <- dictionary_ngrams  %>% mutate(across(where(is.character), tolower))
dictionary_os <- dictionary_os  %>% mutate(across(where(is.character), tolower))
dictionary_onet <- dictionary_onet  %>% mutate(across(where(is.character), tolower)) %>% mutate_all(funs(gsub("[[:punct:]]", "", .)))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
#merge dictionaries and remove duplicates 

MyMerge <- function(x, y){
  df <- merge(x, y, all = TRUE)
  return(df)
}

# merge all four dictionaries and delete duplicate skills

dictionary <- Reduce(MyMerge, list(dictionary_analyst, dictionary_ngrams, dictionary_onet, dictionary_os)) %>% distinct()


#remove common character strings found within words and phrases to analyze separately 

#remove skills that need to be removed when then appear alone but not when in a phrase

dictionary$skill <- str_remove(dictionary$skill, "(?! )(ai|science|business)(?! )")

#remove short character skills that will be picked up within words

dictionary$skill <- str_remove(dictionary$skill, "(?:^|\\W)(r|c|ms|go)(?:$|\\W)")

# turn dictionary into vector and remove empty rows

dictionary <- dictionary[!apply(dictionary == "", 1, all),]

get counts for r, ms, ai, go

data_sci_count<-data.frame("r" = sum(str_count(data_scientists$description, " r | r,| r\\.")), "ms" = sum(str_count(data_scientists$description, " ms | ms,| ms.")), Go = sum(str_count(data_scientists$description, " go ")), "ai" = sum(str_count(data_scientists$description, " ai | ai,| ai\\.")))

data_ana_count<-data.frame("r" = sum(str_count(data_analysts$description, " r | r,| r\\.")), "ms" = sum(str_count(data_analysts$description, " ms | ms,| ms\\.")), Go = sum(str_count(data_analysts$description, " go ")), "ai" = sum(str_count(data_analysts$description, " ai | ai,| ai\\.")))

Detect dictionary words in original descriptions

# Pulls skills out of description based on dictionary

setDT(data_scientists)[, skills := paste(dictionary[unlist(lapply(dictionary, function(x) grepl(x, description, ignore.case = T)))], collapse = ","), by = 1:nrow(data_scientists)]

setDT(data_analysts)[, skills := paste(dictionary[unlist(lapply(dictionary, function(x) grepl(x, description, ignore.case = T)))], collapse = ","), by = 1:nrow(data_analysts)]



# Create a count of skills for data science

skillsfreq_ds <- data_scientists %>%
  separate_rows(skills, sep = ',') %>%
  group_by(skills = tolower(skills)) %>%
  summarise(count = n())

# Create a count of skills for data analyst

skillsfreq_da <- data_analysts %>%
  separate_rows(skills, sep = ',') %>%
  group_by(skills = tolower(skills)) %>%
  summarise(count = n())

# Merge counts for r, ms, ai, go

data_sci_count <- data_sci_count %>% gather(skills, count, 1:4)

data_ana_count <- data_ana_count %>% gather(skills, count, 1:4)

skillsfreq_ds <- merge(skillsfreq_ds, data_sci_count, all = TRUE)

skillsfreq_da <- merge(skillsfreq_da, data_ana_count, all = TRUE)

Table top data scientist skills

top_skills_ds <- skillsfreq_ds %>% arrange(desc(count)) %>% select(skills, count)                     
head(top_skills_ds) %>%
  kbl(caption = "Top Skills") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Top Skills
skills count
python 5250
machine learning 1693
design 1468
computer 1460
science 1383
research 1254

Table top data analyst skills

top_skills_da <- skillsfreq_da %>% arrange(desc(count)) %>% select(skills, count)                     
head(top_skills_da) %>%
  kbl(caption = "Top Skills") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Top Skills
skills count
python 1275
research 785
communication 776
analytical 653
design 627
organization 605

Merge top data skills for data scientist and data analyst

skillsfreq_all <- full_join(skillsfreq_ds ,skillsfreq_da,by="skills") %>% rename(data_scientist = count.x, data_analyst = count.y)

Table top data skills

top_skills_all <- skillsfreq_all %>% arrange(desc(data_scientist)) %>% select(skills, data_scientist, data_analyst)       

head(top_skills_all) %>%
  kbl(caption = "Top Skills") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Top Skills
skills data_scientist data_analyst
python 5250 1275
machine learning 1693 207
design 1468 627
computer 1460 380
science 1383 312
research 1254 785