Hard and Tech Skills
Using the N-gram function which can extract single, double or more word combinations, we extracted the most common hard, technical, and soft skills we know from prior experiences and from the articles linked below. Below are the Top 20 Hard and Technical Skills in order of frequency.
The Most In-Demand Tech Skills for Data Scientists
Data Scientist Resume Sample Template & Data-Driven Guide
We used the N-gram function twice to extract single word skills, and then once again to extract double worded skills such as "quantitative analysis. In conjunction with the N-gram function, we leveraged dplyr functions to tidy our data, ggplot for the bar graphs and reactable for the table visual of our data.
# hard skills:
#https://towardsdatascience.com/the-most-in-demand-tech-skills-for-data-scientists-d716d10c191d
# technical, hard and soft skills:
#https://zety.com/blog/data-scientist-resume-example
# res1$w1 = tolower(res1s$w1) sets all words to lowercase
job_descr_phrases <- result_set$job_descr
job_descr_text <- job_descr_phrases
job_phrases_1 <- createNgram(job_descr_text,1)
job_phrases_1$w1 <- tolower(job_phrases_1$w1) # convert to lowercase for accurate counting
job_phrases_2 <- createNgram(job_descr_text,2)
job_phrases_2$w1w2 <- tolower(job_phrases_2$w1w2)
# hard and tech skills single word skills
techskill = c("python", "R", "r","sql", "spark", "hadoop","java",
"tableau", "aws", "sas", "hive", "tensorflow", "scala",
"c++", "excel", "azure", "mathematics", "statistitcs",
"programming", "debugging", "probability", "modeling",
"matplotlib", "openrefine", "matlab", "bigml", "d3.js",
"excel","ggplot2","jupyter","nltk","scikit-learn","tensorflow",
"weka","predictive","ai","mathematics","c", "linux","nosql",
"basic","bayesian","mapreduce")
hardskill= c("data visualization", "quantitative analysis", "data analysis",
"artificial intelligence", "predictive analysis", "predictive modeling",
"bachelors degree", "masters degree")
job_phrases_1<-
job_phrases_1 %>%
select(-3) %>%
filter(w1 %in% techskill) %>%
group_by(w1) %>%
summarise_all(funs(sum)) %>%
arrange(desc(freq))
colnames(job_phrases_1)[1] <- "Hard and Tech Skills"
#hard and tech double word skills
job_phrases_2<-
job_phrases_2 %>%
select(-3) %>%
filter(w1w2 %in% hardskill) %>%
group_by(w1w2) %>%
summarise_all(funs(sum))%>%
arrange(desc(freq))
colnames(job_phrases_2)[1] <- "Hard and Tech Skills"
#combine the single word and two word skills into one dataframe
totaltech <-
rbind(job_phrases_1, job_phrases_2) %>%
arrange(desc(freq))
head(totaltech)%>%
knitr::kable(caption = "Tech Skills for Data Science ")%>%
kableExtra::kable_styling(bootstrap_options = "striped")
Tech Skills for Data Science
Hard and Tech Skills
|
freq
|
python
|
531
|
sql
|
481
|
modeling
|
432
|
r
|
426
|
programming
|
284
|
spark
|
245
|
#bar graph top 20 hard and tech skills
totaltech %>%
top_n(20) %>%
ggplot(aes(fct_reorder(`Hard and Tech Skills`,`freq`), `freq`))+
geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.4) +
coord_flip() +
xlab("") +
ylab("Frequency")+
ggtitle("Top 20 Hard and Tech Skills")
## Selecting by freq

reactable(totaltech, bordered = TRUE, striped = TRUE,
highlight = TRUE, filterable = TRUE, showPageSizeOptions = TRUE,
showPagination = TRUE, pageSizeOptions = c(5, 10, 20),
defaultPageSize = 10)
Soft Skills
Note, for soft skills we extracted words similar to one another such as lead, leadership, and leader and then manually grouped them together since they all are suggestive of the same skill. The soft skills were extracted and orgaized in a similar fashion to the above hard and technical skills.
#soft skills single word
job_phrases_3 <- createNgram(job_descr_text,1)
job_phrases_3$w1 <- tolower(job_phrases_3$w1)
softskill = c("communication", "collaboration","teamwork", "collaborate",
"professional", "veteran", "lead", "leadership", "leader",
"innovation", "innovate", "innovative", "collaborative",
"passionate", "creative", "motivated", "integrity", "effectiveness",
"pioneering","communication", "collaboration", "preceptiveness",
"perseptive","teamwork","collaborate")
job_phrases_4 <- createNgram(job_descr_text,2)
job_phrases_4$w1w2 <- tolower(job_phrases_4$w1w2)
softskill2 = c("critical thinking", "problem solving","interpersonal skills",
"time management")
job_phrases_3<-
job_phrases_3 %>%
select(-3) %>%
filter(w1 %in% softskill) %>%
group_by(w1) %>%
summarise_all(funs(sum)) %>%
arrange(desc(freq))
colnames(job_phrases_3)[1] <- "Soft Skills"
#soft skills double word
job_phrases_4<- job_phrases_4 %>%
select(-3) %>%
filter(w1w2 %in% softskill2) %>%
group_by(w1w2) %>%
summarise_all(funs(sum)) %>%
arrange(desc(freq))
colnames(job_phrases_4)[1] <- "Soft Skills"
#combine the single word and two word skills into one dataframe
totalsoft <-
rbind(job_phrases_3, job_phrases_4) %>%
arrange(desc(freq)) #arranges in descending order
totalsoft<-
totalsoft[-c(2,10), ]
totalsoft[7,2] = (302+158+141)
totalsoft<-
totalsoft[-c(8,15), ]
totalsoft[4,2] = (264+149+53)
totalsoft<-
totalsoft[-c(12,10), ]
totalsoft[5,2] = (203+106+91)
totalsoft<- arrange(totalsoft,desc(freq))
head(totalsoft)%>%
knitr::kable(caption = "Soft skills for Data Science jos ")%>%
kableExtra::kable_styling(bootstrap_options = "striped")
Soft skills for Data Science jos
Soft Skills
|
freq
|
leadership
|
601
|
collaborate
|
466
|
professional
|
400
|
communication
|
356
|
veteran
|
265
|
innovative
|
264
|
#bar graph top 10 soft skills
totalsoft %>%
top_n(10) %>% #extract top 10
ggplot(aes(fct_reorder(`Soft Skills`,`freq`), `freq`))+
geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.4) +
coord_flip() +
xlab("") +
ylab("Frequency")+
ggtitle("Top Ten Soft Skills")
## Selecting by freq

reactable(totalsoft, bordered = TRUE, striped = TRUE, highlight = TRUE, filterable = TRUE, showPageSizeOptions = TRUE, showPagination = TRUE, pageSizeOptions = c(5, 10), defaultPageSize = 5)