I came across a study conducted by Jeff Hale in 2018 about The Most In-Demand Skills for Data Scientists. More can be found here
I will be analyzing his two datasets to determine “Which are the most valued data science skills?”
I will use the the two datasets from John Hale’s study. I have hosted them on my GitHub repository.
gen_skills_url <- "https://raw.githubusercontent.com/nathtrish334/Data-607/main/ds_general_skills_revised.csv"
soft_skills_url <- "https://raw.githubusercontent.com/nathtrish334/Data-607/main/ds_job_listing_software.csv"
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.6 v dplyr 1.0.4
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(rvest)
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
##
## pluck
## The following object is masked from 'package:readr':
##
## guess_encoding
library(tm)
## Warning: package 'tm' was built under R version 4.0.4
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(SnowballC)
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 4.0.4
library(stringr)
library(plotly)
## Warning: package 'plotly' was built under R version 4.0.4
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
gen_skills <- read_csv(file=gen_skills_url)
##
## -- Column specification --------------------------------------------------------
## cols(
## Keyword = col_character(),
## LinkedIn = col_number(),
## Indeed = col_number(),
## SimplyHired = col_number(),
## Monster = col_number()
## )
soft_skills <- read_csv(file=soft_skills_url)
##
## -- Column specification --------------------------------------------------------
## cols(
## Keyword = col_character(),
## LinkedIn = col_number(),
## Indeed = col_number(),
## SimplyHired = col_number(),
## Monster = col_number(),
## `LinkedIn %` = col_character(),
## `Indeed %` = col_character(),
## `SimplyHired %` = col_character(),
## `Monster %` = col_character(),
## `Avg %` = col_character(),
## `GlassDoor Self Reported % 2017` = col_character(),
## Difference = col_character()
## )
I cleaned the two datasets and joined them into one
df_1 <- as_tibble(gen_skills, key = "ï..Keyword")
df_1$Skill_Set <- "Soft"
df_2 <- as_tibble(soft_skills, key = "ï..Keyword")
df_2$Skill_Set <- "Tech"
df_2 <- select(df_2,-c(6,7,8,9,10,11,12))
combined_df<-rbind(df_1, df_2)
head(combined_df)
## # A tibble: 6 x 6
## Keyword LinkedIn Indeed SimplyHired Monster Skill_Set
## <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 machine learning 5701 3439 2561 2340 Soft
## 2 analysis 5168 3500 2668 3306 Soft
## 3 statistics 4893 2992 2308 2399 Soft
## 4 computer science 4517 2739 2093 1900 Soft
## 5 communication 3404 2344 1791 2053 Soft
## 6 mathematics 2605 1961 1497 1815 Soft
# Rename the variable
names(combined_df)[names(combined_df) == "ï..Keyword"] <- "Keyword"
#Tidying the combined dataframe
combined_df$Total <- rowSums(sapply(combined_df[,c(2:5)], as.numeric))
combined_df <- mutate(combined_df[, c(1,6,2:5,7)])
combined_df <- data.frame(combined_df)
colnames(combined_df)
## [1] "Keyword" "Skill_Set" "LinkedIn" "Indeed" "SimplyHired"
## [6] "Monster" "Total"
Averages of keywords
SimplyHired <- 3829
Monster <- 3746
Indeed <- 5138
LinkedIn <- 8610
Total <- LinkedIn + Indeed + SimplyHired + Monster
combined_df$LinkedIn <- ((combined_df$LinkedIn)/(LinkedIn))*100
combined_df$Indeed <- ((combined_df$Indeed)/(Indeed))*100
combined_df$SimplyHired <- ((combined_df$SimplyHired)/(SimplyHired))*100
combined_df$Monster <- ((combined_df$Monster)/((Monster)))*100
combined_df$Total <- ((combined_df$Total/((Total))))*100
dim(combined_df)
## [1] 74 7
head(combined_df, 10)
## Keyword Skill_Set LinkedIn Indeed SimplyHired Monster Total
## 1 machine learning Soft 66.21370 66.93266 66.88430 62.46663 65.84908
## 2 analysis Soft 60.02323 68.11989 69.67877 88.25414 68.66764
## 3 statistics Soft 56.82927 58.23278 60.27683 64.04164 59.05360
## 4 computer science Soft 52.46225 53.30868 54.66179 50.72077 52.75524
## 5 communication Soft 39.53542 45.62086 46.77461 54.80513 44.98429
## 6 mathematics Soft 30.25552 38.16660 39.09637 48.45168 36.94602
## 7 visualization Soft 21.82346 27.50097 30.11230 32.22104 26.50659
## 8 AI composite Soft 18.21138 21.89568 21.18046 18.33956 19.65483
## 9 deep learning Soft 15.21487 19.05411 17.62862 16.17726 16.74248
## 10 NLP composite Soft 14.07666 17.71117 17.23688 15.53657 15.77639
combined_df_soft <- filter(combined_df, combined_df$Skill_Set == "Soft")
dim(combined_df_soft)
## [1] 30 7
combined_df_soft_top10 <- arrange(combined_df_soft[1:10,c(1,3:7)], desc(Total))
combined_df_soft_top10
## Keyword LinkedIn Indeed SimplyHired Monster Total
## 1 analysis 60.02323 68.11989 69.67877 88.25414 68.66764
## 2 machine learning 66.21370 66.93266 66.88430 62.46663 65.84908
## 3 statistics 56.82927 58.23278 60.27683 64.04164 59.05360
## 4 computer science 52.46225 53.30868 54.66179 50.72077 52.75524
## 5 communication 39.53542 45.62086 46.77461 54.80513 44.98429
## 6 mathematics 30.25552 38.16660 39.09637 48.45168 36.94602
## 7 visualization 21.82346 27.50097 30.11230 32.22104 26.50659
## 8 AI composite 18.21138 21.89568 21.18046 18.33956 19.65483
## 9 deep learning 15.21487 19.05411 17.62862 16.17726 16.74248
## 10 NLP composite 14.07666 17.71117 17.23688 15.53657 15.77639
# Top ten Soft skills as a percentage of total Data Science jobs
plot_ly(combined_df_soft_top10,
x = ~ combined_df_soft_top10,
y = combined_df_soft_top10$Total,
type = 'bar',
name = combined_df_soft_top10$Keyword)%>%
layout(title = 'Top 10 Soft Skills for Data Scientists',yaxis = list(title = '% of Data Science Jobs'), xaxis = list(title = 'Soft Skills'))
soft_top10_by_site <- data.frame("Keyword"=combined_df_soft_top10, combined_df_soft_top10)
soft_visual_top10 <- soft_top10_by_site[,c('Keyword', 'LinkedIn', 'Indeed', 'SimplyHired', 'Monster', 'Total')]
head(soft_visual_top10, 5)
## Keyword LinkedIn Indeed SimplyHired Monster Total
## 1 analysis 60.02323 68.11989 69.67877 88.25414 68.66764
## 2 machine learning 66.21370 66.93266 66.88430 62.46663 65.84908
## 3 statistics 56.82927 58.23278 60.27683 64.04164 59.05360
## 4 computer science 52.46225 53.30868 54.66179 50.72077 52.75524
## 5 communication 39.53542 45.62086 46.77461 54.80513 44.98429
plot_ly(soft_visual_top10, x = ~Keyword, y = ~LinkedIn, type = 'bar', name = "Linkedin") %>%
add_trace(y = ~Indeed, name = "Indeed") %>%
add_trace(y = ~SimplyHired, name = "SimplyHired") %>%
add_trace(y = ~Monster, name = "Monster") %>%
add_trace(y = ~Total, name = "Average") %>%
layout(title = 'Soft Skills by Job Sites', yaxis = list(title = '% of Data Science Jobs'), xaxis = list(title = 'Soft Skills'), barmode = 'group')
combined_df_tech <- filter(combined_df, combined_df$Skill_Set == "Tech")
dim(combined_df_tech)
## [1] 44 7
combined_df_tech_top10 <- arrange(combined_df_tech[1:10,c(1,3:7)], desc(Total))
combined_df_tech_top10
## Keyword LinkedIn Indeed SimplyHired Monster Total
## 1 Python 73.71661 74.30907 75.42439 67.91244 73.14637
## 2 R 52.88037 60.45154 62.49674 63.13401 58.23289
## 3 SQL 45.05226 51.14831 53.69548 49.14576 48.79238
## 4 Hadoop 24.87805 30.71234 30.39958 32.03417 28.53257
## 5 Spark 25.19164 30.18684 30.47793 28.35024 27.89945
## 6 Java 22.57840 26.80031 27.65735 26.74853 25.24035
## 7 SAS 19.89547 22.07084 23.76600 26.10785 22.20607
## 8 Tableau 14.12311 19.69638 20.37085 19.86119 17.59602
## 9 Hive 13.72822 16.15415 16.63620 16.52429 15.32617
## 10 Scala 12.07898 14.38303 15.38261 13.88147 13.54406
# Top 10 Technical skills as a percentage of total Data Science jobs
plot_ly(combined_df_tech_top10,
x = ~ combined_df_tech_top10,
y = combined_df_tech_top10$Total,
type = 'bar',
name = combined_df_tech_top10$Keyword)%>%
layout(title = 'Top 10 Technical Skills for Data Scientists',yaxis = list(title = '% of Data Science Jobs'), xaxis = list(title = 'Technical Skills'))
tech_top10_by_site <- data.frame("Keyword"=combined_df_tech_top10, combined_df_tech_top10)
tech_visual_top10 <- tech_top10_by_site[,c('Keyword', 'LinkedIn', 'Indeed', 'SimplyHired', 'Monster', 'Total')]
head(tech_visual_top10, 5)
## Keyword LinkedIn Indeed SimplyHired Monster Total
## 1 Python 73.71661 74.30907 75.42439 67.91244 73.14637
## 2 R 52.88037 60.45154 62.49674 63.13401 58.23289
## 3 SQL 45.05226 51.14831 53.69548 49.14576 48.79238
## 4 Hadoop 24.87805 30.71234 30.39958 32.03417 28.53257
## 5 Spark 25.19164 30.18684 30.47793 28.35024 27.89945
plot_ly(tech_visual_top10, x = ~Keyword, y = ~LinkedIn, type = 'bar', name = "Linkedin") %>%
add_trace(y = ~Indeed, name = "Indeed") %>%
add_trace(y = ~SimplyHired, name = "SimplyHired") %>%
add_trace(y = ~Monster, name = "Monster") %>%
add_trace(y = ~Total, name = "Average") %>%
layout(title = 'Technical Skills by Job Sites', yaxis = list(title = '% of Data Science Jobs'), xaxis = list(title = 'Technical Skills'), barmode = 'group')
# Top Soft skill required
head(combined_df_tech,1)
## Keyword Skill_Set LinkedIn Indeed SimplyHired Monster Total
## 1 Python Tech 73.71661 74.30907 75.42439 67.91244 73.14637
# Top Technical skill required
head(combined_df_soft,1)
## Keyword Skill_Set LinkedIn Indeed SimplyHired Monster Total
## 1 machine learning Soft 66.2137 66.93266 66.8843 62.46663 65.84908
According to this dataset, the top soft skill that employers are looking for when hiring a data scientist is Analysis. The top technical skill being targeted is Python.