DATA 607 - Project 3

INTRODUCTION

I came across a study conducted by Jeff Hale in 2018 about The Most In-Demand Skills for Data Scientists. More can be found here
I will be analyzing his two datasets to determine “Which are the most valued data science skills?”

Data Source

I will use the the two datasets from John Hale’s study. I have hosted them on my GitHub repository.

gen_skills_url <- "https://raw.githubusercontent.com/nathtrish334/Data-607/main/ds_general_skills_revised.csv"
soft_skills_url <- "https://raw.githubusercontent.com/nathtrish334/Data-607/main/ds_job_listing_software.csv"

DATA LOADING

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.6     v dplyr   1.0.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(ggplot2)
library(rvest)

## Loading required package: xml2

## 
## Attaching package: 'rvest'

## The following object is masked from 'package:purrr':
## 
##     pluck

## The following object is masked from 'package:readr':
## 
##     guess_encoding

library(tm)

## Warning: package 'tm' was built under R version 4.0.4

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(SnowballC)
library(wordcloud2)

## Warning: package 'wordcloud2' was built under R version 4.0.4

library(stringr)
library(plotly)

## Warning: package 'plotly' was built under R version 4.0.4

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

gen_skills <- read_csv(file=gen_skills_url)

## 
## -- Column specification --------------------------------------------------------
## cols(
##   Keyword = col_character(),
##   LinkedIn = col_number(),
##   Indeed = col_number(),
##   SimplyHired = col_number(),
##   Monster = col_number()
## )

soft_skills <- read_csv(file=soft_skills_url)

## 
## -- Column specification --------------------------------------------------------
## cols(
##   Keyword = col_character(),
##   LinkedIn = col_number(),
##   Indeed = col_number(),
##   SimplyHired = col_number(),
##   Monster = col_number(),
##   `LinkedIn %` = col_character(),
##   `Indeed %` = col_character(),
##   `SimplyHired %` = col_character(),
##   `Monster %` = col_character(),
##   `Avg %` = col_character(),
##   `GlassDoor Self Reported % 2017` = col_character(),
##   Difference = col_character()
## )

Data Cleaning

I cleaned the two datasets and joined them into one

df_1 <- as_tibble(gen_skills, key = "ï..Keyword")
df_1$Skill_Set <- "Soft"
df_2 <- as_tibble(soft_skills, key = "ï..Keyword")
df_2$Skill_Set <- "Tech"
df_2 <- select(df_2,-c(6,7,8,9,10,11,12))
combined_df<-rbind(df_1, df_2)
head(combined_df)

## # A tibble: 6 x 6
##   Keyword          LinkedIn Indeed SimplyHired Monster Skill_Set
##   <chr>               <dbl>  <dbl>       <dbl>   <dbl> <chr>    
## 1 machine learning     5701   3439        2561    2340 Soft     
## 2 analysis             5168   3500        2668    3306 Soft     
## 3 statistics           4893   2992        2308    2399 Soft     
## 4 computer science     4517   2739        2093    1900 Soft     
## 5 communication        3404   2344        1791    2053 Soft     
## 6 mathematics          2605   1961        1497    1815 Soft

# Rename the variable
names(combined_df)[names(combined_df) == "ï..Keyword"] <- "Keyword"

#Tidying the combined dataframe
combined_df$Total <- rowSums(sapply(combined_df[,c(2:5)], as.numeric))
combined_df <- mutate(combined_df[, c(1,6,2:5,7)])
combined_df <- data.frame(combined_df)
colnames(combined_df)

## [1] "Keyword"     "Skill_Set"   "LinkedIn"    "Indeed"      "SimplyHired"
## [6] "Monster"     "Total"

Averages of keywords

SimplyHired <- 3829
Monster <- 3746
Indeed <- 5138
LinkedIn <- 8610
Total <- LinkedIn + Indeed + SimplyHired + Monster
combined_df$LinkedIn <- ((combined_df$LinkedIn)/(LinkedIn))*100
combined_df$Indeed <- ((combined_df$Indeed)/(Indeed))*100
combined_df$SimplyHired <- ((combined_df$SimplyHired)/(SimplyHired))*100
combined_df$Monster <- ((combined_df$Monster)/((Monster)))*100
combined_df$Total <- ((combined_df$Total/((Total))))*100
dim(combined_df)

## [1] 74  7

head(combined_df, 10)

##             Keyword Skill_Set LinkedIn   Indeed SimplyHired  Monster    Total
## 1  machine learning      Soft 66.21370 66.93266    66.88430 62.46663 65.84908
## 2          analysis      Soft 60.02323 68.11989    69.67877 88.25414 68.66764
## 3        statistics      Soft 56.82927 58.23278    60.27683 64.04164 59.05360
## 4  computer science      Soft 52.46225 53.30868    54.66179 50.72077 52.75524
## 5     communication      Soft 39.53542 45.62086    46.77461 54.80513 44.98429
## 6       mathematics      Soft 30.25552 38.16660    39.09637 48.45168 36.94602
## 7     visualization      Soft 21.82346 27.50097    30.11230 32.22104 26.50659
## 8      AI composite      Soft 18.21138 21.89568    21.18046 18.33956 19.65483
## 9     deep learning      Soft 15.21487 19.05411    17.62862 16.17726 16.74248
## 10    NLP composite      Soft 14.07666 17.71117    17.23688 15.53657 15.77639

ANALYSIS

Top 10 Soft Skills for Data Scientists

combined_df_soft <- filter(combined_df, combined_df$Skill_Set == "Soft")
dim(combined_df_soft)

## [1] 30  7

combined_df_soft_top10 <- arrange(combined_df_soft[1:10,c(1,3:7)], desc(Total))
combined_df_soft_top10

##             Keyword LinkedIn   Indeed SimplyHired  Monster    Total
## 1          analysis 60.02323 68.11989    69.67877 88.25414 68.66764
## 2  machine learning 66.21370 66.93266    66.88430 62.46663 65.84908
## 3        statistics 56.82927 58.23278    60.27683 64.04164 59.05360
## 4  computer science 52.46225 53.30868    54.66179 50.72077 52.75524
## 5     communication 39.53542 45.62086    46.77461 54.80513 44.98429
## 6       mathematics 30.25552 38.16660    39.09637 48.45168 36.94602
## 7     visualization 21.82346 27.50097    30.11230 32.22104 26.50659
## 8      AI composite 18.21138 21.89568    21.18046 18.33956 19.65483
## 9     deep learning 15.21487 19.05411    17.62862 16.17726 16.74248
## 10    NLP composite 14.07666 17.71117    17.23688 15.53657 15.77639

# Top ten Soft skills as a percentage of total Data Science jobs

plot_ly(combined_df_soft_top10, 
        x = ~ combined_df_soft_top10, 
        y = combined_df_soft_top10$Total, 
        type = 'bar',
        name = combined_df_soft_top10$Keyword)%>% 
layout(title = 'Top 10 Soft Skills for Data Scientists',yaxis = list(title = '% of Data Science Jobs'), xaxis = list(title = 'Soft Skills'))

Top 10 Soft skills as a pecentage of total data science jobs by job site

soft_top10_by_site <- data.frame("Keyword"=combined_df_soft_top10, combined_df_soft_top10)

soft_visual_top10 <- soft_top10_by_site[,c('Keyword', 'LinkedIn', 'Indeed', 'SimplyHired', 'Monster', 'Total')]
head(soft_visual_top10, 5)

##            Keyword LinkedIn   Indeed SimplyHired  Monster    Total
## 1         analysis 60.02323 68.11989    69.67877 88.25414 68.66764
## 2 machine learning 66.21370 66.93266    66.88430 62.46663 65.84908
## 3       statistics 56.82927 58.23278    60.27683 64.04164 59.05360
## 4 computer science 52.46225 53.30868    54.66179 50.72077 52.75524
## 5    communication 39.53542 45.62086    46.77461 54.80513 44.98429

plot_ly(soft_visual_top10, x = ~Keyword, y = ~LinkedIn, type = 'bar', name = "Linkedin") %>%
add_trace(y = ~Indeed, name = "Indeed") %>% 
add_trace(y = ~SimplyHired, name = "SimplyHired") %>% 
add_trace(y = ~Monster, name = "Monster") %>% 
add_trace(y = ~Total, name = "Average") %>% 
layout(title = 'Soft Skills by Job Sites', yaxis = list(title = '% of Data Science Jobs'), xaxis = list(title = 'Soft Skills'), barmode = 'group')

Top 10 Technical Skills for Data Scientists

combined_df_tech <- filter(combined_df, combined_df$Skill_Set == "Tech")
dim(combined_df_tech)

## [1] 44  7

combined_df_tech_top10 <- arrange(combined_df_tech[1:10,c(1,3:7)], desc(Total))
combined_df_tech_top10

##    Keyword LinkedIn   Indeed SimplyHired  Monster    Total
## 1   Python 73.71661 74.30907    75.42439 67.91244 73.14637
## 2        R 52.88037 60.45154    62.49674 63.13401 58.23289
## 3      SQL 45.05226 51.14831    53.69548 49.14576 48.79238
## 4   Hadoop 24.87805 30.71234    30.39958 32.03417 28.53257
## 5    Spark 25.19164 30.18684    30.47793 28.35024 27.89945
## 6     Java 22.57840 26.80031    27.65735 26.74853 25.24035
## 7      SAS 19.89547 22.07084    23.76600 26.10785 22.20607
## 8  Tableau 14.12311 19.69638    20.37085 19.86119 17.59602
## 9     Hive 13.72822 16.15415    16.63620 16.52429 15.32617
## 10   Scala 12.07898 14.38303    15.38261 13.88147 13.54406

# Top 10 Technical skills as a percentage of total Data Science jobs

plot_ly(combined_df_tech_top10, 
        x = ~ combined_df_tech_top10, 
        y = combined_df_tech_top10$Total, 
        type = 'bar', 
        name = combined_df_tech_top10$Keyword)%>% 
layout(title = 'Top 10 Technical Skills for Data Scientists',yaxis = list(title = '% of Data Science Jobs'), xaxis = list(title = 'Technical Skills'))

Top 10 Technical skills as a pecentage of total data science jobs by job site

tech_top10_by_site <- data.frame("Keyword"=combined_df_tech_top10, combined_df_tech_top10)

tech_visual_top10 <- tech_top10_by_site[,c('Keyword', 'LinkedIn', 'Indeed', 'SimplyHired', 'Monster', 'Total')]
head(tech_visual_top10, 5)

##   Keyword LinkedIn   Indeed SimplyHired  Monster    Total
## 1  Python 73.71661 74.30907    75.42439 67.91244 73.14637
## 2       R 52.88037 60.45154    62.49674 63.13401 58.23289
## 3     SQL 45.05226 51.14831    53.69548 49.14576 48.79238
## 4  Hadoop 24.87805 30.71234    30.39958 32.03417 28.53257
## 5   Spark 25.19164 30.18684    30.47793 28.35024 27.89945

plot_ly(tech_visual_top10, x = ~Keyword, y = ~LinkedIn, type = 'bar', name = "Linkedin") %>%
add_trace(y = ~Indeed, name = "Indeed") %>% 
add_trace(y = ~SimplyHired, name = "SimplyHired") %>% 
add_trace(y = ~Monster, name = "Monster") %>% 
add_trace(y = ~Total, name = "Average") %>% 
layout(title = 'Technical Skills by Job Sites', yaxis = list(title = '% of Data Science Jobs'), xaxis = list(title = 'Technical Skills'), barmode = 'group')

Top Soft and Technical skills needed for a data scientist

# Top Soft skill required
head(combined_df_tech,1)

##   Keyword Skill_Set LinkedIn   Indeed SimplyHired  Monster    Total
## 1  Python      Tech 73.71661 74.30907    75.42439 67.91244 73.14637

# Top Technical skill required
head(combined_df_soft,1)

##            Keyword Skill_Set LinkedIn   Indeed SimplyHired  Monster    Total
## 1 machine learning      Soft  66.2137 66.93266     66.8843 62.46663 65.84908

CONCLUSION

According to this dataset, the top soft skill that employers are looking for when hiring a data scientist is Analysis. The top technical skill being targeted is Python.