This project is an opportunity to practice both the hard technical skills around R and SQL as well as the soft skills of distance collaboration with classmates. The primary question of this project is: Which are the most valued data science skills? Groups are to seek an answer to this question while also completing each of the following:
Documentation for this project was developed and maintained in a sepereate file. This was done both to allow parallel development but also to minimize the length each document. The documentaion can be found at:
#load the necessary supporting packages
library(stringr)
library(dplyr)
library(tidyr)
library(Hmisc)
library(ggplot2)
file_location_1 <- 'https://raw.githubusercontent.com/ZacharyHerold/CUNY-DATA607/master/SCData.csv'
ds_user_profiles_skills <- read.table(file_location_1, sep = ",", header = TRUE, stringsAsFactors = FALSE)
file_location_2 <- 'https://raw.githubusercontent.com/ZacharyHerold/CUNY-DATA607/master/ds_general_skills_revised.csv'
ds_job_postings_skills <- read.table(file_location_2, sep = ",", header = TRUE, stringsAsFactors = FALSE)
file_location_3 <- 'https://raw.githubusercontent.com/ZacharyHerold/CUNY-DATA607/master/ds_job_listing_software.csv'
ds_job_postings_software <- read.table(file_location_3, sep = ",", header = TRUE, fill = TRUE, stringsAsFactors = FALSE, strip.white = TRUE, comment.char = "")
Tidy and transform the user profile datadata
#ds_user_profiles_skills:
ds_user_profiles <- ds_user_profiles_skills %>%
mutate(Skill = tolower(Skill)) %>%
mutate(Skill = capitalize(Skill)) %>%
mutate(Skill_Type = ifelse(Skill %in% c("R"
,"Python"
,"Sql"
,"Matlab"
,"Java"
,"C++"
,"Sas"
,"Hadoop")
,"Software/Language Skill"
,"General Skill")) %>%
select("Skill_Type", "Skill", "Percentage")
Pull out and tidy the count of job postings by site
#ds_job_postings_n_postings
#pull out the total number of postings (to be used later)
ds_jp_n_postings <- ds_job_postings_software[c(41),]
ds_jp_n_postings <- ds_jp_n_postings %>%
select("LinkedIn"
, "Indeed"
, "SimplyHired"
, "Monster")
ds_jp_n_postings <- gather(ds_jp_n_postings, "Site", "N_Postings", 1:4)
ds_jp_n_postings <- ds_jp_n_postings %>%
mutate(N_Postings = str_replace_all(N_Postings, ",", "")) %>%
mutate(N_Postings = as.numeric(N_Postings))
#ds_jp_n_postings
Tidy and transform the two job posting data sets
#ds_job_postings_skills:
#limit to desired rows
ds_job_postings_skills <- ds_job_postings_skills[c(1:15, 17),]
#cleanup
ds_job_postings_skills <- ds_job_postings_skills %>%
mutate(Skill_Type = "General Skill") %>%
select("Skill_Type"
, "Keyword"
, "LinkedIn"
, "Indeed"
, "SimplyHired"
, "Monster")
#ds_job_postings_software:
#limit to desired rows
ds_job_postings_software <- ds_job_postings_software[c(1:37, 39),]
#cleanup
ds_job_postings_software <- ds_job_postings_software %>%
#filter(c(1:37)) %>%
mutate(Skill_Type = "Software/Language Skill") %>%
select("Skill_Type"
, "Keyword"
, "LinkedIn"
, "Indeed"
, "SimplyHired"
, "Monster")
# union (append) the two job posting datasets
ds_job_postings <- union(ds_job_postings_skills
,ds_job_postings_software)
#tidy the unioned job postings dataset
ds_job_postings <- ds_job_postings %>%
rename(Skill = "Keyword") %>%
filter(Skill != "Total") %>%
mutate(Skill = tolower(Skill)) %>%
mutate(Skill = capitalize(Skill))
#transform to "normalized" structure
ds_job_postings <-gather(ds_job_postings, "Site", "N_Records", 3:6)
#cast to numeric
ds_job_postings <- ds_job_postings %>%
mutate(N_Records = str_replace_all(N_Records, ",", "")) %>%
filter(!is.na(N_Records)) %>%
filter(N_Records != "") %>%
mutate(N_Records = as.numeric(N_Records))
In job posting data set, add percent of postings
ds_job_postings <- inner_join(ds_job_postings
, ds_jp_n_postings
, by = "Site")
ds_job_postings <- ds_job_postings %>%
mutate(Pct_Postings = round(N_Records/N_Postings*100, 2)) %>%
select(-N_Postings)
head(ds_job_postings, n=10)
## Skill_Type Skill Site N_Records
## 1 Software/Language Skill Javascript LinkedIn 328
## 2 Software/Language Skill Scikit-learn LinkedIn 474
## 3 Software/Language Skill Sql LinkedIn 3879
## 4 General Skill Analysis LinkedIn 5168
## 5 General Skill Data engineering LinkedIn 514
## 6 Software/Language Skill Mysql LinkedIn 278
## 7 General Skill Communication LinkedIn 3404
## 8 Software/Language Skill Hbase LinkedIn 302
## 9 Software/Language Skill Tensorflow LinkedIn 844
## 10 Software/Language Skill Docker LinkedIn 290
## Pct_Postings
## 1 3.81
## 2 5.51
## 3 45.05
## 4 60.02
## 5 5.97
## 6 3.23
## 7 39.54
## 8 3.51
## 9 9.80
## 10 3.37
write.csv(x = ds_user_profiles
, file = "C:/Users/cbailey/Documents/GitHub/607-Week8-Project3/05_ds_profiles.csv"
,row.names = FALSE)
write.csv(x = ds_job_postings
, file = "C:/Users/cbailey/Documents/GitHub/607-Week8-Project3/05_ds_job_postings.csv"
,row.names = FALSE)
Pull the top skills (listed >50% of postings) across sites
jp_avg_across_sites <- ds_job_postings %>%
group_by(Skill_Type, Skill) %>%
summarise(Avg_Pct_Postings = mean(Pct_Postings)) %>%
arrange(desc(Avg_Pct_Postings))
jp_avg_across_sites <- as.data.frame(jp_avg_across_sites)
jp_avg_across_sites_top <- jp_avg_across_sites %>%
filter(Avg_Pct_Postings >= 50)
jp_avg_across_sites_top
## Skill_Type Skill Avg_Pct_Postings
## 1 Software/Language Skill Python 72.8400
## 2 General Skill Analysis 71.5175
## 3 General Skill Machine learning 65.6225
## 4 General Skill Statistics 59.8450
## 5 Software/Language Skill R 59.7400
## 6 General Skill Computer science 52.7875
Pull the top skills (listed >50% of postings) across sites
Also, review the frequency across sites and the order within sites
jp_avg_within_sites <- ds_job_postings %>%
group_by(Site, Skill_Type, Skill) %>%
summarise(Avg_Pct_Postings = mean(Pct_Postings)) %>%
arrange(Site, desc(Avg_Pct_Postings)) %>%
filter(Avg_Pct_Postings >= 50)
jp_avg_within_sites <- as.data.frame(jp_avg_within_sites)
jp_avg_within_sites
## Site Skill_Type Skill Avg_Pct_Postings
## 1 Indeed Software/Language Skill Python 74.31
## 2 Indeed General Skill Analysis 68.12
## 3 Indeed General Skill Machine learning 66.93
## 4 Indeed Software/Language Skill R 60.45
## 5 Indeed General Skill Statistics 58.23
## 6 Indeed General Skill Computer science 53.31
## 7 Indeed Software/Language Skill Sql 51.15
## 8 LinkedIn Software/Language Skill Python 73.72
## 9 LinkedIn General Skill Machine learning 66.21
## 10 LinkedIn General Skill Analysis 60.02
## 11 LinkedIn General Skill Statistics 56.83
## 12 LinkedIn Software/Language Skill R 52.88
## 13 LinkedIn General Skill Computer science 52.46
## 14 Monster General Skill Analysis 88.25
## 15 Monster Software/Language Skill Python 67.91
## 16 Monster General Skill Statistics 64.04
## 17 Monster Software/Language Skill R 63.13
## 18 Monster General Skill Machine learning 62.47
## 19 Monster General Skill Communication 54.81
## 20 Monster General Skill Computer science 50.72
## 21 SimplyHired Software/Language Skill Python 75.42
## 22 SimplyHired General Skill Analysis 69.68
## 23 SimplyHired General Skill Machine learning 66.88
## 24 SimplyHired Software/Language Skill R 62.50
## 25 SimplyHired General Skill Statistics 60.28
## 26 SimplyHired General Skill Computer science 54.66
## 27 SimplyHired Software/Language Skill Sql 53.70
Compare that to the skills listed by data scientists
# change term in ds profile data set to allow for joining later
ds_user_profiles_top <-ds_user_profiles %>%
mutate(Skill = str_replace_all(Skill, "Data analysis", "Analysis")) %>%
arrange(desc(Percentage)) %>%
top_n(7, Percentage)
ds_user_profiles
## Skill_Type Skill Percentage
## 1 General Skill Data analysis 55.69
## 2 Software/Language Skill R 48.01
## 3 Software/Language Skill Python 46.31
## 4 General Skill Data mining 45.94
## 5 General Skill Machine learning 45.32
## 6 General Skill Statistics 41.52
## 7 Software/Language Skill Sql 39.00
## 8 General Skill Analytics 35.16
## 9 Software/Language Skill Matlab 30.55
## 10 Software/Language Skill Java 30.02
## 11 General Skill Statistical modeling 27.15
## 12 General Skill Algorithm design 27.04
## 13 Software/Language Skill C++ 25.99
## 14 General Skill Business intelligence 23.78
## 15 General Skill Big data 23.68
## 16 Software/Language Skill Sas 22.78
## 17 Software/Language Skill Hadoop 20.98
## 18 General Skill Programming 19.77
## 19 General Skill Research 19.70
## 20 General Skill Software engineering 19.51
# join the job posting and user profile data sets
ds_overlap <- inner_join(ds_user_profiles_top
,jp_avg_across_sites
,by = c("Skill", "Skill_Type"))
ds_overlap <- ds_overlap %>%
mutate(Avg_Pct_Postings = round(Avg_Pct_Postings,2)) %>%
select(Skill = Skill
, Skill_Type = Skill_Type
, DS_Profile_Pct = Percentage
, DS_Posting_Pct = Avg_Pct_Postings)
ds_overlap
## Skill Skill_Type DS_Profile_Pct DS_Posting_Pct
## 1 Analysis General Skill 55.69 71.52
## 2 R Software/Language Skill 48.01 59.74
## 3 Python Software/Language Skill 46.31 72.84
## 4 Machine learning General Skill 45.32 65.62
## 5 Statistics General Skill 41.52 59.84
## 6 Sql Software/Language Skill 39.00 49.76
Scatter plot of Data Scientist User Profile Skills vs Data Science Job Posting Skills
scatter <- ggplot(ds_overlap, aes(x= DS_Profile_Pct, y = DS_Posting_Pct))
scatter <- scatter + geom_point(aes(color = Skill))
scatter <- scatter + xlim(0, 100) + ylim(0, 100)
scatter
Bar graph of the top 20 skills listed on Data Science Profiles
bar <- ggplot(data = ds_user_profiles
, aes(x = reorder(Skill, -Percentage)
,y = Percentage
,fill = Skill_Type))
bar <- bar + xlab("Skill")
bar <- bar + geom_bar(stat="identity")
bar <- bar + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
bar
Bar graph of top 20 skills from job postings (averaged across sites)
jp_avg_across_sites <- jp_avg_across_sites %>%
top_n(20, Avg_Pct_Postings)
bar <- ggplot(data = jp_avg_across_sites
, aes(x = reorder(Skill, -Avg_Pct_Postings)
,y = Avg_Pct_Postings
,fill = Skill_Type))
bar <- bar + xlab("Skill")
bar <- bar + geom_bar(stat="identity")
bar <- bar + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
bar
While the exact order of “top” skills varies slightly by source, the following six skills were consistently identified by data scientists’ and data scientist job postings across multiple sites.
These six skills show that “top” skills include both general technical skills (analysis, machine learning, and Statistics) and software/language skills (R, Python, SQL). This is more clearly seen in the two bar graph displays in which the color for each skill type is roughly equally present.
Other skills such as computer science or communication were mentioned by some sources but not consistently across sources. While it may not have consistently placed as a top skill in our selected data sets, communication skills were found to be invaluable while working on this project. Learning to work together to plan and execute this project across many time zones has been an interesting challenge and we have all learned invaluable lessons.