Open Skill data science skills analysis

Use data in Open Skills web site to find job titles with Data Science skills.
Data from site to be used is clean_title_count

Use tidy workflow to produce analysis:

Libraries

library(RCurl)
library(dplyr)
library(tidyr)
library(rvest)
library(stringr)
library(ggplot2)
library(kableExtra)
library(RMySQL)
library(wordcloud2)

Import

Get list of csv files to import from Open Skills website

csvFiles<-read_html("https://open-skills-datasets.s3-us-west-2.amazonaws.com/") %>% html_nodes("key") %>% html_text() %>% str_extract("cleaned_title_count/[:print:]+") %>% str_extract("[0-9]+Q[1-4].csv")
csvFiles<-csvFiles[!is.na(csvFiles)] 
csvFiles %>% kable(align = 'c') %>% kable_styling() %>% scroll_box(width = "300px",height="400px") %>% kable_styling(position = "center") 
x
2011Q1.csv
2011Q2.csv
2011Q3.csv
2011Q4.csv
2012Q1.csv
2012Q2.csv
2012Q3.csv
2012Q4.csv
2013Q1.csv
2013Q2.csv
2013Q3.csv
2013Q4.csv
2014Q1.csv
2014Q2.csv
2015Q1.csv
2016Q1.csv
2016Q2.csv
2016Q3.csv
2016Q4.csv
2017Q1.csv

Load last 4 csv files - last year worth of data

out<-vector()
for (i in 0:3) {
  url<-paste("https://open-skills-datasets.s3-us-west-2.amazonaws.com/cleaned_title_count/",csvFiles[length(csvFiles)-i],sep="") 
  x<-getURL(url)
  out<-rbind(out,read.csv(textConnection(x)))
}
dataTable<-tbl_df(out)
head(dataTable,n=20) %>% kable() %>% kable_styling() %>% scroll_box(width = "910px",height="400px")
title counts_total skills_1 skills_2 skills_3 skills_4 skills_5 skills_6 skills_7 skills_8 skills_9 skills_10 soc_code_common_1 soc_code_common_2 soc_code_common_total soc_code_top_1 soc_code_top_2 soc_code_top_total soc_code_given_1 soc_code_given_2 soc_code_given_total
1 39-9032.00 1 13-1071.00 1 35-2014.00 NA 1
rsaf language training elt instructors 2 c reduce english language 25-1123.00 1 27-3031.00 21-1093.00 2 NA 1
integrator 2 troubleshooting skill red hat enterprise linux linux design 15-1142.00 1 15-1122.00 15-1143.00 2 NA 1
correctional officer 1 transportation brakes 33-1012.00 1 29-2012.00 1 33-3012.00 NA 1
store all positions 2 time management 13-1199.00 51-9151.00 2 43-4161.00 43-4051.03 2 NA 1
head of office 2 writing forth 11-2022.00 11-3021.00 2 13-1151.00 11-3021.00 2 NA 1
security positions 1 rules closedcircuit tv cameras security cameras bulletproof vests surveillance cameras alarms 33-3051.01 1 33-3051.01 1 NA 1
sr manager event communications 1 speaking writing forth 11-2031.00 1 11-2011.00 1 11-2021.00 NA 1
assistant director of engineering and construction 1 ada mechanical levels impact 11-9041.00 1 17-2141.00 1 NA 1
scheduling coordinator 3 microsoft office 31-1014.00 29-2061.00 2 11-9111.00 41-4011.00 3 43-5061.00 NA 1
systems engineer fair 1 skill 17-2071.00 1 17-2141.00 1 NA 1
school bus driver 2 skill transportation route 25-2012.00 1 25-2052.00 25-9041.00 2 53-3022.00 NA 1
systems analyst 1 skill c simulation software mathematics mechanical physics science design 13-1111.00 1 15-1121.00 1 15-1121.00 NA 1
warehouser 1 trucks pallet jacks forklifts sweepers 33-3051.01 1 49-9071.00 1 49-9071.00 NA 1
cloud solution architect 2 redmine c programming python visualization javascript puppet apache tomcat stencils oracle 15-1132.00 15-1121.00 2 15-1132.00 15-1143.00 2 15-1199.02 NA 1
insemination crew worker 4 speaking self personal protective equipment 43-5061.00 27-2011.00 4 35-3021.00 37-2011.00 4 NA 1
product support sales representative 2 stamina 41-4012.00 1 41-4011.00 41-2031.00 2 NA 1
direct support professional team leader 1 29-1181.00 1 15-1122.00 1 NA 1
data scientist 1 skill science git python mathematics r derive 15-1134.00 1 17-2112.00 1 15-1111.00 NA 1
part bus driver commercial drivers license 1 33-3051.01 1 53-3022.00 1 NA 1

Tidy

Build long tables for skills and jobs

#Get skills
skills <- dataTable %>%select( starts_with('skill') )    %>%
  gather(3:10,  value = "skill") %>%
  select( starts_with('skill')) %>% 
  distinct() %>% 
  filter(skill != '')
## Warning: attributes are not identical across measure variables;
## they will be dropped
#Get job titles 
jobs <- dataTable %>%select( 1)  %>%
  filter(title != '')   %>% 
  distinct()  
 names(jobs)<-c('job_name')
 names(skills) <- c('skill_name')
head(jobs,n=20) %>% kable() %>% kable_styling() %>% scroll_box(width = "910px",height="400px")
job_name
rsaf language training elt instructors
integrator
correctional officer
store all positions
head of office
security positions
sr manager event communications
assistant director of engineering and construction
scheduling coordinator
systems engineer fair
school bus driver
systems analyst
warehouser
cloud solution architect
insemination crew worker
product support sales representative
direct support professional team leader
data scientist
part bus driver commercial drivers license
senior engineer
head(skills,n=20) %>% kable() %>% kable_styling() %>% scroll_box(width = "910px",height="400px")
skill_name
c
troubleshooting
transportation
time management
writing
rules
speaking
ada
microsoft office
skill
trucks
redmine
stamina
hoists
visualization
meat slicers
sales and marketing
negotiation
coordination
trash bags

Save skills and jobs to DB

con = dbConnect(MySQL(), user='data607p3', password='data607p3', dbname='job_skills', host='localhost')

########################################################################
#only run this once otherwise tables will have duplicate data
RMySQL::dbWriteTable(con, 'jobs', jobs, append = TRUE, row.names = FALSE)
## [1] TRUE
RMySQL::dbWriteTable(con, 'skills', skills, append = TRUE, row.names = FALSE)
## [1] TRUE
#########################################################################

Read jobs and skills back

We are using auto increment id column for primary key so we need to read the tables back to do the mapping table

job_db <- dbReadTable(con, 'jobs')
skills_db<- dbReadTable(con, 'skills')
head(job_db,n=20) %>% kable() %>% kable_styling() %>% scroll_box(width = "910px",height="400px")
job_id job_name
1 rsaf language training elt instructors
2 integrator
3 correctional officer
4 store all positions
5 head of office
6 security positions
7 sr manager event communications
8 assistant director of engineering and construction
9 scheduling coordinator
10 systems engineer fair
11 school bus driver
12 systems analyst
13 warehouser
14 cloud solution architect
15 insemination crew worker
16 product support sales representative
17 direct support professional team leader
18 data scientist
19 part bus driver commercial drivers license
20 senior engineer
head(skills_db,n=20) %>% kable() %>% kable_styling() %>% scroll_box(width = "910px",height="400px")
skill_id skill_name
1 c
2 troubleshooting
3 transportation
4 time management
5 writing
6 rules
7 speaking
8 ada
9 microsoft office
10 skill
11 trucks
12 redmine
13 stamina
14 hoists
15 visualization
16 meat slicers
17 sales and marketing
18 negotiation
19 coordination
20 trash bags

Tidy & Transform job to skills mapping then save it to db

job_skills <-  dataTable   %>%
  gather(3:10,  value = "skill_name", key="job_name") %>%
  filter(skill_name != '') %>%
  filter(title != '') %>%
  select(title, skill_name)%>%
  as.data.frame()
## Warning: attributes are not identical across measure variables;
## they will be dropped
names(job_skills) <- c('job_name','skill_name')
head(job_skills,n=20) %>% kable() %>% kable_styling() %>% scroll_box(width = "910px",height="400px")
job_name skill_name
rsaf language training elt instructors c
integrator troubleshooting
correctional officer transportation
store all positions time management
head of office writing
security positions rules
sr manager event communications speaking
assistant director of engineering and construction ada
scheduling coordinator microsoft office
systems engineer fair skill
school bus driver skill
systems analyst skill
warehouser trucks
cloud solution architect redmine
insemination crew worker speaking
product support sales representative stamina
data scientist skill
senior engineer skill
network engineer i troubleshooting
cdla truck drivers trucks
# save jobs_skills mapping. we are doing this because we have auto increment private keys.
job_skills_map <- job_skills %>%
                  inner_join(skills_db )  %>%
                  inner_join(job_db )   %>%
                  select(job_id, skill_id) %>%
                  distinct()
## Joining, by = "skill_name"
## Joining, by = "job_name"
## Warning: Column `job_name` joining factor and character vector, coercing
## into character vector
RMySQL::dbWriteTable(con, 'job_skills', job_skills_map, append = TRUE, row.names = FALSE)
## [1] TRUE

Read tables from db

#Read tables from db.
jobs_db <- dbReadTable(con, 'jobs')
skills_db<- dbReadTable(con, 'skills')
job_skills_db<- dbReadTable(con, 'job_skills')

# Join tables to get skills for jobs. 
job_skills_db <- jobs_db %>%
  inner_join(job_skills_db )%>%
  inner_join(skills_db ) %>%
  select(2,4)
## Joining, by = "job_id"
## Joining, by = "skill_id"

Question 1 - What are the most important skills in Data Science?

Transform

Filter table for data science titles only

job_skills_dataScience<-filter(job_skills_db,str_detect(job_name,"data scientist"))
head(job_skills_dataScience,n=20) %>% kable() %>% kable_styling() %>% scroll_box(width = "910px",height="400px")
job_name skill_name
data scientist skill
data scientist science
data scientist python
data scientist mathematics
data scientist programming
data scientist git
data scientist derive
data scientist r
data scientist design
data scientist c
data scientist visualization
data scientist writing
data scientist intern r
data scientist intern design
data scientist intern science
data scientist intern programming
data scientist intern visualization
data scientist intern python
data scientist intern programming languages
data scientist intern drupal

Create a vector with all skills

skillsDataScience<-pull(job_skills_dataScience,'skill_name')
skillsDataScience<-table(skillsDataScience)
head(skillsDataScience,n=10) %>% kable() %>% kable_styling() %>% scroll_box(width = "910px",height="400px")
skillsDataScience Freq
23 2
apache hadoop 3
biology 1
c 13
chemistry 1
cmake 4
collectors 1
compliance software 1
coordination 2
critical thinking 5

Visualize

Order data then visualize using ggplot

Also we plot a word soup using wordcloud2 package

skillsDataScienceOrdered<-skillsDataScience %>% as.data.frame() %>% tbl_df()
skillsDataScienceOrdered$skillsDataScience <- factor(skillsDataScienceOrdered$skillsDataScience,levels=skillsDataScienceOrdered$skillsDataScience[order(-skillsDataScienceOrdered$Freq)])

ggplot(skillsDataScienceOrdered,aes(x=skillsDataScience,y=Freq))+geom_bar(stat="identity")+theme(axis.text = element_text(angle=90))

wordcloud2(skillsDataScienceOrdered,shuffle = TRUE,size = .5,gridSize = 15)

Question 2 - What Data Science job tittles require the most skills?

Transform

Same as in question 1 we filter our long tidy table for data science titles only, but this time we group them

job_skills_dataScience<-filter(job_skills_db,str_detect(job_name,"data scientist")) %>% group_by(job_name) %>% count()
head(job_skills_dataScience,n=20) %>% kable() %>% kable_styling() %>% scroll_box(width = "910px",height="400px")
job_name n
chief data scientist 6
cmi data scientist 6
cs data scientist mid 8
data architect data scientist 8
data scientist 12
data scientist analyst 3
data scientist analyst staff 8
data scientist analyst statistician 7
data scientist associate manager 8
data scientist bi and crm 4
data scientist data analytics 7
data scientist data scientist senior usa 8
data scientist devops senior 8
data scientist financial services analytics 11
data scientist for cutting edge analytic firm 5
data scientist insight studio consulting mit 8
data scientist insight studio consulting northeastern 8
data scientist intern 11
data scientist intern of champagne 5
data scientist intern tech 5

Visualize

We order data and visualize using ggplot

skillsDataScienceOrdered<-job_skills_dataScience
skillsDataScienceOrdered$job_name <- factor(skillsDataScienceOrdered$job_name,levels=skillsDataScienceOrdered$job_name[order(-skillsDataScienceOrdered$n)])

ggplot(skillsDataScienceOrdered,aes(x=job_name,y=n))+geom_bar(stat="identity")+theme(axis.text = element_text(angle=90))

Question 3 - What other job titles have similar skills to Data Science?

Transform

Once again we filter data scientist, and then we do a join with all other job names to find those with similar skills

job_skills_dataScience<-filter(job_skills_db,str_detect(job_name,"data scientist"))
job_skills_dataScience<-job_skills_dataScience %>% filter(skill_name!='skill')
same_skills<-job_skills %>% right_join(job_skills_dataScience,by='skill_name') %>% subset(!(job_name.x %in% job_skills_dataScience$job_name)) %>% group_by(job_name.x) %>% count()

Visualize

We order and visualize the data using ggplot

same_skills_Ordered<-same_skills %>% filter(n>550) %>% as.data.frame() %>% tbl_df()
same_skills_Ordered$job_name <- factor(same_skills_Ordered$job_name.x,levels=same_skills_Ordered$job_name.x[order(-same_skills_Ordered$n)])

ggplot(same_skills_Ordered,aes(x=job_name,y=n))+geom_bar(stat="identity")+theme(axis.text = element_text(angle=90))

Question 4 - What percentage of people have the skills to be a Data Scientist?

We define a model in which: - A random person has 10 random skills from the list of possible skills in the dataset - A person with more than 5 data science skills has the skills to be a data scientist

With this model, we test out 1000 random people and calculate a percentage

skill_all<-job_skills_db$skill_name
skill_datascience<-job_skills_dataScience$skill_name
skill_not_datascience<-setdiff(job_skills$skill_name,skill_datascience)

number_of_data_sientists<-0
people_in_population<-1000
for (i in 1:people_in_population) {
  person<-skill_all[as.integer(runif(10,0,length(skill_all)))]
  if (length(setdiff(person,skill_datascience))>5) {
    #print(paste("Person",toString(i),"has skills to be a data scientist"))
    number_of_data_sientists<-number_of_data_sientists+1
  } else {
    #print(paste("Person",toString(i),"does not skills to be a data scientist"))
  }
}
proportion<-number_of_data_sientists/people_in_population*100
print(paste("The percentage of people with data science skills is: ",toString(proportion),"%",sep=""))
## [1] "The percentage of people with data science skills is: 11%"
result<-as.data.frame(c(100-proportion,proportion))
colnames(result)<-c("Percentage")
result$Skill_Set<-c("Non Data Science","Data Science")
ggplot(result,aes(y=Percentage,x=Skill_Set,fill=Skill_Set))+geom_bar(stat="identity")+geom_text(data=result,aes(y=Percentage,x=Skill_Set,label=paste(Percentage,"%",sep="")))