Open Skill data science skills analysis

Use data in Open Skills web site to find job titles with Data Science skills.
Data from site to be used is clean_title_count

Use tidy workflow to produce analysis:

Libraries

library(RCurl)
library(dplyr)
library(tidyr)
library(rvest)
library(stringr)
library(ggplot2)
library(kableExtra)
library(RMySQL)
library(wordcloud2)

Import

Get list of csv files to import from Open Skills website

csvFiles<-read_html("https://open-skills-datasets.s3-us-west-2.amazonaws.com/") %>% html_nodes("key") %>% html_text() %>% str_extract("cleaned_title_count/[:print:]+") %>% str_extract("[0-9]+Q[1-4].csv")
csvFiles<-csvFiles[!is.na(csvFiles)] 
csvFiles %>% kable(align = 'c') %>% kable_styling() %>% scroll_box(width = "300px",height="400px") %>% kable_styling(position = "center")

x
2011Q1.csv
2011Q2.csv
2011Q3.csv
2011Q4.csv
2012Q1.csv
2012Q2.csv
2012Q3.csv
2012Q4.csv
2013Q1.csv
2013Q2.csv
2013Q3.csv
2013Q4.csv
2014Q1.csv
2014Q2.csv
2015Q1.csv
2016Q1.csv
2016Q2.csv
2016Q3.csv
2016Q4.csv
2017Q1.csv

Load last 4 csv files - last year worth of data

out<-vector()
for (i in 0:3) {
  url<-paste("https://open-skills-datasets.s3-us-west-2.amazonaws.com/cleaned_title_count/",csvFiles[length(csvFiles)-i],sep="") 
  x<-getURL(url)
  out<-rbind(out,read.csv(textConnection(x)))
}
dataTable<-tbl_df(out)
head(dataTable,n=20) %>% kable() %>% kable_styling() %>% scroll_box(width = "910px",height="400px")

title	counts_total	skills_1	skills_2	skills_3	skills_4	skills_5	skills_6	skills_7	skills_8	skills_9	skills_10	soc_code_common_1	soc_code_common_2	soc_code_common_total	soc_code_top_1	soc_code_top_2	soc_code_top_total	soc_code_given_1	soc_code_given_2	soc_code_given_total
	1											39-9032.00		1	13-1071.00		1	35-2014.00	NA	1
rsaf language training elt instructors	2	c	reduce	english language								25-1123.00		1	27-3031.00	21-1093.00	2		NA	1
integrator	2	troubleshooting	skill	red hat enterprise linux	linux	design						15-1142.00		1	15-1122.00	15-1143.00	2		NA	1
correctional officer	1	transportation	brakes									33-1012.00		1	29-2012.00		1	33-3012.00	NA	1
store all positions	2	time management										13-1199.00	51-9151.00	2	43-4161.00	43-4051.03	2		NA	1
head of office	2	writing	forth									11-2022.00	11-3021.00	2	13-1151.00	11-3021.00	2		NA	1
security positions	1	rules	closedcircuit tv cameras	security cameras	bulletproof vests	surveillance cameras	alarms					33-3051.01		1	33-3051.01		1		NA	1
sr manager event communications	1	speaking	writing	forth								11-2031.00		1	11-2011.00		1	11-2021.00	NA	1
assistant director of engineering and construction	1	ada	mechanical	levels	impact							11-9041.00		1	17-2141.00		1		NA	1
scheduling coordinator	3	microsoft office										31-1014.00	29-2061.00	2	11-9111.00	41-4011.00	3	43-5061.00	NA	1
systems engineer fair	1	skill										17-2071.00		1	17-2141.00		1		NA	1
school bus driver	2	skill	transportation	route								25-2012.00		1	25-2052.00	25-9041.00	2	53-3022.00	NA	1
systems analyst	1	skill	c	simulation software	mathematics	mechanical	physics	science	design			13-1111.00		1	15-1121.00		1	15-1121.00	NA	1
warehouser	1	trucks	pallet jacks	forklifts	sweepers							33-3051.01		1	49-9071.00		1	49-9071.00	NA	1
cloud solution architect	2	redmine	c	programming	python	visualization	javascript	puppet	apache tomcat	stencils	oracle	15-1132.00	15-1121.00	2	15-1132.00	15-1143.00	2	15-1199.02	NA	1
insemination crew worker	4	speaking	self	personal protective equipment								43-5061.00	27-2011.00	4	35-3021.00	37-2011.00	4		NA	1
product support sales representative	2	stamina										41-4012.00		1	41-4011.00	41-2031.00	2		NA	1
direct support professional team leader	1											29-1181.00		1	15-1122.00		1		NA	1
data scientist	1	skill	science	git	python	mathematics	r	derive				15-1134.00		1	17-2112.00		1	15-1111.00	NA	1
part bus driver commercial drivers license	1											33-3051.01		1	53-3022.00		1		NA	1

Tidy

Build long tables for skills and jobs

#Get skills
skills <- dataTable %>%select( starts_with('skill') )    %>%
  gather(3:10,  value = "skill") %>%
  select( starts_with('skill')) %>% 
  distinct() %>% 
  filter(skill != '')

## Warning: attributes are not identical across measure variables;
## they will be dropped

#Get job titles 
jobs <- dataTable %>%select( 1)  %>%
  filter(title != '')   %>% 
  distinct()  
 names(jobs)<-c('job_name')
 names(skills) <- c('skill_name')
head(jobs,n=20) %>% kable() %>% kable_styling() %>% scroll_box(width = "910px",height="400px")

job_name
rsaf language training elt instructors
integrator
correctional officer
store all positions
head of office
security positions
sr manager event communications
assistant director of engineering and construction
scheduling coordinator
systems engineer fair
school bus driver
systems analyst
warehouser
cloud solution architect
insemination crew worker
product support sales representative
direct support professional team leader
data scientist
part bus driver commercial drivers license
senior engineer

head(skills,n=20) %>% kable() %>% kable_styling() %>% scroll_box(width = "910px",height="400px")

skill_name
c
troubleshooting
transportation
time management
writing
rules
speaking
ada
microsoft office
skill
trucks
redmine
stamina
hoists
visualization
meat slicers
sales and marketing
negotiation
coordination
trash bags

Save skills and jobs to DB

con = dbConnect(MySQL(), user='data607p3', password='data607p3', dbname='job_skills', host='localhost')

########################################################################
#only run this once otherwise tables will have duplicate data
RMySQL::dbWriteTable(con, 'jobs', jobs, append = TRUE, row.names = FALSE)

## [1] TRUE

RMySQL::dbWriteTable(con, 'skills', skills, append = TRUE, row.names = FALSE)

## [1] TRUE

#########################################################################

Read jobs and skills back

We are using auto increment id column for primary key so we need to read the tables back to do the mapping table

job_db <- dbReadTable(con, 'jobs')
skills_db<- dbReadTable(con, 'skills')
head(job_db,n=20) %>% kable() %>% kable_styling() %>% scroll_box(width = "910px",height="400px")

job_id	job_name
1	rsaf language training elt instructors
2	integrator
3	correctional officer
4	store all positions
5	head of office
6	security positions
7	sr manager event communications
8	assistant director of engineering and construction
9	scheduling coordinator
10	systems engineer fair
11	school bus driver
12	systems analyst
13	warehouser
14	cloud solution architect
15	insemination crew worker
16	product support sales representative
17	direct support professional team leader
18	data scientist
19	part bus driver commercial drivers license
20	senior engineer

head(skills_db,n=20) %>% kable() %>% kable_styling() %>% scroll_box(width = "910px",height="400px")

skill_id	skill_name
1	c
2	troubleshooting
3	transportation
4	time management
5	writing
6	rules
7	speaking
8	ada
9	microsoft office
10	skill
11	trucks
12	redmine
13	stamina
14	hoists
15	visualization
16	meat slicers
17	sales and marketing
18	negotiation
19	coordination
20	trash bags

Tidy & Transform job to skills mapping then save it to db

job_skills <-  dataTable   %>%
  gather(3:10,  value = "skill_name", key="job_name") %>%
  filter(skill_name != '') %>%
  filter(title != '') %>%
  select(title, skill_name)%>%
  as.data.frame()

## Warning: attributes are not identical across measure variables;
## they will be dropped

names(job_skills) <- c('job_name','skill_name')
head(job_skills,n=20) %>% kable() %>% kable_styling() %>% scroll_box(width = "910px",height="400px")

job_name	skill_name
rsaf language training elt instructors	c
integrator	troubleshooting
correctional officer	transportation
store all positions	time management
head of office	writing
security positions	rules
sr manager event communications	speaking
assistant director of engineering and construction	ada
scheduling coordinator	microsoft office
systems engineer fair	skill
school bus driver	skill
systems analyst	skill
warehouser	trucks
cloud solution architect	redmine
insemination crew worker	speaking
product support sales representative	stamina
data scientist	skill
senior engineer	skill
network engineer i	troubleshooting
cdla truck drivers	trucks

# save jobs_skills mapping. we are doing this because we have auto increment private keys.
job_skills_map <- job_skills %>%
                  inner_join(skills_db )  %>%
                  inner_join(job_db )   %>%
                  select(job_id, skill_id) %>%
                  distinct()

## Joining, by = "skill_name"

## Joining, by = "job_name"

## Warning: Column `job_name` joining factor and character vector, coercing
## into character vector

RMySQL::dbWriteTable(con, 'job_skills', job_skills_map, append = TRUE, row.names = FALSE)

## [1] TRUE

Read tables from db

#Read tables from db.
jobs_db <- dbReadTable(con, 'jobs')
skills_db<- dbReadTable(con, 'skills')
job_skills_db<- dbReadTable(con, 'job_skills')

# Join tables to get skills for jobs. 
job_skills_db <- jobs_db %>%
  inner_join(job_skills_db )%>%
  inner_join(skills_db ) %>%
  select(2,4)

## Joining, by = "job_id"

## Joining, by = "skill_id"

Question 1 - What are the most important skills in Data Science?

Transform

Filter table for data science titles only

job_skills_dataScience<-filter(job_skills_db,str_detect(job_name,"data scientist"))
head(job_skills_dataScience,n=20) %>% kable() %>% kable_styling() %>% scroll_box(width = "910px",height="400px")

job_name	skill_name
data scientist	skill
data scientist	science
data scientist	python
data scientist	mathematics
data scientist	programming
data scientist	git
data scientist	derive
data scientist	r
data scientist	design
data scientist	c
data scientist	visualization
data scientist	writing
data scientist intern	r
data scientist intern	design
data scientist intern	science
data scientist intern	programming
data scientist intern	visualization
data scientist intern	python
data scientist intern	programming languages
data scientist intern	drupal

Create a vector with all skills

skillsDataScience<-pull(job_skills_dataScience,'skill_name')
skillsDataScience<-table(skillsDataScience)
head(skillsDataScience,n=10) %>% kable() %>% kable_styling() %>% scroll_box(width = "910px",height="400px")

skillsDataScience	Freq
23	2
apache hadoop	3
biology	1
c	13
chemistry	1
cmake	4
collectors	1
compliance software	1
coordination	2
critical thinking	5

Visualize

Order data then visualize using ggplot

Also we plot a word soup using wordcloud2 package

skillsDataScienceOrdered<-skillsDataScience %>% as.data.frame() %>% tbl_df()
skillsDataScienceOrdered$skillsDataScience <- factor(skillsDataScienceOrdered$skillsDataScience,levels=skillsDataScienceOrdered$skillsDataScience[order(-skillsDataScienceOrdered$Freq)])

ggplot(skillsDataScienceOrdered,aes(x=skillsDataScience,y=Freq))+geom_bar(stat="identity")+theme(axis.text = element_text(angle=90))

wordcloud2(skillsDataScienceOrdered,shuffle = TRUE,size = .5,gridSize = 15)

Question 2 - What Data Science job tittles require the most skills?

Transform

Same as in question 1 we filter our long tidy table for data science titles only, but this time we group them

job_skills_dataScience<-filter(job_skills_db,str_detect(job_name,"data scientist")) %>% group_by(job_name) %>% count()
head(job_skills_dataScience,n=20) %>% kable() %>% kable_styling() %>% scroll_box(width = "910px",height="400px")

job_name	n
chief data scientist	6
cmi data scientist	6
cs data scientist mid	8
data architect data scientist	8
data scientist	12
data scientist analyst	3
data scientist analyst staff	8
data scientist analyst statistician	7
data scientist associate manager	8
data scientist bi and crm	4
data scientist data analytics	7
data scientist data scientist senior usa	8
data scientist devops senior	8
data scientist financial services analytics	11
data scientist for cutting edge analytic firm	5
data scientist insight studio consulting mit	8
data scientist insight studio consulting northeastern	8
data scientist intern	11
data scientist intern of champagne	5
data scientist intern tech	5

Visualize

We order data and visualize using ggplot

skillsDataScienceOrdered<-job_skills_dataScience
skillsDataScienceOrdered$job_name <- factor(skillsDataScienceOrdered$job_name,levels=skillsDataScienceOrdered$job_name[order(-skillsDataScienceOrdered$n)])

ggplot(skillsDataScienceOrdered,aes(x=job_name,y=n))+geom_bar(stat="identity")+theme(axis.text = element_text(angle=90))

Question 3 - What other job titles have similar skills to Data Science?

Transform

Once again we filter data scientist, and then we do a join with all other job names to find those with similar skills

job_skills_dataScience<-filter(job_skills_db,str_detect(job_name,"data scientist"))
job_skills_dataScience<-job_skills_dataScience %>% filter(skill_name!='skill')
same_skills<-job_skills %>% right_join(job_skills_dataScience,by='skill_name') %>% subset(!(job_name.x %in% job_skills_dataScience$job_name)) %>% group_by(job_name.x) %>% count()

Visualize

We order and visualize the data using ggplot

same_skills_Ordered<-same_skills %>% filter(n>550) %>% as.data.frame() %>% tbl_df()
same_skills_Ordered$job_name <- factor(same_skills_Ordered$job_name.x,levels=same_skills_Ordered$job_name.x[order(-same_skills_Ordered$n)])

ggplot(same_skills_Ordered,aes(x=job_name,y=n))+geom_bar(stat="identity")+theme(axis.text = element_text(angle=90))

Question 4 - What percentage of people have the skills to be a Data Scientist?

We define a model in which: - A random person has 10 random skills from the list of possible skills in the dataset - A person with more than 5 data science skills has the skills to be a data scientist

With this model, we test out 1000 random people and calculate a percentage

skill_all<-job_skills_db$skill_name
skill_datascience<-job_skills_dataScience$skill_name
skill_not_datascience<-setdiff(job_skills$skill_name,skill_datascience)

number_of_data_sientists<-0
people_in_population<-1000
for (i in 1:people_in_population) {
  person<-skill_all[as.integer(runif(10,0,length(skill_all)))]
  if (length(setdiff(person,skill_datascience))>5) {
    #print(paste("Person",toString(i),"has skills to be a data scientist"))
    number_of_data_sientists<-number_of_data_sientists+1
  } else {
    #print(paste("Person",toString(i),"does not skills to be a data scientist"))
  }
}
proportion<-number_of_data_sientists/people_in_population*100
print(paste("The percentage of people with data science skills is: ",toString(proportion),"%",sep=""))

## [1] "The percentage of people with data science skills is: 11%"

result<-as.data.frame(c(100-proportion,proportion))
colnames(result)<-c("Percentage")
result$Skill_Set<-c("Non Data Science","Data Science")
ggplot(result,aes(y=Percentage,x=Skill_Set,fill=Skill_Set))+geom_bar(stat="identity")+geom_text(data=result,aes(y=Percentage,x=Skill_Set,label=paste(Percentage,"%",sep="")))

Data Science Skills Project

Joby John and Peter Kowalchuk

October 14, 2018

Open Skill data science skills analysis

Libraries

Import

Get list of csv files to import from Open Skills website

Load last 4 csv files - last year worth of data

Tidy

Build long tables for skills and jobs

Save skills and jobs to DB

Read jobs and skills back

Tidy & Transform job to skills mapping then save it to db

Read tables from db

Question 1 - What are the most important skills in Data Science?

Transform

Visualize

Question 2 - What Data Science job tittles require the most skills?

Transform

Visualize

Question 3 - What other job titles have similar skills to Data Science?

Transform

Visualize

Question 4 - What percentage of people have the skills to be a Data Scientist?