Use data in Open Skills web site to find job titles with Data Science skills.
Data from site to be used is clean_title_count
Use tidy workflow to produce analysis:
library(RCurl)
library(dplyr)
library(tidyr)
library(rvest)
library(stringr)
library(ggplot2)
library(kableExtra)
library(RMySQL)
library(wordcloud2)
csvFiles<-read_html("https://open-skills-datasets.s3-us-west-2.amazonaws.com/") %>% html_nodes("key") %>% html_text() %>% str_extract("cleaned_title_count/[:print:]+") %>% str_extract("[0-9]+Q[1-4].csv")
csvFiles<-csvFiles[!is.na(csvFiles)]
csvFiles %>% kable(align = 'c') %>% kable_styling() %>% scroll_box(width = "300px",height="400px") %>% kable_styling(position = "center")
x |
---|
2011Q1.csv |
2011Q2.csv |
2011Q3.csv |
2011Q4.csv |
2012Q1.csv |
2012Q2.csv |
2012Q3.csv |
2012Q4.csv |
2013Q1.csv |
2013Q2.csv |
2013Q3.csv |
2013Q4.csv |
2014Q1.csv |
2014Q2.csv |
2015Q1.csv |
2016Q1.csv |
2016Q2.csv |
2016Q3.csv |
2016Q4.csv |
2017Q1.csv |
out<-vector()
for (i in 0:3) {
url<-paste("https://open-skills-datasets.s3-us-west-2.amazonaws.com/cleaned_title_count/",csvFiles[length(csvFiles)-i],sep="")
x<-getURL(url)
out<-rbind(out,read.csv(textConnection(x)))
}
dataTable<-tbl_df(out)
head(dataTable,n=20) %>% kable() %>% kable_styling() %>% scroll_box(width = "910px",height="400px")
title | counts_total | skills_1 | skills_2 | skills_3 | skills_4 | skills_5 | skills_6 | skills_7 | skills_8 | skills_9 | skills_10 | soc_code_common_1 | soc_code_common_2 | soc_code_common_total | soc_code_top_1 | soc_code_top_2 | soc_code_top_total | soc_code_given_1 | soc_code_given_2 | soc_code_given_total |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 39-9032.00 | 1 | 13-1071.00 | 1 | 35-2014.00 | NA | 1 | |||||||||||||
rsaf language training elt instructors | 2 | c | reduce | english language | 25-1123.00 | 1 | 27-3031.00 | 21-1093.00 | 2 | NA | 1 | |||||||||
integrator | 2 | troubleshooting | skill | red hat enterprise linux | linux | design | 15-1142.00 | 1 | 15-1122.00 | 15-1143.00 | 2 | NA | 1 | |||||||
correctional officer | 1 | transportation | brakes | 33-1012.00 | 1 | 29-2012.00 | 1 | 33-3012.00 | NA | 1 | ||||||||||
store all positions | 2 | time management | 13-1199.00 | 51-9151.00 | 2 | 43-4161.00 | 43-4051.03 | 2 | NA | 1 | ||||||||||
head of office | 2 | writing | forth | 11-2022.00 | 11-3021.00 | 2 | 13-1151.00 | 11-3021.00 | 2 | NA | 1 | |||||||||
security positions | 1 | rules | closedcircuit tv cameras | security cameras | bulletproof vests | surveillance cameras | alarms | 33-3051.01 | 1 | 33-3051.01 | 1 | NA | 1 | |||||||
sr manager event communications | 1 | speaking | writing | forth | 11-2031.00 | 1 | 11-2011.00 | 1 | 11-2021.00 | NA | 1 | |||||||||
assistant director of engineering and construction | 1 | ada | mechanical | levels | impact | 11-9041.00 | 1 | 17-2141.00 | 1 | NA | 1 | |||||||||
scheduling coordinator | 3 | microsoft office | 31-1014.00 | 29-2061.00 | 2 | 11-9111.00 | 41-4011.00 | 3 | 43-5061.00 | NA | 1 | |||||||||
systems engineer fair | 1 | skill | 17-2071.00 | 1 | 17-2141.00 | 1 | NA | 1 | ||||||||||||
school bus driver | 2 | skill | transportation | route | 25-2012.00 | 1 | 25-2052.00 | 25-9041.00 | 2 | 53-3022.00 | NA | 1 | ||||||||
systems analyst | 1 | skill | c | simulation software | mathematics | mechanical | physics | science | design | 13-1111.00 | 1 | 15-1121.00 | 1 | 15-1121.00 | NA | 1 | ||||
warehouser | 1 | trucks | pallet jacks | forklifts | sweepers | 33-3051.01 | 1 | 49-9071.00 | 1 | 49-9071.00 | NA | 1 | ||||||||
cloud solution architect | 2 | redmine | c | programming | python | visualization | javascript | puppet | apache tomcat | stencils | oracle | 15-1132.00 | 15-1121.00 | 2 | 15-1132.00 | 15-1143.00 | 2 | 15-1199.02 | NA | 1 |
insemination crew worker | 4 | speaking | self | personal protective equipment | 43-5061.00 | 27-2011.00 | 4 | 35-3021.00 | 37-2011.00 | 4 | NA | 1 | ||||||||
product support sales representative | 2 | stamina | 41-4012.00 | 1 | 41-4011.00 | 41-2031.00 | 2 | NA | 1 | |||||||||||
direct support professional team leader | 1 | 29-1181.00 | 1 | 15-1122.00 | 1 | NA | 1 | |||||||||||||
data scientist | 1 | skill | science | git | python | mathematics | r | derive | 15-1134.00 | 1 | 17-2112.00 | 1 | 15-1111.00 | NA | 1 | |||||
part bus driver commercial drivers license | 1 | 33-3051.01 | 1 | 53-3022.00 | 1 | NA | 1 |
#Get skills
skills <- dataTable %>%select( starts_with('skill') ) %>%
gather(3:10, value = "skill") %>%
select( starts_with('skill')) %>%
distinct() %>%
filter(skill != '')
## Warning: attributes are not identical across measure variables;
## they will be dropped
#Get job titles
jobs <- dataTable %>%select( 1) %>%
filter(title != '') %>%
distinct()
names(jobs)<-c('job_name')
names(skills) <- c('skill_name')
head(jobs,n=20) %>% kable() %>% kable_styling() %>% scroll_box(width = "910px",height="400px")
job_name |
---|
rsaf language training elt instructors |
integrator |
correctional officer |
store all positions |
head of office |
security positions |
sr manager event communications |
assistant director of engineering and construction |
scheduling coordinator |
systems engineer fair |
school bus driver |
systems analyst |
warehouser |
cloud solution architect |
insemination crew worker |
product support sales representative |
direct support professional team leader |
data scientist |
part bus driver commercial drivers license |
senior engineer |
head(skills,n=20) %>% kable() %>% kable_styling() %>% scroll_box(width = "910px",height="400px")
skill_name |
---|
c |
troubleshooting |
transportation |
time management |
writing |
rules |
speaking |
ada |
microsoft office |
skill |
trucks |
redmine |
stamina |
hoists |
visualization |
meat slicers |
sales and marketing |
negotiation |
coordination |
trash bags |
con = dbConnect(MySQL(), user='data607p3', password='data607p3', dbname='job_skills', host='localhost')
########################################################################
#only run this once otherwise tables will have duplicate data
RMySQL::dbWriteTable(con, 'jobs', jobs, append = TRUE, row.names = FALSE)
## [1] TRUE
RMySQL::dbWriteTable(con, 'skills', skills, append = TRUE, row.names = FALSE)
## [1] TRUE
#########################################################################
We are using auto increment id column for primary key so we need to read the tables back to do the mapping table
job_db <- dbReadTable(con, 'jobs')
skills_db<- dbReadTable(con, 'skills')
head(job_db,n=20) %>% kable() %>% kable_styling() %>% scroll_box(width = "910px",height="400px")
job_id | job_name |
---|---|
1 | rsaf language training elt instructors |
2 | integrator |
3 | correctional officer |
4 | store all positions |
5 | head of office |
6 | security positions |
7 | sr manager event communications |
8 | assistant director of engineering and construction |
9 | scheduling coordinator |
10 | systems engineer fair |
11 | school bus driver |
12 | systems analyst |
13 | warehouser |
14 | cloud solution architect |
15 | insemination crew worker |
16 | product support sales representative |
17 | direct support professional team leader |
18 | data scientist |
19 | part bus driver commercial drivers license |
20 | senior engineer |
head(skills_db,n=20) %>% kable() %>% kable_styling() %>% scroll_box(width = "910px",height="400px")
skill_id | skill_name |
---|---|
1 | c |
2 | troubleshooting |
3 | transportation |
4 | time management |
5 | writing |
6 | rules |
7 | speaking |
8 | ada |
9 | microsoft office |
10 | skill |
11 | trucks |
12 | redmine |
13 | stamina |
14 | hoists |
15 | visualization |
16 | meat slicers |
17 | sales and marketing |
18 | negotiation |
19 | coordination |
20 | trash bags |
job_skills <- dataTable %>%
gather(3:10, value = "skill_name", key="job_name") %>%
filter(skill_name != '') %>%
filter(title != '') %>%
select(title, skill_name)%>%
as.data.frame()
## Warning: attributes are not identical across measure variables;
## they will be dropped
names(job_skills) <- c('job_name','skill_name')
head(job_skills,n=20) %>% kable() %>% kable_styling() %>% scroll_box(width = "910px",height="400px")
job_name | skill_name |
---|---|
rsaf language training elt instructors | c |
integrator | troubleshooting |
correctional officer | transportation |
store all positions | time management |
head of office | writing |
security positions | rules |
sr manager event communications | speaking |
assistant director of engineering and construction | ada |
scheduling coordinator | microsoft office |
systems engineer fair | skill |
school bus driver | skill |
systems analyst | skill |
warehouser | trucks |
cloud solution architect | redmine |
insemination crew worker | speaking |
product support sales representative | stamina |
data scientist | skill |
senior engineer | skill |
network engineer i | troubleshooting |
cdla truck drivers | trucks |
# save jobs_skills mapping. we are doing this because we have auto increment private keys.
job_skills_map <- job_skills %>%
inner_join(skills_db ) %>%
inner_join(job_db ) %>%
select(job_id, skill_id) %>%
distinct()
## Joining, by = "skill_name"
## Joining, by = "job_name"
## Warning: Column `job_name` joining factor and character vector, coercing
## into character vector
RMySQL::dbWriteTable(con, 'job_skills', job_skills_map, append = TRUE, row.names = FALSE)
## [1] TRUE
#Read tables from db.
jobs_db <- dbReadTable(con, 'jobs')
skills_db<- dbReadTable(con, 'skills')
job_skills_db<- dbReadTable(con, 'job_skills')
# Join tables to get skills for jobs.
job_skills_db <- jobs_db %>%
inner_join(job_skills_db )%>%
inner_join(skills_db ) %>%
select(2,4)
## Joining, by = "job_id"
## Joining, by = "skill_id"
Filter table for data science titles only
job_skills_dataScience<-filter(job_skills_db,str_detect(job_name,"data scientist"))
head(job_skills_dataScience,n=20) %>% kable() %>% kable_styling() %>% scroll_box(width = "910px",height="400px")
job_name | skill_name |
---|---|
data scientist | skill |
data scientist | science |
data scientist | python |
data scientist | mathematics |
data scientist | programming |
data scientist | git |
data scientist | derive |
data scientist | r |
data scientist | design |
data scientist | c |
data scientist | visualization |
data scientist | writing |
data scientist intern | r |
data scientist intern | design |
data scientist intern | science |
data scientist intern | programming |
data scientist intern | visualization |
data scientist intern | python |
data scientist intern | programming languages |
data scientist intern | drupal |
Create a vector with all skills
skillsDataScience<-pull(job_skills_dataScience,'skill_name')
skillsDataScience<-table(skillsDataScience)
head(skillsDataScience,n=10) %>% kable() %>% kable_styling() %>% scroll_box(width = "910px",height="400px")
skillsDataScience | Freq |
---|---|
23 | 2 |
apache hadoop | 3 |
biology | 1 |
c | 13 |
chemistry | 1 |
cmake | 4 |
collectors | 1 |
compliance software | 1 |
coordination | 2 |
critical thinking | 5 |
Order data then visualize using ggplot
Also we plot a word soup using wordcloud2 package
skillsDataScienceOrdered<-skillsDataScience %>% as.data.frame() %>% tbl_df()
skillsDataScienceOrdered$skillsDataScience <- factor(skillsDataScienceOrdered$skillsDataScience,levels=skillsDataScienceOrdered$skillsDataScience[order(-skillsDataScienceOrdered$Freq)])
ggplot(skillsDataScienceOrdered,aes(x=skillsDataScience,y=Freq))+geom_bar(stat="identity")+theme(axis.text = element_text(angle=90))
wordcloud2(skillsDataScienceOrdered,shuffle = TRUE,size = .5,gridSize = 15)
Same as in question 1 we filter our long tidy table for data science titles only, but this time we group them
job_skills_dataScience<-filter(job_skills_db,str_detect(job_name,"data scientist")) %>% group_by(job_name) %>% count()
head(job_skills_dataScience,n=20) %>% kable() %>% kable_styling() %>% scroll_box(width = "910px",height="400px")
job_name | n |
---|---|
chief data scientist | 6 |
cmi data scientist | 6 |
cs data scientist mid | 8 |
data architect data scientist | 8 |
data scientist | 12 |
data scientist analyst | 3 |
data scientist analyst staff | 8 |
data scientist analyst statistician | 7 |
data scientist associate manager | 8 |
data scientist bi and crm | 4 |
data scientist data analytics | 7 |
data scientist data scientist senior usa | 8 |
data scientist devops senior | 8 |
data scientist financial services analytics | 11 |
data scientist for cutting edge analytic firm | 5 |
data scientist insight studio consulting mit | 8 |
data scientist insight studio consulting northeastern | 8 |
data scientist intern | 11 |
data scientist intern of champagne | 5 |
data scientist intern tech | 5 |
We order data and visualize using ggplot
skillsDataScienceOrdered<-job_skills_dataScience
skillsDataScienceOrdered$job_name <- factor(skillsDataScienceOrdered$job_name,levels=skillsDataScienceOrdered$job_name[order(-skillsDataScienceOrdered$n)])
ggplot(skillsDataScienceOrdered,aes(x=job_name,y=n))+geom_bar(stat="identity")+theme(axis.text = element_text(angle=90))
Once again we filter data scientist, and then we do a join with all other job names to find those with similar skills
job_skills_dataScience<-filter(job_skills_db,str_detect(job_name,"data scientist"))
job_skills_dataScience<-job_skills_dataScience %>% filter(skill_name!='skill')
same_skills<-job_skills %>% right_join(job_skills_dataScience,by='skill_name') %>% subset(!(job_name.x %in% job_skills_dataScience$job_name)) %>% group_by(job_name.x) %>% count()
We order and visualize the data using ggplot
same_skills_Ordered<-same_skills %>% filter(n>550) %>% as.data.frame() %>% tbl_df()
same_skills_Ordered$job_name <- factor(same_skills_Ordered$job_name.x,levels=same_skills_Ordered$job_name.x[order(-same_skills_Ordered$n)])
ggplot(same_skills_Ordered,aes(x=job_name,y=n))+geom_bar(stat="identity")+theme(axis.text = element_text(angle=90))
We define a model in which: - A random person has 10 random skills from the list of possible skills in the dataset - A person with more than 5 data science skills has the skills to be a data scientist
With this model, we test out 1000 random people and calculate a percentage
skill_all<-job_skills_db$skill_name
skill_datascience<-job_skills_dataScience$skill_name
skill_not_datascience<-setdiff(job_skills$skill_name,skill_datascience)
number_of_data_sientists<-0
people_in_population<-1000
for (i in 1:people_in_population) {
person<-skill_all[as.integer(runif(10,0,length(skill_all)))]
if (length(setdiff(person,skill_datascience))>5) {
#print(paste("Person",toString(i),"has skills to be a data scientist"))
number_of_data_sientists<-number_of_data_sientists+1
} else {
#print(paste("Person",toString(i),"does not skills to be a data scientist"))
}
}
proportion<-number_of_data_sientists/people_in_population*100
print(paste("The percentage of people with data science skills is: ",toString(proportion),"%",sep=""))
## [1] "The percentage of people with data science skills is: 11%"
result<-as.data.frame(c(100-proportion,proportion))
colnames(result)<-c("Percentage")
result$Skill_Set<-c("Non Data Science","Data Science")
ggplot(result,aes(y=Percentage,x=Skill_Set,fill=Skill_Set))+geom_bar(stat="identity")+geom_text(data=result,aes(y=Percentage,x=Skill_Set,label=paste(Percentage,"%",sep="")))