The Data comes in 3 extracts :
| Data Science Datasets |
| Type | Sample | Observations |
|---|---|---|
| Job Seekers | All | 23859 |
| Job Seekers | Subset | 2325 |
| Job Openings | All | 7556 |
qry<-"select gender, location, education, title, skill from staging_db.JobSeekersStage limit 4;"
rs = dbSendQuery(conn, qry)
qry_df = fetch(rs, n=-1) # select all rows
knitr::kable(qry_df, caption='All Seekers : Primary Skill', table.attr = "style='width:100%;'") %>%
kable_styling(font_size = 12)| gender | location | education | title | skill |
|---|---|---|---|---|
| Female | United States of America | Doctoral degree | Consultant | other |
| Male | Indonesia | Bachelor’s degree | Other | other |
| Female | United States of America | Master’s degree | Data Scientist | java |
| Male | United States of America | Master’s degree | Not employed | python |
qry<-"select gender, location, education, title, skill from staging_db.JobSeekersStage limit 4;"
rs = dbSendQuery(conn, qry)
qry_df = fetch(rs, n=-1) # select all rows
knitr::kable(qry_df, caption='Subset Seekers : All Key Skills', table.attr = "style='width:100%;'") %>%
kable_styling(font_size = 12)| gender | location | education | title | skill |
|---|---|---|---|---|
| Female | United States of America | Doctoral degree | Consultant | other |
| Male | Indonesia | Bachelor’s degree | Other | other |
| Female | United States of America | Master’s degree | Data Scientist | java |
| Male | United States of America | Master’s degree | Not employed | python |
qry<-"select job_type, location, skill, min_salary, max_salary from staging_db.JobOpeningsStage limit 4;"
rs = dbSendQuery(conn, qry)
qry_df = fetch(rs, n=-1) # select all rows
knitr::kable(qry_df, caption='All Job Openings : All Skills', table.attr = "style='width:100%;'") %>%
kable_styling(font_size = 12)| job_type | location | skill | min_salary | max_salary |
|---|---|---|---|---|
| Analytics | Bengaluru | sql | 10 | 15 |
| Analytics | Bengaluru | python | 10 | 15 |
| Analytics | Bengaluru | analytics | 10 | 15 |
| Analytics | Bengaluru | machine learning | 10 | 15 |
Gender of Job Seekers.
qry<-"select gender, count(*) as count from staging_db.JobSeekersStage group by gender order by count desc;"
rs = dbSendQuery(conn, qry)
g_df = fetch(rs, n=-1) # select all rows
knitr::kable(g_df, caption='Titles', table.attr = "style='width:80%;'")| gender | count |
|---|---|
| Male | 19430 |
| Female | 4010 |
| Prefer not to say | 340 |
| Prefer to self-describe | 79 |
Do the Job Seekers consider themselves a Data Scientist ?.
qry<-"select dataScientist, count(*) from staging_db.JobSeekersStage group by dataScientist;"
rs = dbSendQuery(conn, qry)
ds_df = fetch(rs, n=-1) # select all rows
knitr::kable(ds_df, caption='Titles', table.attr = "style='width:80%;'")| dataScientist | count(*) |
|---|---|
| Maybe | 4184 |
| Definitely not | 1557 |
| Definitely yes | 4684 |
| Probably yes | 4893 |
| Probably not | 3162 |
| 5379 |
Majors of job seekers by percent.
qry<-"select r.short_description as major, count(*) as count from JobSeeker s, MajorReference r where s.major_id=r.id group by major_id order by count desc;"
rs = dbSendQuery(conn, qry)
majors_df = fetch(rs, n=-1) # select all rows
total<-sum(majors_df$count)
majors_df<-majors_df %>%
mutate(major = substr(major,0,16), pct=count/total )
majors_df<-majors_df %>%
select(major, pct)
ggplot(data=majors_df, aes(y=pct, x=major)) + geom_bar(stat="identity") + labs(title="Majors of Job Seekers", x=" ") + coord_flip()Major Distributions Comparisons : Data Scientists or Not ?".
qry<-"select r.short_description as major, count(*) as count from JobSeeker s, MajorReference r where s.major_id=r.id and data_scientist='Definitely yes' group by major_id order by count desc;"
rs = dbSendQuery(conn, qry)
majors_df = fetch(rs, n=-1) # select all rows
total<-sum(majors_df$count)
majors_df<-majors_df %>%
mutate(major = substr(major,0,16), pct=count/total )
majors_df<-majors_df %>%
select(major, pct)
p1<-ggplot(data=majors_df, aes(y=pct, x=major)) + geom_bar(stat="identity") + labs(title="Definitely Data Scientsts", x=" ") + coord_flip()
qry<-"select r.short_description as major, count(*) as count from JobSeeker s, MajorReference r where s.major_id=r.id and data_scientist='Definitely not' group by major_id order by count desc;"
rs = dbSendQuery(conn, qry)
majors_df = fetch(rs, n=-1) # select all rows
total<-sum(majors_df$count)
majors_df<-majors_df %>%
mutate(major = substr(major,0,16), pct=count/total )
majors_df<-majors_df %>%
select(major, pct)
p2<-ggplot(data=majors_df, aes(y=pct, x=major)) + geom_bar(stat="identity") + labs(title="Definitely Not Data Scientsts", x=" ") + coord_flip()
grid.arrange(p1, p2, nrow = 2)Titles of Job Seekers.
qry<-"select r.short_description as title, count(*) as count from JobSeeker s, TitleReference r where s.title_id=r.id group by title_id order by count desc;"
rs = dbSendQuery(conn, qry)
t_df = fetch(rs, n=-1) # select all rows
knitr::kable(t_df, caption='Titles', table.attr = "style='width:80%;'")| title | count |
|---|---|
| Student | 5253 |
| Data Scientist | 4137 |
| Software Engineer | 3130 |
| Data Analyst | 1922 |
| Other | 1322 |
| Research Scientist | 1189 |
| 959 | |
| Not employed | 842 |
| Consultant | 785 |
| Business Analyst | 772 |
| Data Engineer | 737 |
| Research Assistant | 600 |
| Manager | 590 |
| Product/Project Manager | 428 |
| Chief Officer | 360 |
| Statistician | 237 |
| DBA/Database Engineer | 145 |
| Developer Advocate | 117 |
| Marketing Analyst | 115 |
| Salesperson | 102 |
| Principal Investigator | 97 |
| Data Journalist | 20 |
All skill sets required by the companies.
qry<-"select r.short_description as skill, count(*) as count from JobRequirements s, SkillReference r where s.skill_id=r.id group by skill_id;"
rs = dbSendQuery(conn, qry)
skills_needed_df = fetch(rs, n=-1) # select all rows
ggplot(data=skills_needed_df , aes(y=count, x=skill)) + geom_bar(stat="identity") + labs(title="Job Requirements", x=" ") + coord_flip()# knitr::kable(skills_needed_df, caption='Skills Needed', table.attr = "style='width:80%;'")All skill sets offered by all job seekers.
qry<-"select r.short_description as skill, count(*) as count from JobSeekerPrimarySkills s, SkillReference r where s.skill_id=r.id group by skill_id;"
rs = dbSendQuery(conn, qry)
skills_offered_df = fetch(rs, n=-1) # select all rows
# knitr::kable(skills_offered_df, caption='All Job Openings : All Skills', table.attr = "style='width:90%;'")
ggplot(data=skills_offered_df , aes(y=count, x=skill)) + geom_bar(stat="identity") + labs(title="Primary Skills of Job Seekers", x=" ") + coord_flip()Key skill sets offered by a subset of job seekers.
qry<-"select r.short_description as skill, count(*) as count from JobSeekerKeySkills s, SkillReference r where s.skill_id=r.id group by skill_id;"
rs = dbSendQuery(conn, qry)
key_skills_offered_df = fetch(rs, n=-1) # select all rows
# knitr::kable(skills_offered_df, caption='All Job Openings : All Skills', table.attr = "style='width:80%;'")
ggplot(data=key_skills_offered_df , aes(y=count, x=skill)) + geom_bar(stat="identity") + labs(title="Key Skills of Job Seekers", x=" ") + coord_flip() Create seperate plots and display side by side.
p1<-ggplot(data=skills_offered_df, aes(y=count, x=skill)) + geom_bar(stat="identity") + labs(title="Skills Offered", x=" ") + coord_flip()
p2<-ggplot(data=skills_needed_df, aes(y=count, x=skill)) + geom_bar(stat="identity") + labs(title="Skills Needed", x=" ") + coord_flip()
grid.arrange(p1, p2, nrow = 2)Display them side by side for each skill set for the primary skill sets.
so_df<-subset(skills_offered_df, skill=="matlab" | skill=="r" | skill=="python" | skill=="sql" )
sn_df<-subset(skills_needed_df, skill=="matlab" | skill=="r" | skill=="python" | skill=="sql" )
# create the 3 columns and bind them into 1 df
column_what <- c(rep("Offered", nrow(as.data.frame(so_df))), rep("Needed", nrow(as.data.frame(sn_df))))
column_freq <- c(as.vector(so_df$count), as.vector(sn_df$count))
column_skill <- c(as.vector(so_df$skill), as.vector(sn_df$skill))
outcome_data <- cbind(column_what, column_skill, column_freq)
outcome_data_df<-data.frame(What = factor(column_what, levels = c("Offered", "Needed")),
Freq = column_freq, Skill=column_skill)
ggplot(data = outcome_data_df, aes(x = Skill, y = Freq, fill = What)) +
geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75) +
# ylim(0,50) +
geom_text(aes(label = Freq), fontface = "bold", vjust = 1.5,
position = position_dodge(.9), size = 4) + # controls the number at top of bar
labs(x = "\n Skill", y = "Frequency\n", title = "\n Skills Offered vs Needed \n") +
theme(plot.title = element_text(hjust = 0.5, face="bold", colour="blue", size = 16), # center the title
axis.title.x = element_text(face="bold", colour="red", size = 12),
axis.title.y = element_text(face="bold", colour="red", size = 12),
legend.title = element_text(face="bold", colour="blue", size = 10))Display them side by side for each skill set for the key skill set holders.
so_df<-subset(key_skills_offered_df, skill=="matlab" | skill=="r" | skill=="python" | skill=="sql" )
sn_df<-subset(skills_needed_df, skill=="matlab" | skill=="r" | skill=="python" | skill=="sql" )
# create the 3 columns and bind them into 1 df
column_what <- c(rep("Offered", nrow(as.data.frame(so_df))), rep("Needed", nrow(as.data.frame(sn_df))))
column_freq <- c(as.vector(so_df$count), as.vector(sn_df$count))
column_skill <- c(as.vector(so_df$skill), as.vector(sn_df$skill))
outcome_data <- cbind(column_what, column_skill, column_freq)
outcome_data_df<-data.frame(What = factor(column_what, levels = c("Offered", "Needed")),
Freq = column_freq, Skill=column_skill)
ggplot(data = outcome_data_df, aes(x = Skill, y = Freq, fill = What)) +
geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75) +
# ylim(0,50) +
geom_text(aes(label = Freq), fontface = "bold", vjust = 1.5,
position = position_dodge(.9), size = 4) + # controls the number at top of bar
labs(x = "\n Skill", y = "Frequency\n", title = "\n Skills Offered vs Needed \n") +
theme(plot.title = element_text(hjust = 0.5, face="bold", colour="blue", size = 16), # center the title
axis.title.x = element_text(face="bold", colour="red", size = 12),
axis.title.y = element_text(face="bold", colour="red", size = 12),
legend.title = element_text(face="bold", colour="blue", size = 10))