Data Overview




The Data comes in 3 extracts :



Data Science Datasets
Type Sample Observations
Job Seekers All 23859
Job Seekers Subset 2325
Job Openings All 7556














Glimpse the 3 Datasets





qry<-"select gender, location, education, title, skill from  staging_db.JobSeekersStage limit 4;"
rs = dbSendQuery(conn, qry)
qry_df = fetch(rs, n=-1)      # select all rows
knitr::kable(qry_df, caption='All Seekers : Primary Skill', table.attr = "style='width:100%;'") %>%
  kable_styling(font_size = 12)
All Seekers : Primary Skill
gender location education title skill
Female United States of America Doctoral degree Consultant other
Male Indonesia Bachelor’s degree Other other
Female United States of America Master’s degree Data Scientist java
Male United States of America Master’s degree Not employed python
qry<-"select gender, location, education, title, skill from  staging_db.JobSeekersStage limit 4;"
rs = dbSendQuery(conn, qry)
qry_df = fetch(rs, n=-1)      # select all rows
knitr::kable(qry_df, caption='Subset Seekers : All Key Skills', table.attr = "style='width:100%;'")  %>%
  kable_styling(font_size = 12)
Subset Seekers : All Key Skills
gender location education title skill
Female United States of America Doctoral degree Consultant other
Male Indonesia Bachelor’s degree Other other
Female United States of America Master’s degree Data Scientist java
Male United States of America Master’s degree Not employed python
qry<-"select job_type, location, skill, min_salary, max_salary  from staging_db.JobOpeningsStage  limit 4;"
rs = dbSendQuery(conn, qry)
qry_df = fetch(rs, n=-1)      # select all rows
knitr::kable(qry_df, caption='All Job Openings : All Skills', table.attr = "style='width:100%;'")  %>%
  kable_styling(font_size = 12)
All Job Openings : All Skills
job_type location skill min_salary max_salary
Analytics Bengaluru sql 10 15
Analytics Bengaluru python 10 15
Analytics Bengaluru analytics 10 15
Analytics Bengaluru machine learning 10 15








Demographic distributions








Gender of Job Seekers.



qry<-"select gender, count(*) as count from staging_db.JobSeekersStage group by gender order by count desc;"

rs = dbSendQuery(conn, qry)
g_df = fetch(rs, n=-1)      # select all rows


knitr::kable(g_df, caption='Titles', table.attr = "style='width:80%;'")
Titles
gender count
Male 19430
Female 4010
Prefer not to say 340
Prefer to self-describe 79







Do the Job Seekers consider themselves a Data Scientist ?.



qry<-"select dataScientist, count(*) from staging_db.JobSeekersStage group by dataScientist;"

rs = dbSendQuery(conn, qry)
ds_df = fetch(rs, n=-1)      # select all rows


knitr::kable(ds_df, caption='Titles', table.attr = "style='width:80%;'")
Titles
dataScientist count(*)
Maybe 4184
Definitely not 1557
Definitely yes 4684
Probably yes 4893
Probably not 3162
5379







Majors of job seekers by percent.



qry<-"select r.short_description as major,  count(*) as count from JobSeeker s, MajorReference r where s.major_id=r.id group by major_id order by count desc;"
rs = dbSendQuery(conn, qry)
majors_df = fetch(rs, n=-1)      # select all rows

total<-sum(majors_df$count)

majors_df<-majors_df %>%
  mutate(major = substr(major,0,16), pct=count/total )


majors_df<-majors_df %>%
  select(major, pct)

ggplot(data=majors_df, aes(y=pct, x=major)) +   geom_bar(stat="identity") + labs(title="Majors of Job Seekers", x=" ") + coord_flip()







Major Distributions Comparisons : Data Scientists or Not ?".



qry<-"select r.short_description as major,  count(*) as count from JobSeeker s, MajorReference r where s.major_id=r.id and data_scientist='Definitely yes'  group by major_id order by count desc;"
rs = dbSendQuery(conn, qry)
majors_df = fetch(rs, n=-1)      # select all rows

total<-sum(majors_df$count)

majors_df<-majors_df %>%
  mutate(major = substr(major,0,16), pct=count/total )


majors_df<-majors_df %>%
  select(major, pct)

p1<-ggplot(data=majors_df, aes(y=pct, x=major)) +   geom_bar(stat="identity") + labs(title="Definitely Data Scientsts", x=" ") + coord_flip()








qry<-"select r.short_description as major,  count(*) as count from JobSeeker s, MajorReference r where s.major_id=r.id and data_scientist='Definitely not'  group by major_id order by count desc;"
rs = dbSendQuery(conn, qry)
majors_df = fetch(rs, n=-1)      # select all rows

total<-sum(majors_df$count)

majors_df<-majors_df %>%
  mutate(major = substr(major,0,16), pct=count/total )


majors_df<-majors_df %>%
  select(major, pct)

p2<-ggplot(data=majors_df, aes(y=pct, x=major)) +   geom_bar(stat="identity") + labs(title="Definitely Not Data Scientsts", x=" ") + coord_flip()


grid.arrange(p1, p2, nrow = 2)







Titles of Job Seekers.



qry<-"select r.short_description as title,  count(*) as count from JobSeeker s, TitleReference r where s.title_id=r.id group by title_id order by count desc;"

rs = dbSendQuery(conn, qry)
t_df = fetch(rs, n=-1)      # select all rows


knitr::kable(t_df, caption='Titles', table.attr = "style='width:80%;'")
Titles
title count
Student 5253
Data Scientist 4137
Software Engineer 3130
Data Analyst 1922
Other 1322
Research Scientist 1189
959
Not employed 842
Consultant 785
Business Analyst 772
Data Engineer 737
Research Assistant 600
Manager 590
Product/Project Manager 428
Chief Officer 360
Statistician 237
DBA/Database Engineer 145
Developer Advocate 117
Marketing Analyst 115
Salesperson 102
Principal Investigator 97
Data Journalist 20








Analysis of Skill Sets








All skill sets required by the companies.



qry<-"select r.short_description as skill,  count(*) as count from JobRequirements s, SkillReference r where s.skill_id=r.id group by skill_id;"
rs = dbSendQuery(conn, qry)
skills_needed_df = fetch(rs, n=-1)      # select all rows



ggplot(data=skills_needed_df , aes(y=count, x=skill)) +   geom_bar(stat="identity") + labs(title="Job Requirements", x=" ") + coord_flip()

# knitr::kable(skills_needed_df, caption='Skills Needed', table.attr = "style='width:80%;'")







All skill sets offered by all job seekers.



qry<-"select r.short_description as skill,  count(*) as count from JobSeekerPrimarySkills s, SkillReference r where s.skill_id=r.id group by skill_id;"
rs = dbSendQuery(conn, qry)
skills_offered_df = fetch(rs, n=-1)      # select all rows


# knitr::kable(skills_offered_df, caption='All Job Openings : All Skills', table.attr = "style='width:90%;'")

ggplot(data=skills_offered_df , aes(y=count, x=skill)) +   geom_bar(stat="identity") + labs(title="Primary Skills of Job Seekers", x=" ") + coord_flip()







Key skill sets offered by a subset of job seekers.



qry<-"select r.short_description as skill,  count(*) as count from JobSeekerKeySkills s, SkillReference r where s.skill_id=r.id group by skill_id;"
rs = dbSendQuery(conn, qry)
key_skills_offered_df = fetch(rs, n=-1)      # select all rows


# knitr::kable(skills_offered_df, caption='All Job Openings : All Skills', table.attr = "style='width:80%;'")

ggplot(data=key_skills_offered_df , aes(y=count, x=skill)) +   geom_bar(stat="identity") + labs(title="Key Skills of Job Seekers", x=" ") + coord_flip()







Create seperate plots and display side by side.

p1<-ggplot(data=skills_offered_df, aes(y=count, x=skill)) +   geom_bar(stat="identity") + labs(title="Skills Offered", x=" ")  + coord_flip()
p2<-ggplot(data=skills_needed_df, aes(y=count, x=skill)) +   geom_bar(stat="identity") + labs(title="Skills Needed", x=" ")  + coord_flip()
grid.arrange(p1, p2, nrow = 2)







Display them side by side for each skill set for the primary skill sets.



so_df<-subset(skills_offered_df, skill=="matlab" | skill=="r" | skill=="python" | skill=="sql" )
                            
sn_df<-subset(skills_needed_df, skill=="matlab" | skill=="r" | skill=="python" | skill=="sql" )


# create the 3 columns and bind them into 1 df
column_what <- c(rep("Offered", nrow(as.data.frame(so_df))), rep("Needed", nrow(as.data.frame(sn_df))))
column_freq <- c(as.vector(so_df$count), as.vector(sn_df$count))
column_skill <- c(as.vector(so_df$skill), as.vector(sn_df$skill))
outcome_data <- cbind(column_what, column_skill, column_freq)

outcome_data_df<-data.frame(What = factor(column_what, levels = c("Offered", "Needed")),
                            Freq = column_freq, Skill=column_skill)


ggplot(data = outcome_data_df, aes(x = Skill, y = Freq, fill = What)) +
  geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75)  +
 #  ylim(0,50) +
  geom_text(aes(label = Freq), fontface = "bold", vjust = 1.5,
            position = position_dodge(.9), size = 4) +         # controls the number at top of bar
  labs(x = "\n Skill", y = "Frequency\n", title = "\n Skills Offered vs Needed \n") +
  theme(plot.title = element_text(hjust = 0.5, face="bold", colour="blue", size = 16),      # center the title
        axis.title.x = element_text(face="bold", colour="red", size = 12),
        axis.title.y = element_text(face="bold", colour="red", size = 12),
        legend.title = element_text(face="bold", colour="blue", size = 10))







Display them side by side for each skill set for the key skill set holders.



so_df<-subset(key_skills_offered_df, skill=="matlab" | skill=="r" | skill=="python" | skill=="sql" )
                            
sn_df<-subset(skills_needed_df, skill=="matlab" | skill=="r" | skill=="python" | skill=="sql" )


# create the 3 columns and bind them into 1 df
column_what <- c(rep("Offered", nrow(as.data.frame(so_df))), rep("Needed", nrow(as.data.frame(sn_df))))
column_freq <- c(as.vector(so_df$count), as.vector(sn_df$count))
column_skill <- c(as.vector(so_df$skill), as.vector(sn_df$skill))
outcome_data <- cbind(column_what, column_skill, column_freq)

outcome_data_df<-data.frame(What = factor(column_what, levels = c("Offered", "Needed")),
                            Freq = column_freq, Skill=column_skill)


ggplot(data = outcome_data_df, aes(x = Skill, y = Freq, fill = What)) +
  geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75)  +
 #  ylim(0,50) +
  geom_text(aes(label = Freq), fontface = "bold", vjust = 1.5,
            position = position_dodge(.9), size = 4) +         # controls the number at top of bar
  labs(x = "\n Skill", y = "Frequency\n", title = "\n Skills Offered vs Needed \n") +
  theme(plot.title = element_text(hjust = 0.5, face="bold", colour="blue", size = 16),      # center the title
        axis.title.x = element_text(face="bold", colour="red", size = 12),
        axis.title.y = element_text(face="bold", colour="red", size = 12),
        legend.title = element_text(face="bold", colour="blue", size = 10))