The dataset used to answer this question was sourced from Kaggle:
https://www.kaggle.com/elroyggj/indeed-dataset-data-scientistanalystengineer.
It contains information from 5,715 data-science related job postings on the job-listings site, Indeed. It includes information like the job title, description, technical skills required, state, salary, and more. The dataset was imported, tidied, then analyzed.
We looked at the top skills required for each job-type, the locations of jobs, and the skills that different pay-grades tend to require.
# Loading packages
knitr::opts_chunk$set(echo = TRUE)
library(stringr)
library(knitr)
library(tidyr)
library(Rmisc)## Warning: package 'Rmisc' was built under R version 3.6.3
## Loading required package: lattice
## Loading required package: plyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Warning: package 'wordcloud' was built under R version 3.6.3
## Loading required package: RColorBrewer
##
## Attaching package: 'maps'
## The following object is masked from 'package:plyr':
##
## ozone
## Warning: package 'kableExtra' was built under R version 3.6.3
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
stateFromLower <-function(x) {
#read 52 state codes into local variable [includes DC (Washington D.C. and PR (Puerto Rico)]
st.codes<-data.frame(
state=as.factor(c("AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", "GA",
"HI", "IA", "ID", "IL", "IN", "KS", "KY", "LA", "MA", "MD", "ME",
"MI", "MN", "MO", "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM",
"NV", "NY", "OH", "OK", "OR", "PA", "PR", "RI", "SC", "SD", "TN",
"TX", "UT", "VA", "VT", "WA", "WI", "WV", "WY")),
full=as.factor(c("alaska","alabama","arkansas","arizona","california","colorado",
"connecticut","district of columbia","delaware","florida","georgia",
"hawaii","iowa","idaho","illinois","indiana","kansas","kentucky",
"louisiana","massachusetts","maryland","maine","michigan","minnesota",
"missouri","mississippi","montana","north carolina","north dakota",
"nebraska","new hampshire","new jersey","new mexico","nevada",
"new york","ohio","oklahoma","oregon","pennsylvania","puerto rico",
"rhode island","south carolina","south dakota","tennessee","texas",
"utah","virginia","vermont","washington","wisconsin",
"west virginia","wyoming"))
)
#create an nx1 data.frame of state codes from source column
st.x<-data.frame(state=x)
#match source codes with codes from 'st.codes' local variable and use to return the full state name
refac.x<-st.codes$full[match(st.x$state,st.codes$state)]
#return the full state names in the same order in which they appeared in the original source
return(refac.x)
}Soft skills were not explicitly included in the dataset as a field. Luckily, they are mentioned in the job descriptions.
We begin by programatically reading through the job-descriptions to extract mentions of the ‘soft-skills’.
One of the tasks before this was to create a list of soft skills that we’d be looking for.
#The soft skills we will look will be stored as: 'keywords'
keywords<-unname(unlist(read.csv("https://raw.githubusercontent.com/davidblumenstiel/DATA-607-Project-3/master/softskills.csv", stringsAsFactors = FALSE)))
#Iniates new columns for each keyword; also makes list index list
df[keywords] <- NA
keywordColIndex <- seq(length(df)-length(keywords)+1,length(df),1)
#Will determine if contains keywords; occurs (1), or doesnt occur(0)
#Coerces multiple occurances to 1
i = 0
for (desc in df$Description) {
i = i + 1
df[i,keywordColIndex] <-(as.integer(str_detect(desc,fixed(keywords, ignore_case = TRUE))))
}
#Gets rid of the descriptions to save space
df$Description <- NULL
#Gathers soft skills into a tidy dataset: df2
df2 <- df[,c(1, 44:ncol(df))] %>%
gather(skill, flag, -X) %>%
filter(flag == 1) %>%
select(id = X, skill) %>%
mutate(skill_type = 'soft')soft_sum_3cols <- soft_summary %>% select(Job_Type,Queried_Salary,sum)
avg_ss_by_jobtype$Job_Type <- factor(avg_ss_by_jobtype$Job_Type,levels = c("data_analyst", "data_scientist", "data_engineer"))
#plot showing number soft skills on average per job type
ggplot(avg_ss_by_jobtype) +
aes(x = Job_Type, weight = x) +
geom_bar(fill = "#2171b5") +
labs(x = "Job Type", y = "avg # of soft skills in job description", title = "Soft Skills Mentioned per Job Type") +
theme_minimal()#we agregate the dataframe into averages for every combination of salary and job type
skillby_type_sal <-aggregate(x=soft_sum_3cols$sum, by = soft_sum_3cols[c("Job_Type", "Queried_Salary")],
FUN=mean, na.rm=TRUE)
# make variable factors and order them
skillby_type_sal$Queried_Salary <- factor(skillby_type_sal$Queried_Salary,levels = c("<80000", "80000-99999", "100000-119999", "120000-139999", "140000-159999", ">160000"))
skillby_type_sal$Job_Type <- factor(skillby_type_sal$Job_Type,levels = c("data_analyst", "data_scientist", "data_engineer"))
# heatmap of average skills mentioned by job type and salary
ggplot(skillby_type_sal) +
aes(x = Queried_Salary, y = Job_Type, fill = x) +
geom_tile(size = 1L) +
scale_fill_distiller(palette = "PuBu") +
labs(x = "Salary Band", y = "Job Type", title = "Soft Skills Mentioned across Salary and Job Type") +
theme_minimal()#not used for now bar chart for average # soft skills mentioned by salary
# avg_ss_by_salary$Queried_Salary <- factor(avg_ss_by_salary$Queried_Salary,levels = c("<80000", "80000-99999", "100000-119999", "120000-139999", "140000-159999", ">160000"))
# ggplot(avg_ss_by_salary) +
# aes(x = Queried_Salary, weight = x) +
# geom_bar(fill = "#1f9e89") +
# labs(x = "Salary Band", y = "avg # of soft skills in job description", title = "Soft Skill Mentions per Salary Band") +
# theme_minimal()eng_highs <- filter(df, Job_Type == "data_engineer" & Queried_Salary == ">160000")
number_eng_highs <- nrow(eng_highs)
eng_highs_long <- eng_highs[,c(44:ncol(eng_highs))] %>%
gather(skill, flag) %>%
filter(flag == 1)
eng_highs_count <-aggregate(x=eng_highs_long$flag, by = eng_highs_long[c("skill")],
FUN=sum, na.rm=TRUE)
eng_highs_count$x <- round(eng_highs_count$x/number_eng_highs,4)
eng_highs_count <- eng_highs_count[order(-eng_highs_count$x),]
rownames(eng_highs_count) <- NULLanl_lows <- filter(df, Job_Type == "data_analyst" & Queried_Salary == "<80000")
number_anl_lows <- nrow(anl_lows)
anl_lows_long <- anl_lows[,c(44:ncol(eng_highs))] %>%
gather(skill, flag) %>%
filter(flag == 1)
anl_lows_count <-aggregate(x=anl_lows_long$flag, by = anl_lows_long[c("skill")],
FUN=sum, na.rm=TRUE)
anl_lows_count$x <- round(anl_lows_count$x/number_anl_lows,4)
anl_lows_count <- anl_lows_count[order(-anl_lows_count$x),]
rownames(anl_lows_count) <- NULLtable_eng_highs <- eng_highs_count[1:22,]
table_anl_lows <- anl_lows_count[1:22,]
names(table_eng_highs)[2] <- "occurance"
names(table_anl_lows)[2] <- "occurance"
table_eng_highs %>%
kable("html",col.names = NA, align = 'clc', caption = 'Top Soft skills (engineers high salary)') %>%
kable_styling(full_width = F, position = "float_left")| skill | occurance |
|---|---|
| Team | 0.7885 |
| Management | 0.3846 |
| Problem | 0.3654 |
| communication | 0.3077 |
| Result | 0.2596 |
| Hands-on | 0.2404 |
| Insight | 0.2212 |
| detail | 0.2019 |
| Organization | 0.1923 |
| Collaborate | 0.1827 |
| Competitive | 0.1731 |
| Driven | 0.1731 |
| Responsible | 0.1635 |
| Collaboration | 0.1442 |
| Passionate | 0.1346 |
| Problem solving | 0.1250 |
| Innovation | 0.1154 |
| Influence | 0.1058 |
| Leadership | 0.1058 |
| Managing | 0.0962 |
| Team player | 0.0962 |
| Analyzing | 0.0865 |
table_anl_lows %>%
kable("html", align = 'clc', caption = 'Top Soft skills (analyst low salary)') %>%
kable_styling(full_width = F, position = "right")| skill | occurance |
|---|---|
| Team | 0.6227 |
| Management | 0.5386 |
| communication | 0.4656 |
| Problem | 0.4109 |
| Organization | 0.3829 |
| detail | 0.3240 |
| Result | 0.3128 |
| Research | 0.2917 |
| Train | 0.2693 |
| Presentation | 0.2567 |
| Training | 0.2216 |
| Responsible | 0.2188 |
| Independent | 0.2146 |
| Insight | 0.2020 |
| Analyzing | 0.1893 |
| Attention to detail | 0.1795 |
| Collaborate | 0.1697 |
| Writing | 0.1697 |
| Leadership | 0.1683 |
| Interpersonal | 0.1529 |
| Strategic | 0.1515 |
| Integrity | 0.1487 |
## skill occurance
## 1 Team 0.7885
## 2 Management 0.3846
## 3 Problem 0.3654
## 4 communication 0.3077
## 5 Result 0.2596
## 6 Hands-on 0.2404
## 7 Insight 0.2212
## 8 detail 0.2019
## 9 Organization 0.1923
## 10 Collaborate 0.1827
## 11 Competitive 0.1731
## 12 Driven 0.1731
## 13 Responsible 0.1635
## 14 Collaboration 0.1442
## 15 Passionate 0.1346
## 16 Problem solving 0.1250
## 17 Innovation 0.1154
## 18 Influence 0.1058
## 19 Leadership 0.1058
## 20 Managing 0.0962
## 21 Team player 0.0962
## 22 Analyzing 0.0865
Here the hard skills are extracted out from the inbuilt list of skills in the raw dataset.
# List all hard skills and how often they were listed in a separate dataframe
for (i in 1:nrow(df_raw)) {
slist <- str_extract_all(df_raw$Skill[i], "'.+?'")
temp_df <- data.frame(skill = slist[[1]])
if (nrow(temp_df) > 0) {
temp_df$id <- i - 1
temp_df$skill <- as.character(temp_df$skill)
temp_df <- select(temp_df, id, skill)
if (i == 1) {
sdf <- temp_df
} else {
sdf <- bind_rows(sdf, temp_df)
}
}
}
# Remove quotes and trim whitespace
sdf$skill <- str_replace_all(sdf$skill, "'", "")
sdf$skill <- str_replace_all(sdf$skill, '"', "")
sdf$skill <- str_trim(sdf$skill)
sdf$skill_type <- 'hard'
# Combine skills
skills <- bind_rows(df2, sdf)Here we look at the different types of jobs and industries and compare the skills mentioned.
We’re looking to identify roles that specifiy if roles are analyst, engineer, or scientists, as well as roles that are junior or senior level.
df <- df_raw
job.indeed.df <- df %>%
select(c(Id=X, (Job_Title)))
# Pattern Building:
pattern.analyst <- c('analyst','statistician','analysis','analytics')
pattern.engineer <- c('engineer', 'engg', 'technician','technologist','designer','architect')
pattern.scientist <- c('scientist','doctor','dr.')
pattern.junior <- c('junior','jr', 'entry','internship','jr.')
pattern.senior <- c('senior', 'sr','experienced','sr.')
# Intermedaite Data Frame for Titlles:
final.data.df <- data.frame(Id=integer(nrow(job.indeed.df)), Job_Title=character(nrow(job.indeed.df))
, analyst=integer(nrow(job.indeed.df)) ,engineer=integer(nrow(job.indeed.df)),scientist=integer(nrow(job.indeed.df))
, junior=integer(nrow(job.indeed.df)),senior=integer(nrow(job.indeed.df)))
final.data.df$Id <- job.indeed.df$Id
final.data.df$Job_Title <- as.character( as.character( job.indeed.df$Job_Title) )
# Working on the counts:
for (i in 1: nrow(job.indeed.df)) {
final.data.df$analyst[i] <- if(grepl(paste(pattern.analyst,collapse="|"), job.indeed.df$Job_Title[i], ignore.case = TRUE) ) 1 else 0
final.data.df$engineer[i] <- if(grepl(paste(pattern.engineer,collapse="|"), job.indeed.df$Job_Title[i], ignore.case = TRUE) ) 1 else 0
final.data.df$scientist[i] <- if(grepl(paste(pattern.scientist,collapse="|"), job.indeed.df$Job_Title[i], ignore.case = TRUE) ) 1 else 0
final.data.df$junior[i] <- if(grepl(paste(pattern.junior,collapse="|"), job.indeed.df$Job_Title[i], ignore.case = TRUE) ) 1 else 0
final.data.df$senior[i] <- if(grepl(paste(pattern.senior,collapse="|"), job.indeed.df$Job_Title[i], ignore.case = TRUE) ) 1 else 0
}This is the raw dataset with the job title flags joined.
# Add job title flags to original dataframe
# Merging with main Analysis Data Frame:
df_all <- left_join(df_raw[,1:16], final.data.df[,-2], by = c('X' = 'Id'))
names(df_all)[1] <- 'job_id'hs <- filter(skills, skill_type == 'hard') %>%
left_join(final.data.df, by = c('id' = 'Id')) %>%
mutate(analyst = ifelse(analyst == 1, id, NA),
engineer = ifelse(engineer == 1, id, NA),
scientist = ifelse(scientist == 1, id, NA),
junior = ifelse(junior == 1, id, NA),
senior = ifelse(senior == 1, id, NA)) %>%
dplyr::group_by(skill) %>%
dplyr::summarize(total_count = n(),
ana_ids = n_distinct(analyst),
ana_pct = ana_ids / sum(df_all$analyst),
eng_ids = n_distinct(engineer),
eng_pct = eng_ids / sum(df_all$engineer),
sci_ids = n_distinct(scientist),
sci_pct = sci_ids / sum(df_all$scientist),
jr_ids = n_distinct(junior),
jr_pct = jr_ids / sum(df_all$junior),
sr_ids = n_distinct(senior),
sr_pct = sr_ids / sum(df_all$senior))
# specify top number of skills
top_x <- 10Lets take a look at the relative demand for data analysts, engineers, and scientists.
#row count of dataset
post_count <- nrow(df_raw)
dplyr::summarize(final.data.df,
Analyst = sum(analyst) / n(),
Engineer = sum(engineer) / n(),
Scientist = sum(scientist) / n()) %>%
gather(position, pct_of_posts) %>%
ggplot(aes(x = position, y = pct_of_posts)) +
geom_col(fill = 'grey80') +
geom_hline(yintercept = .333, linetype = 'dotted') +
scale_y_continuous(label = percent_format(accuracy = 1)) +
labs(x = element_blank(),
y = element_blank(),
title = 'Job Title Distribution',
subtitle = str_c('Data Roles from ', comma(post_count), ' Posts')) +
theme(legend.position = 'none') +
theme_bw()What are the most in demand skills for data role in general?
ggplot( arrange(hs, desc(total_count))[1:top_x,], aes(x = reorder(skill, total_count), y = total_count/nrow(df_all))) +
geom_col(fill = 'grey60') +
coord_flip() +
labs(title = str_c('Top ', top_x, ' Technical Skills'),
subtitle = str_c('Data Analysts/Engineers/Scientists'),
y = "% of Job Posts",
x = element_blank()) +
scale_y_continuous(label = percent_format(accuracy = 1))ggplot( arrange(hs, desc(sci_ids))[1:top_x,], aes(x = reorder(skill, sci_pct), y = sci_pct)) +
geom_col(fill = 'grey60') +
coord_flip() +
labs(title = str_c('Top ', top_x, ' Technical Skills'),
subtitle = str_c('Data Scientists'),
y = "% of Job Posts",
x = element_blank()) +
scale_y_continuous(label = percent_format(accuracy = 1))ggplot( arrange(hs, desc(ana_ids))[1:top_x,], aes(x = reorder(skill, ana_pct), y = ana_pct)) +
geom_col(fill = 'grey60') +
coord_flip() +
labs(title = str_c('Top ', top_x, ' Technical Skills'),
subtitle = str_c('Data Analysts'),
y = "% of Job Posts",
x = element_blank()) +
scale_y_continuous(label = percent_format(accuracy = 1))ggplot( arrange(hs, desc(eng_ids))[1:top_x,], aes(x = reorder(skill, eng_pct), y = eng_pct)) +
geom_col(fill = 'grey60') +
coord_flip() +
labs(title = str_c('Top ', top_x, ' Technical Skills'),
subtitle = str_c('Data Engineers'),
y = "% of Job Posts",
x = element_blank()) +
scale_y_continuous(label = percent_format(accuracy = 1))What differences do we observe between junior and senior roles?
# new dataframe for only junior and seniors
jrs <- filter(final.data.df, junior == 1) %>%
select(Id, Analyst = analyst, Engineer = engineer, Scientist = scientist) %>%
gather(position, count, -Id)
jrs$level <- 'Junior'
srs <- filter(final.data.df, senior == 1) %>%
select(Id, Analyst = analyst, Engineer = engineer, Scientist = scientist) %>%
gather(position, count, -Id)
srs$level <- 'Senior'
pos_and_role <- bind_rows(jrs, srs) %>%
group_by(level, position) %>%
dplyr::summarize(count = sum(count))
ggplot(pos_and_role, aes(x = position, y = count, fill = level)) +
geom_col(position = 'fill') +
theme_bw() +
scale_y_continuous(label = percent_format(accuracy = 1)) +
scale_fill_manual(values = c('lightskyblue3', 'royalblue4')) +
labs(x = element_blank(),
y = element_blank(),
title = 'Distribution of Specified Levels')hs_jr_base <- filter(skills, skill_type == 'hard') %>%
inner_join(filter(final.data.df, junior == 1 & scientist == 1), by = c('id' = 'Id'))
hs_jr <- group_by(hs_jr_base, skill) %>%
dplyr::summarize(total_count = n())
hs_jr_denom <- n_distinct(hs_jr_base$id)
ggplot( arrange(hs_jr, desc(total_count))[1:top_x,], aes(x = reorder(skill, total_count), y = total_count/hs_jr_denom)) +
geom_col(fill = 'grey60') +
coord_flip() +
labs(title = str_c('Top ', top_x, ' Technical Skills'),
subtitle = str_c('Junior Data Scientists'),
y = "% of Job Posts",
x = element_blank()) +
scale_y_continuous(label = percent_format(accuracy = 1))hs_sr_base <- filter(skills, skill_type == 'hard') %>%
inner_join(filter(final.data.df, senior == 1 & scientist == 1), by = c('id' = 'Id'))
hs_sr <- group_by(hs_sr_base, skill) %>%
dplyr::summarize(total_count = n())
hs_sr_denom <- n_distinct(hs_sr_base$id)ggplot( arrange(hs_sr, desc(total_count))[1:top_x,], aes(x = reorder(skill, total_count), y = total_count/hs_sr_denom)) +
geom_col(fill = 'grey60') +
coord_flip() +
labs(title = str_c('Top ', top_x, ' Technical Skills'),
subtitle = str_c('Senior Data Scientists'),
y = "% of Job Posts",
x = element_blank()) +
scale_y_continuous(label = percent_format(accuracy = 1))sci_industry <- filter(df_all, scientist == 1 & Company_Industry != '') %>%
dplyr::group_by(Company_Industry) %>%
dplyr::summarize(postings = n()) %>%
arrange(desc(postings))Lets take a look at which companies are posting in search of data scientists.
ggplot(sci_industry[1:10,], aes(x = reorder(Company_Industry, postings), y = postings / sum(sci_industry$postings))) +
geom_col(fill = 'grey60') +
coord_flip() +
labs(title = 'Top 10 Company Industries',
subtitle = str_c('Data Scientist Posts'),
y = "% of Job Posts",
x = element_blank()) +
scale_y_continuous(label = percent_format(accuracy = 1))# WordCLoud map for the soft and hard skillset that high rating/star companies are
# looking for in engineer vs analyst/scientist Position:
df.stars.5 <- df_all %>%
filter(No_of_Stars ==5 ) %>%
select(job_id, Queried_Salary, Company_Industry, analyst, engineer, scientist , junior, senior ) %>%
inner_join(skills, by = c( 'job_id' = 'id'))
df.stars.5.engg <- df.stars.5 %>%
filter ( engineer ==1) %>%
group_by(skill) %>%
dplyr::summarize(total_count = n())
df.stars.5.ana.sci <- df.stars.5 %>%
filter ( analyst + scientist > 0 ) %>%
group_by(skill) %>%
dplyr::summarize(total_count = n())
set.seed(1234)
wordcloud(words = df.stars.5.engg$skill, freq = df.stars.5.engg$total_count, min.freq = 1,
max.words=100, random.order=FALSE, rot.per=0.10,
colors=brewer.pal(8, "Dark2"))## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Shell Scripting could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Business Intelligence could not be fit on page. It will
## not be plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Google Cloud Platform could not be fit on page. It will
## not be plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Natural Language Processing could not be fit on page. It
## will not be plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Oral communication skills could not be fit on page. It
## will not be plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Responsibility could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Work-Life Balance could not be fit on page. It will not be
## plotted.
wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.5.ana.sci$total_count, min.freq = 1,
max.words=100, random.order=FALSE, rot.per=0.10,
colors=brewer.pal(8, "Dark2"))## Warning in wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.
## 5.ana.sci$total_count, : Data-driven decision-making could not be fit on page.
## It will not be plotted.
## Warning in wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.
## 5.ana.sci$total_count, : Survey Design could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.
## 5.ana.sci$total_count, : Written communication skills could not be fit on page.
## It will not be plotted.
# Salary Range by titles given by high rating/star companies:
df.stars.5.Salary <- df.stars.5 %>%
dplyr::group_by(Queried_Salary) %>%
dplyr::summarize(total_count = n(),
ana_total = sum(analyst),
eng_totals = sum(engineer),
sci_total = sum(scientist),
jr_total = sum(junior),
sr_total = sum(senior))
kable(df.stars.5.Salary)| Queried_Salary | total_count | ana_total | eng_totals | sci_total | jr_total | sr_total |
|---|---|---|---|---|---|---|
| <80000 | 37 | 37 | 0 | 0 | 0 | 0 |
| >160000 | 47 | 17 | 0 | 30 | 0 | 0 |
| 100000-119999 | 111 | 33 | 54 | 24 | 0 | 0 |
| 120000-139999 | 136 | 55 | 34 | 77 | 0 | 9 |
| 140000-159999 | 59 | 17 | 28 | 14 | 0 | 8 |
| 80000-99999 | 61 | 25 | 36 | 0 | 0 | 13 |
#Statewise Job posting for Data Science Jobs:
df.states <- df_all %>%
select(job_id, Location ) %>%
group_by(Location) %>%
dplyr::summarize(total_count = n())
all_states <- map_data("state")
df.states$region <- stateFromLower(df.states$Location)
Total <- merge(all_states, df.states, by="region")
Total <- Total[Total$region!="REMOTE",]
p <- ggplot()
p <- p + geom_polygon(data=Total, aes(x=long, y=lat, group = group, fill=Total$total_count),colour="white"
) + scale_fill_continuous(low = "#56B4E9", high = "#0072B2", guide="colorbar")
pFirst, we need to select and arrange the data we want to use
#Creates a hard-skill and salary, and job type dataset
Skill_Pay <- sdf[1:2]
Skill_Pay[c("Salary","Job_Type")] <- NA
#Populates the salary and job type columns from the raw dataset
for (i in Skill_Pay$id) {
Skill_Pay$Salary[Skill_Pay$id == i] <- df_raw$Queried_Salary[df_raw$X == i]
Skill_Pay$Job_Type[Skill_Pay$id == i] <- df_raw$Job_Type[df_raw$X == i]
}
#Going to focus on only data_science-type jobs
Skill_PayDS <- filter(Skill_Pay,Job_Type == 'data_scientist')
#Changes the 'skill' column to a factor type
Skill_PayDS$skill <- as.factor(Skill_PayDS$skill)
#Filters it to skills mentioned over 200 times (to keep sample size high for analysis)
mentions <- 200
stab = table(Skill_PayDS$skill)
Skill_PayDS <- Skill_PayDS[Skill_PayDS$skill %in% names(stab)[stab >= mentions],]
#Drops the now un-used factors from skills
Skill_PayDS <- droplevels(Skill_PayDS)
#Factors and re-orders Salary
Pay_Grades <- c("<80000", "80000-99999", "100000-119999", "120000-139999", "140000-159999",">160000")
Skill_PayDS$Salary <- factor(Skill_PayDS$Salary, levels = Pay_Grades)
#Makes a table of Skills vs Salary
PayTable<- table(Skill_PayDS$skill ,Skill_PayDS$Salary)
PayTable##
## <80000 80000-99999 100000-119999 120000-139999
## AI 5 50 105 131
## AWS 1 37 80 96
## Big Data 2 17 97 161
## C/C++ 6 36 98 141
## Data Mining 13 55 238 225
## Data Science 6 17 65 75
## Hadoop 3 27 163 297
## Hive 1 11 91 158
## Java 2 40 149 224
## Linux 2 44 55 66
## Machine Learning 30 145 467 577
## MATLAB 3 30 68 131
## Natural Language Processing 5 44 114 153
## Python 23 157 506 598
## R 19 127 460 502
## SAS 7 59 192 162
## Scala 1 7 67 119
## Spark 3 20 168 269
## SQL 13 134 354 371
## Tableau 5 38 157 164
## TensorFlow 6 46 86 128
##
## 140000-159999 >160000
## AI 114 46
## AWS 65 37
## Big Data 111 29
## C/C++ 99 51
## Data Mining 162 40
## Data Science 41 17
## Hadoop 244 93
## Hive 132 49
## Java 190 70
## Linux 48 10
## Machine Learning 440 183
## MATLAB 98 34
## Natural Language Processing 157 55
## Python 456 172
## R 328 114
## SAS 113 31
## Scala 100 44
## Spark 201 84
## SQL 232 89
## Tableau 94 26
## TensorFlow 100 54
#Specify the significance level
sig <- 0.05
#Loops through each skill
j = 0
k = 0
plist <- NA
SigSkills <- NA
while (j < nrow(PayTable)) {
j = j + 1
i = 0
unmentioned <- NA
#Loops through the elemenents of each skill, and builds a vector of the number of jobs for which a skill-salary combo wasn't mentioned: unmentioned
while (i < ncol(PayTable)) {
i = i + 1
unmentioned[i] <- Job_Totals[i] - PayTable[j,i]
}
#Performed a chi-squared tests for by each skill by paygrade to determine skills mentioned with significant relationship
#Saves the skills with significant relationships to a list of names and their p-values to a seperate list
if (chisq.test(cbind(unmentioned,PayTable[j,]))$p.value < sig) {
k = k + 1
plist[k] <- chisq.test(cbind(unmentioned,PayTable[j,]))$p.value
SigSkills[k] <- names(PayTable[,1])[j]
}
}## Warning in chisq.test(cbind(unmentioned, PayTable[j, ])): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(cbind(unmentioned, PayTable[j, ])): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(cbind(unmentioned, PayTable[j, ])): Chi-squared
## approximation may be incorrect
The plots below show the proportions of posts that mention a skill for each salary range. The horizontal bar would be the expected proportion if there was no difference in the proportion of mentions according to the salary range. The plots shown are only those for which there were over 200 skill mentions p > 0.05 (as determined by chi squared tests). There are several interesting findings below, and if one is interested in knowing which skills seem to correlate with the highest paying jobs: C/C++, Hadoop, Hive, Java, NLP, Scala, Spark, and TensorFlow.
i = 0
skillplots <- list()
while (i < length(SigSkills)) {
i = i + 1
plottemp <- as.data.frame(PayTable[SigSkills[i],]/Job_Totals)
print (ggplot(plottemp, aes(x = plottemp[,1], y = rownames(plottemp), fill = rownames(plottemp))) +
geom_bar(stat = "identity", width=0.9) +
scale_y_discrete(limits=rownames(plottemp)) +
geom_vline(xintercept = sum(PayTable[SigSkills[i],])/sum(Job_Totals)) +
scale_fill_brewer(palette=1, type = "qual") +
xlab("Proportion") +
ylab("Salary Range") +
ggtitle(paste("Proportion of Job Postings that Mention",SigSkills[i],"; p = ",round(plist[i], digits = 6))) +
labs(fill = "Salary Range")+
theme(axis.text.x = element_text(angle = 15)) +
coord_flip())
} ## Warning: position_stack requires non-overlapping x intervals
## Warning: position_stack requires non-overlapping x intervals
## Warning: position_stack requires non-overlapping x intervals
## Warning: position_stack requires non-overlapping x intervals
## Warning: position_stack requires non-overlapping x intervals
## Warning: position_stack requires non-overlapping x intervals
## Warning: position_stack requires non-overlapping x intervals
## Warning: position_stack requires non-overlapping x intervals
## Warning: position_stack requires non-overlapping x intervals
## Warning: position_stack requires non-overlapping x intervals
## Warning: position_stack requires non-overlapping x intervals
## Warning: position_stack requires non-overlapping x intervals
## Warning: position_stack requires non-overlapping x intervals
## Warning: position_stack requires non-overlapping x intervals
## Warning: position_stack requires non-overlapping x intervals
## Warning: position_stack requires non-overlapping x intervals
## Warning: position_stack requires non-overlapping x intervals
## Warning: position_stack requires non-overlapping x intervals
## Warning: position_stack requires non-overlapping x intervals
## Warning: position_stack requires non-overlapping x intervals
Lets create a connection Object to MySQL database.
We will connect to the sample database named “indeed” created on local host.
Pls create the database manually on your local MySQL installation with details as in the connection string below.
#install.packages("RMySQL")
#library("RMySQL")
## Connection String to connect to the local host database:
#mysqlconnection = dbConnect(MySQL(), user = 'indeed', password = 'Indeed@123', dbname = 'indeed',
# host = 'localhost')
## Lists the tables currently in the database:
#dbListTables(mysqlconnection)
## Creating table and inserting rows from the mentioned dataframe; if table exists then it will overwrite:
#dbWriteTable(mysqlconnection, "skills", skills[, ], overwrite = TRUE, row.names = FALSE)
#dbWriteTable(mysqlconnection, "df_raw", df_raw[, ], overwrite = TRUE, row.names = FALSE)
## Lists the tables existing and above two newly created ones:
#dbListTables(mysqlconnection)