Data607-Major Assignment-Project3-Most Valued Data Science Skills

Overview:

Import

#Loads the dataset
df_raw <- read.csv("https://raw.githubusercontent.com/davidblumenstiel/DATA-607-Project-3/master/indeed_job_dataset.csv",stringsAsFactors = FALSE)

#Create working df object
df <- df_raw

Soft Skills

Skills that are not technical were not included in the dataset. However, they are mentioned in the job descriptions. So, we begin by programatically reading through the job-descriptions to extract mentions of the ‘soft-skills’

#The soft skills we will look will be stored as: 'keywords'
keywords<-unname(unlist(read.csv("https://raw.githubusercontent.com/davidblumenstiel/DATA-607-Project-3/master/softskills.csv", stringsAsFactors = FALSE)))

#Iniates new columns for each keyword; also makes list index list
df[keywords] <- NA
keywordColIndex <- seq(length(df)-length(keywords)+1,length(df),1)

#Will determine if contains keywords; occurs (1), or doesnt occur(0)
#Coerces multiple occurances to 1
i = 0
for (desc in df$Description) {
  i = i + 1
  df[i,keywordColIndex] <-(as.integer(str_detect(desc,fixed(keywords, ignore_case = TRUE))))
}

#Gets rid of the descriptions to save space
df$Description <- NULL



#Gathers soft skills into a tidy dataset: df2
df2 <- df[,c(1, 44:ncol(df))] %>%
  gather(skill, flag, -X) %>%
  filter(flag == 1) %>%
  select(id = X, skill) %>%
  mutate(skill_type = 'soft')

We Create data frames that have social skills by salary and jobtype

soft_summary <- df %>%
   replace(is.na(.), 0) %>%
   mutate(sum = rowSums(.[43:ncol(df)]))

#mean(soft_summary$sum)

avg_ss_by_jobtype <- aggregate(soft_summary[, ncol(soft_summary)], list(soft_summary$Job_Type), mean)
names(avg_ss_by_jobtype)[1]<-"Job_Type"

avg_ss_by_salary <- aggregate(soft_summary[, ncol(soft_summary)], list(soft_summary$Queried_Salary), mean)
names(avg_ss_by_salary)[1]<-"Queried_Salary"

We create a dataframe containing only the total amount of softskills per job and the job type and the salary band

soft_sum_3cols <- soft_summary %>% select(Job_Type,Queried_Salary,sum)

avg_ss_by_jobtype$Job_Type <- factor(avg_ss_by_jobtype$Job_Type,levels = c("data_analyst", "data_scientist", "data_engineer"))

#plot showing number soft skills on average per job type

ggplot(avg_ss_by_jobtype) +
 aes(x = Job_Type, weight = x) +
 geom_bar(fill = "#2171b5") +
 labs(x = "Job Type", y = "avg # of soft skills in job description", title = "Soft Skills Mentioned per Job Type") +
 theme_minimal()

#we agregate the dataframe into averages for every combination of salary and job type
skillby_type_sal <-aggregate(x=soft_sum_3cols$sum, by = soft_sum_3cols[c("Job_Type", "Queried_Salary")],
  FUN=mean, na.rm=TRUE)

# make variable factors and order them

skillby_type_sal$Queried_Salary <- factor(skillby_type_sal$Queried_Salary,levels = c("<80000", "80000-99999", "100000-119999", "120000-139999", "140000-159999", ">160000"))

skillby_type_sal$Job_Type <- factor(skillby_type_sal$Job_Type,levels = c("data_analyst", "data_scientist", "data_engineer"))

# heatmap of average skills mentioned by job type and salary

ggplot(skillby_type_sal) +
 aes(x = Queried_Salary, y = Job_Type, fill = x) +
 geom_tile(size = 1L) +
 scale_fill_distiller(palette = "PuBu") +
 labs(x = "Salary Band", y = "Job Type", title = "Soft Skills Mentioned across Salary and Job Type") +
 theme_minimal()

#not used for now bar chart for average # soft skills mentioned by salary

# avg_ss_by_salary$Queried_Salary <- factor(avg_ss_by_salary$Queried_Salary,levels = c("<80000", "80000-99999", "100000-119999", "120000-139999", "140000-159999", ">160000"))

# ggplot(avg_ss_by_salary) +
#  aes(x = Queried_Salary, weight = x) +
#  geom_bar(fill = "#1f9e89") +
#  labs(x = "Salary Band", y = "avg # of soft skills in job description", title = "Soft Skill Mentions per Salary Band") +
#  theme_minimal()

Looking at the extreme case of high salary data engineer

eng_highs <- filter(df, Job_Type == "data_engineer" & Queried_Salary == ">160000")
number_eng_highs <- nrow(eng_highs)
eng_highs_long <- eng_highs[,c(44:ncol(eng_highs))] %>%
  gather(skill, flag) %>%
  filter(flag == 1)

eng_highs_count <-aggregate(x=eng_highs_long$flag, by = eng_highs_long[c("skill")],
  FUN=sum, na.rm=TRUE)

eng_highs_count$x <- round(eng_highs_count$x/number_eng_highs,4)

eng_highs_count <- eng_highs_count[order(-eng_highs_count$x),] 
rownames(eng_highs_count) <- NULL

Looking at the extreme case of low salary data analyst

anl_lows <- filter(df, Job_Type == "data_analyst" & Queried_Salary == "<80000")
number_anl_lows <- nrow(anl_lows)

anl_lows_long <- anl_lows[,c(44:ncol(eng_highs))] %>%
  gather(skill, flag) %>%
  filter(flag == 1)

anl_lows_count <-aggregate(x=anl_lows_long$flag, by = anl_lows_long[c("skill")],
  FUN=sum, na.rm=TRUE)

anl_lows_count$x <- round(anl_lows_count$x/number_anl_lows,4)

anl_lows_count <- anl_lows_count[order(-anl_lows_count$x),] 
rownames(anl_lows_count) <- NULL

Looking at the extreme cases of high salary data engineer and low salary data analyst together

library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

table_eng_highs <- eng_highs_count[1:22,]
table_anl_lows <- anl_lows_count[1:22,]

names(table_eng_highs)[2] <- "occurance"
names(table_anl_lows)[2] <- "occurance"

table_eng_highs %>%
  kable("html",col.names = NA, align = 'clc', caption = 'Top Soft skills (engineers high salary)') %>%
    kable_styling(full_width = F, position = "float_left")

Top Soft skills (engineers high salary)
skill	occurance
Team	0.7885
Management	0.3846
Problem	0.3654
communication	0.3077
Result	0.2596
Hands-on	0.2404
Insight	0.2212
detail	0.2019
Organization	0.1923
Collaborate	0.1827
Competitive	0.1731
Driven	0.1731
Responsible	0.1635
Collaboration	0.1442
Passionate	0.1346
Problem solving	0.1250
Innovation	0.1154
Influence	0.1058
Leadership	0.1058
Managing	0.0962
Team player	0.0962
Analyzing	0.0865

table_anl_lows %>%
  kable("html", align = 'clc', caption = 'Top Soft skills (analyst low salary)') %>%
    kable_styling(full_width = F, position = "right")

Top Soft skills (analyst low salary)
skill	occurance
Team	0.6227
Management	0.5386
communication	0.4656
Problem	0.4109
Organization	0.3829
detail	0.3240
Result	0.3128
Research	0.2917
Train	0.2693
Presentation	0.2567
Training	0.2216
Responsible	0.2188
Independent	0.2146
Insight	0.2020
Analyzing	0.1893
Attention to detail	0.1795
Collaborate	0.1697
Writing	0.1697
Leadership	0.1683
Interpersonal	0.1529
Strategic	0.1515
Integrity	0.1487

print(table_eng_highs)

##              skill occurance
## 1             Team    0.7885
## 2       Management    0.3846
## 3          Problem    0.3654
## 4    communication    0.3077
## 5           Result    0.2596
## 6         Hands-on    0.2404
## 7          Insight    0.2212
## 8           detail    0.2019
## 9     Organization    0.1923
## 10     Collaborate    0.1827
## 11     Competitive    0.1731
## 12          Driven    0.1731
## 13     Responsible    0.1635
## 14   Collaboration    0.1442
## 15      Passionate    0.1346
## 16 Problem solving    0.1250
## 17      Innovation    0.1154
## 18       Influence    0.1058
## 19      Leadership    0.1058
## 20        Managing    0.0962
## 21     Team player    0.0962
## 22       Analyzing    0.0865

Hard Skills

Here the hard skills are seperated out from the inbuilt list of skills in the raw dataset

# List all hard skills and how often they were listed in a separate dataframe
for (i in 1:nrow(df_raw)) {
  slist <- str_extract_all(df_raw$Skill[i], "'.+?'")
  temp_df <- data.frame(skill = slist[[1]])
  if (nrow(temp_df) > 0) {
    temp_df$id <- i - 1
    temp_df$skill <- as.character(temp_df$skill)
    temp_df <- select(temp_df, id, skill)
    if (i == 1) {
      sdf <- temp_df
    } else {
      sdf <- bind_rows(sdf, temp_df)
    }
  }
}

# Remove quotes and trim whitespace
sdf$skill <- str_replace_all(sdf$skill, "'", "")
sdf$skill <- str_replace_all(sdf$skill, '"', "")
sdf$skill <- str_trim(sdf$skill)
sdf$skill_type <- 'hard'

# Combine skills
skills <- bind_rows(df2, sdf)

Job Title

Here we look at the different types of jobs and industries and compare the skills mentioned

df <- df_raw
job.indeed.df  <- df %>%
  select(c(Id=X, (Job_Title)))

# Pattern Building:
pattern.analyst <- c('analyst','statistician','analysis','analytics')
pattern.engineer <- c('engineer', 'engg', 'technician','technologist','designer','architect')
pattern.scientist <- c('scientist','doctor','dr.')
pattern.junior <- c('junior','jr', 'entry','internship','jr.')
pattern.senior <- c('senior', 'sr','experienced','sr.')

# Intermedaite Data Frame for Titlles:
final.data.df <- data.frame(Id=integer(nrow(job.indeed.df)), Job_Title=character(nrow(job.indeed.df))
                            , analyst=integer(nrow(job.indeed.df)) ,engineer=integer(nrow(job.indeed.df)),scientist=integer(nrow(job.indeed.df))
                            , junior=integer(nrow(job.indeed.df)),senior=integer(nrow(job.indeed.df)))
final.data.df$Id <-   job.indeed.df$Id
final.data.df$Job_Title <- as.character( as.character( job.indeed.df$Job_Title) )

# Working on the counts:
for (i in 1: nrow(job.indeed.df)) {
  final.data.df$analyst[i] <- if(grepl(paste(pattern.analyst,collapse="|"), job.indeed.df$Job_Title[i], ignore.case = TRUE) )  1 else 0
  final.data.df$engineer[i] <- if(grepl(paste(pattern.engineer,collapse="|"), job.indeed.df$Job_Title[i], ignore.case = TRUE) )  1 else 0
  final.data.df$scientist[i] <- if(grepl(paste(pattern.scientist,collapse="|"), job.indeed.df$Job_Title[i], ignore.case = TRUE) )  1 else 0
  final.data.df$junior[i] <- if(grepl(paste(pattern.junior,collapse="|"), job.indeed.df$Job_Title[i], ignore.case = TRUE) )  1 else 0
  final.data.df$senior[i] <- if(grepl(paste(pattern.senior,collapse="|"), job.indeed.df$Job_Title[i], ignore.case = TRUE) )  1 else 0
}

Combine Data

# Add job title flags to original dataframe
# Merging with main Analysis Data Frame:
df_all <- left_join(df_raw[,1:16], final.data.df[,-2], by = c('X' = 'Id'))
names(df_all)[1] <- 'job_id'

Summary Visualizations

# Job Role Distribution by Position
post_count <- nrow(df_raw)

dplyr::summarize(final.data.df, 
                    Analyst = sum(analyst) / n(),
                    Engineer = sum(engineer) / n(),
                    Scientist = sum(scientist) / n()) %>%
  gather(position, pct_of_posts) %>%
  ggplot(aes(x = position, y = pct_of_posts)) +
  geom_col(fill = 'grey80') +
  geom_hline(yintercept = .333, linetype = 'dotted') +
  scale_y_continuous(label = percent_format(accuracy = 1)) +
  labs(x = element_blank(), 
       y = element_blank(), 
       title = 'Job Title Distribution', 
       subtitle = str_c('Data Roles from ', comma(post_count), ' Posts')) +
  theme(legend.position = 'none') +
  theme_bw()

# hard skills dataframe for visualizations
hs <- filter(skills, skill_type == 'hard') %>%
  left_join(final.data.df, by = c('id' = 'Id')) %>%
  mutate(analyst = ifelse(analyst == 1, id, NA),
         engineer = ifelse(engineer == 1, id, NA),
         scientist = ifelse(scientist == 1, id, NA),
         junior = ifelse(junior == 1, id, NA),
         senior = ifelse(senior == 1, id, NA)) %>% 
  dplyr::group_by(skill) %>%
  dplyr::summarize(total_count = n(),
            ana_ids = n_distinct(analyst),
            ana_pct = ana_ids / sum(df_all$analyst),
            eng_ids = n_distinct(engineer),
            eng_pct = eng_ids / sum(df_all$engineer),
            sci_ids = n_distinct(scientist),
            sci_pct = sci_ids / sum(df_all$scientist),
            jr_ids = n_distinct(junior),
            jr_pct = jr_ids / sum(df_all$junior),
            sr_ids = n_distinct(senior),
            sr_pct = sr_ids / sum(df_all$senior))


# specify top number of skills
top_x <- 10


# Top Hard Skills for All Data Postings
ggplot( arrange(hs, desc(total_count))[1:top_x,], aes(x = reorder(skill, total_count), y = total_count/nrow(df_all))) +
  geom_col(fill = 'grey60') +
  coord_flip() +
  labs(title = str_c('Top ', top_x, ' Technical Skills'),
       subtitle = str_c('Data Analysts/Engineers/Scientists'),
       y = "% of Job Posts",
       x = element_blank()) +
  scale_y_continuous(label = percent_format(accuracy = 1))

# Top Hard Skills for Data Scientists
ggplot( arrange(hs, desc(sci_ids))[1:top_x,], aes(x = reorder(skill, sci_pct), y = sci_pct)) +
  geom_col(fill = 'grey60') +
  coord_flip() +
  labs(title = str_c('Top ', top_x, ' Technical Skills'),
       subtitle = str_c('Data Scientists'),
       y = "% of Job Posts",
       x = element_blank()) +
  scale_y_continuous(label = percent_format(accuracy = 1))

# Top Hard Skills for Data Analysts
ggplot( arrange(hs, desc(ana_ids))[1:top_x,], aes(x = reorder(skill, ana_pct), y = ana_pct)) +
  geom_col(fill = 'grey60') +
  coord_flip() +
  labs(title = str_c('Top ', top_x, ' Technical Skills'),
       subtitle = str_c('Data Analysts'),
       y = "% of Job Posts",
       x = element_blank()) +
  scale_y_continuous(label = percent_format(accuracy = 1))

# Top Hard Skills for Data Engineers
ggplot( arrange(hs, desc(eng_ids))[1:top_x,], aes(x = reorder(skill, eng_pct), y = eng_pct)) +
  geom_col(fill = 'grey60') +
  coord_flip() +
  labs(title = str_c('Top ', top_x, ' Technical Skills'),
       subtitle = str_c('Data Engineers'),
       y = "% of Job Posts",
       x = element_blank()) +
  scale_y_continuous(label = percent_format(accuracy = 1))

# Distribution of Junior and Senior Data Scientist Roles
jrs <- filter(final.data.df, junior == 1) %>%
  select(Id, Analyst = analyst, Engineer = engineer, Scientist = scientist) %>%
  gather(position, count, -Id)
jrs$level <- 'Junior'

srs <- filter(final.data.df, senior == 1) %>%
  select(Id, Analyst = analyst, Engineer = engineer, Scientist = scientist) %>%
  gather(position, count, -Id)
srs$level <- 'Senior'

pos_and_role <- bind_rows(jrs, srs) %>%
  group_by(level, position) %>%
  dplyr::summarize(count = sum(count))
  
ggplot(pos_and_role, aes(x = position, y = count, fill = level)) +
  geom_col(position = 'fill') +
  theme_bw() +
  scale_y_continuous(label = percent_format(accuracy = 1)) +
  scale_fill_manual(values = c('lightskyblue3', 'royalblue4')) +
  labs(x = element_blank(), 
       y = element_blank(),
       title = 'Distribution of Specified Levels')

# Top Hard Skills for Junior Data Scientists
hs_jr_base <- filter(skills, skill_type == 'hard') %>%
  inner_join(filter(final.data.df, junior == 1 & scientist == 1), by = c('id' = 'Id')) 
hs_jr <-  group_by(hs_jr_base, skill) %>%
  dplyr::summarize(total_count = n())
hs_jr_denom <- n_distinct(hs_jr_base$id)

ggplot( arrange(hs_jr, desc(total_count))[1:top_x,], aes(x = reorder(skill, total_count), y = total_count/hs_jr_denom)) +
  geom_col(fill = 'grey60') +
  coord_flip() +
  labs(title = str_c('Top ', top_x, ' Technical Skills'),
       subtitle = str_c('Junior Data Scientists'),
       y = "% of Job Posts",
       x = element_blank()) +
  scale_y_continuous(label = percent_format(accuracy = 1))

hs_sr_base <- filter(skills, skill_type == 'hard') %>%
  inner_join(filter(final.data.df, senior == 1 & scientist == 1), by = c('id' = 'Id')) 
hs_sr <- group_by(hs_sr_base, skill) %>%
  dplyr::summarize(total_count = n())
hs_sr_denom <- n_distinct(hs_sr_base$id)


# Top Hard Skills for Senior Data Scientists
ggplot( arrange(hs_sr, desc(total_count))[1:top_x,], aes(x = reorder(skill, total_count), y = total_count/hs_sr_denom)) +
  geom_col(fill = 'grey60') +
  coord_flip() +
  labs(title = str_c('Top ', top_x, ' Technical Skills'),
       subtitle = str_c('Senior Data Scientists'),
       y = "% of Job Posts",
       x = element_blank()) +
  scale_y_continuous(label = percent_format(accuracy = 1))

sci_industry <- filter(df_all, scientist == 1 & Company_Industry != '') %>%
  dplyr::group_by(Company_Industry) %>%
  dplyr::summarize(postings = n()) %>%
  arrange(desc(postings))


# Top Company Industries Seeking Data Scientists
ggplot(sci_industry[1:10,], aes(x = reorder(Company_Industry, postings), y = postings / sum(sci_industry$postings))) +
  geom_col(fill = 'grey60') +
  coord_flip() +
  labs(title = 'Top 10 Company Industries',
       subtitle = str_c('Data Scientist Posts'),
       y = "% of Job Posts",
       x = element_blank()) +
  scale_y_continuous(label = percent_format(accuracy = 1))

Word Cloud

# WordCLoud map for the soft and hard skillset that high rating/star companies are 
# looking for in engineer vs analyst/scientist Position: 
df.stars.5 <- df_all %>%
  filter(No_of_Stars ==5  ) %>%
  select(job_id, Queried_Salary,   Company_Industry, analyst, engineer, scientist , junior, senior ) %>%
  inner_join(skills, by = c( 'job_id' = 'id'))

df.stars.5.engg <- df.stars.5 %>% 
  filter ( engineer ==1) %>%
  group_by(skill) %>%
  dplyr::summarize(total_count = n())

df.stars.5.ana.sci <- df.stars.5 %>% 
  filter ( analyst + scientist > 0 ) %>%
  group_by(skill) %>%
  dplyr::summarize(total_count = n())


set.seed(1234)
wordcloud(words = df.stars.5.engg$skill, freq = df.stars.5.engg$total_count, min.freq = 1,
          max.words=100, random.order=FALSE, rot.per=0.10,
          colors=brewer.pal(8, "Dark2"))

## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Shell Scripting could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Business Intelligence could not be fit on page. It will
## not be plotted.

## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Google Cloud Platform could not be fit on page. It will
## not be plotted.

## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Natural Language Processing could not be fit on page. It
## will not be plotted.

## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Oral communication skills could not be fit on page. It
## will not be plotted.

## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Responsibility could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Work-Life Balance could not be fit on page. It will not be
## plotted.

wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.5.ana.sci$total_count, min.freq = 1,
          max.words=100, random.order=FALSE, rot.per=0.10, 
          colors=brewer.pal(8, "Dark2"))

## Warning in wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.
## 5.ana.sci$total_count, : Data-driven decision-making could not be fit on page.
## It will not be plotted.

## Warning in wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.
## 5.ana.sci$total_count, : Survey Design could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.
## 5.ana.sci$total_count, : Written communication skills could not be fit on page.
## It will not be plotted.

# Salary Range by titles given by high rating/star companies:
df.stars.5.Salary <- df.stars.5 %>% 
  dplyr::group_by(Queried_Salary) %>%
  dplyr::summarize(total_count = n(),
            ana_total = sum(analyst),
            eng_totals = sum(engineer),
            sci_total = sum(scientist),
            jr_total = sum(junior),
            sr_total = sum(senior))

kable(df.stars.5.Salary)

Queried_Salary	total_count	ana_total	eng_totals	sci_total	sr_total
<80000	37	37	0	0	0
>160000	47	17	0	30	0
100000-119999	111	33	54	24	0
120000-139999	136	55	34	77	9
140000-159999	59	17	28	14	8
80000-99999	61	25	36	0	13

# ==> We can see from above no junior level roles; No Engineer roles with greater then 
#     160,000 USD

#Statewise Job posting for Data Science Jobs: 
df.states <- df_all %>% 
  select(job_id, Location ) %>%
  group_by(Location) %>%
  dplyr::summarize(total_count = n())

all_states <- map_data("state")

df.states$region <- stateFromLower(df.states$Location)
Total <- merge(all_states, df.states, by="region")

Total <- Total[Total$region!="REMOTE",]

p <- ggplot()
p <- p + geom_polygon(data=Total, aes(x=long, y=lat, group = group, fill=Total$total_count),colour="white"
      ) + scale_fill_continuous(low = "#56B4E9", high = "#0072B2", guide="colorbar")

p

The Question on Everybody’s Mind: What Technical Skills will Get Me a High Paying Job?

First, we need to select and arrange the data we want to use

#Creates a hard-skill and salary, and job type dataset
Skill_Pay <- sdf[1:2]
Skill_Pay[c("Salary","Job_Type")] <- NA

#Populates the salary and job type columns from the raw dataset
for (i in Skill_Pay$id) {
  Skill_Pay$Salary[Skill_Pay$id == i] <- df_raw$Queried_Salary[df_raw$X == i]
  Skill_Pay$Job_Type[Skill_Pay$id == i] <- df_raw$Job_Type[df_raw$X == i]
  }

#Going to focus on only data_science-type jobs
Skill_PayDS <- filter(Skill_Pay,Job_Type == 'data_scientist')

#Changes the 'skill' column to a factor type
Skill_PayDS$skill <- as.factor(Skill_PayDS$skill)

#Filters it to skills mentioned over 200 times (to keep sample size high for analysis)
mentions <- 200
stab = table(Skill_PayDS$skill)
Skill_PayDS <- Skill_PayDS[Skill_PayDS$skill %in% names(stab)[stab >= mentions],]

#Drops the now un-used factors from skills
Skill_PayDS <- droplevels(Skill_PayDS)

#Factors and re-orders Salary
Pay_Grades <- c("<80000", "80000-99999", "100000-119999", "120000-139999", "140000-159999",">160000")
Skill_PayDS$Salary <- factor(Skill_PayDS$Salary, levels = Pay_Grades)

#Makes a table of Skills vs Salary
PayTable<- table(Skill_PayDS$skill ,Skill_PayDS$Salary)
PayTable

##                              
##                               <80000 80000-99999 100000-119999 120000-139999
##   AI                               5          50           105           131
##   AWS                              1          37            80            96
##   Big Data                         2          17            97           161
##   C/C++                            6          36            98           141
##   Data Mining                     13          55           238           225
##   Data Science                     6          17            65            75
##   Hadoop                           3          27           163           297
##   Hive                             1          11            91           158
##   Java                             2          40           149           224
##   Linux                            2          44            55            66
##   Machine Learning                30         145           467           577
##   MATLAB                           3          30            68           131
##   Natural Language Processing      5          44           114           153
##   Python                          23         157           506           598
##   R                               19         127           460           502
##   SAS                              7          59           192           162
##   Scala                            1           7            67           119
##   Spark                            3          20           168           269
##   SQL                             13         134           354           371
##   Tableau                          5          38           157           164
##   TensorFlow                       6          46            86           128
##                              
##                               140000-159999 >160000
##   AI                                    114      46
##   AWS                                    65      37
##   Big Data                              111      29
##   C/C++                                  99      51
##   Data Mining                           162      40
##   Data Science                           41      17
##   Hadoop                                244      93
##   Hive                                  132      49
##   Java                                  190      70
##   Linux                                  48      10
##   Machine Learning                      440     183
##   MATLAB                                 98      34
##   Natural Language Processing           157      55
##   Python                                456     172
##   R                                     328     114
##   SAS                                   113      31
##   Scala                                 100      44
##   Spark                                 201      84
##   SQL                                   232      89
##   Tableau                                94      26
##   TensorFlow                            100      54

#Gets a total count for the number of jobs by Salary: Job_Totals
q = 0
Job_Totals <- NA
for (i in Pay_Grades) {
  q = q + 1
  Job_Totals[q] <- nrow(subset(df_raw, df_raw$Queried_Salary == i & df_raw$Job_Type == "data_scientist"))
}

Statistical analysis using chi-squared to filter to only significant findings

#Specify the significance level
sig <- 0.05


#Loops through each skill
j = 0
k = 0

plist <- NA
SigSkills <- NA
while (j < nrow(PayTable)) {
  j = j + 1
  
  
  i = 0
  
  unmentioned <- NA
  
  #Loops through the elemenents of each skill, and builds a vector of the number of jobs for which a skill-salary combo wasn't   mentioned: unmentioned
  while (i < ncol(PayTable)) {
    i = i + 1
    unmentioned[i] <- Job_Totals[i] - PayTable[j,i]
    
  }

  #Performed a chi-squared tests for by each skill by paygrade to determine skills mentioned with significant relationship
  #Saves the skills with significant relationships to a list of names and their p-values to a seperate list
  
  
  if (chisq.test(cbind(unmentioned,PayTable[j,]))$p.value < sig) {
    k = k + 1
    plist[k] <- chisq.test(cbind(unmentioned,PayTable[j,]))$p.value
    SigSkills[k] <- names(PayTable[,1])[j]
    
    }
  
}

## Warning in chisq.test(cbind(unmentioned, PayTable[j, ])): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(cbind(unmentioned, PayTable[j, ])): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(cbind(unmentioned, PayTable[j, ])): Chi-squared
## approximation may be incorrect

Proportion plots for skills vs salary

The plots below show the proportions of posts that mention a skill for each salary range. The horizontal bar would be the expected proportion if there was no difference in the proportion of mentions according to the salary range. The plots shown are only those for which there were over 200 skill mentions p > 0.05 (as determined by chi squared tests). There are several interesting findings below, and if one is interested in knowing which skills seem to correlate with the highest paying jobs: C/C++, Hadoop, Hive, Java, NLP, Scala, Spark, and TensorFlow.

i = 0
skillplots <- list()
while (i < length(SigSkills)) {
  i = i + 1
  plottemp <- as.data.frame(PayTable[SigSkills[i],]/Job_Totals)
  print (ggplot(plottemp, aes(x = plottemp[,1], y = rownames(plottemp), fill = rownames(plottemp))) +
    geom_bar(stat = "identity", width=0.9) +
    scale_y_discrete(limits=rownames(plottemp)) +
    geom_vline(xintercept = sum(PayTable[SigSkills[i],])/sum(Job_Totals)) +
    scale_fill_brewer(palette=1, type = "qual") +
    xlab("Proportion") +
    ylab("Salary Range") +
    ggtitle(paste("Proportion of Job Postings that Mention",SigSkills[i],";  p = ",round(plist[i], digits = 6))) +
    labs(fill = "Salary Range")+
    theme(axis.text.x = element_text(angle = 15)) +
    coord_flip())
  
  
}

Create a connection Object to MySQL database.

We will connect to the sample database named “indeed” created on local host

Pls create the database manually on your local MySQL installation with details as in the connection string below

#install.packages("RMySQL")
#library("RMySQL")

## Connection String to connect to the local host database:
#mysqlconnection = dbConnect(MySQL(), user = 'indeed', password = 'Indeed@123', dbname = 'indeed',
#   host = 'localhost')

## Lists the tables currently in the database: 
#dbListTables(mysqlconnection)

## Creating table and inserting rows from the mentioned dataframe; if table exists then it will overwrite:
#dbWriteTable(mysqlconnection, "skills", skills[, ], overwrite = TRUE, row.names = FALSE)
#dbWriteTable(mysqlconnection, "df_raw", df_raw[, ], overwrite = TRUE, row.names = FALSE)

## Lists the tables existing and above two newly created ones:
#dbListTables(mysqlconnection)

Data607-Major Assignment-Project3-Most Valued Data Science Skills

Leo Yi / David Blumensteil / Manolis Manoli / Vinayak Kamath

3/11/2020

Overview:

Import

Soft Skills

Skills that are not technical were not included in the dataset. However, they are mentioned in the job descriptions. So, we begin by programatically reading through the job-descriptions to extract mentions of the ‘soft-skills’

We create a dataframe containing only the total amount of softskills per job and the job type and the salary band

Looking at the extreme case of high salary data engineer

Looking at the extreme case of low salary data analyst

Looking at the extreme cases of high salary data engineer and low salary data analyst together

Hard Skills

Here the hard skills are seperated out from the inbuilt list of skills in the raw dataset

Job Title

Here we look at the different types of jobs and industries and compare the skills mentioned

Combine Data

Summary Visualizations

Word Cloud

The Question on Everybody’s Mind: What Technical Skills will Get Me a High Paying Job?

First, we need to select and arrange the data we want to use

Statistical analysis using chi-squared to filter to only significant findings

Proportion plots for skills vs salary

Create a connection Object to MySQL database.

We will connect to the sample database named “indeed” created on local host

Pls create the database manually on your local MySQL installation with details as in the connection string below