library(tidyverse)
library(wordcloud)
library(RColorBrewer)
library(leaflet)
## Warning: package 'leaflet' was built under R version 4.0.4
library(tigris)
## Warning: package 'tigris' was built under R version 4.0.5
library(stringr)
library(scales)
Load Data:
glassdoor <- read.csv("Glass_door_DataAnalyst.csv")
glassdoorde <- read.csv("Glass_door_DataEngineer.csv")
glassdoords <- read.csv("Glass_door_DataScientist.csv")
glassdoorba <- read.csv("Glass_door_BusinessAnalyst.csv")
indeed <- read.csv("indeed_job_dataset.csv")
industry_survey <- read.csv("multiple_choice_responses.csv")
For this we’ll want to explore the data and answer specific questions:
here we’ll bind the extra datasets:
glassdoor <- glassdoor[,-1]
glassdoorba <- glassdoorba[,-c(1,2)]
glassdoords <- glassdoords[,-c(1,2)]
glassdoorba$Rating <- as.numeric(glassdoorba$Rating)
## Warning: NAs introduced by coercion
glassdoorba <- glassdoorba %>% filter(!is.na(Rating))
glassdoorba$Founded <- as.numeric(glassdoorba$Founded)
glassdoorba <- glassdoorba %>% filter(!is.na(Founded))
glassdoords$Rating <- as.numeric(glassdoords$Rating)
glassdoords <- glassdoords %>% filter(!is.na(Rating))
glassdoords$Founded <- as.numeric(glassdoords$Founded)
glassdoords <- glassdoords%>% filter(!is.na(Founded))
check the columns are the same:
all(names(glassdoor) == names(glassdoorde))
all(names(glassdoorde) == names(glassdoords))
all(names(glassdoords) == names(glassdoorba))
all(names(glassdoorba) == names(glassdoor))
all(glassdoor %>% summarise_all(class) == glassdoorba %>% summarise_all(class))
all(glassdoorba %>% summarise_all(class) == glassdoorde %>% summarise_all(class))
all(glassdoorde %>% summarise_all(class) == glassdoords %>% summarise_all(class))
all(glassdoords %>% summarise_all(class) == glassdoor %>% summarise_all(class))
add column descriptor:
glassdoor <- glassdoor %>% add_column(searchpram = "DA")
glassdoorde <- glassdoorde %>% add_column(searchpram = "DE")
glassdoords <- glassdoords %>% add_column(searchpram = "DS")
glassdoorba <- glassdoorba %>% add_column(searchpram = "BA")
bind all the rows:
glassdoor <- bind_rows(glassdoor,glassdoorde,glassdoords,glassdoorba)
How much do data scientists make:
low <- str_extract(glassdoor$Salary.Estimate, "\\$\\d*K")
high <- str_extract(glassdoor$Salary.Estimate, "-\\$\\d*K")
low <- extract_numeric(low)
## extract_numeric() is deprecated: please use readr::parse_number() instead
high <- extract_numeric(high)*-1
## extract_numeric() is deprecated: please use readr::parse_number() instead
middleglass <- (high+low)/2
glassdoor jobs:
hist(middleglass)
look at earnings in survey:
earnings_survey <- industry_survey[-1,21] %>%
str_remove_all("[$,]")
survey_low <- str_extract(earnings_survey, "\\d*-") %>% str_remove_all("[-]")
survey_high <- str_extract(earnings_survey, "-\\d*") %>% str_remove_all("[-]")
survey_mid <- data.frame(earn = (as.numeric(survey_low)+as.numeric(survey_high))/2000, role = as.factor(industry_survey[-1,7]), country = as.factor(industry_survey[-1,5]))
survey_mid <- subset(survey_mid,!(role %in% c("","Student","Research Scientist","Not employed","Other","Statistician"))) %>% filter( country == "United States of America")
levels(survey_mid$role)
## [1] "" "Business Analyst"
## [3] "Data Analyst" "Data Engineer"
## [5] "Data Scientist" "DBA/Database Engineer"
## [7] "Not employed" "Other"
## [9] "Product/Project Manager" "Research Scientist"
## [11] "Software Engineer" "Statistician"
## [13] "Student"
hist(survey_mid$earn)
And now ggplot histogram for current job holders:
meanearn <- mean(survey_mid$earn,na.rm = TRUE)
medianearn <- median(survey_mid$earn,na.rm = TRUE)
ggplot(survey_mid, aes(x=earn)) +
geom_histogram(aes(y=..density..), position="identity", alpha=0.5)+
geom_density(adjust = 2)+
geom_vline(aes(xintercept = meanearn),color ="blue")+
geom_vline(aes(xintercept = medianearn),color ="green")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 215 rows containing non-finite values (stat_bin).
## Warning: Removed 215 rows containing non-finite values (stat_density).
indeed jobs:
summary(as.factor(indeed$Queried_Salary))
## <80000 >160000 100000-119999 120000-139999 140000-159999
## 788 415 1394 1292 873
## 80000-99999
## 953
middleindeed <- indeed$Queried_Salary
middleindeed <- ifelse(middleindeed == "<80000",80000,
ifelse(middleindeed == ">160000",160000,
ifelse(middleindeed == "100000-119999",109999.5,
ifelse(middleindeed == "120000-139999",129999.5,
ifelse(middleindeed == "140000-159999",149999.5,
ifelse(middleindeed == "80000-99999",89999.5,0))))))
hist(middleindeed)
And now ggplot histogram for posted jobs:
jobearn <- data.frame(earn = append(middleglass,middleindeed/1000))
meanearn <- mean(jobearn$earn,na.rm = TRUE)
medianearn <- median(jobearn$earn,na.rm = TRUE)
ggplot(jobearn, aes(x=earn)) +
geom_histogram(aes(y=..density..), position="identity", alpha=0.5)+
geom_density(adjust = 4)+
geom_vline(aes(xintercept = meanearn),color ="blue")+
geom_vline(aes(xintercept = medianearn),color ="green")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 22 rows containing non-finite values (stat_bin).
## Warning: Removed 22 rows containing non-finite values (stat_density).
as we can see here the predictions are different between our two datasets
who you’ll work for:
glasstemp <- str_extract(glassdoor$Company.Name, ".*\n")
glassdoorcompanies <- ifelse(is.na(glasstemp),glassdoor$Company.Name,substr(glasstemp,1,nchar(glasstemp)-1))
glassdoor <- glassdoor %>% mutate(company.Name.f = glassdoorcompanies)
wordcompanies <- append(indeed$Company,glassdoorcompanies)
wordcompaniesdf <- data.frame(wordcompanies)
wordcompaniesdf <- count(wordcompaniesdf, wordcompanies)
wordcompaniesdf <- wordcompaniesdf[-1,]
#Take a look at the top ten:
head(wordcompaniesdf[order(-wordcompaniesdf$n),], 10)
Lets take this and get the company types from it:
wordcompaniesdf <- left_join(wordcompaniesdf,indeed,by = c("wordcompanies" = "Company")) %>% select(wordcompanies,n,Company_Industry)%>% distinct()
wordcompaniesdf <- left_join(wordcompaniesdf,glassdoor,by = c("wordcompanies" = "company.Name.f")) %>% select(wordcompanies,n,Company_Industry,Industry,Sector)%>% distinct()
summary(wordcompaniesdf)
## wordcompanies n Company_Industry Industry
## Length:6595 Min. : 1.000 Length:6595 Length:6595
## Class :character 1st Qu.: 1.000 Class :character Class :character
## Mode :character Median : 1.000 Mode :character Mode :character
## Mean : 2.746
## 3rd Qu.: 3.000
## Max. :242.000
## Sector
## Length:6595
## Class :character
## Mode :character
##
##
##
who’s hiring:
wordcloud(words = wordcompaniesdf$wordcompanies, freq = wordcompaniesdf$n, min.freq = 4,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
It looks like there are a lot of consulting jobs out there. lets look deeper at word companies:
sector <- wordcompaniesdf %>% filter((!is.na(Company_Industry ) & Company_Industry != "")|(!is.na(Sector)&Sector!="-1"))
sector <- sector[,-4]
sector$merge <- ifelse((sector$Company_Industry == ""|is.na(sector$Company_Industry)),sector$Sector,sector$Company_Industry)
sector$clean <- ifelse(sector$merge %in% c("Industrial ManufacturingAgriculture and Extraction","Industrial ManufacturingConstruction","Industrial ManufacturingConsumer Goods and Services"),"Industrial Manufacturing",sector$merge)
sector$clean <- ifelse(sector$merge %in% c("Restaurants, Travel and Leisure","Restaurants, Travel and LeisureConsulting and Business Services"),"Restaurants, Bars & Food Services",sector$clean)
sector$clean <- ifelse(sector$merge == "Aerospace and Defense","Aerospace & Defense",sector$clean)
sector$clean <- ifelse(sector$merge == "Agriculture and Extraction","Agriculture & Forestry",sector$clean)
sector$clean <- ifelse(sector$merge == "Banks and Financial Services","Finance",sector$clean)
sector$clean <- ifelse(sector$merge == "Education and Schools","Education",sector$clean)
sector$clean <- ifelse(sector$merge == "Food and BeveragesConsulting and Business Services","Food and Beverages",sector$clean)
sector$clean <- ifelse(sector$merge == "InsuranceHealth Care","Insurance",sector$clean)
sector$clean <- ifelse(sector$merge == "Construction, Repair & Maintenance","Construction",sector$clean)
sector$clean <- ifelse(sector$merge == "Industrial Manufacturing","Manufacturing",sector$clean)
sector$clean <- ifelse(sector$merge == "Energy and Utilities","Oil, Gas, Energy & Utilities",sector$clean)
sector$clean <- ifelse(sector$merge == "Media, News and Publishing" ,"Media",sector$clean)
sector$clean <- ifelse(sector$merge == "Consumer Goods and Services" ,"Consumer Services",sector$clean)
sector$clean <- ifelse(sector$merge == "RetailConsumer Goods and Services" ,"Consumer Services",sector$clean)
sector$clean <- ifelse(sector$merge == "Restaurants, Bars & Food Services" ,"Food and Beverages",sector$clean)
sector$clean <- ifelse(sector$merge == "Internet and Software" ,"Information Technology",sector$clean)
sector$clean <- ifelse(sector$merge == "Computers and Electronics" ,"Information Technology",sector$clean)
sector$clean <- ifelse(sector$merge == "Transport and Freight" ,"Transportation & Logistics",sector$clean)
sector$clean <- ifelse(sector$merge == "Arts, Entertainment & Recreation" ,"Media",sector$clean)
sector$clean <- ifelse(sector$merge == "Pharmaceuticals" ,"Biotech & Pharmaceuticals",sector$clean)
sector$clean <- ifelse(sector$merge == "Business Services" ,"Consulting and Business Services",sector$clean)
sector$clean <- ifelse(sector$merge == "Industrial Manufacturing" ,"Manufacturing",sector$clean)
sector$clean <- ifelse(sector$merge == "Real EstateReal Estate" ,"Real Estate",sector$clean)
sector$clean <- ifelse(sector$merge == "Industrial Manufacturing" ,"Manufacturing",sector$clean)
sector$merge <- as.factor(sector$merge)
sector$clean <- as.factor(sector$clean)
and the agregate:
jobs.by.sector <- aggregate(sector$n, by=list(Category=sector$clean), FUN=sum)
jobs.by.sector <- arrange(jobs.by.sector,desc(x))
jobs.by.sector.top10 <- head(jobs.by.sector,9)
jobs.by.sector.rest <- tail(jobs.by.sector,20)
jobs.by.sector.top10 <- bind_rows(jobs.by.sector.top10,tibble(Category = "Other",x = sum(jobs.by.sector.rest$x)))
#jobs.by.sector.top10
ggplot(jobs.by.sector.top10, aes(x=Category,y=x)) +
geom_bar(stat="identity",fill = "orangered1")+
scale_x_discrete(limits=jobs.by.sector.top10$Category)+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
ok looking at indeed skills next then using them as searches inside the descriptions of glass door postings:
allskills <- function(column){
bigvector <- c()
for(i in 1:length(column)){
index <- column[i]
temp <- unlist(str_extract_all(index,"\\'(.*?)\\'"))
temp <- str_remove_all(temp, "\\'")
bigvector <- append(bigvector,temp)
}
dfsum <- as.data.frame(table(bigvector))
return(dfsum)
}
test function and build our model data set:
indeednumbers <- allskills(indeed$Skill)
indeednumbers <- indeednumbers[order(-indeednumbers$Freq),]
now we’re going to use the skills in indeed to extract the skills in glassdoor:
glassskill <- function(column,targetcolumn){
#going to have to bee a double loop here
emptytarget <- data.frame(bigvector = targetcolumn,Freq = 0)
emptytarget$bigvector <- as.character(emptytarget$bigvector)
#print(class(emptytarget$Freq))
for(i in 1:length(column)){
indexI <- column[i]
#print(indexI)
for(j in 1:length(targetcolumn)){
indexJ <- targetcolumn[j]
#print(indexJ)
boolean <- str_detect(paste(indexI), paste0("\\W",indexJ,"\\W"))
#print(boolean)
if(boolean == TRUE){
emptytarget[j,2] <- as.numeric(emptytarget[j,2]+1)
}
}
}
return(emptytarget)
}
short test
glassskills <- glassskill(glassdoor$Job.Description,indeednumbers$bigvector)
write.csv(glassskills,"glassskills.csv")
glassskills <- read.csv("glassskills.csv")
make a word cloud of the skills:
str(glassskills)
## 'data.frame': 464 obs. of 3 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ bigvector: chr "Python" "SQL" "Machine Learning" "R" ...
## $ Freq : int 4235 5551 720 2223 1410 1675 1664 1972 73 803 ...
#combine dataframes:
combined_skills <- indeednumbers %>% mutate(Freq = indeednumbers[,2] + glassskills[,3])
the word cloud:
skills_cloud <- wordcloud(words = combined_skills$bigvector, freq = combined_skills$Freq, min.freq = 200, max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
big or small:
indeed_b_s <- data.frame( bs = indeed$Company_Employees) %>% filter(bs != "")
table(indeed_b_s$bs)
##
## 10,000+ Less than 10,000
## 2004 1195
glass_b_s <- data.frame( bs = glassdoor$Size) %>% filter(bs != "" & bs != "Unknown" & bs != "-1" )
table(glass_b_s$bs)
##
## 1 to 50 employees 10000+ employees 1001 to 5000 employees
## 1734 2642 1936
## 201 to 500 employees 5001 to 10000 employees 501 to 1000 employees
## 1314 655 1053
## 51 to 200 employees
## 2082
glass_b_s$bs <- factor(glass_b_s$bs, c("1 to 50 employees", "51 to 200 employees", "201 to 500 employees","501 to 1000 employees","1001 to 5000 employees","5001 to 10000 employees","10000+ employees"))
#str(glass_b_s$bs)
#will use glassdoor alone for this as indeed's info is sparse
ggplot(glass_b_s, aes(bs)) +
geom_bar(aes(y=..prop.., group = 1), fill = "steelblue2") +
scale_y_continuous(labels = percent_format())
A function for the finding of education given the description:
edu <- function(description){
finalvector <- rep(NA, length(description))
for(i in 1:length(description)){
if(grepl(paste0("\\W","no experience","\\W"),description[i], ignore.case = TRUE)){
finalvector[i] = 0
}
else{
finalvector[i] = 5
}
if(grepl(paste0("\\W","college","\\W"),description[i], ignore.case = TRUE)){
finalvector[i] = 1
}
if(grepl(paste0("\\W","Bachelor"),description[i], ignore.case = TRUE)){
finalvector[i] = 2
}
if(grepl(paste0("\\W","Master"),description[i], ignore.case = TRUE)){
finalvector[i] = 3
}
if(grepl(paste0("\\W","PHD","\\W"),description[i],ignore.case = TRUE)|grepl(paste0("\\W","Doctoral","\\W"),description[i],ignore.case = TRUE)){
finalvector[i] = 4
}
}
return(finalvector)
}
build the EDU column:
glassedu <- edu(glassdoor$Job.Description)
indeededu <- edu(indeed$Description)
max.edu.req <- data.frame(edu = append(glassedu,indeededu))
table(max.edu.req$edu)
##
## 0 1 2 3 4 5
## 5 275 5102 3262 1779 7674
a dataframe of educational achievement in the same format:
survey.edu.req <- data.frame(edu = factor(industry_survey$Q4))
#levels(survey.edu.req$edu)
survey.edu.req$edu2 <- ifelse(survey.edu.req$edu == "No formal education past high school",0,
ifelse(survey.edu.req$edu == ""|survey.edu.req$edu == "I prefer not to answer",5,
ifelse(survey.edu.req$edu == "Some college/university study without earning a bachelor’s degree",1,
ifelse(survey.edu.req$edu == "Bachelor’s degree",2,
ifelse(survey.edu.req$edu == "Master’s degree",3,
ifelse(survey.edu.req$edu == "Doctoral degree"|survey.edu.req$edu == "Professional degree",4,5))))))
And the table:
table(survey.edu.req$edu2)
##
## 0 1 2 3 4 5
## 233 837 5993 8549 3378 728
indeed$remote <- ifelse(grepl(paste0("Remote"),indeed$Description,ignore.case = TRUE),1,0)
glassdoor$remote <- ifelse(grepl(paste0("Remote"),glassdoor$Job.Description,ignore.case = TRUE),1,0)
table(indeed$remote)
##
## 0 1
## 5576 139
table(glassdoor$remote)
##
## 0 1
## 11119 1263
remote.porp <- data.frame(remote = append(glassdoor$remote,indeed$remote))
remote.porp$id <- ifelse(remote.porp$remote == 1,"Remote","Unspecified")
remote.porp <- remote.porp %>%
group_by(id) %>%
count()
bp<- ggplot(remote.porp, aes(x="", y=n, fill=id))+
geom_bar(width = 1, stat = "identity")+
coord_polar("y", start=0)
bp
and the plots:
ggplot(survey.edu.req, aes(edu2)) +
geom_bar(aes(y=..prop.., group = 1), fill = "steelblue2") +
scale_y_continuous(labels = percent_format())+
coord_flip()+
ylim(0,.5)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
ggplot(max.edu.req, aes(x = edu)) +
geom_bar(aes(y=..prop.., group = 1), fill = "steelblue2") +
scale_y_continuous(labels = percent_format())+
coord_flip()+
ylim(.5,0)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
Finally lets look at the demographics in the data sciecne community:
age <- data.frame(age = as.factor(industry_survey[-1,2])) %>% group_by(age) %>% summarize(n = n())
## `summarise()` ungrouping output (override with `.groups` argument)
bp<- ggplot(age, aes(x=age, y=n))+
geom_bar(stat = "identity")
bp
sex
sex <- data.frame(sex = as.factor(industry_survey[-1,3])) %>% group_by(sex) %>% summarize(n = n())
## `summarise()` ungrouping output (override with `.groups` argument)
bp <- ggplot(sex, aes(x="", y=n, fill=sex))+
geom_bar(width = 1, stat = "identity")+
coord_polar("y", start=0)
bp
lets look at location next:
#summary(as.factor(glassdoor$Location))
glassdoor$State <- sapply(glassdoor$Location, function(x) strsplit(x, ", ")[[1]][2])
glassdoor$State <- ifelse(glassdoor$State == "Arapahoe", "CO",glassdoor$State)
summary(as.factor(glassdoor$State))
## AZ CA CO DE FL
## 931 3309 96 74 224
## GA IL IN KS Los Angeles
## 4 1055 23 3 1
## NC NJ NY OH PA
## 90 231 1100 212 886
## SC TX United Kingdom UT VA
## 3 4001 4 33 48
## WA
## 54
statemap <- data.frame(state = append(glassdoor$State,indeed$Location))
statems <- data.frame(state = c("MS","AK"),n = c(0,0))
statemap <- count(statemap, state)
statemap <- bind_rows(statemap,statems)
and filter out the non states:
anywhere <- statemap %>% filter((state %in% c('USA','REMOTE')))
statemap <- statemap %>% filter(!(state %in% c('USA','REMOTE','Los Angeles','United Kingdom','')))
state.pop <- read.csv("states pop 2019.csv")
names(state.pop)
## [1] "ï..State" "X2019"
state.pop <- left_join(state.pop,statemap,by = c("ï..State" = "state")) %>% mutate(porp = n/X2019*100)
#included but not shown as it will fill with loading symbols
#states <- states(cb=T)
states_merged <- geo_join(states, statemap, "STUSPS", "state")
pal <- colorNumeric("Greens", domain=states_merged$n)
popup_sb <- paste0("Total: ", as.character(states_merged$n))
leaflet() %>%
addProviderTiles("CartoDB.Positron") %>%
setView(-98.483330, 38.712046, zoom = 4) %>%
addPolygons(data = states_merged ,
fillColor = ~pal(states_merged$n),
fillOpacity = 0.7,
weight = 0.2,
smoothFactor = 0.2,
popup = ~popup_sb) %>%
addLegend(pal = pal,
values = states_merged$n,
position = "bottomright",
title = "Jobs")