library(tidyverse)
library(wordcloud)
library(RColorBrewer)
library(leaflet)
## Warning: package 'leaflet' was built under R version 4.0.4
library(tigris)
## Warning: package 'tigris' was built under R version 4.0.5
library(stringr)
library(scales)

Load Data:

glassdoor <- read.csv("Glass_door_DataAnalyst.csv")
glassdoorde <- read.csv("Glass_door_DataEngineer.csv")
glassdoords <- read.csv("Glass_door_DataScientist.csv")
glassdoorba <- read.csv("Glass_door_BusinessAnalyst.csv")
indeed <- read.csv("indeed_job_dataset.csv")
industry_survey <- read.csv("multiple_choice_responses.csv")

For this we’ll want to explore the data and answer specific questions:

here we’ll bind the extra datasets:

glassdoor <- glassdoor[,-1]
glassdoorba <- glassdoorba[,-c(1,2)]
glassdoords <- glassdoords[,-c(1,2)]
glassdoorba$Rating <- as.numeric(glassdoorba$Rating)
## Warning: NAs introduced by coercion
glassdoorba <- glassdoorba %>% filter(!is.na(Rating))
glassdoorba$Founded <- as.numeric(glassdoorba$Founded)
glassdoorba <- glassdoorba %>% filter(!is.na(Founded))

glassdoords$Rating <- as.numeric(glassdoords$Rating)
glassdoords <- glassdoords %>% filter(!is.na(Rating))
glassdoords$Founded <- as.numeric(glassdoords$Founded)
glassdoords <- glassdoords%>% filter(!is.na(Founded))

check the columns are the same:

all(names(glassdoor) == names(glassdoorde))
all(names(glassdoorde) == names(glassdoords))
all(names(glassdoords) == names(glassdoorba))
all(names(glassdoorba) == names(glassdoor))

all(glassdoor %>% summarise_all(class) == glassdoorba %>% summarise_all(class))
all(glassdoorba %>% summarise_all(class) == glassdoorde %>% summarise_all(class))
all(glassdoorde %>% summarise_all(class) == glassdoords %>% summarise_all(class))
all(glassdoords %>% summarise_all(class) == glassdoor %>% summarise_all(class))

add column descriptor:

glassdoor <- glassdoor %>% add_column(searchpram = "DA")
glassdoorde <- glassdoorde %>% add_column(searchpram = "DE")
glassdoords <- glassdoords %>% add_column(searchpram = "DS")
glassdoorba <- glassdoorba %>% add_column(searchpram = "BA")

bind all the rows:

glassdoor <- bind_rows(glassdoor,glassdoorde,glassdoords,glassdoorba)

How much do data scientists make:

low <- str_extract(glassdoor$Salary.Estimate, "\\$\\d*K")
high <- str_extract(glassdoor$Salary.Estimate, "-\\$\\d*K")
low <- extract_numeric(low)
## extract_numeric() is deprecated: please use readr::parse_number() instead
high <- extract_numeric(high)*-1
## extract_numeric() is deprecated: please use readr::parse_number() instead
middleglass <- (high+low)/2

glassdoor jobs:

hist(middleglass)

look at earnings in survey:

earnings_survey <- industry_survey[-1,21] %>%
  str_remove_all("[$,]")
survey_low <- str_extract(earnings_survey, "\\d*-") %>% str_remove_all("[-]")
survey_high <- str_extract(earnings_survey, "-\\d*") %>% str_remove_all("[-]")
survey_mid <- data.frame(earn = (as.numeric(survey_low)+as.numeric(survey_high))/2000, role = as.factor(industry_survey[-1,7]), country =  as.factor(industry_survey[-1,5])) 
  
survey_mid <- subset(survey_mid,!(role %in% c("","Student","Research Scientist","Not employed","Other","Statistician"))) %>% filter( country == "United States of America")
levels(survey_mid$role)
##  [1] ""                        "Business Analyst"       
##  [3] "Data Analyst"            "Data Engineer"          
##  [5] "Data Scientist"          "DBA/Database Engineer"  
##  [7] "Not employed"            "Other"                  
##  [9] "Product/Project Manager" "Research Scientist"     
## [11] "Software Engineer"       "Statistician"           
## [13] "Student"
hist(survey_mid$earn)

And now ggplot histogram for current job holders:

meanearn <- mean(survey_mid$earn,na.rm = TRUE)
medianearn <- median(survey_mid$earn,na.rm = TRUE)
ggplot(survey_mid, aes(x=earn)) +
geom_histogram(aes(y=..density..), position="identity", alpha=0.5)+
geom_density(adjust = 2)+
geom_vline(aes(xintercept = meanearn),color ="blue")+
geom_vline(aes(xintercept = medianearn),color ="green")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 215 rows containing non-finite values (stat_bin).
## Warning: Removed 215 rows containing non-finite values (stat_density).

indeed jobs:

summary(as.factor(indeed$Queried_Salary))
##        <80000       >160000 100000-119999 120000-139999 140000-159999 
##           788           415          1394          1292           873 
##   80000-99999 
##           953
middleindeed <- indeed$Queried_Salary
middleindeed <- ifelse(middleindeed == "<80000",80000,
                  ifelse(middleindeed == ">160000",160000,     
                    ifelse(middleindeed == "100000-119999",109999.5,
                      ifelse(middleindeed == "120000-139999",129999.5,
                        ifelse(middleindeed == "140000-159999",149999.5,
                          ifelse(middleindeed == "80000-99999",89999.5,0))))))
hist(middleindeed)

And now ggplot histogram for posted jobs:

jobearn <- data.frame(earn = append(middleglass,middleindeed/1000))
meanearn <- mean(jobearn$earn,na.rm = TRUE)
medianearn <- median(jobearn$earn,na.rm = TRUE)
ggplot(jobearn, aes(x=earn)) +
geom_histogram(aes(y=..density..), position="identity", alpha=0.5)+
geom_density(adjust = 4)+
geom_vline(aes(xintercept = meanearn),color ="blue")+
geom_vline(aes(xintercept = medianearn),color ="green") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 22 rows containing non-finite values (stat_bin).
## Warning: Removed 22 rows containing non-finite values (stat_density).

as we can see here the predictions are different between our two datasets

who you’ll work for:

glasstemp <- str_extract(glassdoor$Company.Name, ".*\n")
glassdoorcompanies <- ifelse(is.na(glasstemp),glassdoor$Company.Name,substr(glasstemp,1,nchar(glasstemp)-1))
glassdoor <- glassdoor %>% mutate(company.Name.f = glassdoorcompanies)
wordcompanies <- append(indeed$Company,glassdoorcompanies)
wordcompaniesdf <- data.frame(wordcompanies)
wordcompaniesdf <- count(wordcompaniesdf, wordcompanies)
wordcompaniesdf <- wordcompaniesdf[-1,]
#Take a look at the top ten:
head(wordcompaniesdf[order(-wordcompaniesdf$n),], 10)

Lets take this and get the company types from it:

wordcompaniesdf <- left_join(wordcompaniesdf,indeed,by = c("wordcompanies" = "Company")) %>% select(wordcompanies,n,Company_Industry)%>% distinct()

wordcompaniesdf <- left_join(wordcompaniesdf,glassdoor,by = c("wordcompanies" = "company.Name.f")) %>% select(wordcompanies,n,Company_Industry,Industry,Sector)%>% distinct()
summary(wordcompaniesdf)
##  wordcompanies            n           Company_Industry     Industry        
##  Length:6595        Min.   :  1.000   Length:6595        Length:6595       
##  Class :character   1st Qu.:  1.000   Class :character   Class :character  
##  Mode  :character   Median :  1.000   Mode  :character   Mode  :character  
##                     Mean   :  2.746                                        
##                     3rd Qu.:  3.000                                        
##                     Max.   :242.000                                        
##     Sector         
##  Length:6595       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

who’s hiring:

wordcloud(words = wordcompaniesdf$wordcompanies, freq = wordcompaniesdf$n, min.freq = 4,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

It looks like there are a lot of consulting jobs out there. lets look deeper at word companies:

sector <- wordcompaniesdf %>% filter((!is.na(Company_Industry ) & Company_Industry != "")|(!is.na(Sector)&Sector!="-1"))
sector <- sector[,-4]

sector$merge <- ifelse((sector$Company_Industry == ""|is.na(sector$Company_Industry)),sector$Sector,sector$Company_Industry)

sector$clean <- ifelse(sector$merge %in% c("Industrial ManufacturingAgriculture and Extraction","Industrial ManufacturingConstruction","Industrial ManufacturingConsumer Goods and Services"),"Industrial Manufacturing",sector$merge)

sector$clean <- ifelse(sector$merge %in% c("Restaurants, Travel and Leisure","Restaurants, Travel and LeisureConsulting and Business Services"),"Restaurants, Bars & Food Services",sector$clean)

sector$clean <- ifelse(sector$merge == "Aerospace and Defense","Aerospace & Defense",sector$clean)

sector$clean <- ifelse(sector$merge == "Agriculture and Extraction","Agriculture & Forestry",sector$clean)

sector$clean <- ifelse(sector$merge == "Banks and Financial Services","Finance",sector$clean)

sector$clean <- ifelse(sector$merge == "Education and Schools","Education",sector$clean)

sector$clean <- ifelse(sector$merge == "Food and BeveragesConsulting and Business Services","Food and Beverages",sector$clean)

sector$clean <- ifelse(sector$merge == "InsuranceHealth Care","Insurance",sector$clean)

sector$clean <- ifelse(sector$merge == "Construction, Repair & Maintenance","Construction",sector$clean)

sector$clean <- ifelse(sector$merge == "Industrial Manufacturing","Manufacturing",sector$clean)

sector$clean <- ifelse(sector$merge == "Energy and Utilities","Oil, Gas, Energy & Utilities",sector$clean)

sector$clean <- ifelse(sector$merge == "Media, News and Publishing" ,"Media",sector$clean)

sector$clean <- ifelse(sector$merge == "Consumer Goods and Services" ,"Consumer Services",sector$clean)

sector$clean <- ifelse(sector$merge == "RetailConsumer Goods and Services" ,"Consumer Services",sector$clean)

sector$clean <- ifelse(sector$merge == "Restaurants, Bars & Food Services" ,"Food and Beverages",sector$clean)

sector$clean <- ifelse(sector$merge == "Internet and Software" ,"Information Technology",sector$clean)
sector$clean <- ifelse(sector$merge == "Computers and Electronics" ,"Information Technology",sector$clean)

sector$clean <- ifelse(sector$merge == "Transport and Freight" ,"Transportation & Logistics",sector$clean)

sector$clean <- ifelse(sector$merge == "Arts, Entertainment & Recreation" ,"Media",sector$clean)

sector$clean <- ifelse(sector$merge == "Pharmaceuticals" ,"Biotech & Pharmaceuticals",sector$clean)

sector$clean <- ifelse(sector$merge == "Business Services" ,"Consulting and Business Services",sector$clean)

sector$clean <- ifelse(sector$merge == "Industrial Manufacturing" ,"Manufacturing",sector$clean)

sector$clean <- ifelse(sector$merge == "Real EstateReal Estate" ,"Real Estate",sector$clean)

sector$clean <- ifelse(sector$merge == "Industrial Manufacturing" ,"Manufacturing",sector$clean)

sector$merge <- as.factor(sector$merge)
sector$clean <- as.factor(sector$clean)

and the agregate:

jobs.by.sector <- aggregate(sector$n, by=list(Category=sector$clean), FUN=sum)
jobs.by.sector <- arrange(jobs.by.sector,desc(x))
jobs.by.sector.top10 <- head(jobs.by.sector,9)
jobs.by.sector.rest <- tail(jobs.by.sector,20)
jobs.by.sector.top10 <- bind_rows(jobs.by.sector.top10,tibble(Category = "Other",x = sum(jobs.by.sector.rest$x)))
#jobs.by.sector.top10
ggplot(jobs.by.sector.top10, aes(x=Category,y=x)) + 
    geom_bar(stat="identity",fill = "orangered1")+
  scale_x_discrete(limits=jobs.by.sector.top10$Category)+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

ok looking at indeed skills next then using them as searches inside the descriptions of glass door postings:

allskills <- function(column){
  bigvector <- c()
  for(i in 1:length(column)){
    index <- column[i]
    temp <- unlist(str_extract_all(index,"\\'(.*?)\\'"))
    temp <- str_remove_all(temp, "\\'")
    bigvector <- append(bigvector,temp)
  }

dfsum <- as.data.frame(table(bigvector))
return(dfsum)
}

test function and build our model data set:

indeednumbers <- allskills(indeed$Skill)
indeednumbers <- indeednumbers[order(-indeednumbers$Freq),]

now we’re going to use the skills in indeed to extract the skills in glassdoor:

glassskill <- function(column,targetcolumn){
  #going to have to bee a double loop here
  emptytarget <- data.frame(bigvector = targetcolumn,Freq = 0)
  emptytarget$bigvector <- as.character(emptytarget$bigvector)
  #print(class(emptytarget$Freq))
  for(i in 1:length(column)){
    indexI <- column[i]
    #print(indexI)
    for(j in 1:length(targetcolumn)){
    indexJ <- targetcolumn[j]
    #print(indexJ)
    boolean <- str_detect(paste(indexI), paste0("\\W",indexJ,"\\W"))
    #print(boolean)
    if(boolean == TRUE){
      emptytarget[j,2] <- as.numeric(emptytarget[j,2]+1)
    }
    }
  }
  return(emptytarget)
}

short test

glassskills <- glassskill(glassdoor$Job.Description,indeednumbers$bigvector)
write.csv(glassskills,"glassskills.csv")
glassskills <- read.csv("glassskills.csv")

make a word cloud of the skills:

str(glassskills)
## 'data.frame':    464 obs. of  3 variables:
##  $ X        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ bigvector: chr  "Python" "SQL" "Machine Learning" "R" ...
##  $ Freq     : int  4235 5551 720 2223 1410 1675 1664 1972 73 803 ...
#combine dataframes:
combined_skills <- indeednumbers %>% mutate(Freq = indeednumbers[,2] + glassskills[,3]) 

the word cloud:

skills_cloud <- wordcloud(words = combined_skills$bigvector, freq = combined_skills$Freq, min.freq = 200, max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

big or small:

indeed_b_s <- data.frame( bs = indeed$Company_Employees) %>% filter(bs != "")
table(indeed_b_s$bs)
## 
##          10,000+ Less than 10,000 
##             2004             1195
glass_b_s <- data.frame( bs = glassdoor$Size) %>% filter(bs != "" & bs != "Unknown" & bs != "-1" )
table(glass_b_s$bs)
## 
##       1 to 50 employees        10000+ employees  1001 to 5000 employees 
##                    1734                    2642                    1936 
##    201 to 500 employees 5001 to 10000 employees   501 to 1000 employees 
##                    1314                     655                    1053 
##     51 to 200 employees 
##                    2082
glass_b_s$bs <- factor(glass_b_s$bs, c("1 to 50 employees", "51 to 200 employees", "201 to 500 employees","501 to 1000 employees","1001 to 5000 employees","5001 to 10000 employees","10000+ employees"))
#str(glass_b_s$bs)
#will use glassdoor alone for this as indeed's info is sparse
ggplot(glass_b_s, aes(bs)) + 
    geom_bar(aes(y=..prop.., group = 1), fill =  "steelblue2") + 
    scale_y_continuous(labels = percent_format())

A function for the finding of education given the description:

edu <- function(description){
  finalvector <- rep(NA, length(description))
  for(i in 1:length(description)){
    if(grepl(paste0("\\W","no experience","\\W"),description[i], ignore.case = TRUE)){
    finalvector[i] = 0
    }
    else{
      finalvector[i] = 5
    }
    if(grepl(paste0("\\W","college","\\W"),description[i], ignore.case = TRUE)){
    finalvector[i] = 1
    }
    if(grepl(paste0("\\W","Bachelor"),description[i], ignore.case = TRUE)){
    finalvector[i] = 2
    }
    if(grepl(paste0("\\W","Master"),description[i], ignore.case = TRUE)){
    finalvector[i] = 3 
    }
    if(grepl(paste0("\\W","PHD","\\W"),description[i],ignore.case = TRUE)|grepl(paste0("\\W","Doctoral","\\W"),description[i],ignore.case = TRUE)){
    finalvector[i] = 4
    }
  }
  return(finalvector)
}

build the EDU column:

glassedu <- edu(glassdoor$Job.Description)
indeededu <- edu(indeed$Description)
max.edu.req <- data.frame(edu = append(glassedu,indeededu))
table(max.edu.req$edu)
## 
##    0    1    2    3    4    5 
##    5  275 5102 3262 1779 7674

a dataframe of educational achievement in the same format:

survey.edu.req <- data.frame(edu = factor(industry_survey$Q4))
#levels(survey.edu.req$edu)
survey.edu.req$edu2 <- ifelse(survey.edu.req$edu == "No formal education past high school",0,
                      ifelse(survey.edu.req$edu == ""|survey.edu.req$edu == "I prefer not to answer",5,
                      ifelse(survey.edu.req$edu == "Some college/university study without earning a bachelor’s degree",1,
                      ifelse(survey.edu.req$edu == "Bachelor’s degree",2,
                      ifelse(survey.edu.req$edu == "Master’s degree",3,
                      ifelse(survey.edu.req$edu == "Doctoral degree"|survey.edu.req$edu == "Professional degree",4,5))))))

And the table:

table(survey.edu.req$edu2)
## 
##    0    1    2    3    4    5 
##  233  837 5993 8549 3378  728
indeed$remote <- ifelse(grepl(paste0("Remote"),indeed$Description,ignore.case = TRUE),1,0)
glassdoor$remote <- ifelse(grepl(paste0("Remote"),glassdoor$Job.Description,ignore.case = TRUE),1,0)
table(indeed$remote)
## 
##    0    1 
## 5576  139
table(glassdoor$remote)
## 
##     0     1 
## 11119  1263
remote.porp <- data.frame(remote = append(glassdoor$remote,indeed$remote))
remote.porp$id <- ifelse(remote.porp$remote == 1,"Remote","Unspecified")

remote.porp <-   remote.porp %>%
  group_by(id) %>% 
  count()
bp<- ggplot(remote.porp, aes(x="", y=n, fill=id))+
geom_bar(width = 1, stat = "identity")+
  coord_polar("y", start=0)
bp

and the plots:

ggplot(survey.edu.req, aes(edu2)) + 
    geom_bar(aes(y=..prop.., group = 1), fill =  "steelblue2") + 
    scale_y_continuous(labels = percent_format())+
    coord_flip()+
    ylim(0,.5)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.

ggplot(max.edu.req, aes(x = edu)) + 
    geom_bar(aes(y=..prop.., group = 1), fill =  "steelblue2") + 
    scale_y_continuous(labels = percent_format())+
    coord_flip()+
    ylim(.5,0)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.

Finally lets look at the demographics in the data sciecne community:

age <- data.frame(age = as.factor(industry_survey[-1,2])) %>% group_by(age) %>% summarize(n = n())
## `summarise()` ungrouping output (override with `.groups` argument)
bp<- ggplot(age, aes(x=age, y=n))+
geom_bar(stat = "identity")
bp

sex

sex <- data.frame(sex = as.factor(industry_survey[-1,3])) %>% group_by(sex) %>% summarize(n = n())
## `summarise()` ungrouping output (override with `.groups` argument)
bp <- ggplot(sex, aes(x="", y=n, fill=sex))+
geom_bar(width = 1, stat = "identity")+
  coord_polar("y", start=0)
bp

lets look at location next:

#summary(as.factor(glassdoor$Location))
glassdoor$State <- sapply(glassdoor$Location, function(x) strsplit(x, ", ")[[1]][2])
glassdoor$State <- ifelse(glassdoor$State == "Arapahoe", "CO",glassdoor$State)
summary(as.factor(glassdoor$State))
##             AZ             CA             CO             DE             FL 
##            931           3309             96             74            224 
##             GA             IL             IN             KS    Los Angeles 
##              4           1055             23              3              1 
##             NC             NJ             NY             OH             PA 
##             90            231           1100            212            886 
##             SC             TX United Kingdom             UT             VA 
##              3           4001              4             33             48 
##             WA 
##             54
statemap <- data.frame(state = append(glassdoor$State,indeed$Location))
statems <- data.frame(state = c("MS","AK"),n = c(0,0))
statemap <- count(statemap, state)
statemap <- bind_rows(statemap,statems)

and filter out the non states:

anywhere <- statemap %>% filter((state %in% c('USA','REMOTE')))
statemap <- statemap %>% filter(!(state %in% c('USA','REMOTE','Los Angeles','United Kingdom','')))
state.pop <- read.csv("states pop 2019.csv")
names(state.pop)
## [1] "ï..State" "X2019"
state.pop <- left_join(state.pop,statemap,by = c("ï..State" = "state")) %>% mutate(porp = n/X2019*100)
#included but not shown as it will fill with loading symbols
#states <- states(cb=T)
states_merged <- geo_join(states, statemap, "STUSPS", "state")
pal <- colorNumeric("Greens", domain=states_merged$n)
popup_sb <- paste0("Total: ", as.character(states_merged$n))

leaflet() %>%
  addProviderTiles("CartoDB.Positron") %>%
  setView(-98.483330, 38.712046, zoom = 4) %>% 
  addPolygons(data = states_merged , 
              fillColor = ~pal(states_merged$n), 
              fillOpacity = 0.7, 
              weight = 0.2, 
              smoothFactor = 0.2, 
              popup = ~popup_sb) %>%
  addLegend(pal = pal, 
            values = states_merged$n, 
            position = "bottomright", 
            title = "Jobs")