Clean New York and San Francisco processed resumes and extract skills

Heather Geiger - March 21, 2018

Load libraries.

library(stringr)    #For string operations
library(rvest)      #For screen scrapper

## Loading required package: xml2

## Warning: package 'xml2' was built under R version 3.4.3

library(tokenizers) #
library(tidyverse)  #For Tidyverse

## ── Attaching packages ─────────────────────────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 2.2.1     ✔ readr   1.1.1
## ✔ tibble  1.4.2     ✔ purrr   0.2.4
## ✔ tidyr   0.7.2     ✔ dplyr   0.7.4
## ✔ ggplot2 2.2.1     ✔ forcats 0.3.0

## Warning: package 'tibble' was built under R version 3.4.3

## Warning: package 'forcats' was built under R version 3.4.3

## ── Conflicts ────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()         masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag()            masks stats::lag()
## ✖ purrr::pluck()          masks rvest::pluck()

library(RCurl)      #For File Operations

## Warning: package 'RCurl' was built under R version 3.4.3

## Loading required package: bitops

## 
## Attaching package: 'RCurl'

## The following object is masked from 'package:tidyr':
## 
##     complete

library(dplyr)      #For Manipulating the data frames
library(DT)         #For Data table package

## Warning: package 'DT' was built under R version 3.4.3

library(curl)

## Warning: package 'curl' was built under R version 3.4.3

## 
## Attaching package: 'curl'

## The following object is masked from 'package:readr':
## 
##     parse_date

library(RJSONIO)

Load Rdata files.

Load both Rdata files from previous step.

First, load for New York, then for San Francisco. Delete everything but the three objects we really need, which are:

job_titles_and_descriptions_across_resumes (data frame)
skills_per_resume (data frame)
executive_summaries (vector)

Next, we will want to combine all of this information into one long format data frame.

First column - City. Next column - Resume.num. Third column - Resume.section. This will say whether we are referring to an item from job titles, descriptions, skills, or executive summaries. Final column - Text. This will give the actual value.

load("resumes_processed.Rdata")
rm(list=setdiff(ls(),c("job_titles_and_descriptions_across_resumes","skills_per_resume","executive_summaries")))
ls()

## [1] "executive_summaries"                       
## [2] "job_titles_and_descriptions_across_resumes"
## [3] "skills_per_resume"

city <- "New York"

job_titles_and_descriptions_across_resumes <- gather(job_titles_and_descriptions_across_resumes,Resume.section,Text,-Resume.num)
job_titles_and_descriptions_across_resumes <- data.frame(City = city,job_titles_and_descriptions_across_resumes,stringsAsFactors=FALSE)
#head(job_titles_and_descriptions_across_resumes);tail(job_titles_and_descriptions_across_resumes)
skills_per_resume <- data.frame(City = city,Resume.num = skills_per_resume$Resume.num,
                Resume.section = "Skills",
                Text = skills_per_resume$Skill,stringsAsFactors=FALSE)
#head(skills_per_resume);tail(skills_per_resume)
executive_summaries <- data.frame(City = city,
            Resume.num = setdiff(1:1000,c(185,200,763,786,795,815,66,111,213,290,294,505,627,837)),
            Resume.section = "Executive.summary",
            Text = executive_summaries,stringsAsFactors=FALSE)
#head(executive_summaries);tail(executive_summaries)

new_york_resumes <- rbind(job_titles_and_descriptions_across_resumes,skills_per_resume,executive_summaries)
new_york_resumes <- new_york_resumes %>% arrange(Resume.num)

#head(new_york_resumes);tail(new_york_resumes)

rm(list=setdiff(ls(),"new_york_resumes"))
ls()

## [1] "new_york_resumes"

load("resumes_processed_san_francisco.Rdata")
rm(list=setdiff(ls(),c("job_titles_and_descriptions_across_resumes","skills_per_resume","executive_summaries","valid_json","descriptions_for_every_job","new_york_resumes")))
ls()

## [1] "descriptions_for_every_job"                
## [2] "executive_summaries"                       
## [3] "job_titles_and_descriptions_across_resumes"
## [4] "new_york_resumes"                          
## [5] "skills_per_resume"                         
## [6] "valid_json"

city <- "San Francisco"

job_titles_and_descriptions_across_resumes <- gather(job_titles_and_descriptions_across_resumes,Resume.section,Text,-Resume.num)        
job_titles_and_descriptions_across_resumes <- data.frame(City = city,job_titles_and_descriptions_across_resumes,stringsAsFactors=FALSE)
skills_per_resume <- data.frame(City = city,Resume.num = skills_per_resume$Resume.num,
                                Resume.section = "Skills",
                                Text = skills_per_resume$Skill,stringsAsFactors=FALSE)
executive_summaries <- data.frame(City = city,
                Resume.num = which(valid_json == TRUE & descriptions_for_every_job == TRUE),
                Resume.section = "Executive.summary",
                Text = executive_summaries,stringsAsFactors=FALSE)
head(executive_summaries);tail(executive_summaries)

##            City Resume.num    Resume.section
## 1 San Francisco          1 Executive.summary
## 2 San Francisco          2 Executive.summary
## 3 San Francisco          3 Executive.summary
## 4 San Francisco          4 Executive.summary
## 5 San Francisco          5 Executive.summary
## 6 San Francisco          6 Executive.summary
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     Text
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
## 2 • 8+ years of Data Science experience building interpretable machine learning models, and building end to end data pipelines which included extracting, transforming and combine all incoming data with the goal of discovering hidden insight, with an eye to improve business processes, address business problems or result in cost savings\\n\\n• Experience working with large data and metadata sources; interpret and communicate insights and findings from analysis and experiments to both technical and non-technical audiences in ad, service, and business\\n\\n• Expert knowledge in breadth of machine learning algorithms and love to find the best approach to a specific problem. Implemented several supervised and unsupervised learning algorithms such as Ensemble Methods (Random forests), Logistic Regression, Regularized Linear Regression, SVMs, Deep Neural Networks, Extreme Gradient Boosting, Decision Trees, KMeans, Gaussian Mixture Models, Hierarchical models, and time series models (ARIMA, GARCH, VARCH etc.)\\n\\n• Experience with applied statistical techniques and machine learning, including Bayesian methods, time-series modeling, classification, regression, mixture models, clustering, dimensionality reduction, model selection, feature extraction, experimental design, and choice modeling\\n\\n• Led independent research and experimentation of new methodologies to discover insights, improvements for problems. Delivered findings and actionable results to management team through data visualization, presentation, or training sessions. Proactively involved in roadmap discussions, data science initiatives and the optimal approach to apply the underlying algorithms\\n\\n• Fluent and well-versed writing production quality code in SQL, R, Python, Spark and Scala\\n\\n• Hands on experience building regression, classification, and recommender systems with large datasets in distributed systems and constrained environments\\n\\n• Domain expertise in architecting and building comprehensive analytical solutions in Marketing, Sales and Operations functions across Technology, Retail and Banking industries\\n\\n• Hands on experience communicating business insights by dashboarding in Tableau. Developed automated tableau dashboards that helped evaluate and evolve existing user data strategies, which include user metrics, measurement frameworks, and methods to measurement\\n\\n• Strong track record of contributing to successful end-to-end analytic solutions (clarifying business objectives and hypotheses, communicating project deliverables and timelines, and informing action based on findings)\\n\\n• Developed and deployed dashboards in Tableau and RShiny to identify trends and opportunities, surface actionable insights, and help teams set goals, forecasts and prioritization of initiatives\\n\\n• Experienced in Data Modeling retaining concepts of RDBMS, Logical and Physical Data Modeling until 3NormalForm (3NF) and Multidimensional Data Modeling Schema (Star schema, Snow-Flake Modeling, Facts and dimensions)\\n\\n• Professional working experience in writing spark streaming and spark batch jobs using spark MLlib\\n\\n• Hands on experience in optimizing the SQL Queries and database performance tuning in Oracle, SQL Server and Teradata databases
## 3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
## 4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
## 5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
## 6

##              City Resume.num    Resume.section
## 566 San Francisco        645 Executive.summary
## 567 San Francisco        646 Executive.summary
## 568 San Francisco        647 Executive.summary
## 569 San Francisco        648 Executive.summary
## 570 San Francisco        649 Executive.summary
## 571 San Francisco        650 Executive.summary
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              Text
## 566                                                                                                                                                                                                                                                       • Over 6 years of experience in laboratory environments, including cGMP\\u002FcGLP compliant environments.\\n• Familiar with operations, techniques and equipment used in standard Biologic platforms pertaining to cell culture, purification and fill\\u002Ffinish.\\n• Experience using manufacturing execution system (MES), laboratory information management system (LIMS), system applications products (SAP) and cGMP electronic document management systems for electronic batch record, batch production record, QC sampling results, and standard operating procedure reviews.\\n• Five years of hands-on experience as an assistant researcher in various academic research laboratories, including cell culture and purification techniques. Used mice models and mammalian cell cultures in diabetes studies and enzyme characterization.
## 567                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              
## 568                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              
## 569                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              
## 570 • Understanding of cGMP and aseptic behavior for the manufacture of Biological therapies.\\n• Familiarity with operations, techniques and equipment used in standard Biologic platforms pertaining to cell culture, purification and fill\\u002Ffinish.\\n• Five years of hands-on experience as an assistant researcher in various academic research laboratories, including cell culture and purification techniques.\\n• Experience using manufacturing execution system (MES), laboratory information management system (LIMS), system applications products (SAP) and cGMP electronic document management systems for electronic batch record, batch production record, QC sampling results, and standard operating procedure reviews.\\n• Experience with planning and determining capacity for a genome sequencing facility to ensure proper product turnaround time.\\n• Used mice models and mammalian cell cultures in diabetes studies and enzyme characterization.\\n• Gained experience in next generation sequencing using Illumina 2500 platforms in quality control and library construction.
## 571

san_francisco_resumes <- rbind(job_titles_and_descriptions_across_resumes,skills_per_resume,executive_summaries)
san_francisco_resumes <- san_francisco_resumes %>% arrange(Resume.num)

head(san_francisco_resumes);tail(san_francisco_resumes)

##            City Resume.num    Resume.section
## 1 San Francisco          1         Job.title
## 2 San Francisco          1         Job.title
## 3 San Francisco          1   Job.description
## 4 San Francisco          1   Job.description
## 5 San Francisco          1 Executive.summary
## 6 San Francisco          2         Job.title
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   Text
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       Data Scientist
## 2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       Data Scientist
## 3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    Developed a deep similarity network using TensorFlow based on VGG and ResNet to solve in-class\\nsimilarity problem from two groups of real world images, tuned the model to reach an f1 score close to 0.85 by using image augmentation and up-sampling.\\n• Built parallel crawlers with proxy network to efficiently crawl 98% of all public listings (about 100\\nmillion pages) from several short-term rental platforms and uploaded data into AWS RDS\\n• Built a a price calculator to recommend the best rental price for Pillow's customers\\n• Use LDA to automatic generate a large set of labeled images from their descriptions\\n• Perform sentiment analysis for review data and rank the reviews
## 4 • Built automatic decision making system for retail stores(Verizon) to make management decisions with location based sensor data, increased the sales of accessories by 30%\\n• Machine learning: Developed Location based Model to predict user behavior over 500,000 minutes of raw\\nsensor data and 400 days of sales data with ENN, achieving AUC 0.72\\n• Feature engineering: Performed detailed feature engineering for data sources from customer's position,\\npath and interactions with sales representatives to address significant factors for transitions.\\nCourses & Projects\\nSelf-Driving Car Engineer Nano Degree - Udacity\\nLearned and deployed code to control a real self-driving car in a test track\\nSkills: Tensorflow, Keras, Yolo, ROS, UKF, Particle Filter, PID, behavior planning, FCNN\\nData Analyst Nano Degree - Udacity\\nData wrangle Open street Maps Data use MongoDB, visualization with D3, A\\u002FB test\\nSkills: Python, R, MongoDB, Machine Learning, A\\u002FB Test, HTML, CSS, D3\\n\\n• Database Management                Created online post forum 'Quill And Inkpot' with MySQL\\n• Operating System                   Implemented Priority Scheduling in Nachos using Java\\n• Distributed System                 Implemented a secure distributed ATM systems with Java\\n• Computer Vision                    Compared image compression algorithm using structural similarity\\n\\nHonors and Awards\\nOct. 2009          Champion, RoboGame Competition, USTC (rank 1)\\nSep. 2007          First Prize, Chinese Physics Olympiad (provincial level), China (top 0.1%))
## 5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     
## 6                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       Data Scientist

##               City Resume.num    Resume.section
## 8795 San Francisco        650   Job.description
## 8796 San Francisco        650   Job.description
## 8797 San Francisco        650   Job.description
## 8798 San Francisco        650   Job.description
## 8799 San Francisco        650   Job.description
## 8800 San Francisco        650 Executive.summary
##                                                                                                                                                                                                                                                                                                                                                       Text
## 8795 • Generated several Bayesian models of attention with Monte Carlo simulations\\n• Programmed and ran experiments using visual displays, eye-tracking and psychophysics\\n• Generated code for image processing of several hundred images\\n• Co-supervision of undergraduate students (5-8 per year) and graduate students\\n• ANOVA\\u002FRegression
## 8796                                                                                                                • Generated a Bayesian model of attention with Monte Carlo simulations\\n• Generated code for image processing of medical images (mammograms)\\n• Programmed and ran experiments using visual displays, eye-tracking and psychophysics
## 8797                                                                                                                                                                                                                  • Developed code for running experiments, data processing\\n• Ran experiments using visual displays, eye-tracking, and psychophysics
## 8798                                                                                                                                                                                                                         • Developed code for running experiments, data processing\\n• Developed code to calculate color coordinates of surface colors
## 8799                                                                                                                                                                 • Manager of laboratory facilities, including programming and statistical analyses\\n• Developed code for reading in and cleaning lung volume and EKG data, including graphical input
## 8800

rm(list=setdiff(ls(),c("new_york_resumes","san_francisco_resumes")))
ls()

## [1] "new_york_resumes"      "san_francisco_resumes"

Now we can combine new_york_resumes and san_francisco_resumes and proceed with clean-up and analysis.

resumes_across_cities <- rbind(new_york_resumes,san_francisco_resumes)
rm(new_york_resumes);rm(san_francisco_resumes)

resumes_across_cities <- resumes_across_cities %>% arrange(Resume.num,City)

Replace special encoding “\u002F” with a “/”.

resumes_across_cities$Text <- str_replace_all(resumes_across_cities$Text,pattern='\\\\u002F',replace='/')

Now for each resume (unique combination of City and Resume.num), we want to check for the occurence of various strings.

Read in a table with the skill heading plus synonyms.

All in lowercase.

We should require these be bounded on either side by either the start/end of the string, or whitespace, or punctuation.

Start by getting a list of keywords, with a vector of keywords per skill.

keywords <- read.table("keywords.txt",header=TRUE,check.names=FALSE,stringsAsFactors=FALSE,sep="\t")
keywords <- keywords[grep('This is probably too tough',keywords$Other.notes,invert=TRUE),]

keyword_list <- vector("list",length=nrow(keywords))

for(i in 1:nrow(keywords))
{
keywords_this_row <- keywords$Skill[i]
if(keywords$Synonyms[i] != "None"){
    keywords_this_row <- c(keywords_this_row,unlist(strsplit(keywords$Synonyms[i],",")[[1]]))
    }
keyword_list[[i]] <- keywords_this_row
}

Write a function to give a pattern for a keyword if it has a word boundary, comma, or space on each side.

Then if there are multiple keywords, paste these together with a pipe.

Finally, run this function for every item in keyword_list.

#Couldn't figure out how to get a regex for a space, comma, or word boundary. However did get one that can get either a space or comma.
space_or_comma <- "[[:space:],]"
word_boundary <- "\\b"

pattern_for_one_keyword <- function(keyword){
    regexes <- paste0(space_or_comma,keyword,space_or_comma)
    regexes <- c(regexes,paste0(word_boundary,keyword,word_boundary))
    regexes <- c(regexes,paste0(word_boundary,keyword,space_or_comma))
    regexes <- c(regexes,paste0(space_or_comma,keyword,word_boundary))
    return(paste0(regexes,collapse="|"))
}

pattern_for_multiple_keywords <- function(keyword_vector){
    if(length(keyword_vector) == 1){return(pattern_for_one_keyword(keyword_vector))}
    if(length(keyword_vector) > 1){
        individual_regexes <- c()
        for(i in 1:length(keyword_vector))
        {
            individual_regexes <- c(individual_regexes,pattern_for_one_keyword(keyword_vector[i]))
        }
    return(paste0(individual_regexes,collapse="|")) 
    }
}

keyword_regexes <- unlist(lapply(keyword_list,function(x)pattern_for_multiple_keywords(x)))

We can now use keyword_regexes along with str_detect to give a TRUE/FALSE value for whether the text contains the pattern.

num_resumes_per_skill <- c()

for(i in 1:length(keyword_regexes))
{
skill <- keyword_regexes[i]
skill_in_text <- str_detect(tolower(resumes_across_cities$Text),skill)
resumes_across_cities_incl_this_skill <- resumes_across_cities[skill_in_text,]
num_resumes_this_skill <- length(unique(paste0(resumes_across_cities_incl_this_skill$City,resumes_across_cities_incl_this_skill$Resume.num)))
num_resumes_per_skill <- c(num_resumes_per_skill,num_resumes_this_skill)
print(paste0("Skill ",keywords$Skill[i]," found in ",num_resumes_this_skill," resumes"))
}

## [1] "Skill algorithm found in 375 resumes"
## [1] "Skill appengine found in 0 resumes"
## [1] "Skill aws found in 264 resumes"
## [1] "Skill big data found in 322 resumes"
## [1] "Skill c++ found in 423 resumes"
## [1] "Skill collaboration found in 405 resumes"
## [1] "Skill communication found in 352 resumes"
## [1] "Skill prediction found in 772 resumes"
## [1] "Skill couchdb found in 0 resumes"
## [1] "Skill creativity found in 65 resumes"
## [1] "Skill critical thinking found in 8 resumes"
## [1] "Skill customer service found in 55 resumes"
## [1] "Skill data manipulation found in 156 resumes"
## [1] "Skill data wrangling found in 26 resumes"
## [1] "Skill data mining found in 479 resumes"
## [1] "Skill d3.js found in 66 resumes"
## [1] "Skill decision making found in 97 resumes"
## [1] "Skill decision tree found in 209 resumes"
## [1] "Skill ecl found in 1 resumes"
## [1] "Skill flare found in 3 resumes"
## [1] "Skill google visualization api found in 0 resumes"
## [1] "Skill hadoop found in 402 resumes"
## [1] "Skill java found in 431 resumes"
## [1] "Skill leadership found in 554 resumes"
## [1] "Skill machine learning found in 800 resumes"
## [1] "Skill matlab found in 313 resumes"
## [1] "Skill microsoft excel found in 71 resumes"
## [1] "Skill mining social media found in 1 resumes"
## [1] "Skill modeling found in 990 resumes"
## [1] "Skill perl found in 65 resumes"
## [1] "Skill powerpoint found in 104 resumes"
## [1] "Skill presentation found in 392 resumes"
## [1] "Skill problem solving found in 54 resumes"
## [1] "Skill python found in 1000 resumes"
## [1] "Skill r found in 873 resumes"
## [1] "Skill raphael.js found in 0 resumes"
## [1] "Skill risk modeling found in 12 resumes"
## [1] "Skill sas found in 384 resumes"
## [1] "Skill scripting languages found in 92 resumes"
## [1] "Skill sql found in 901 resumes"
## [1] "Skill statistics found in 861 resumes"
## [1] "Skill tableau found in 383 resumes"
## [1] "Skill a/b testing found in 84 resumes"
## [1] "Skill data visualization found in 529 resumes"

Let’s remove a few skills that were found in very few resumes.

Also now let’s redo also collecting information for which cities the resumes come from.

num_resumes_per_skill <- data.frame(Skill = keywords$Skill,
                Num.resumes = num_resumes_per_skill,
                stringsAsFactors=FALSE)

head(num_resumes_per_skill[order(num_resumes_per_skill$Num.resumes),],n=10)

##                       Skill Num.resumes
## 2                 appengine           0
## 9                   couchdb           0
## 21 google visualization api           0
## 36               raphael.js           0
## 19                      ecl           1
## 28      mining social media           1
## 20                    flare           3
## 11        critical thinking           8
## 37            risk modeling          12
## 14           data wrangling          26

It seems reasonable to look only at skills found in at least 10 resumes.

keywords <- keywords[num_resumes_per_skill$Num.resumes >= 10,]
keyword_regexes <- keyword_regexes[num_resumes_per_skill$Num.resumes >= 10]

## [1] "Skill algorithm found in 375 resumes"
## [1] "Skill aws found in 264 resumes"
## [1] "Skill big data found in 322 resumes"
## [1] "Skill c++ found in 423 resumes"
## [1] "Skill collaboration found in 405 resumes"
## [1] "Skill communication found in 352 resumes"
## [1] "Skill prediction found in 772 resumes"
## [1] "Skill creativity found in 65 resumes"
## [1] "Skill customer service found in 55 resumes"
## [1] "Skill data manipulation found in 156 resumes"
## [1] "Skill data wrangling found in 26 resumes"
## [1] "Skill data mining found in 479 resumes"
## [1] "Skill d3.js found in 66 resumes"
## [1] "Skill decision making found in 97 resumes"
## [1] "Skill decision tree found in 209 resumes"
## [1] "Skill hadoop found in 402 resumes"
## [1] "Skill java found in 431 resumes"
## [1] "Skill leadership found in 554 resumes"
## [1] "Skill machine learning found in 800 resumes"
## [1] "Skill matlab found in 313 resumes"
## [1] "Skill microsoft excel found in 71 resumes"
## [1] "Skill modeling found in 990 resumes"
## [1] "Skill perl found in 65 resumes"
## [1] "Skill powerpoint found in 104 resumes"
## [1] "Skill presentation found in 392 resumes"
## [1] "Skill problem solving found in 54 resumes"
## [1] "Skill python found in 1000 resumes"
## [1] "Skill r found in 873 resumes"
## [1] "Skill risk modeling found in 12 resumes"
## [1] "Skill sas found in 384 resumes"
## [1] "Skill scripting languages found in 92 resumes"
## [1] "Skill sql found in 901 resumes"
## [1] "Skill statistics found in 861 resumes"
## [1] "Skill tableau found in 383 resumes"
## [1] "Skill a/b testing found in 84 resumes"
## [1] "Skill data visualization found in 529 resumes"

## 
##      New York San Francisco 
##            36            36

##       Skill      Type          City Num.resumes
## 1 algorithm technical      New York         242
## 2 algorithm technical San Francisco         133
## 3       aws technical      New York         159
## 4       aws technical San Francisco         105
## 5  big data technical      New York         216
## 6  big data technical San Francisco         106

##                 Skill      Type          City Num.resumes
## 67            tableau technical      New York         252
## 68            tableau technical San Francisco         131
## 69        a/b testing technical      New York          48
## 70        a/b testing technical San Francisco          36
## 71 data visualization technical      New York         369
## 72 data visualization technical San Francisco         160

Finally, add a column Percent.resumes which expresses the percentage of resumes from the city that contain the skill.

cities_plus_resumes <- resumes_across_cities[!(duplicated(paste0(resumes_across_cities$City,resumes_across_cities$Resume.num))),]

resumes_per_city <- data.frame(table(cities_plus_resumes$City))
resumes_per_city[,1] <- as.vector(resumes_per_city[,1])

colnames(resumes_per_city) <- c("City","Total.resumes.this.city")

num_resumes_per_skill <- merge(num_resumes_per_skill,resumes_per_city,"City") %>% mutate(Percent.of.resumes.this.city = round(Num.resumes*100/Total.resumes.this.city,digits=2))

head(num_resumes_per_skill);tail(num_resumes_per_skill)

##       City             Skill      Type Num.resumes Total.resumes.this.city
## 1 New York         algorithm technical         242                     986
## 2 New York        creativity      soft          47                     986
## 3 New York               aws technical         159                     986
## 4 New York  customer service      soft          31                     986
## 5 New York          big data technical         216                     986
## 6 New York data manipulation technical         108                     986
##   Percent.of.resumes.this.city
## 1                        24.54
## 2                         4.77
## 3                        16.13
## 4                         3.14
## 5                        21.91
## 6                        10.95

##             City              Skill      Type Num.resumes
## 67 San Francisco             python technical         367
## 68 San Francisco            tableau technical         131
## 69 San Francisco                  r technical         304
## 70 San Francisco        a/b testing technical          36
## 71 San Francisco      risk modeling technical           4
## 72 San Francisco data visualization technical         160
##    Total.resumes.this.city Percent.of.resumes.this.city
## 67                     571                        64.27
## 68                     571                        22.94
## 69                     571                        53.24
## 70                     571                         6.30
## 71                     571                         0.70
## 72                     571                        28.02

ggplot(num_resumes_per_skill,aes(x=Skill,y=Percent.of.resumes.this.city,fill=City)) + geom_bar(stat="identity",position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + facet_wrap(~Type,scales="free")

ggplot(num_resumes_per_skill,aes(x=Skill,y=Percent.of.resumes.this.city,fill=City)) + 
geom_bar(stat="identity",position="dodge") + 
theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
facet_wrap(~Type,scales="free") +
ggtitle("Percent of resumes in each city listing a skill,\nincluding soft + technical skills")

It looks from first glance like data science job seekers in New York may be somewhat more likely to include soft skills keywords like collaboration, communication, and presentation in their resume.

San Francisco data science job seekers seem a bit more likely to include leadership soft skills. However, as I included “led” and “leading” as keywords for this skill, it’s also possible that San Francisco has more data science job seekers who were in management roles. Not necessarily that they just wanted to highlight their leadership as a soft skill.

It’s a bit hard to see the technical skills in detail, as this plot has more bars. Let’s plot just technical skills now.

Also remove “risk modeling” and “data wrangling”. Even also allowing just “wrangling” or “wrangled”, still very few resumes with this skill listed using our set of keywords.

Also, order skills by their max in the two cities, and create two panels for lower vs. higher frequency skills.

technical_skills_to_plot <- num_resumes_per_skill[num_resumes_per_skill$Type == "technical" & !(num_resumes_per_skill$Skill %in% c("data wrangling","risk modeling")),]
max_percent_of_resumes_per_city_per_skill <- aggregate(Percent.of.resumes.this.city ~ Skill,technical_skills_to_plot,max)
max_percent_of_resumes_per_city_per_skill <- max_percent_of_resumes_per_city_per_skill %>% arrange(Percent.of.resumes.this.city)

technical_skills_to_plot <- data.frame(technical_skills_to_plot,Frequency.level = rep(NA,times=nrow(technical_skills_to_plot)),stringsAsFactors=FALSE)

technical_skills_to_plot$Frequency.level <- ifelse(technical_skills_to_plot$Skill %in% max_percent_of_resumes_per_city_per_skill$Skill[max_percent_of_resumes_per_city_per_skill[,2] < 25],"Lower frequency skills","Higher frequency skills")

technical_skills_to_plot$Skill <- factor(technical_skills_to_plot$Skill,levels=max_percent_of_resumes_per_city_per_skill$Skill)

ggplot(technical_skills_to_plot,
aes(x=Skill,y=Percent.of.resumes.this.city,fill=City)) +
geom_bar(stat="identity",position="dodge") +
ggtitle("Technical") +
facet_wrap(~Frequency.level,scales="free") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

Still a bit hard to read! I think let’s use table to look at city differences.

technical_skills_spread <- technical_skills_to_plot %>% 
            select(setdiff(colnames(technical_skills_to_plot),c("Num.resumes","Total.resumes.this.city"))) %>%
            spread(City,Percent.of.resumes.this.city)
colnames(technical_skills_spread)[4:5] <- c("New.York","San.Francisco")

technical_skills_spread[,c(1,4,5)] %>% mutate(City.difference = New.York - San.Francisco) %>% arrange(desc(abs(City.difference)))

##                  Skill New.York San.Francisco City.difference
## 1   data visualization    37.42         28.02            9.40
## 2                  sas    26.57         21.37            5.20
## 3          data mining    32.45         27.85            4.60
## 4                    r    57.71         53.24            4.47
## 5        decision tree    15.01         10.68            4.33
## 6     machine learning    49.80         54.12           -4.32
## 7           prediction    51.12         46.94            4.18
## 8             big data    21.91         18.56            3.35
## 9              tableau    25.56         22.94            2.62
## 10              hadoop    26.77         24.17            2.60
## 11   data manipulation    10.95          8.41            2.54
## 12                 aws    16.13         18.39           -2.26
## 13          powerpoint     7.51          5.25            2.26
## 14          statistics    54.67         56.39           -1.72
## 15 scripting languages     5.38          6.83           -1.45
## 16                perl     3.65          5.08           -1.43
## 17         a/b testing     4.87          6.30           -1.43
## 18     microsoft excel     5.07          3.68            1.39
## 19                java    27.18         28.55           -1.37
## 20                 c++    26.67         28.02           -1.35
## 21           algorithm    24.54         23.29            1.25
## 22            modeling    64.00         62.87            1.13
## 23                 sql    57.51         58.49           -0.98
## 24              matlab    19.78         20.67           -0.89
## 25               d3.js     4.16          4.38           -0.22
## 26              python    64.20         64.27           -0.07

We find that New York data science job seekers appear substantially more likely to include data visualization keywords in their resumes.

Some other skills have smaller differences. For example, San Francisco data science job seekers had keywords related to machine learning in around 4% more of their resumes.

Now, take mean across cities and use that to plot, coloring by skill for clearer viewing.

technical_skills_to_plot <- technical_skills_spread %>% 
            mutate(Mean.percent.of.resumes.across.cities = (New.York + San.Francisco)/2)

mycol <- c("#004949","#009292","#FF6DB6","#FFB677","#490092","#006DDB","#B66DFF","#6DB6FF","#B6DBFF","#920000","#924900","#DBD100","#24FF24","#FFFF6D","#000000") #Set up colorblind friendly vector. 

ggplot(technical_skills_to_plot,
aes(x=Skill,y=Mean.percent.of.resumes.across.cities,fill=Skill)) +
geom_bar(stat="identity") +
ggtitle("Technical skills found by keywords") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_fill_manual(values=rep(mycol,times=2)) +
xlab("Mean percent of resumes across cities")