library(stringr) #For string operations
library(rvest) #For screen scrapper
## Loading required package: xml2
## Warning: package 'xml2' was built under R version 3.4.3
library(tokenizers) #
library(tidyverse) #For Tidyverse
## ── Attaching packages ─────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ readr 1.1.1
## ✔ tibble 1.4.2 ✔ purrr 0.2.4
## ✔ tidyr 0.7.2 ✔ dplyr 0.7.4
## ✔ ggplot2 2.2.1 ✔ forcats 0.3.0
## Warning: package 'tibble' was built under R version 3.4.3
## Warning: package 'forcats' was built under R version 3.4.3
## ── Conflicts ────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::pluck() masks rvest::pluck()
library(RCurl) #For File Operations
## Warning: package 'RCurl' was built under R version 3.4.3
## Loading required package: bitops
##
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
##
## complete
library(dplyr) #For Manipulating the data frames
library(DT) #For Data table package
## Warning: package 'DT' was built under R version 3.4.3
library(curl)
## Warning: package 'curl' was built under R version 3.4.3
##
## Attaching package: 'curl'
## The following object is masked from 'package:readr':
##
## parse_date
library(RJSONIO)
Load both Rdata files from previous step.
First, load for New York, then for San Francisco. Delete everything but the three objects we really need, which are:
Next, we will want to combine all of this information into one long format data frame.
First column - City. Next column - Resume.num. Third column - Resume.section. This will say whether we are referring to an item from job titles, descriptions, skills, or executive summaries. Final column - Text. This will give the actual value.
load("resumes_processed.Rdata")
rm(list=setdiff(ls(),c("job_titles_and_descriptions_across_resumes","skills_per_resume","executive_summaries")))
ls()
## [1] "executive_summaries"
## [2] "job_titles_and_descriptions_across_resumes"
## [3] "skills_per_resume"
city <- "New York"
job_titles_and_descriptions_across_resumes <- gather(job_titles_and_descriptions_across_resumes,Resume.section,Text,-Resume.num)
job_titles_and_descriptions_across_resumes <- data.frame(City = city,job_titles_and_descriptions_across_resumes,stringsAsFactors=FALSE)
#head(job_titles_and_descriptions_across_resumes);tail(job_titles_and_descriptions_across_resumes)
skills_per_resume <- data.frame(City = city,Resume.num = skills_per_resume$Resume.num,
Resume.section = "Skills",
Text = skills_per_resume$Skill,stringsAsFactors=FALSE)
#head(skills_per_resume);tail(skills_per_resume)
executive_summaries <- data.frame(City = city,
Resume.num = setdiff(1:1000,c(185,200,763,786,795,815,66,111,213,290,294,505,627,837)),
Resume.section = "Executive.summary",
Text = executive_summaries,stringsAsFactors=FALSE)
#head(executive_summaries);tail(executive_summaries)
new_york_resumes <- rbind(job_titles_and_descriptions_across_resumes,skills_per_resume,executive_summaries)
new_york_resumes <- new_york_resumes %>% arrange(Resume.num)
#head(new_york_resumes);tail(new_york_resumes)
rm(list=setdiff(ls(),"new_york_resumes"))
ls()
## [1] "new_york_resumes"
load("resumes_processed_san_francisco.Rdata")
rm(list=setdiff(ls(),c("job_titles_and_descriptions_across_resumes","skills_per_resume","executive_summaries","valid_json","descriptions_for_every_job","new_york_resumes")))
ls()
## [1] "descriptions_for_every_job"
## [2] "executive_summaries"
## [3] "job_titles_and_descriptions_across_resumes"
## [4] "new_york_resumes"
## [5] "skills_per_resume"
## [6] "valid_json"
city <- "San Francisco"
job_titles_and_descriptions_across_resumes <- gather(job_titles_and_descriptions_across_resumes,Resume.section,Text,-Resume.num)
job_titles_and_descriptions_across_resumes <- data.frame(City = city,job_titles_and_descriptions_across_resumes,stringsAsFactors=FALSE)
skills_per_resume <- data.frame(City = city,Resume.num = skills_per_resume$Resume.num,
Resume.section = "Skills",
Text = skills_per_resume$Skill,stringsAsFactors=FALSE)
executive_summaries <- data.frame(City = city,
Resume.num = which(valid_json == TRUE & descriptions_for_every_job == TRUE),
Resume.section = "Executive.summary",
Text = executive_summaries,stringsAsFactors=FALSE)
head(executive_summaries);tail(executive_summaries)
## City Resume.num Resume.section
## 1 San Francisco 1 Executive.summary
## 2 San Francisco 2 Executive.summary
## 3 San Francisco 3 Executive.summary
## 4 San Francisco 4 Executive.summary
## 5 San Francisco 5 Executive.summary
## 6 San Francisco 6 Executive.summary
## Text
## 1
## 2 • 8+ years of Data Science experience building interpretable machine learning models, and building end to end data pipelines which included extracting, transforming and combine all incoming data with the goal of discovering hidden insight, with an eye to improve business processes, address business problems or result in cost savings\\n\\n• Experience working with large data and metadata sources; interpret and communicate insights and findings from analysis and experiments to both technical and non-technical audiences in ad, service, and business\\n\\n• Expert knowledge in breadth of machine learning algorithms and love to find the best approach to a specific problem. Implemented several supervised and unsupervised learning algorithms such as Ensemble Methods (Random forests), Logistic Regression, Regularized Linear Regression, SVMs, Deep Neural Networks, Extreme Gradient Boosting, Decision Trees, KMeans, Gaussian Mixture Models, Hierarchical models, and time series models (ARIMA, GARCH, VARCH etc.)\\n\\n• Experience with applied statistical techniques and machine learning, including Bayesian methods, time-series modeling, classification, regression, mixture models, clustering, dimensionality reduction, model selection, feature extraction, experimental design, and choice modeling\\n\\n• Led independent research and experimentation of new methodologies to discover insights, improvements for problems. Delivered findings and actionable results to management team through data visualization, presentation, or training sessions. Proactively involved in roadmap discussions, data science initiatives and the optimal approach to apply the underlying algorithms\\n\\n• Fluent and well-versed writing production quality code in SQL, R, Python, Spark and Scala\\n\\n• Hands on experience building regression, classification, and recommender systems with large datasets in distributed systems and constrained environments\\n\\n• Domain expertise in architecting and building comprehensive analytical solutions in Marketing, Sales and Operations functions across Technology, Retail and Banking industries\\n\\n• Hands on experience communicating business insights by dashboarding in Tableau. Developed automated tableau dashboards that helped evaluate and evolve existing user data strategies, which include user metrics, measurement frameworks, and methods to measurement\\n\\n• Strong track record of contributing to successful end-to-end analytic solutions (clarifying business objectives and hypotheses, communicating project deliverables and timelines, and informing action based on findings)\\n\\n• Developed and deployed dashboards in Tableau and RShiny to identify trends and opportunities, surface actionable insights, and help teams set goals, forecasts and prioritization of initiatives\\n\\n• Experienced in Data Modeling retaining concepts of RDBMS, Logical and Physical Data Modeling until 3NormalForm (3NF) and Multidimensional Data Modeling Schema (Star schema, Snow-Flake Modeling, Facts and dimensions)\\n\\n• Professional working experience in writing spark streaming and spark batch jobs using spark MLlib\\n\\n• Hands on experience in optimizing the SQL Queries and database performance tuning in Oracle, SQL Server and Teradata databases
## 3
## 4
## 5
## 6
## City Resume.num Resume.section
## 566 San Francisco 645 Executive.summary
## 567 San Francisco 646 Executive.summary
## 568 San Francisco 647 Executive.summary
## 569 San Francisco 648 Executive.summary
## 570 San Francisco 649 Executive.summary
## 571 San Francisco 650 Executive.summary
## Text
## 566 • Over 6 years of experience in laboratory environments, including cGMP\\u002FcGLP compliant environments.\\n• Familiar with operations, techniques and equipment used in standard Biologic platforms pertaining to cell culture, purification and fill\\u002Ffinish.\\n• Experience using manufacturing execution system (MES), laboratory information management system (LIMS), system applications products (SAP) and cGMP electronic document management systems for electronic batch record, batch production record, QC sampling results, and standard operating procedure reviews.\\n• Five years of hands-on experience as an assistant researcher in various academic research laboratories, including cell culture and purification techniques. Used mice models and mammalian cell cultures in diabetes studies and enzyme characterization.
## 567
## 568
## 569
## 570 • Understanding of cGMP and aseptic behavior for the manufacture of Biological therapies.\\n• Familiarity with operations, techniques and equipment used in standard Biologic platforms pertaining to cell culture, purification and fill\\u002Ffinish.\\n• Five years of hands-on experience as an assistant researcher in various academic research laboratories, including cell culture and purification techniques.\\n• Experience using manufacturing execution system (MES), laboratory information management system (LIMS), system applications products (SAP) and cGMP electronic document management systems for electronic batch record, batch production record, QC sampling results, and standard operating procedure reviews.\\n• Experience with planning and determining capacity for a genome sequencing facility to ensure proper product turnaround time.\\n• Used mice models and mammalian cell cultures in diabetes studies and enzyme characterization.\\n• Gained experience in next generation sequencing using Illumina 2500 platforms in quality control and library construction.
## 571
san_francisco_resumes <- rbind(job_titles_and_descriptions_across_resumes,skills_per_resume,executive_summaries)
san_francisco_resumes <- san_francisco_resumes %>% arrange(Resume.num)
head(san_francisco_resumes);tail(san_francisco_resumes)
## City Resume.num Resume.section
## 1 San Francisco 1 Job.title
## 2 San Francisco 1 Job.title
## 3 San Francisco 1 Job.description
## 4 San Francisco 1 Job.description
## 5 San Francisco 1 Executive.summary
## 6 San Francisco 2 Job.title
## Text
## 1 Data Scientist
## 2 Data Scientist
## 3 Developed a deep similarity network using TensorFlow based on VGG and ResNet to solve in-class\\nsimilarity problem from two groups of real world images, tuned the model to reach an f1 score close to 0.85 by using image augmentation and up-sampling.\\n• Built parallel crawlers with proxy network to efficiently crawl 98% of all public listings (about 100\\nmillion pages) from several short-term rental platforms and uploaded data into AWS RDS\\n• Built a a price calculator to recommend the best rental price for Pillow's customers\\n• Use LDA to automatic generate a large set of labeled images from their descriptions\\n• Perform sentiment analysis for review data and rank the reviews
## 4 • Built automatic decision making system for retail stores(Verizon) to make management decisions with location based sensor data, increased the sales of accessories by 30%\\n• Machine learning: Developed Location based Model to predict user behavior over 500,000 minutes of raw\\nsensor data and 400 days of sales data with ENN, achieving AUC 0.72\\n• Feature engineering: Performed detailed feature engineering for data sources from customer's position,\\npath and interactions with sales representatives to address significant factors for transitions.\\nCourses & Projects\\nSelf-Driving Car Engineer Nano Degree - Udacity\\nLearned and deployed code to control a real self-driving car in a test track\\nSkills: Tensorflow, Keras, Yolo, ROS, UKF, Particle Filter, PID, behavior planning, FCNN\\nData Analyst Nano Degree - Udacity\\nData wrangle Open street Maps Data use MongoDB, visualization with D3, A\\u002FB test\\nSkills: Python, R, MongoDB, Machine Learning, A\\u002FB Test, HTML, CSS, D3\\n\\n• Database Management Created online post forum 'Quill And Inkpot' with MySQL\\n• Operating System Implemented Priority Scheduling in Nachos using Java\\n• Distributed System Implemented a secure distributed ATM systems with Java\\n• Computer Vision Compared image compression algorithm using structural similarity\\n\\nHonors and Awards\\nOct. 2009 Champion, RoboGame Competition, USTC (rank 1)\\nSep. 2007 First Prize, Chinese Physics Olympiad (provincial level), China (top 0.1%))
## 5
## 6 Data Scientist
## City Resume.num Resume.section
## 8795 San Francisco 650 Job.description
## 8796 San Francisco 650 Job.description
## 8797 San Francisco 650 Job.description
## 8798 San Francisco 650 Job.description
## 8799 San Francisco 650 Job.description
## 8800 San Francisco 650 Executive.summary
## Text
## 8795 • Generated several Bayesian models of attention with Monte Carlo simulations\\n• Programmed and ran experiments using visual displays, eye-tracking and psychophysics\\n• Generated code for image processing of several hundred images\\n• Co-supervision of undergraduate students (5-8 per year) and graduate students\\n• ANOVA\\u002FRegression
## 8796 • Generated a Bayesian model of attention with Monte Carlo simulations\\n• Generated code for image processing of medical images (mammograms)\\n• Programmed and ran experiments using visual displays, eye-tracking and psychophysics
## 8797 • Developed code for running experiments, data processing\\n• Ran experiments using visual displays, eye-tracking, and psychophysics
## 8798 • Developed code for running experiments, data processing\\n• Developed code to calculate color coordinates of surface colors
## 8799 • Manager of laboratory facilities, including programming and statistical analyses\\n• Developed code for reading in and cleaning lung volume and EKG data, including graphical input
## 8800
rm(list=setdiff(ls(),c("new_york_resumes","san_francisco_resumes")))
ls()
## [1] "new_york_resumes" "san_francisco_resumes"
Now we can combine new_york_resumes and san_francisco_resumes and proceed with clean-up and analysis.
resumes_across_cities <- rbind(new_york_resumes,san_francisco_resumes)
rm(new_york_resumes);rm(san_francisco_resumes)
resumes_across_cities <- resumes_across_cities %>% arrange(Resume.num,City)
Replace special encoding “\u002F” with a “/”.
resumes_across_cities$Text <- str_replace_all(resumes_across_cities$Text,pattern='\\\\u002F',replace='/')
Now for each resume (unique combination of City and Resume.num), we want to check for the occurence of various strings.
Read in a table with the skill heading plus synonyms.
All in lowercase.
We should require these be bounded on either side by either the start/end of the string, or whitespace, or punctuation.
Start by getting a list of keywords, with a vector of keywords per skill.
keywords <- read.table("keywords.txt",header=TRUE,check.names=FALSE,stringsAsFactors=FALSE,sep="\t")
keywords <- keywords[grep('This is probably too tough',keywords$Other.notes,invert=TRUE),]
keyword_list <- vector("list",length=nrow(keywords))
for(i in 1:nrow(keywords))
{
keywords_this_row <- keywords$Skill[i]
if(keywords$Synonyms[i] != "None"){
keywords_this_row <- c(keywords_this_row,unlist(strsplit(keywords$Synonyms[i],",")[[1]]))
}
keyword_list[[i]] <- keywords_this_row
}
Write a function to give a pattern for a keyword if it has a word boundary, comma, or space on each side.
Then if there are multiple keywords, paste these together with a pipe.
Finally, run this function for every item in keyword_list.
#Couldn't figure out how to get a regex for a space, comma, or word boundary. However did get one that can get either a space or comma.
space_or_comma <- "[[:space:],]"
word_boundary <- "\\b"
pattern_for_one_keyword <- function(keyword){
regexes <- paste0(space_or_comma,keyword,space_or_comma)
regexes <- c(regexes,paste0(word_boundary,keyword,word_boundary))
regexes <- c(regexes,paste0(word_boundary,keyword,space_or_comma))
regexes <- c(regexes,paste0(space_or_comma,keyword,word_boundary))
return(paste0(regexes,collapse="|"))
}
pattern_for_multiple_keywords <- function(keyword_vector){
if(length(keyword_vector) == 1){return(pattern_for_one_keyword(keyword_vector))}
if(length(keyword_vector) > 1){
individual_regexes <- c()
for(i in 1:length(keyword_vector))
{
individual_regexes <- c(individual_regexes,pattern_for_one_keyword(keyword_vector[i]))
}
return(paste0(individual_regexes,collapse="|"))
}
}
keyword_regexes <- unlist(lapply(keyword_list,function(x)pattern_for_multiple_keywords(x)))
We can now use keyword_regexes along with str_detect to give a TRUE/FALSE value for whether the text contains the pattern.
num_resumes_per_skill <- c()
for(i in 1:length(keyword_regexes))
{
skill <- keyword_regexes[i]
skill_in_text <- str_detect(tolower(resumes_across_cities$Text),skill)
resumes_across_cities_incl_this_skill <- resumes_across_cities[skill_in_text,]
num_resumes_this_skill <- length(unique(paste0(resumes_across_cities_incl_this_skill$City,resumes_across_cities_incl_this_skill$Resume.num)))
num_resumes_per_skill <- c(num_resumes_per_skill,num_resumes_this_skill)
print(paste0("Skill ",keywords$Skill[i]," found in ",num_resumes_this_skill," resumes"))
}
## [1] "Skill algorithm found in 375 resumes"
## [1] "Skill appengine found in 0 resumes"
## [1] "Skill aws found in 264 resumes"
## [1] "Skill big data found in 322 resumes"
## [1] "Skill c++ found in 423 resumes"
## [1] "Skill collaboration found in 405 resumes"
## [1] "Skill communication found in 352 resumes"
## [1] "Skill prediction found in 772 resumes"
## [1] "Skill couchdb found in 0 resumes"
## [1] "Skill creativity found in 65 resumes"
## [1] "Skill critical thinking found in 8 resumes"
## [1] "Skill customer service found in 55 resumes"
## [1] "Skill data manipulation found in 156 resumes"
## [1] "Skill data wrangling found in 26 resumes"
## [1] "Skill data mining found in 479 resumes"
## [1] "Skill d3.js found in 66 resumes"
## [1] "Skill decision making found in 97 resumes"
## [1] "Skill decision tree found in 209 resumes"
## [1] "Skill ecl found in 1 resumes"
## [1] "Skill flare found in 3 resumes"
## [1] "Skill google visualization api found in 0 resumes"
## [1] "Skill hadoop found in 402 resumes"
## [1] "Skill java found in 431 resumes"
## [1] "Skill leadership found in 554 resumes"
## [1] "Skill machine learning found in 800 resumes"
## [1] "Skill matlab found in 313 resumes"
## [1] "Skill microsoft excel found in 71 resumes"
## [1] "Skill mining social media found in 1 resumes"
## [1] "Skill modeling found in 990 resumes"
## [1] "Skill perl found in 65 resumes"
## [1] "Skill powerpoint found in 104 resumes"
## [1] "Skill presentation found in 392 resumes"
## [1] "Skill problem solving found in 54 resumes"
## [1] "Skill python found in 1000 resumes"
## [1] "Skill r found in 873 resumes"
## [1] "Skill raphael.js found in 0 resumes"
## [1] "Skill risk modeling found in 12 resumes"
## [1] "Skill sas found in 384 resumes"
## [1] "Skill scripting languages found in 92 resumes"
## [1] "Skill sql found in 901 resumes"
## [1] "Skill statistics found in 861 resumes"
## [1] "Skill tableau found in 383 resumes"
## [1] "Skill a/b testing found in 84 resumes"
## [1] "Skill data visualization found in 529 resumes"
Let’s remove a few skills that were found in very few resumes.
Also now let’s redo also collecting information for which cities the resumes come from.
num_resumes_per_skill <- data.frame(Skill = keywords$Skill,
Num.resumes = num_resumes_per_skill,
stringsAsFactors=FALSE)
head(num_resumes_per_skill[order(num_resumes_per_skill$Num.resumes),],n=10)
## Skill Num.resumes
## 2 appengine 0
## 9 couchdb 0
## 21 google visualization api 0
## 36 raphael.js 0
## 19 ecl 1
## 28 mining social media 1
## 20 flare 3
## 11 critical thinking 8
## 37 risk modeling 12
## 14 data wrangling 26
It seems reasonable to look only at skills found in at least 10 resumes.
keywords <- keywords[num_resumes_per_skill$Num.resumes >= 10,]
keyword_regexes <- keyword_regexes[num_resumes_per_skill$Num.resumes >= 10]
## [1] "Skill algorithm found in 375 resumes"
## [1] "Skill aws found in 264 resumes"
## [1] "Skill big data found in 322 resumes"
## [1] "Skill c++ found in 423 resumes"
## [1] "Skill collaboration found in 405 resumes"
## [1] "Skill communication found in 352 resumes"
## [1] "Skill prediction found in 772 resumes"
## [1] "Skill creativity found in 65 resumes"
## [1] "Skill customer service found in 55 resumes"
## [1] "Skill data manipulation found in 156 resumes"
## [1] "Skill data wrangling found in 26 resumes"
## [1] "Skill data mining found in 479 resumes"
## [1] "Skill d3.js found in 66 resumes"
## [1] "Skill decision making found in 97 resumes"
## [1] "Skill decision tree found in 209 resumes"
## [1] "Skill hadoop found in 402 resumes"
## [1] "Skill java found in 431 resumes"
## [1] "Skill leadership found in 554 resumes"
## [1] "Skill machine learning found in 800 resumes"
## [1] "Skill matlab found in 313 resumes"
## [1] "Skill microsoft excel found in 71 resumes"
## [1] "Skill modeling found in 990 resumes"
## [1] "Skill perl found in 65 resumes"
## [1] "Skill powerpoint found in 104 resumes"
## [1] "Skill presentation found in 392 resumes"
## [1] "Skill problem solving found in 54 resumes"
## [1] "Skill python found in 1000 resumes"
## [1] "Skill r found in 873 resumes"
## [1] "Skill risk modeling found in 12 resumes"
## [1] "Skill sas found in 384 resumes"
## [1] "Skill scripting languages found in 92 resumes"
## [1] "Skill sql found in 901 resumes"
## [1] "Skill statistics found in 861 resumes"
## [1] "Skill tableau found in 383 resumes"
## [1] "Skill a/b testing found in 84 resumes"
## [1] "Skill data visualization found in 529 resumes"
##
## New York San Francisco
## 36 36
## Skill Type City Num.resumes
## 1 algorithm technical New York 242
## 2 algorithm technical San Francisco 133
## 3 aws technical New York 159
## 4 aws technical San Francisco 105
## 5 big data technical New York 216
## 6 big data technical San Francisco 106
## Skill Type City Num.resumes
## 67 tableau technical New York 252
## 68 tableau technical San Francisco 131
## 69 a/b testing technical New York 48
## 70 a/b testing technical San Francisco 36
## 71 data visualization technical New York 369
## 72 data visualization technical San Francisco 160
Finally, add a column Percent.resumes which expresses the percentage of resumes from the city that contain the skill.
cities_plus_resumes <- resumes_across_cities[!(duplicated(paste0(resumes_across_cities$City,resumes_across_cities$Resume.num))),]
resumes_per_city <- data.frame(table(cities_plus_resumes$City))
resumes_per_city[,1] <- as.vector(resumes_per_city[,1])
colnames(resumes_per_city) <- c("City","Total.resumes.this.city")
num_resumes_per_skill <- merge(num_resumes_per_skill,resumes_per_city,"City") %>% mutate(Percent.of.resumes.this.city = round(Num.resumes*100/Total.resumes.this.city,digits=2))
head(num_resumes_per_skill);tail(num_resumes_per_skill)
## City Skill Type Num.resumes Total.resumes.this.city
## 1 New York algorithm technical 242 986
## 2 New York creativity soft 47 986
## 3 New York aws technical 159 986
## 4 New York customer service soft 31 986
## 5 New York big data technical 216 986
## 6 New York data manipulation technical 108 986
## Percent.of.resumes.this.city
## 1 24.54
## 2 4.77
## 3 16.13
## 4 3.14
## 5 21.91
## 6 10.95
## City Skill Type Num.resumes
## 67 San Francisco python technical 367
## 68 San Francisco tableau technical 131
## 69 San Francisco r technical 304
## 70 San Francisco a/b testing technical 36
## 71 San Francisco risk modeling technical 4
## 72 San Francisco data visualization technical 160
## Total.resumes.this.city Percent.of.resumes.this.city
## 67 571 64.27
## 68 571 22.94
## 69 571 53.24
## 70 571 6.30
## 71 571 0.70
## 72 571 28.02
ggplot(num_resumes_per_skill,aes(x=Skill,y=Percent.of.resumes.this.city,fill=City)) + geom_bar(stat="identity",position="dodge") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + facet_wrap(~Type,scales="free")
ggplot(num_resumes_per_skill,aes(x=Skill,y=Percent.of.resumes.this.city,fill=City)) +
geom_bar(stat="identity",position="dodge") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
facet_wrap(~Type,scales="free") +
ggtitle("Percent of resumes in each city listing a skill,\nincluding soft + technical skills")
It looks from first glance like data science job seekers in New York may be somewhat more likely to include soft skills keywords like collaboration, communication, and presentation in their resume.
San Francisco data science job seekers seem a bit more likely to include leadership soft skills. However, as I included “led” and “leading” as keywords for this skill, it’s also possible that San Francisco has more data science job seekers who were in management roles. Not necessarily that they just wanted to highlight their leadership as a soft skill.
It’s a bit hard to see the technical skills in detail, as this plot has more bars. Let’s plot just technical skills now.
Also remove “risk modeling” and “data wrangling”. Even also allowing just “wrangling” or “wrangled”, still very few resumes with this skill listed using our set of keywords.
Also, order skills by their max in the two cities, and create two panels for lower vs. higher frequency skills.
technical_skills_to_plot <- num_resumes_per_skill[num_resumes_per_skill$Type == "technical" & !(num_resumes_per_skill$Skill %in% c("data wrangling","risk modeling")),]
max_percent_of_resumes_per_city_per_skill <- aggregate(Percent.of.resumes.this.city ~ Skill,technical_skills_to_plot,max)
max_percent_of_resumes_per_city_per_skill <- max_percent_of_resumes_per_city_per_skill %>% arrange(Percent.of.resumes.this.city)
technical_skills_to_plot <- data.frame(technical_skills_to_plot,Frequency.level = rep(NA,times=nrow(technical_skills_to_plot)),stringsAsFactors=FALSE)
technical_skills_to_plot$Frequency.level <- ifelse(technical_skills_to_plot$Skill %in% max_percent_of_resumes_per_city_per_skill$Skill[max_percent_of_resumes_per_city_per_skill[,2] < 25],"Lower frequency skills","Higher frequency skills")
technical_skills_to_plot$Skill <- factor(technical_skills_to_plot$Skill,levels=max_percent_of_resumes_per_city_per_skill$Skill)
ggplot(technical_skills_to_plot,
aes(x=Skill,y=Percent.of.resumes.this.city,fill=City)) +
geom_bar(stat="identity",position="dodge") +
ggtitle("Technical") +
facet_wrap(~Frequency.level,scales="free") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Still a bit hard to read! I think let’s use table to look at city differences.
technical_skills_spread <- technical_skills_to_plot %>%
select(setdiff(colnames(technical_skills_to_plot),c("Num.resumes","Total.resumes.this.city"))) %>%
spread(City,Percent.of.resumes.this.city)
colnames(technical_skills_spread)[4:5] <- c("New.York","San.Francisco")
technical_skills_spread[,c(1,4,5)] %>% mutate(City.difference = New.York - San.Francisco) %>% arrange(desc(abs(City.difference)))
## Skill New.York San.Francisco City.difference
## 1 data visualization 37.42 28.02 9.40
## 2 sas 26.57 21.37 5.20
## 3 data mining 32.45 27.85 4.60
## 4 r 57.71 53.24 4.47
## 5 decision tree 15.01 10.68 4.33
## 6 machine learning 49.80 54.12 -4.32
## 7 prediction 51.12 46.94 4.18
## 8 big data 21.91 18.56 3.35
## 9 tableau 25.56 22.94 2.62
## 10 hadoop 26.77 24.17 2.60
## 11 data manipulation 10.95 8.41 2.54
## 12 aws 16.13 18.39 -2.26
## 13 powerpoint 7.51 5.25 2.26
## 14 statistics 54.67 56.39 -1.72
## 15 scripting languages 5.38 6.83 -1.45
## 16 perl 3.65 5.08 -1.43
## 17 a/b testing 4.87 6.30 -1.43
## 18 microsoft excel 5.07 3.68 1.39
## 19 java 27.18 28.55 -1.37
## 20 c++ 26.67 28.02 -1.35
## 21 algorithm 24.54 23.29 1.25
## 22 modeling 64.00 62.87 1.13
## 23 sql 57.51 58.49 -0.98
## 24 matlab 19.78 20.67 -0.89
## 25 d3.js 4.16 4.38 -0.22
## 26 python 64.20 64.27 -0.07
We find that New York data science job seekers appear substantially more likely to include data visualization keywords in their resumes.
Some other skills have smaller differences. For example, San Francisco data science job seekers had keywords related to machine learning in around 4% more of their resumes.
Now, take mean across cities and use that to plot, coloring by skill for clearer viewing.
technical_skills_to_plot <- technical_skills_spread %>%
mutate(Mean.percent.of.resumes.across.cities = (New.York + San.Francisco)/2)
mycol <- c("#004949","#009292","#FF6DB6","#FFB677","#490092","#006DDB","#B66DFF","#6DB6FF","#B6DBFF","#920000","#924900","#DBD100","#24FF24","#FFFF6D","#000000") #Set up colorblind friendly vector.
ggplot(technical_skills_to_plot,
aes(x=Skill,y=Mean.percent.of.resumes.across.cities,fill=Skill)) +
geom_bar(stat="identity") +
ggtitle("Technical skills found by keywords") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_fill_manual(values=rep(mycol,times=2)) +
xlab("Mean percent of resumes across cities")