All the project files, codes, and graphics are available under https://github.com/omerozeren/DATA607/tree/master/PROJECT_3 on Github.
First we’ll set a few variables that we’ll use in our scraping activity. we’ve used a smaller set of cities as we’ll probably just use this to demonstrate how it works.
cities <- c("New+York+NY", "Seattle+WA", "San+Francisco+CA",
"Washington+DC","Atlanta+GA","Boston+MA", "Austin+TX",
"Los+Angeles+CA")
target.job <- "data+scientist"
base.url <- "https://www.indeed.com/"
max.results <- 100
#create a df to hold everything that we collect
jobs.data <- data.frame(matrix(ncol = 4, nrow = 0))
n <- c("city","job.title","company.name","description")
colnames(jobs.data)<-n
#Havesting data for all the cities and begin to parse the results with xpaths to extract interesting information
for (city in cities){
print(paste("Downloading data for: ", city))
for (start in range(0,max.results,10)){
url <- paste(base.url,"jobs?q=",target.job,"&l=",city,"&start=", start ,sep="")
page <- read_html(url)
Sys.sleep(1)
#get the links
links <- page %>%
html_nodes("div") %>%
html_nodes(xpath = '//*[@data-tn-element="jobTitle"]') %>%
html_attr("href")
#get the job title
job.title <- page %>%
html_nodes("div") %>%
html_nodes(xpath = '//*[@data-tn-element="jobTitle"]') %>%
html_attr("title")
#get the job title
job.title <- page %>%
html_nodes("div") %>%
html_nodes(xpath = '//*[@data-tn-element="jobTitle"]') %>%
html_attr("title")
#get the company name
company.name <- page %>%
html_nodes("span") %>%
html_nodes(xpath = '//*[@class="company"]') %>%
html_text() %>%
trimws -> company.name
#get the description
description <- page %>%
html_nodes("span") %>%
html_nodes(xpath = '//*[@class="summary"]') %>%
html_text() %>%
trimws -> description
}
#fill in the job data
job.city <- rep(city,length(links))
#iterate over the links that we collected
for ( n in 1:length(links) ){
#build the link
link <- paste(base.url,links[n],sep="")
#pull the link
page <- read_html(link)
}
#add the newly collected data to the jobs.data
jobs.data <- rbind(jobs.data,data.frame(city,
job.title,
company.name,
description))
}
write to csv file to GITHUB! This is already done.So, I will comment it.
write.csv(jobs.data, file = "C:/Users/OMERO/Documents/GitHub/Data607/PROJECT_3/Indeed_Job_Search.csv")
Now, load the data into a dataframe And, view the first few rows
url <- "https://raw.githubusercontent.com/omerozeren/DATA607/master/PROJECT_3/Indeed_Job_Search.csv"
Indeed_data <- read.csv(url)
kable(head(Indeed_data))
X | city | job.title | company.name | description |
---|---|---|---|---|
1 | New+York+NY | Data Scientist | Capital One | Bachelor’s Degree plus 2 years of experience in data analytics, or Master’s Degree plus 1 year of experience in data analytics, or PhD…. |
2 | New+York+NY | Data Scientist, Licensing Operations | Spotify | Partner with other licensing data scientists to drive business insights on labels and publishers. Partner with licensing data scientists and the broader… |
3 | New+York+NY | Data Engineering Manager, Analytics (Instagram) | Manage data warehouse plans across a product vertical. Build cross-functional relationships with Data Scientists, Product Managers and Software Engineers to… | |
4 | New+York+NY | Technical Trainer, Data and Machine Learning, Google Cloud | Deliver technical training on Data and Machine Learning to diverse audiences (e.g., partners, customers, Googlers, etc)…. | |
5 | New+York+NY | Intern Machine Learning & Design Research | Autodesk | Our software is used by everyone - from design professionals, engineers and architects to digital scientists, students and hobbyists…. |
6 | New+York+NY | Data Scientist, Accounting | Spotify | You have worked in data analytics for a minimum of 2 years with relevant experience with accounting and financial data analysis…. |
Alright, here’s the top 10 soft skills listed by Indeed.com, the same site we scraped the data from!
https://www.indeed.com/career-advice/resumes-cover-letters/hard-skills-vs-soft-skills
So, let’s create a list of words that corresponds to each skill! Using Thesaurus.com to do so!
integrity_keywords <- c("integrity", "honesty", "honest", "sincere", "sincerity")
dependability_keywords <- c("dependable", "punctual", "punctionality")
communication_keywords <- c("communication", "well spoken", "well-spoken", "personable")
open_mindedness_keywords <- c("open-mind", "open mind", "openness")
teamwork_keywords <- c("teamwork", "team player", "team-oriented", "team-oriented", "collaborate")
creativity_keywords <- c("creative", "creativity")
problem_solving_keywords <- c("problem-solver", "problem solving", "problem-oriented", "problem oriented")
critical_thinking_keywords <- c("critical thinker", "critical thinking", "critical-think")
adaptability_keywords <- c("adapt", "adaptability", "flexible", "flexibility")
organization_keywords <- c("organization", "orgranized")
Create columns in the dataframe associated with some attractions of national parks
Indeed_data$integrity <- 0
Indeed_data$dependability <- 0
Indeed_data$communication <- 0
Indeed_data$open_mindedness <- 0
Indeed_data$teamwork <- 0
Indeed_data$creativity <- 0
Indeed_data$problem_solving <- 0
Indeed_data$critical_thinking <- 0
Indeed_data$adaptability <- 0
Indeed_data$organization <- 0
Change all description words to lowercase, for stringr detection purposes
Indeed_data$description <- tolower(Indeed_data$description)
Populate the columns just created above
Indeed_data$integrity <- str_detect(Indeed_data$description, integrity_keywords)
Indeed_data$dependability <- str_detect(Indeed_data$description, dependability_keywords)
Indeed_data$communication <- str_detect(Indeed_data$description, communication_keywords)
Indeed_data$open_mindedness <- str_detect(Indeed_data$description, open_mindedness_keywords)
Indeed_data$teamwork <- str_detect(Indeed_data$description, teamwork_keywords)
Indeed_data$creativity <- str_detect(Indeed_data$description, creativity_keywords)
Indeed_data$problem_solving <- str_detect(Indeed_data$description, problem_solving_keywords)
Indeed_data$critical_thinking <- str_detect(Indeed_data$description, critical_thinking_keywords)
Indeed_data$adaptability <- str_detect(Indeed_data$description, adaptability_keywords)
Indeed_data$organization <- str_detect(Indeed_data$description, organization_keywords)
Summarize most common job titles:
title <- Indeed_data %>%
group_by(city, job.title) %>%
summarise(count = n()) %>%
arrange(desc(count))
kable(head(title))
city | job.title | count |
---|---|---|
Washington+DC | Data Scientist | 9 |
New+York+NY | Data Scientist | 3 |
Boston+MA | Computer Scientist | 2 |
Boston+MA | Data Scientist | 2 |
Los+Angeles+CA | Data Scientist | 2 |
San+Francisco+CA | Data Scientist | 2 |
set.seed(1234)
wordcloud(words=title$job.title, freq = title$count, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"));
Bar plot of the frequency for the top 10 titles
barplot(title[1:10,]$count, las = 2,
names.arg = title[1:10,]$job.title,
col ="lightblue", main ="Most frequent job Titles",
ylab = "Job frequencies")
Summarize most common Cities:
title <- Indeed_data %>%
group_by(city, job.title) %>%
summarise(count = n()) %>%
arrange(desc(count))
kable(head(title))
city | job.title | count |
---|---|---|
Washington+DC | Data Scientist | 9 |
New+York+NY | Data Scientist | 3 |
Boston+MA | Computer Scientist | 2 |
Boston+MA | Data Scientist | 2 |
Los+Angeles+CA | Data Scientist | 2 |
San+Francisco+CA | Data Scientist | 2 |
set.seed(1234)
wordcloud(words=title$city, freq = title$count, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"));
Bar plot of the frequency for the top 10 titles
barplot(title[1:10,]$count, las = 2,
names.arg = title[1:10,]$city,
col ="lightgreen", main ="Most frequent Cities",
ylab = "City frequencies")
companies <- Indeed_data %>%
group_by(city, company.name) %>%
summarise(count = n()) %>%
arrange(desc(count))
kable(head(companies))
city | company.name | count |
---|---|---|
Austin+TX | Indeed | 3 |
Austin+TX | University of Texas at Austin | 3 |
New+York+NY | 3 | |
Boston+MA | Federal Bureau of Investigation | 2 |
New+York+NY | Spotify | 2 |
San+Francisco+CA | Stitch Fix | 2 |
set.seed(1234)
wordcloud(words=companies$company.name, freq = companies$count, min.freq = 0,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"));
Bar plot of the frequency for the top 10 Companies
barplot(companies[1:10,]$count, las = 2,
names.arg = companies[1:10,]$company.name,
col ="red", main ="Most frequent Companies",
ylab = "Company frequencies")
In conclusion, we were able to build a table from indeed.com. We were also able use text mining to answer three main questions: 1-what skills are companies looking for, 2-What companies are looking for in Data Scientists,3.Finally, What cities are mostly looking for Data Scientists. The main companies that looking for Data Scienties are
Some of the non-technical skills that are greatly needed for Companies:
Most of the titles are clustered in Data Scients but there are few different titles that Companies are looking for