Project 3
The goal of project 3 is to find key skills for data scientists and put these into a relational database as well as perform some baisic data analysis.
Loading Libraries.
library(RCurl)
library(xml2)
library(rvest)
library(knitr)
library(tidyr)
library(dplyr)
library(readr)
library(mongolite)
library(dummies)
library(ggplot2)Function to find multiple URLs for all cities:
# Function to generate URLS
itter <- c("", "&start=10", "&start=20", "&start=30")
IndeedCities <- function(jobTitle, city, state){
url <- c() #To make a list with multiple returns.
for(i in itter){ # create urls
i <- paste0("https://www.indeed.com/jobs?q=",
gsub(" ", replacement = "+", jobTitle),
"&l=",
gsub(" ", replacement = "+", city),
"%2C+",
state,
i,
sep = "")
url <- c(url, i) #Update list of urls
}
return(url)
}The cities we will search over.
# This is easier than updating a datafram from a function and using a loop.
nyc <- IndeedCities("Data Scientist", "New York", "NY")
bost <- IndeedCities("Data Scientist", "Boston", "MA")
sanFran <- IndeedCities("Data Scientist", "San Francisco", "CA")
hust <- IndeedCities("Data Scientist", "Houston", "TX")
sea <- IndeedCities("Data Scientist", "Seattle", "WA")Function to create the city data.frames
df1 <- data.frame(CompanyName = as.character(),
TitleName = as.character(),
City = as.character(),
ExperienceList = as.character())
IndeedData <- function(x){
for(i in c(1:4)){
ineeedPage <- read_html(x[i]) # reading to make a list.
#Creating columns of the dataframe:
ExperienceList <- ineeedPage %>%
html_nodes(".experienceList") %>%
html_text()
TitleName <- ineeedPage %>%
html_nodes(".jobtitle") %>%
html_text()
CompanyName <- ineeedPage %>%
html_nodes(".company") %>%
html_text()
City <- ineeedPage %>%
html_nodes(".location") %>%
html_text()
# Make a list of all the text in the webpage.
test <- ineeedPage %>%
html_nodes(".result") %>%
html_text()
# Make sure that experience is actually included.
for(j in 1:length(test)) {
if (grepl("Desired Experience:",x = test[j])) { # Double checking to make sure we don't skip NAs
} else {
ExperienceList <- append(ExperienceList, NA, after = (j))
}
}
df <- data.frame(CompanyName,TitleName, City, ExperienceList)
df1 <- rbind(df1,df)
}
return(df1)
}Create the city dfs
nycDF <- IndeedData(nyc)
bostDF <- IndeedData(bost)
sanFranDF <- IndeedData(sanFran)
hustDF <- IndeedData(hust)
seaDF <- IndeedData(sea)Create one single df.
df <- rbind(nycDF, bostDF, sanFranDF, hustDF, seaDF)
df$City <- gsub("\\d.*","",df$City)
df$CompanyName <- gsub("\\n","",df$CompanyName)
df$TitleName <- gsub("\\n","",df$TitleName)
df$City <- trimws(df$City) # Removes white space
df$ExperienceList <- trimws(df$ExperienceList) # Removes white space on either end.
df %>% head() %>% kable()| CompanyName | TitleName | City | ExperienceList |
|---|---|---|---|
| Wade & Wendy | Senior Data Scientist | New York, NY | AI, Machine Learning, Scala, Clojure, Java, Natural Language Processing, Python |
| KPMG | Data Scientist | New York, NY | Discharge, Machine Learning, R, Natural Language Processing, MATLAB, Data Mining, Ruby, Hadoop, Scala, Sas, Tableau, Java, Spark, Python |
| Cigna | Director of Software Engineering - Machine Learning & Artifi… | New York, NY | TensorFlow, AI, Data Mining, Machine Learning, Scala, Image Processing, C/C++, Natural Language Processing, MATLAB, Python, Lua |
| Data Scientist/Quantitative Analyst, Engineering | New York, NY | Machine Learning, R, MATLAB, Android | |
| Capital One | Data Scientist | New York, NY | Machine Learning, R, Big Data, Hadoop, Scala, Tableau, Spark, Python, AWS |
| Columbia University | Research Scientist - All Levels (Applied Data Scientist) | New York, NY | Machine Learning, R, C/C++, Big Data, MATLAB, SPSS, Scala, Sas, Java, Data Science, Python |
# Explained in the presentation rmd.
skillCount <- paste(df$ExperienceList, collapse = ",")
skillCount <- strsplit(skillCount, ",")[[1]]
skillCount <- gsub(" ", "", skillCount)
skillTable <- table(skillCount) %>% as.data.frame()ggplot(skillTable, aes(x = reorder(skillCount, Freq), y = Freq)) +
geom_bar(stat = "identity") +
ggtitle("Percentage of Non- Missing Values") +
coord_flip() Set up mongodb:
# SEtting up MongoDB
m <- mongo(url = "mongodb://127.0.0.1:27017" ) # Database ports
jsonlite::stream_out(df, file("skillsDF.json"), verbose = FALSE)
mt <- mongo("skillsDF")
mt$import(file("skillsDF.json"))
mt$find() %>% head(30) %>% kable()| CompanyName | TitleName | City | ExperienceList |
|---|---|---|---|
| Magid | Sr. Data Scientist | New York, NY | Machine Learning, Nurse Practitioner, R, Health, Dental Insurance, Git |
| KPMG | Associate, Data Scientist - Optimization | New York, NY | Discharge, Machine Learning, R, MATLAB, SVN, Ruby, Sas, Tableau, Java, Git, Python |
| M.Gemi | Data Scientist | New York, NY | Python |
| Data Scientist/Quantitative Analyst, Engineering | New York, NY | Machine Learning, R, MATLAB, Android | |
| Columbia University | Research Scientist - All Levels (Applied Data Scientist) | New York, NY | Machine Learning, R, C/C++, Big Data, MATLAB, SPSS, Scala, Sas, Java, Data Science, Python |
| Dstillery | Data Scientist | New York, NY | Machine Learning, R, Java, Data Science, Python |
| Squarespace | Data Scientist | New York, NY | Machine Learning, Hadoop, Health, Paid Time Off, Spark, PostgresSQL, Data Science, Python |
| Data Scientist, Analytics | New York, NY | Hive, Perl, R, MySQL, PHP, Data Mining, Hadoop, Oracle, Sas, Data Warehouse, Python | |
| Diaspark | Data Scientist | Manhattan, NY | Agile, Machine Learning, Hadoop, R, Azure, Java, Spark, Python, Data Science, AWS |
| Cognizant | Data Scientist | New York, NY | Machine Learning, R, Data Science |
| Machine Learning | New York, NY | Big Data, Natural Language Processing, Spark, Machine Learning, Hadoop | |
| Meetup | Data Scientist | New York, NY | Hive, BI, Experimental Design, Hadoop, Scala, R, Spark, Data Warehouse, Python |
| MetroPlus Health Plan | Data Scientist | New York, NY | Machine Learning, Data Mining, Sas, Data Warehouse, Data Science |
| WorkFusion | Deep Learning Data Scientist | New York, NY | AI, Machine Learning, Image Processing, Big Data |
| Wade & Wendy | Senior Data Scientist | New York, NY | AI, Machine Learning, Scala, Clojure, Java, Natural Language Processing, Python |
| Wade & Wendy | Senior Data Scientist | New York, NY | AI, Machine Learning, Scala, Clojure, Java, Natural Language Processing, Python |
| KPMG | Associate, Data Scientist - Optimization | New York, NY | Discharge, Machine Learning, R, MATLAB, SVN, Ruby, Sas, Tableau, Java, Git, Python |
| MetroPlus Health Plan | Data Scientist | New York, NY | Machine Learning, Data Mining, Sas, Data Warehouse, Data Science |
| Amazon Web Services, Inc. | Data Scientist | New York, NY | Data Mining, Machine Learning, R, C/C++, Big Data, Java, Python, AWS |
| Research Scientist, Google Brain (United States) | New York, NY | AI, Data Mining, Machine Learning, Image Processing, C/C++, Natural Language Processing, Python | |
| IFG Companies | Predictive Modeling Analyst | New York, NY | Machine Learning, Vision, Life, R, Sas, Health, Paid Time Off, Dental Insurance, Data Warehouse, Triage, Disability Insurance |
| Delos | Data Scientist | New York, NY | Machine Learning, R, C/C++, Big Data, MATLAB, JavaScript, AI, Data Mining, Hadoop, Scala, Java, Spark, Data Science, Python |
| Uber | Data Scientist - Global Intelligence (New York) | New York, NY | Hive, Rideshare, Nurse Practitioner, R, Data Science |
| Pivotal Software | Data Scientist | New York, NY | Agile, Hive, Machine Learning, R, C/C++, MATLAB, Pentaho, BI, Hadoop, Sas, Tableau, Java, Python |
| Komodo Health | Data Scientist | New York, NY | Machine Learning, Vision, R, MATLAB, Hadoop, Scala, Flexible Schedule, Java, Health, Dental Insurance, Spark, Data Science, Python |
| NBCUniversal | Data Scientist | New York, NY | Hive, Machine Learning, R, Hadoop, Scala, Sas, Image Processing, Java, Spark, Data Science, Python |
| Foursquare | Data Scientist | New York, NY | Machine Learning, R, Big Data, Scala, Data Science, Python |
| M.Gemi | Data Scientist | New York, NY | Python |
| WorkFusion | Deep Learning Data Scientist | New York, NY | AI, Machine Learning, Image Processing, Big Data |
| Magid | Sr. Data Scientist | New York, NY | Machine Learning, Nurse Practitioner, R, Health, Dental Insurance, Git |
Future work
- Find cities highest skill
- Find city with the most diverse requirements
- Find city with least requirements
- K means cluster
- Try
df$ExperienceList <- as.character(df$ExperienceList)
# Not
dfKNN <- df %>%
mutate(ID = 1:n()) %>%
mutate(ExperienceList = strsplit(ExperienceList, split = ",")) %>%
unnest() %>%
mutate(Value = 1) %>%
spread(ExperienceList, Value, fill = 0) rownames(dfKNN) <- dfKNN$ID
dfKNN <- select(dfKNN, -ID)
dfDum <- dfKNN %>%
select(-c(CompanyName, TitleName, City ))km.out = kmeans(dfDum,2, nstart =20)library(dummies)
test <- dummy( df$CompanyName, sep = ",")K Means cluster
clusterDF <- dfKNN[, 1:3]
clusterDF$Cluster <- km.out$cluster
clusterDF %>% head() %>% kable()| CompanyName | TitleName | City | Cluster | |
|---|---|---|---|---|
| 264 | A Place for Mom | Data Scientist | Seattle, WA | 2 |
| 56 | AdTheorent | Data Scientist | New York, NY | 2 |
| 93 | Akamai | 2018 Summer Intern - Data Scientist | Cambridge, MA | 2 |
| 71 | Akamai | Data Scientist.Associate. | Cambridge, MA | 2 |
| 183 | alliantgroup | Engineering Consultant | Houston, TX | 2 |
| 208 | alliantgroup | Engineering Consultant | Houston, TX | 2 |
dfCity <- dfKNN %>%
select(-c(CompanyName, TitleName)) %>%
group_by(City) %>%
summarise_all(funs(mean(., na.rm = TRUE)))
dfCityClean <- dfCity %>% mutate(sumVar = rowSums(.[2:ncol(dfCity)])) %>%
select(City, sumVar) %>%
arrange(-sumVar)
dfCityClean %>% head() %>% kable()| City | sumVar |
|---|---|
| Emeryville, CA | 13.000000 |
| San Bruno, CA | 11.000000 |
| Redmond, WA | 10.500000 |
| Manhattan, NY | 10.000000 |
| Kent, WA | 9.000000 |
| San Francisco, CA | 8.865385 |
dfCityClean %>% tail() %>% kable()| City | sumVar |
|---|---|
| Framingham, MA | 6.000000 |
| Waltham, MA | 6.000000 |
| Houston, TX | 5.583333 |
| Tukwila, WA | 5.000000 |
| Somerville, MA | 4.000000 |
| Natick, MA | 3.000000 |
The most diverse skills by city.
dfCityClean <- dfCity %>% mutate(sumVar = rowSums(.[2:ncol(dfCity)])) %>%
select(City, sumVar) %>%
arrange(-sumVar)
dfCityClean %>% head() %>% kable()| City | sumVar |
|---|---|
| Emeryville, CA | 13.000000 |
| San Bruno, CA | 11.000000 |
| Redmond, WA | 10.500000 |
| Manhattan, NY | 10.000000 |
| Kent, WA | 9.000000 |
| San Francisco, CA | 8.865385 |
dfCityClean %>% tail() %>% kable()| City | sumVar |
|---|---|
| Framingham, MA | 6.000000 |
| Waltham, MA | 6.000000 |
| Houston, TX | 5.583333 |
| Tukwila, WA | 5.000000 |
| Somerville, MA | 4.000000 |
| Natick, MA | 3.000000 |
trimws(dfKNN$City) %>% unique()## [1] "Seattle, WA" "New York, NY" "Cambridge, MA"
## [4] "Houston, TX" "Boston, MA" "San Francisco, CA"
## [7] "Framingham, MA" "Bellevue, WA" "Manhattan, NY"
## [10] "Redmond, WA" "Waltham, MA" "Somerville, MA"
## [13] "Natick, MA" "Emeryville, CA" "Kent, WA"
## [16] "Tukwila, WA" "San Bruno, CA"