Webscraping to Find Key DS Skills

Dream Team

2017-10-12

Project 3

The goal of project 3 is to find key skills for data scientists and put these into a relational database as well as perform some baisic data analysis.

Loading Libraries.

library(RCurl)
library(xml2)
library(rvest)
library(knitr)
library(tidyr)
library(dplyr)
library(readr)
library(mongolite)
library(dummies)
library(ggplot2)

Function to find multiple URLs for all cities:

# Function to generate URLS
itter <- c("", "&start=10", "&start=20", "&start=30")
IndeedCities <- function(jobTitle, city, state){
  url <- c() #To make a list with multiple returns.
  for(i in itter){ # create urls
  i <- paste0("https://www.indeed.com/jobs?q=",
                gsub(" ", replacement = "+", jobTitle),
                "&l=",
                gsub(" ", replacement = "+", city),
                "%2C+", 
                state, 
                i,
                sep = "")
  url <- c(url, i) #Update list of urls
  }
  return(url)
}

The cities we will search over.

# This is easier than updating a datafram from a function and using a loop. 
nyc <-  IndeedCities("Data Scientist", "New York", "NY")
bost <- IndeedCities("Data Scientist", "Boston", "MA")
sanFran <- IndeedCities("Data Scientist", "San Francisco", "CA")
hust <- IndeedCities("Data Scientist", "Houston", "TX")
sea <- IndeedCities("Data Scientist", "Seattle", "WA")

Function to create the city data.frames

df1 <- data.frame(CompanyName = as.character(),
                  TitleName = as.character(),
                  City = as.character(),
                  ExperienceList = as.character())
IndeedData <- function(x){
  for(i in c(1:4)){
    ineeedPage <- read_html(x[i]) # reading to make a list.

   #Creating columns of the dataframe:

   ExperienceList <- ineeedPage %>%
      html_nodes(".experienceList") %>%
      html_text()

    TitleName <- ineeedPage %>%
     html_nodes(".jobtitle") %>%
     html_text()

    CompanyName <- ineeedPage %>%
      html_nodes(".company") %>%
      html_text()

    City <- ineeedPage %>%
      html_nodes(".location") %>%
      html_text()
    # Make a list of all the text in the webpage. 
    test <- ineeedPage %>%
      html_nodes(".result") %>%
      html_text()
# Make sure that experience is actually included.
    for(j in 1:length(test)) {
      if (grepl("Desired Experience:",x = test[j])) { # Double checking to make sure we don't skip NAs
      } else {
        ExperienceList <-  append(ExperienceList, NA, after = (j))
  }
}
    df <- data.frame(CompanyName,TitleName, City, ExperienceList)
    df1 <- rbind(df1,df)
  }
 return(df1)
}

Create the city dfs

nycDF <- IndeedData(nyc) 
bostDF <- IndeedData(bost)
sanFranDF <- IndeedData(sanFran)
hustDF <- IndeedData(hust)
seaDF <- IndeedData(sea)

Create one single df.

df <- rbind(nycDF, bostDF, sanFranDF, hustDF, seaDF)
df$City <- gsub("\\d.*","",df$City)
df$CompanyName <- gsub("\\n","",df$CompanyName)
df$TitleName <- gsub("\\n","",df$TitleName)
df$City <- trimws(df$City) # Removes white space
df$ExperienceList <- trimws(df$ExperienceList) # Removes white space on either end. 
df %>% head() %>% kable()
CompanyName TitleName City ExperienceList
Wade & Wendy Senior Data Scientist New York, NY AI, Machine Learning, Scala, Clojure, Java, Natural Language Processing, Python
KPMG Data Scientist New York, NY Discharge, Machine Learning, R, Natural Language Processing, MATLAB, Data Mining, Ruby, Hadoop, Scala, Sas, Tableau, Java, Spark, Python
Cigna Director of Software Engineering - Machine Learning & Artifi… New York, NY TensorFlow, AI, Data Mining, Machine Learning, Scala, Image Processing, C/C++, Natural Language Processing, MATLAB, Python, Lua
Google Data Scientist/Quantitative Analyst, Engineering New York, NY Machine Learning, R, MATLAB, Android
Capital One Data Scientist New York, NY Machine Learning, R, Big Data, Hadoop, Scala, Tableau, Spark, Python, AWS
Columbia University Research Scientist - All Levels (Applied Data Scientist) New York, NY Machine Learning, R, C/C++, Big Data, MATLAB, SPSS, Scala, Sas, Java, Data Science, Python
# Explained in the presentation rmd.
skillCount <- paste(df$ExperienceList, collapse = ",") 
skillCount <-  strsplit(skillCount, ",")[[1]]
skillCount <- gsub(" ", "", skillCount)
skillTable <-  table(skillCount) %>% as.data.frame()
ggplot(skillTable, aes(x = reorder(skillCount, Freq), y = Freq)) + 
  geom_bar(stat = "identity") +
  ggtitle("Percentage of Non- Missing Values") +
  coord_flip() 

Set up mongodb:

# SEtting up MongoDB
m <- mongo(url = "mongodb://127.0.0.1:27017" ) # Database ports

jsonlite::stream_out(df, file("skillsDF.json"), verbose = FALSE)
mt <- mongo("skillsDF")
mt$import(file("skillsDF.json"))
mt$find() %>% head(30) %>% kable()
CompanyName TitleName City ExperienceList
Magid Sr. Data Scientist New York, NY Machine Learning, Nurse Practitioner, R, Health, Dental Insurance, Git
KPMG Associate, Data Scientist - Optimization New York, NY Discharge, Machine Learning, R, MATLAB, SVN, Ruby, Sas, Tableau, Java, Git, Python
M.Gemi Data Scientist New York, NY Python
Google Data Scientist/Quantitative Analyst, Engineering New York, NY Machine Learning, R, MATLAB, Android
Columbia University Research Scientist - All Levels (Applied Data Scientist) New York, NY Machine Learning, R, C/C++, Big Data, MATLAB, SPSS, Scala, Sas, Java, Data Science, Python
Dstillery Data Scientist New York, NY Machine Learning, R, Java, Data Science, Python
Squarespace Data Scientist New York, NY Machine Learning, Hadoop, Health, Paid Time Off, Spark, PostgresSQL, Data Science, Python
Facebook Data Scientist, Analytics New York, NY Hive, Perl, R, MySQL, PHP, Data Mining, Hadoop, Oracle, Sas, Data Warehouse, Python
Diaspark Data Scientist Manhattan, NY Agile, Machine Learning, Hadoop, R, Azure, Java, Spark, Python, Data Science, AWS
Cognizant Data Scientist New York, NY Machine Learning, R, Data Science
Twitter Machine Learning New York, NY Big Data, Natural Language Processing, Spark, Machine Learning, Hadoop
Meetup Data Scientist New York, NY Hive, BI, Experimental Design, Hadoop, Scala, R, Spark, Data Warehouse, Python
MetroPlus Health Plan Data Scientist New York, NY Machine Learning, Data Mining, Sas, Data Warehouse, Data Science
WorkFusion Deep Learning Data Scientist New York, NY AI, Machine Learning, Image Processing, Big Data
Wade & Wendy Senior Data Scientist New York, NY AI, Machine Learning, Scala, Clojure, Java, Natural Language Processing, Python
Wade & Wendy Senior Data Scientist New York, NY AI, Machine Learning, Scala, Clojure, Java, Natural Language Processing, Python
KPMG Associate, Data Scientist - Optimization New York, NY Discharge, Machine Learning, R, MATLAB, SVN, Ruby, Sas, Tableau, Java, Git, Python
MetroPlus Health Plan Data Scientist New York, NY Machine Learning, Data Mining, Sas, Data Warehouse, Data Science
Amazon Web Services, Inc. Data Scientist New York, NY Data Mining, Machine Learning, R, C/C++, Big Data, Java, Python, AWS
Google Research Scientist, Google Brain (United States) New York, NY AI, Data Mining, Machine Learning, Image Processing, C/C++, Natural Language Processing, Python
IFG Companies Predictive Modeling Analyst New York, NY Machine Learning, Vision, Life, R, Sas, Health, Paid Time Off, Dental Insurance, Data Warehouse, Triage, Disability Insurance
Delos Data Scientist New York, NY Machine Learning, R, C/C++, Big Data, MATLAB, JavaScript, AI, Data Mining, Hadoop, Scala, Java, Spark, Data Science, Python
Uber Data Scientist - Global Intelligence (New York) New York, NY Hive, Rideshare, Nurse Practitioner, R, Data Science
Pivotal Software Data Scientist New York, NY Agile, Hive, Machine Learning, R, C/C++, MATLAB, Pentaho, BI, Hadoop, Sas, Tableau, Java, Python
Komodo Health Data Scientist New York, NY Machine Learning, Vision, R, MATLAB, Hadoop, Scala, Flexible Schedule, Java, Health, Dental Insurance, Spark, Data Science, Python
NBCUniversal Data Scientist New York, NY Hive, Machine Learning, R, Hadoop, Scala, Sas, Image Processing, Java, Spark, Data Science, Python
Foursquare Data Scientist New York, NY Machine Learning, R, Big Data, Scala, Data Science, Python
M.Gemi Data Scientist New York, NY Python
WorkFusion Deep Learning Data Scientist New York, NY AI, Machine Learning, Image Processing, Big Data
Magid Sr. Data Scientist New York, NY Machine Learning, Nurse Practitioner, R, Health, Dental Insurance, Git

Future work

  • Find cities highest skill
  • Find city with the most diverse requirements
  • Find city with least requirements
  • K means cluster
  • Try
df$ExperienceList <- as.character(df$ExperienceList)
# Not
dfKNN <- df %>%
  mutate(ID = 1:n()) %>%
  mutate(ExperienceList = strsplit(ExperienceList, split = ",")) %>%
  unnest() %>%
  mutate(Value = 1) %>%
  spread(ExperienceList, Value, fill = 0) 
rownames(dfKNN) <- dfKNN$ID
dfKNN <- select(dfKNN, -ID)


dfDum <- dfKNN %>%
  select(-c(CompanyName, TitleName, City ))
km.out = kmeans(dfDum,2, nstart =20)
library(dummies)
test <- dummy( df$CompanyName, sep = ",")

K Means cluster

clusterDF <- dfKNN[, 1:3]
clusterDF$Cluster <- km.out$cluster
clusterDF %>% head() %>% kable()
CompanyName TitleName City Cluster
264 A Place for Mom Data Scientist Seattle, WA 2
56 AdTheorent Data Scientist New York, NY 2
93 Akamai 2018 Summer Intern - Data Scientist Cambridge, MA 2
71 Akamai Data Scientist.Associate. Cambridge, MA 2
183 alliantgroup Engineering Consultant Houston, TX 2
208 alliantgroup Engineering Consultant Houston, TX 2
dfCity <- dfKNN %>% 
  select(-c(CompanyName, TitleName)) %>% 
  group_by(City) %>% 
  summarise_all(funs(mean(., na.rm = TRUE)))

dfCityClean <- dfCity %>% mutate(sumVar = rowSums(.[2:ncol(dfCity)])) %>% 
  select(City, sumVar) %>% 
  arrange(-sumVar)
dfCityClean %>% head() %>% kable()
City sumVar
Emeryville, CA 13.000000
San Bruno, CA 11.000000
Redmond, WA 10.500000
Manhattan, NY 10.000000
Kent, WA 9.000000
San Francisco, CA 8.865385
dfCityClean %>% tail() %>% kable()
City sumVar
Framingham, MA 6.000000
Waltham, MA 6.000000
Houston, TX 5.583333
Tukwila, WA 5.000000
Somerville, MA 4.000000
Natick, MA 3.000000

The most diverse skills by city.

dfCityClean <- dfCity %>% mutate(sumVar = rowSums(.[2:ncol(dfCity)])) %>% 
  select(City, sumVar) %>% 
  arrange(-sumVar)
dfCityClean %>% head() %>% kable()
City sumVar
Emeryville, CA 13.000000
San Bruno, CA 11.000000
Redmond, WA 10.500000
Manhattan, NY 10.000000
Kent, WA 9.000000
San Francisco, CA 8.865385
dfCityClean %>% tail() %>% kable()
City sumVar
Framingham, MA 6.000000
Waltham, MA 6.000000
Houston, TX 5.583333
Tukwila, WA 5.000000
Somerville, MA 4.000000
Natick, MA 3.000000
trimws(dfKNN$City) %>% unique()
##  [1] "Seattle, WA"       "New York, NY"      "Cambridge, MA"    
##  [4] "Houston, TX"       "Boston, MA"        "San Francisco, CA"
##  [7] "Framingham, MA"    "Bellevue, WA"      "Manhattan, NY"    
## [10] "Redmond, WA"       "Waltham, MA"       "Somerville, MA"   
## [13] "Natick, MA"        "Emeryville, CA"    "Kent, WA"         
## [16] "Tukwila, WA"       "San Bruno, CA"