Project 3

The goal of project 3 is to find key skills for data scientists and put these into a relational database as well as perform some baisic data analysis.

Loading Libraries.

library(RCurl)
library(xml2)
library(rvest)
library(knitr)
library(tidyr)
library(dplyr)
library(readr)
library(mongolite)
library(dummies)
library(ggplot2)

Function to find multiple URLs for all cities:

# Function to generate URLS
itter <- c("", "&start=10", "&start=20", "&start=30")
IndeedCities <- function(jobTitle, city, state){
  url <- c() #To make a list with multiple returns.
  for(i in itter){ # create urls
  i <- paste0("https://www.indeed.com/jobs?q=",
                gsub(" ", replacement = "+", jobTitle),
                "&l=",
                gsub(" ", replacement = "+", city),
                "%2C+", 
                state, 
                i,
                sep = "")
  url <- c(url, i) #Update list of urls
  }
  return(url)
}

The cities we will search over.

# This is easier than updating a datafram from a function and using a loop. 
nyc <-  IndeedCities("Data Scientist", "New York", "NY")
bost <- IndeedCities("Data Scientist", "Boston", "MA")
sanFran <- IndeedCities("Data Scientist", "San Francisco", "CA")
hust <- IndeedCities("Data Scientist", "Houston", "TX")
sea <- IndeedCities("Data Scientist", "Seattle", "WA")

Function to create the city data.frames

df1 <- data.frame(CompanyName = as.character(),
                  TitleName = as.character(),
                  City = as.character(),
                  ExperienceList = as.character())
IndeedData <- function(x){
  for(i in c(1:4)){
    ineeedPage <- read_html(x[i]) # reading to make a list.

   #Creating columns of the dataframe:

   ExperienceList <- ineeedPage %>%
      html_nodes(".experienceList") %>%
      html_text()

    TitleName <- ineeedPage %>%
     html_nodes(".jobtitle") %>%
     html_text()

    CompanyName <- ineeedPage %>%
      html_nodes(".company") %>%
      html_text()

    City <- ineeedPage %>%
      html_nodes(".location") %>%
      html_text()
    # Make a list of all the text in the webpage. 
    test <- ineeedPage %>%
      html_nodes(".result") %>%
      html_text()
# Make sure that experience is actually included.
    for(j in 1:length(test)) {
      if (grepl("Desired Experience:",x = test[j])) { # Double checking to make sure we don't skip NAs
      } else {
        ExperienceList <-  append(ExperienceList, NA, after = (j))
  }
}
    df <- data.frame(CompanyName,TitleName, City, ExperienceList)
    df1 <- rbind(df1,df)
  }
 return(df1)
}

Create the city dfs

nycDF <- IndeedData(nyc) 
bostDF <- IndeedData(bost)
sanFranDF <- IndeedData(sanFran)
hustDF <- IndeedData(hust)
seaDF <- IndeedData(sea)

Create one single df.

df <- rbind(nycDF, bostDF, sanFranDF, hustDF, seaDF)
df$City <- gsub("\\d.*","",df$City)
df$CompanyName <- gsub("\\n","",df$CompanyName)
df$TitleName <- gsub("\\n","",df$TitleName)
df$City <- trimws(df$City) # Removes white space
df$ExperienceList <- trimws(df$ExperienceList) # Removes white space on either end. 
df %>% head() %>% kable()

CompanyName	TitleName	City	ExperienceList
Wade & Wendy	Senior Data Scientist	New York, NY	AI, Machine Learning, Scala, Clojure, Java, Natural Language Processing, Python
KPMG	Data Scientist	New York, NY	Discharge, Machine Learning, R, Natural Language Processing, MATLAB, Data Mining, Ruby, Hadoop, Scala, Sas, Tableau, Java, Spark, Python
Cigna	Director of Software Engineering - Machine Learning & Artifi…	New York, NY	TensorFlow, AI, Data Mining, Machine Learning, Scala, Image Processing, C/C++, Natural Language Processing, MATLAB, Python, Lua
Google	Data Scientist/Quantitative Analyst, Engineering	New York, NY	Machine Learning, R, MATLAB, Android
Capital One	Data Scientist	New York, NY	Machine Learning, R, Big Data, Hadoop, Scala, Tableau, Spark, Python, AWS
Columbia University	Research Scientist - All Levels (Applied Data Scientist)	New York, NY	Machine Learning, R, C/C++, Big Data, MATLAB, SPSS, Scala, Sas, Java, Data Science, Python

# Explained in the presentation rmd.
skillCount <- paste(df$ExperienceList, collapse = ",") 
skillCount <-  strsplit(skillCount, ",")[[1]]
skillCount <- gsub(" ", "", skillCount)
skillTable <-  table(skillCount) %>% as.data.frame()

ggplot(skillTable, aes(x = reorder(skillCount, Freq), y = Freq)) + 
  geom_bar(stat = "identity") +
  ggtitle("Percentage of Non- Missing Values") +
  coord_flip()

Set up mongodb:

# SEtting up MongoDB
m <- mongo(url = "mongodb://127.0.0.1:27017" ) # Database ports

jsonlite::stream_out(df, file("skillsDF.json"), verbose = FALSE)
mt <- mongo("skillsDF")
mt$import(file("skillsDF.json"))
mt$find() %>% head(30) %>% kable()

CompanyName	TitleName	City	ExperienceList
Magid	Sr. Data Scientist	New York, NY	Machine Learning, Nurse Practitioner, R, Health, Dental Insurance, Git
KPMG	Associate, Data Scientist - Optimization	New York, NY	Discharge, Machine Learning, R, MATLAB, SVN, Ruby, Sas, Tableau, Java, Git, Python
M.Gemi	Data Scientist	New York, NY	Python
Google	Data Scientist/Quantitative Analyst, Engineering	New York, NY	Machine Learning, R, MATLAB, Android
Columbia University	Research Scientist - All Levels (Applied Data Scientist)	New York, NY	Machine Learning, R, C/C++, Big Data, MATLAB, SPSS, Scala, Sas, Java, Data Science, Python
Dstillery	Data Scientist	New York, NY	Machine Learning, R, Java, Data Science, Python
Squarespace	Data Scientist	New York, NY	Machine Learning, Hadoop, Health, Paid Time Off, Spark, PostgresSQL, Data Science, Python
Facebook	Data Scientist, Analytics	New York, NY	Hive, Perl, R, MySQL, PHP, Data Mining, Hadoop, Oracle, Sas, Data Warehouse, Python
Diaspark	Data Scientist	Manhattan, NY	Agile, Machine Learning, Hadoop, R, Azure, Java, Spark, Python, Data Science, AWS
Cognizant	Data Scientist	New York, NY	Machine Learning, R, Data Science
Twitter	Machine Learning	New York, NY	Big Data, Natural Language Processing, Spark, Machine Learning, Hadoop
Meetup	Data Scientist	New York, NY	Hive, BI, Experimental Design, Hadoop, Scala, R, Spark, Data Warehouse, Python
MetroPlus Health Plan	Data Scientist	New York, NY	Machine Learning, Data Mining, Sas, Data Warehouse, Data Science
WorkFusion	Deep Learning Data Scientist	New York, NY	AI, Machine Learning, Image Processing, Big Data
Wade & Wendy	Senior Data Scientist	New York, NY	AI, Machine Learning, Scala, Clojure, Java, Natural Language Processing, Python
Wade & Wendy	Senior Data Scientist	New York, NY	AI, Machine Learning, Scala, Clojure, Java, Natural Language Processing, Python
KPMG	Associate, Data Scientist - Optimization	New York, NY	Discharge, Machine Learning, R, MATLAB, SVN, Ruby, Sas, Tableau, Java, Git, Python
MetroPlus Health Plan	Data Scientist	New York, NY	Machine Learning, Data Mining, Sas, Data Warehouse, Data Science
Amazon Web Services, Inc.	Data Scientist	New York, NY	Data Mining, Machine Learning, R, C/C++, Big Data, Java, Python, AWS
Google	Research Scientist, Google Brain (United States)	New York, NY	AI, Data Mining, Machine Learning, Image Processing, C/C++, Natural Language Processing, Python
IFG Companies	Predictive Modeling Analyst	New York, NY	Machine Learning, Vision, Life, R, Sas, Health, Paid Time Off, Dental Insurance, Data Warehouse, Triage, Disability Insurance
Delos	Data Scientist	New York, NY	Machine Learning, R, C/C++, Big Data, MATLAB, JavaScript, AI, Data Mining, Hadoop, Scala, Java, Spark, Data Science, Python
Uber	Data Scientist - Global Intelligence (New York)	New York, NY	Hive, Rideshare, Nurse Practitioner, R, Data Science
Pivotal Software	Data Scientist	New York, NY	Agile, Hive, Machine Learning, R, C/C++, MATLAB, Pentaho, BI, Hadoop, Sas, Tableau, Java, Python
Komodo Health	Data Scientist	New York, NY	Machine Learning, Vision, R, MATLAB, Hadoop, Scala, Flexible Schedule, Java, Health, Dental Insurance, Spark, Data Science, Python
NBCUniversal	Data Scientist	New York, NY	Hive, Machine Learning, R, Hadoop, Scala, Sas, Image Processing, Java, Spark, Data Science, Python
Foursquare	Data Scientist	New York, NY	Machine Learning, R, Big Data, Scala, Data Science, Python
M.Gemi	Data Scientist	New York, NY	Python
WorkFusion	Deep Learning Data Scientist	New York, NY	AI, Machine Learning, Image Processing, Big Data
Magid	Sr. Data Scientist	New York, NY	Machine Learning, Nurse Practitioner, R, Health, Dental Insurance, Git

Future work

Find cities highest skill
Find city with the most diverse requirements
Find city with least requirements
K means cluster
Try

df$ExperienceList <- as.character(df$ExperienceList)
# Not
dfKNN <- df %>%
  mutate(ID = 1:n()) %>%
  mutate(ExperienceList = strsplit(ExperienceList, split = ",")) %>%
  unnest() %>%
  mutate(Value = 1) %>%
  spread(ExperienceList, Value, fill = 0)

rownames(dfKNN) <- dfKNN$ID
dfKNN <- select(dfKNN, -ID)


dfDum <- dfKNN %>%
  select(-c(CompanyName, TitleName, City ))

km.out = kmeans(dfDum,2, nstart =20)

library(dummies)
test <- dummy( df$CompanyName, sep = ",")

K Means cluster

clusterDF <- dfKNN[, 1:3]
clusterDF$Cluster <- km.out$cluster
clusterDF %>% head() %>% kable()

	CompanyName	TitleName	City	Cluster
264	A Place for Mom	Data Scientist	Seattle, WA	2
56	AdTheorent	Data Scientist	New York, NY	2
93	Akamai	2018 Summer Intern - Data Scientist	Cambridge, MA	2
71	Akamai	Data Scientist.Associate.	Cambridge, MA	2
183	alliantgroup	Engineering Consultant	Houston, TX	2
208	alliantgroup	Engineering Consultant	Houston, TX	2

dfCity <- dfKNN %>% 
  select(-c(CompanyName, TitleName)) %>% 
  group_by(City) %>% 
  summarise_all(funs(mean(., na.rm = TRUE)))

dfCityClean <- dfCity %>% mutate(sumVar = rowSums(.[2:ncol(dfCity)])) %>% 
  select(City, sumVar) %>% 
  arrange(-sumVar)
dfCityClean %>% head() %>% kable()

City	sumVar
Emeryville, CA	13.000000
San Bruno, CA	11.000000
Redmond, WA	10.500000
Manhattan, NY	10.000000
Kent, WA	9.000000
San Francisco, CA	8.865385

dfCityClean %>% tail() %>% kable()

City	sumVar
Framingham, MA	6.000000
Waltham, MA	6.000000
Houston, TX	5.583333
Tukwila, WA	5.000000
Somerville, MA	4.000000
Natick, MA	3.000000

The most diverse skills by city.

dfCityClean <- dfCity %>% mutate(sumVar = rowSums(.[2:ncol(dfCity)])) %>% 
  select(City, sumVar) %>% 
  arrange(-sumVar)
dfCityClean %>% head() %>% kable()

City	sumVar
Emeryville, CA	13.000000
San Bruno, CA	11.000000
Redmond, WA	10.500000
Manhattan, NY	10.000000
Kent, WA	9.000000
San Francisco, CA	8.865385

dfCityClean %>% tail() %>% kable()

City	sumVar
Framingham, MA	6.000000
Waltham, MA	6.000000
Houston, TX	5.583333
Tukwila, WA	5.000000
Somerville, MA	4.000000
Natick, MA	3.000000

trimws(dfKNN$City) %>% unique()

##  [1] "Seattle, WA"       "New York, NY"      "Cambridge, MA"    
##  [4] "Houston, TX"       "Boston, MA"        "San Francisco, CA"
##  [7] "Framingham, MA"    "Bellevue, WA"      "Manhattan, NY"    
## [10] "Redmond, WA"       "Waltham, MA"       "Somerville, MA"   
## [13] "Natick, MA"        "Emeryville, CA"    "Kent, WA"         
## [16] "Tukwila, WA"       "San Bruno, CA"

Webscraping to Find Key DS Skills

Dream Team

2017-10-12

Project 3

Loading Libraries.

Future work