Scrape webpage results for salary data in NYC, San Fran, Boston, and Chicago:
# lists to hold salary results and compensation results from webpages
sal_nyc <- list()
comp_nyc <- list()
sal_sf <- list()
comp_sf <- list()
sal_bost <- list()
comp_bost <- list()
sal_chi <- list()
comp_chi <- list()
# for loop to go through all the search result pages on Indeed.com
# each search result page holds about 10 listings of average salaries
# which explains why the counter 'i' that gets attached to the url
# jumps by 10 each time
# First loop for NYC
# j will act as an index for the list
j=1
for (i in seq(0,70,10)) {
# first page of results doesn't have a counter in the url
if (i == 0) link <- url_nyc else link <- paste0(url_nyc,"?start=",i)
# the HTML/CSS is messy, but I found the CSS tags associated with
# salary and average compensation
pg_sal <- read_html(link) %>% html_nodes('.cmp-sal-summary') %>% html_text()
pg_comp <- read_html(link) %>% html_nodes('.cmp-sal-links') %>% html_text()
# save each table of search results as a list within a list
sal_nyc[j] <- list(pg_sal)
comp_nyc[j] <- list(pg_comp)
# increase our index by 1 each time
j = j + 1
}
# Second loop for San Fran
# j will act as an index for the list
j=1
for (i in seq(0,50,10)) {
# first page of results doesn't have a counter in the url
if (i == 0) link <- url_sf else link <- paste0(url_sf,"?start=",i)
# the HTML/CSS is messy, but I found the CSS tags associated with
# salary and average compensation
pg_sal <- read_html(link) %>% html_nodes('.cmp-sal-summary') %>% html_text()
pg_comp <- read_html(link) %>% html_nodes('.cmp-sal-links') %>% html_text()
# save each table of search results as a list within a list
sal_sf[j] <- list(pg_sal)
comp_sf[j] <- list(pg_comp)
# increase our index by 1 each time
j = j + 1
}
# Third loop for Boston
# j will act as an index for the list
j=1
for (i in seq(0,50,10)) {
# first page of results doesn't have a counter in the url
if (i == 0) link <- url_bost else link <- paste0(url_bost,"?start=",i)
# the HTML/CSS is messy, but I found the CSS tags associated with
# salary and average compensation
pg_sal <- read_html(link) %>% html_nodes('.cmp-sal-summary') %>% html_text()
pg_comp <- read_html(link) %>% html_nodes('.cmp-sal-links') %>% html_text()
# save each table of search results as a list within a list
sal_bost[j] <- list(pg_sal)
comp_bost[j] <- list(pg_comp)
# increase our index by 1 each time
j = j + 1
}
# Fourth loop for Chicago
# j will act as an index for the list
j=1
for (i in seq(0,20,10)) {
# first page of results doesn't have a counter in the url
if (i == 0) link <- url_chi else link <- paste0(url_chi,"?start=",i)
# the HTML/CSS is messy, but I found the CSS tags associated with
# salary and average compensation
pg_sal <- read_html(link) %>% html_nodes('.cmp-sal-summary') %>% html_text()
pg_comp <- read_html(link) %>% html_nodes('.cmp-sal-links') %>% html_text()
# save each table of search results as a list within a list
sal_chi[j] <- list(pg_sal)
comp_chi[j] <- list(pg_comp)
# increase our index by 1 each time
j = j + 1
}
Now that the data has been scraped from the web, it’s time to clean it up
####################################################
# Cleanup NYC Data
####################################################
# following two lines turn the nest listed into a dataframe
dfs <- lapply(comp_nyc, data.frame, stringsAsFactors = FALSE)
y <- bind_rows(dfs)
# ugly solution to clean up extra text after name of company...
colnames(y) <- "company"
a <- str_split_fixed(y$company,'-',2)
colnames(a) <- c('name','misc')
a <- a[,-2]
a <- str_split_fixed(a,' Jobs',2)
# following two lines turn the nest listed into a dataframe
dfs2 <- lapply(sal_nyc, data.frame, stringsAsFactors = FALSE)
z <- bind_rows(dfs2)
colnames(z) <- "salary"
# removes the row of national wide average that appears on every search result page
z2 <- data.frame(z[!grepl("Average", z$salary),])
colnames(z2) <- "salary"
# combines company and offered Data Scientist salary in a 2-column dataframe
nyc <- data.frame(cbind(a[,1], z2))
colnames(nyc) <- c('company','salary')
nyc$salary <- gsub('^[0-9]+','',as.character(nyc$salary))
nyc[] <- lapply(nyc, as.character)
nyc$salary <- as.numeric(unlist(str_replace_all(str_extract_all(nyc$salary, '[[0-9]+,.]{2,}'),',','')))
# write.csv(nyc,file="nyc_company_salary.csv")
####################################################
# Now same cleanup for San Fran
####################################################
# following two lines turn the nest listed into a dataframe
dfs <- lapply(comp_sf, data.frame, stringsAsFactors = FALSE)
y <- bind_rows(dfs)
# ugly solution to clean up extra text after name of company...
colnames(y) <- "company"
a <- str_split_fixed(y$company,'-',2)
colnames(a) <- c('name','misc')
a <- a[,-2]
a <- str_split_fixed(a,' Jobs',2)
# following two lines turn the nest listed into a dataframe
dfs2 <- lapply(sal_sf, data.frame, stringsAsFactors = FALSE)
z <- bind_rows(dfs2)
colnames(z) <- "salary"
# removes the row of national wide average that appears on every search result page
z2 <- data.frame(z[!grepl("Average", z$salary),])
colnames(z2) <- "salary"
# combines company and offered Data Scientist salary in a 2-column dataframe
sf <- data.frame(cbind(a[,1], z2))
colnames(sf) <- c('company','salary')
sf$salary <- gsub('^[0-9]+','',as.character(sf$salary))
sf[] <- lapply(sf, as.character)
sf$salary <- as.numeric(unlist(str_replace_all(str_extract_all(sf$salary, '[[0-9]+,.]{2,}'),',','')))
# Annualize per hour compensation rate by assuming a 40-hour work week
sf$salary[48] <- sf$salary[48]*40*52
sf$salary[50] <- sf$salary[50]*40*52
# write.csv(sf,file="sf_company_salary.csv")
####################################################
# Now same cleanup for Boston
####################################################
# following two lines turn the nest listed into a dataframe
dfs <- lapply(comp_bost, data.frame, stringsAsFactors = FALSE)
y <- bind_rows(dfs)
# ugly solution to clean up extra text after name of company...
colnames(y) <- "company"
a <- str_split_fixed(y$company,'-',2)
colnames(a) <- c('name','misc')
a <- a[,-2]
a <- str_split_fixed(a,' Jobs',2)
# following two lines turn the nest listed into a dataframe
dfs2 <- lapply(sal_bost, data.frame, stringsAsFactors = FALSE)
z <- bind_rows(dfs2)
colnames(z) <- "salary"
# removes the row of national wide average that appears on every search result page
z2 <- data.frame(z[!grepl("Average", z$salary),])
colnames(z2) <- "salary"
# combines company and offered Data Scientist salary in a 2-column dataframe
bost <- data.frame(cbind(a[,1], z2))
colnames(bost) <- c('company','salary')
bost$salary <- gsub('^[0-9]+','',as.character(bost$salary))
bost[] <- lapply(bost, as.character)
bost$salary <- as.numeric(unlist(str_replace_all(str_extract_all(bost$salary, '[[0-9]+,.]{2,}'),',','')))
# write.csv(bost,file="bost_company_salary.csv")
####################################################
# Now same cleanup for Chicago
####################################################
# following two lines turn the nest listed into a dataframe
dfs <- lapply(comp_chi, data.frame, stringsAsFactors = FALSE)
y <- bind_rows(dfs)
# ugly solution to clean up extra text after name of company...
colnames(y) <- "company"
a <- str_split_fixed(y$company,'-',2)
colnames(a) <- c('name','misc')
a <- a[,-2]
a <- str_split_fixed(a,' Jobs',2)
# following two lines turn the nest listed into a dataframe
dfs2 <- lapply(sal_chi, data.frame, stringsAsFactors = FALSE)
z <- bind_rows(dfs2)
colnames(z) <- "salary"
# removes the row of national wide average that appears on every search result page
z2 <- data.frame(z[!grepl("Average", z$salary),])
colnames(z2) <- "salary"
# combines company and offered Data Scientist salary in a 2-column dataframe
chi <- data.frame(cbind(a[,1], z2))
colnames(chi) <- c('company','salary')
chi[] <- lapply(chi, as.character)
chi$salary <- as.numeric(unlist(str_replace_all(str_extract_all(chi$salary, '[[0-9]+,.]{2,}'),',','')))
# Annualize per hour compensation rate by assuming a 40-hour work week
chi$salary[18] <- chi$salary[18]*40*52
# write.csv(chi,file="chi_company_salary.csv")
The histogram charts below show wide dispersions Data Scientist salaries in these four metros
options(scipen = 9)
qplot(nyc$salary, geom="histogram", xlab="Salaries",
main = "Histogram for NYC Data Scientist Salaries", fill=I("blue"),
col=I("black"), xlim=c(min(nyc$salary)-10000,max(nyc$salary)+10000),
breaks=seq(min(nyc$salary)-10000, max(nyc$salary)+10000,by=5000))

qplot(sf$salary, geom="histogram", xlab="Salaries",
main = "Histogram for San Fran Data Scientist Salaries", fill=I("blue"),
col=I("black"), xlim=c(min(sf$salary)-10000,max(sf$salary)+10000),
breaks=seq(min(sf$salary)-10000, max(sf$salary)+10000,by=10000))

qplot(bost$salary, geom="histogram", xlab="Salaries",
main = "Histogram for Boston Data Scientist Salaries", fill=I("blue"),
col=I("black"), xlim=c(min(bost$salary)-10000,max(bost$salary)+10000),
breaks=seq(min(bost$salary)-10000, max(bost$salary)+10000,by=10000))

qplot(chi$salary, geom="histogram", xlab="Salaries",
main = "Histogram for Chicago Data Scientist Salaries", fill=I("blue"),
col=I("black"), xlim=c(min(chi$salary)-10000,max(chi$salary)+10000),
breaks=seq(min(chi$salary)-10000, max(chi$salary)+10000,by=10000))
