library(dplyr)
library(tidyr)
library(ggplot2)
library(DT)
library(stringr)
library(zoo)
library(knitr)
library(ggpubr)
library(rvest)
library(tidyverse)
library(data.table)
library(magrittr)
library(digest)
library(RPostgreSQL)
library(tidytext)
library(config)From Sunny:
raw_nyc_df <- read.csv('https://raw.githubusercontent.com/mehtablocker/cuny_607/master/project_3/nyc-jobs.csv')
nyc_jobs_df <- raw_nyc_df
names(nyc_jobs_df) <- names(nyc_jobs_df) %>% tolower() %>% gsub("\\.", "_", .)
names(nyc_jobs_df)[names(nyc_jobs_df)=="x__of_positions"] <- "n_of_positions"
nyc_jobs_df <- nyc_jobs_df %>% select(-posting_type) %>% unique()From Sunny:
data_jobs_df <- nyc_jobs_df %>% filter(grepl("data|analytics", business_title, ignore.case = T))
other_jobs_df <- nyc_jobs_df %>% filter(!grepl("data|analytics", business_title, ignore.case = T))Remove hourly salaries:
salary.thresh <- 20000
data_jobs_df <- subset(data_jobs_df, salary_range_to >= salary.thresh & salary_range_from >= salary.thresh)
other_jobs_df <- subset(other_jobs_df, salary_range_to >= salary.thresh & salary_range_from >= salary.thresh)Sunny provided some interesting results on the salaries, and I wanted to do some statistical analysis on it. Below is the box plots produced from Sunny for the higher end of the salary.
summary(data_jobs_df$salary_range_to) Min. 1st Qu. Median Mean 3rd Qu. Max.
40629 74740 86347 88736 101854 161497
summary(other_jobs_df$salary_range_to) Min. 1st Qu. Median Mean 3rd Qu. Max.
32621 62815 80000 88985 105000 230000
par(mfrow=c(1,2))
boxplot(data_jobs_df$salary_range_to, xlab="Data Jobs", ylab="Salary in Dollars", ylim=c(0, 200000))
boxplot(other_jobs_df$salary_range_to, xlab="Non-Data Jobs", ylab="Salary in Dollars", ylim=c(0, 200000))I used the “ggpubr” package to conduct a two-sample t-test. The null and alternative hypothesis are shown below:
\(H_{0}: \mu_{A}=\mu_{B}\)
\(H_{a}: \mu_{A}>\mu_{B}\)
where \(A\) in this case represents the data group, and \(B\) represents the non-data group.
The t-statstic can be calculated using the Welch t-statistic method since the variances do not appear to be equal. The equation is as follows:
\(t = \frac{\bar{x}_{A}-\bar{x}_{B}}{\sqrt{\frac{s_{A}^{2}}{n_{A}}+\frac{s_{B}^{2}}{n_{B}}}}\)
The degrees of freedom, \(df\), are calculated using the following equation:
\(df = \frac{\left(\frac{s_{A}^{2}}{n_{A}}+\frac{s_{B}^{2}}{n_{B}}\right)}{\frac{s_{A}^{4}}{n_{A}^{2}(n_{B}-1)}+\frac{s_{B}^{4}}{n_{B}^{2}(n_{A}-1)}}\)
The t-test shows that we fail to reject the null hypothesis (\(p = 0.531\)) in favor of the alternative, and we cannot say there is significant evidence that the data jobs pay better.
t.test(data_jobs_df$salary_range_to, other_jobs_df$salary_range_to, alternative="greater", var.equal=FALSE)
Welch Two Sample t-test
data: data_jobs_df$salary_range_to and other_jobs_df$salary_range_to
t = -0.077608, df = 71.146, p-value = 0.5308
alternative hypothesis: true difference in means is greater than 0
95 percent confidence interval:
-5588.974 Inf
sample estimates:
mean of x mean of y
88735.88 88984.57
However, if we conduct the same t-test on the lower salary range, we can see that the \(p\)-value is less than 0.05 (0.005). Therefore, there is sufficient evidence to reject the null hypothesis in favor of the alternative, and we can say that the lower end of the starting salary for data jobs is higher than non-data jobs. The lower range may hold more water since a lot of job postings will try to attract you with the range, but lean towards the lower salary number.
t.test(data_jobs_df$salary_range_from, other_jobs_df$salary_range_from, alternative="greater", var.equal=FALSE)
Welch Two Sample t-test
data: data_jobs_df$salary_range_from and other_jobs_df$salary_range_from
t = 2.639, df = 68.408, p-value = 0.005144
alternative hypothesis: true difference in means is greater than 0
95 percent confidence interval:
2067.422 Inf
sample estimates:
mean of x mean of y
67193.67 61578.12
numbreaks <- 10
up.lim <- 200000
par(mfrow=c(2,2))
hist(data_jobs_df$salary_range_from,breaks=numbreaks,xlim=c(0,up.lim),main="Lower Limit Data Jobs",xlab="Salary ($)")
hist(other_jobs_df$salary_range_from,breaks=numbreaks,xlim=c(0,up.lim),main="Lower Limit Other Jobs",xlab="Salary ($)")
hist(data_jobs_df$salary_range_to,breaks=numbreaks,xlim=c(0,up.lim),main="Upper Limit Data Jobs",xlab="Salary ($)")
hist(other_jobs_df$salary_range_to,breaks=numbreaks,xlim=c(0,up.lim),main="Upper Limit Other Jobs",xlab="Salary ($)")To obtain the latest available information about data science job postings, 50 pages of the latest Indeed job postings are scraped using various html nodes in the rvest package. The resulting dataset contains the following fields: job title, company, location, job summary, and link. To identify the correct html nodes that return these fields, the chrome extention SelectorGadget was used in conjunction with inspect element. Regular expressions are also used for each field to remove blank spaces, new lines, and unnecessary information.
listings <- data.frame(title=character(),
company=character(),
location=character(),
summary=character(),
link=character(),
description = character(),
stringsAsFactors=FALSE)
for (i in seq(0, 990, 10)){
url_ds <- paste0('https://www.indeed.com/jobs?q=data+scientist&l=all&start=',i)
var <- read_html(url_ds)
#job title
title <- var %>%
html_nodes('#resultsCol .jobtitle') %>%
html_text() %>%
str_extract("(\\w+.+)+")
#company
company <- var %>%
html_nodes('#resultsCol .company') %>%
html_text() %>%
str_extract("(\\w+).+")
#location
location <- var %>%
html_nodes('#resultsCol .location') %>%
html_text() %>%
str_extract("(\\w+.)+,.[A-Z]{2}")
#summary
summary <- var %>%
html_nodes('#resultsCol .summary') %>%
html_text() %>%
str_extract(".+")
#link
link <- var %>%
html_nodes('#resultsCol .jobtitle .turnstileLink, #resultsCol a.jobtitle') %>%
html_attr('href')
link <- paste0("https://www.indeed.com",link)
listings <- rbind(listings, as.data.frame(cbind(title,
company,
location,
summary,
link)))
}We have successfully scraped job title, company, location, job summary, and job link from 100 pages of Indeed job postings.
Get listings by city and state:
listings.city <- listings[,1:3]
listings.state <- listings.city %>% separate(location, c("City","State"), sep = ",")Create frequency tables for both:
locations.city <- as.data.frame(table(listings.city$location))
locations.state <- as.data.frame(table(listings.state$State))
head(locations.city[order(locations.city$Freq,decreasing = TRUE),],10) Var1 Freq
12 Princeton, NJ 210
11 New York, NY 119
3 Charlotte, NC 108
5 Dallas, TX 107
2 Chantilly, VA 102
8 Montpelier, VT 101
9 Mount Laurel, NJ 99
13 San Francisco, CA 96
6 Fort Meade, MD 42
1 Bellevue, WA 41
head(locations.state[order(locations.state$Freq,decreasing = TRUE),],10) Var1 Freq
24 NJ 379
3 CA 220
26 NY 144
33 TX 139
35 VA 125
22 NC 119
36 VT 102
37 WA 80
16 MA 74
17 MD 61