Loading Libraries

library(dplyr)
library(tidyr)
library(ggplot2)
library(DT)
library(stringr)
library(zoo)
library(knitr)
library(ggpubr)
library(rvest)
library(tidyverse)
library(data.table)
library(magrittr)
library(digest)
library(RPostgreSQL)
library(tidytext)
library(config)

Sunny Initial Data Clean Up

From Sunny:

raw_nyc_df <- read.csv('https://raw.githubusercontent.com/mehtablocker/cuny_607/master/project_3/nyc-jobs.csv')
nyc_jobs_df <- raw_nyc_df
names(nyc_jobs_df) <- names(nyc_jobs_df) %>% tolower() %>% gsub("\\.", "_", .)
names(nyc_jobs_df)[names(nyc_jobs_df)=="x__of_positions"] <- "n_of_positions"
nyc_jobs_df <- nyc_jobs_df %>% select(-posting_type) %>% unique()

From Sunny:

data_jobs_df <- nyc_jobs_df %>% filter(grepl("data|analytics", business_title, ignore.case = T))
other_jobs_df <- nyc_jobs_df %>% filter(!grepl("data|analytics", business_title, ignore.case = T))

Remove hourly salaries:

salary.thresh <- 20000
data_jobs_df <- subset(data_jobs_df, salary_range_to >= salary.thresh & salary_range_from >= salary.thresh)
other_jobs_df <- subset(other_jobs_df, salary_range_to >= salary.thresh & salary_range_from >= salary.thresh)

Statistical Analysis

Sunny provided some interesting results on the salaries, and I wanted to do some statistical analysis on it. Below is the box plots produced from Sunny for the higher end of the salary.

summary(data_jobs_df$salary_range_to)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  40629   74740   86347   88736  101854  161497 
summary(other_jobs_df$salary_range_to)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  32621   62815   80000   88985  105000  230000 
par(mfrow=c(1,2))
boxplot(data_jobs_df$salary_range_to, xlab="Data Jobs", ylab="Salary in Dollars", ylim=c(0, 200000))
boxplot(other_jobs_df$salary_range_to, xlab="Non-Data Jobs", ylab="Salary in Dollars", ylim=c(0, 200000))

I used the “ggpubr” package to conduct a two-sample t-test. The null and alternative hypothesis are shown below:

\(H_{0}: \mu_{A}=\mu_{B}\)

\(H_{a}: \mu_{A}>\mu_{B}\)

where \(A\) in this case represents the data group, and \(B\) represents the non-data group.

The t-statstic can be calculated using the Welch t-statistic method since the variances do not appear to be equal. The equation is as follows:

\(t = \frac{\bar{x}_{A}-\bar{x}_{B}}{\sqrt{\frac{s_{A}^{2}}{n_{A}}+\frac{s_{B}^{2}}{n_{B}}}}\)

The degrees of freedom, \(df\), are calculated using the following equation:

\(df = \frac{\left(\frac{s_{A}^{2}}{n_{A}}+\frac{s_{B}^{2}}{n_{B}}\right)}{\frac{s_{A}^{4}}{n_{A}^{2}(n_{B}-1)}+\frac{s_{B}^{4}}{n_{B}^{2}(n_{A}-1)}}\)

The t-test shows that we fail to reject the null hypothesis (\(p = 0.531\)) in favor of the alternative, and we cannot say there is significant evidence that the data jobs pay better.

t.test(data_jobs_df$salary_range_to, other_jobs_df$salary_range_to, alternative="greater", var.equal=FALSE)

    Welch Two Sample t-test

data:  data_jobs_df$salary_range_to and other_jobs_df$salary_range_to
t = -0.077608, df = 71.146, p-value = 0.5308
alternative hypothesis: true difference in means is greater than 0
95 percent confidence interval:
 -5588.974       Inf
sample estimates:
mean of x mean of y 
 88735.88  88984.57 

However, if we conduct the same t-test on the lower salary range, we can see that the \(p\)-value is less than 0.05 (0.005). Therefore, there is sufficient evidence to reject the null hypothesis in favor of the alternative, and we can say that the lower end of the starting salary for data jobs is higher than non-data jobs. The lower range may hold more water since a lot of job postings will try to attract you with the range, but lean towards the lower salary number.

t.test(data_jobs_df$salary_range_from, other_jobs_df$salary_range_from, alternative="greater", var.equal=FALSE)

    Welch Two Sample t-test

data:  data_jobs_df$salary_range_from and other_jobs_df$salary_range_from
t = 2.639, df = 68.408, p-value = 0.005144
alternative hypothesis: true difference in means is greater than 0
95 percent confidence interval:
 2067.422      Inf
sample estimates:
mean of x mean of y 
 67193.67  61578.12 
numbreaks <- 10
up.lim <- 200000
par(mfrow=c(2,2))
hist(data_jobs_df$salary_range_from,breaks=numbreaks,xlim=c(0,up.lim),main="Lower Limit Data Jobs",xlab="Salary ($)")
hist(other_jobs_df$salary_range_from,breaks=numbreaks,xlim=c(0,up.lim),main="Lower Limit Other Jobs",xlab="Salary ($)")
hist(data_jobs_df$salary_range_to,breaks=numbreaks,xlim=c(0,up.lim),main="Upper Limit Data Jobs",xlab="Salary ($)")
hist(other_jobs_df$salary_range_to,breaks=numbreaks,xlim=c(0,up.lim),main="Upper Limit Other Jobs",xlab="Salary ($)")

Mary Anna Scrape

To obtain the latest available information about data science job postings, 50 pages of the latest Indeed job postings are scraped using various html nodes in the rvest package. The resulting dataset contains the following fields: job title, company, location, job summary, and link. To identify the correct html nodes that return these fields, the chrome extention SelectorGadget was used in conjunction with inspect element. Regular expressions are also used for each field to remove blank spaces, new lines, and unnecessary information.

listings <- data.frame(title=character(),
                 company=character(), 
                 location=character(), 
                 summary=character(), 
                 link=character(), 
                 description = character(),
                 stringsAsFactors=FALSE) 
for (i in seq(0, 990, 10)){
  url_ds <- paste0('https://www.indeed.com/jobs?q=data+scientist&l=all&start=',i)
  var <- read_html(url_ds)
  
  #job title
  title <-  var %>% 
    html_nodes('#resultsCol .jobtitle') %>%
    html_text() %>%
    str_extract("(\\w+.+)+") 
  
  #company
  company <- var %>% 
    html_nodes('#resultsCol .company') %>%
    html_text() %>%
    str_extract("(\\w+).+") 
  
  #location
  location <- var %>%
    html_nodes('#resultsCol .location') %>%
    html_text() %>%
    str_extract("(\\w+.)+,.[A-Z]{2}")   
  #summary
  summary <- var %>%
    html_nodes('#resultsCol .summary') %>%
    html_text() %>%
    str_extract(".+")
  
  #link
  link <- var %>%
    html_nodes('#resultsCol .jobtitle .turnstileLink, #resultsCol a.jobtitle') %>%
    html_attr('href') 
  link <- paste0("https://www.indeed.com",link)
    
  listings <- rbind(listings, as.data.frame(cbind(title,
                                                  company,
                                                  location,
                                                  summary,
                                                  link)))
}

We have successfully scraped job title, company, location, job summary, and job link from 100 pages of Indeed job postings.

Get listings by city and state:

listings.city <- listings[,1:3]
listings.state <- listings.city %>% separate(location, c("City","State"), sep = ",")

Create frequency tables for both:

locations.city <- as.data.frame(table(listings.city$location))
locations.state <- as.data.frame(table(listings.state$State))
head(locations.city[order(locations.city$Freq,decreasing = TRUE),],10)
                Var1 Freq
12     Princeton, NJ  210
11      New York, NY  119
3      Charlotte, NC  108
5         Dallas, TX  107
2      Chantilly, VA  102
8     Montpelier, VT  101
9   Mount Laurel, NJ   99
13 San Francisco, CA   96
6     Fort Meade, MD   42
1       Bellevue, WA   41
head(locations.state[order(locations.state$Freq,decreasing = TRUE),],10)
   Var1 Freq
24   NJ  379
3    CA  220
26   NY  144
33   TX  139
35   VA  125
22   NC  119
36   VT  102
37   WA   80
16   MA   74
17   MD   61