Data_Scientist

Links to Indeed.com to do a salary search for Data Scientist jobs by metro

url_nyc <- "https://www.indeed.com/salaries/Data-Scientist-Salaries,-New-York-NY"
url_sf <- "https://www.indeed.com/salaries/Data-Scientist-Salaries,-San-Francisco-CA"
url_bost <- "https://www.indeed.com/salaries/Data-Scientist-Salaries,-Boston-MA"
url_chi <- "https://www.indeed.com/salaries/Data-Scientist-Salaries,-Chicago-IL"

Scrape webpage results for salary data in NYC, San Fran, Boston, and Chicago:

# lists to hold salary results and compensation results from webpages
sal_nyc <- list()
comp_nyc <- list()

sal_sf <- list()
comp_sf <- list()

sal_bost <- list()
comp_bost <- list()

sal_chi <- list()
comp_chi <- list()

# for loop to go through all the search result pages on Indeed.com
# each search result page holds about 10 listings of average salaries
# which explains why the counter 'i' that gets attached to the url
# jumps by 10 each time

# First loop for NYC
# j will act as an index for the list
j=1
for (i in seq(0,70,10)) {
        # first page of results doesn't have a counter in the url
        if (i == 0) link <- url_nyc else link <- paste0(url_nyc,"?start=",i)
        
        # the HTML/CSS is messy, but I found the CSS tags associated with 
        # salary and average compensation
        pg_sal <- read_html(link) %>% html_nodes('.cmp-sal-summary') %>% html_text()
        pg_comp <- read_html(link) %>% html_nodes('.cmp-sal-links') %>% html_text()
        
        # save each table of search results as a list within a list
        sal_nyc[j] <- list(pg_sal)
        comp_nyc[j] <- list(pg_comp)
        
        # increase our index by 1 each time
        j = j + 1
}

# Second loop for San Fran
# j will act as an index for the list
j=1
for (i in seq(0,50,10)) {
        # first page of results doesn't have a counter in the url
        if (i == 0) link <- url_sf else link <- paste0(url_sf,"?start=",i)
        
        # the HTML/CSS is messy, but I found the CSS tags associated with 
        # salary and average compensation
        pg_sal <- read_html(link) %>% html_nodes('.cmp-sal-summary') %>% html_text()
        pg_comp <- read_html(link) %>% html_nodes('.cmp-sal-links') %>% html_text()
        
        # save each table of search results as a list within a list
        sal_sf[j] <- list(pg_sal)
        comp_sf[j] <- list(pg_comp)
        
        # increase our index by 1 each time
        j = j + 1
}

# Third loop for Boston
# j will act as an index for the list
j=1
for (i in seq(0,50,10)) {
        # first page of results doesn't have a counter in the url
        if (i == 0) link <- url_bost else link <- paste0(url_bost,"?start=",i)
        
        # the HTML/CSS is messy, but I found the CSS tags associated with 
        # salary and average compensation
        pg_sal <- read_html(link) %>% html_nodes('.cmp-sal-summary') %>% html_text()
        pg_comp <- read_html(link) %>% html_nodes('.cmp-sal-links') %>% html_text()
        
        # save each table of search results as a list within a list
        sal_bost[j] <- list(pg_sal)
        comp_bost[j] <- list(pg_comp)
        
        # increase our index by 1 each time
        j = j + 1
}

# Fourth loop for Chicago
# j will act as an index for the list
j=1
for (i in seq(0,20,10)) {
        # first page of results doesn't have a counter in the url
        if (i == 0) link <- url_chi else link <- paste0(url_chi,"?start=",i)
        
        # the HTML/CSS is messy, but I found the CSS tags associated with 
        # salary and average compensation
        pg_sal <- read_html(link) %>% html_nodes('.cmp-sal-summary') %>% html_text()
        pg_comp <- read_html(link) %>% html_nodes('.cmp-sal-links') %>% html_text()
        
        # save each table of search results as a list within a list
        sal_chi[j] <- list(pg_sal)
        comp_chi[j] <- list(pg_comp)
        
        # increase our index by 1 each time
        j = j + 1
}

Now that the data has been scraped from the web, it’s time to clean it up

####################################################
# Cleanup NYC Data
####################################################

# following two lines turn the nest listed into a dataframe
dfs <- lapply(comp_nyc, data.frame, stringsAsFactors = FALSE)
y <- bind_rows(dfs)

# ugly solution to clean up extra text after name of company...
colnames(y) <- "company"
a <- str_split_fixed(y$company,'-',2)
colnames(a) <- c('name','misc')
a <- a[,-2]
a <- str_split_fixed(a,' Jobs',2)

# following two lines turn the nest listed into a dataframe
dfs2 <- lapply(sal_nyc, data.frame, stringsAsFactors = FALSE)
z <- bind_rows(dfs2)
colnames(z) <- "salary"

# removes the row of national wide average that appears on every search result page
z2 <- data.frame(z[!grepl("Average", z$salary),])
colnames(z2) <- "salary"

# combines company and offered Data Scientist salary in a 2-column dataframe
nyc <- data.frame(cbind(a[,1], z2))
colnames(nyc) <- c('company','salary')

nyc$salary <- gsub('^[0-9]+','',as.character(nyc$salary))
nyc[] <- lapply(nyc, as.character)
nyc$salary <- as.numeric(unlist(str_replace_all(str_extract_all(nyc$salary, '[[0-9]+,.]{2,}'),',','')))

# write.csv(nyc,file="nyc_company_salary.csv")

####################################################
# Now same cleanup for San Fran
####################################################

# following two lines turn the nest listed into a dataframe
dfs <- lapply(comp_sf, data.frame, stringsAsFactors = FALSE)
y <- bind_rows(dfs)

# ugly solution to clean up extra text after name of company...
colnames(y) <- "company"
a <- str_split_fixed(y$company,'-',2)
colnames(a) <- c('name','misc')
a <- a[,-2]
a <- str_split_fixed(a,' Jobs',2)

# following two lines turn the nest listed into a dataframe
dfs2 <- lapply(sal_sf, data.frame, stringsAsFactors = FALSE)
z <- bind_rows(dfs2)
colnames(z) <- "salary"

# removes the row of national wide average that appears on every search result page
z2 <- data.frame(z[!grepl("Average", z$salary),])
colnames(z2) <- "salary"

# combines company and offered Data Scientist salary in a 2-column dataframe
sf <- data.frame(cbind(a[,1], z2))
colnames(sf) <- c('company','salary')

sf$salary <- gsub('^[0-9]+','',as.character(sf$salary))
sf[] <- lapply(sf, as.character)

sf$salary <- as.numeric(unlist(str_replace_all(str_extract_all(sf$salary, '[[0-9]+,.]{2,}'),',','')))

# Annualize per hour compensation rate by assuming a 40-hour work week
sf$salary[48] <- sf$salary[48]*40*52
sf$salary[50] <- sf$salary[50]*40*52

# write.csv(sf,file="sf_company_salary.csv")

####################################################
# Now same cleanup for Boston
####################################################

# following two lines turn the nest listed into a dataframe
dfs <- lapply(comp_bost, data.frame, stringsAsFactors = FALSE)
y <- bind_rows(dfs)

# ugly solution to clean up extra text after name of company...
colnames(y) <- "company"
a <- str_split_fixed(y$company,'-',2)
colnames(a) <- c('name','misc')
a <- a[,-2]
a <- str_split_fixed(a,' Jobs',2)

# following two lines turn the nest listed into a dataframe
dfs2 <- lapply(sal_bost, data.frame, stringsAsFactors = FALSE)
z <- bind_rows(dfs2)
colnames(z) <- "salary"

# removes the row of national wide average that appears on every search result page
z2 <- data.frame(z[!grepl("Average", z$salary),])
colnames(z2) <- "salary"

# combines company and offered Data Scientist salary in a 2-column dataframe
bost <- data.frame(cbind(a[,1], z2))
colnames(bost) <- c('company','salary')

bost$salary <- gsub('^[0-9]+','',as.character(bost$salary))
bost[] <- lapply(bost, as.character)

bost$salary <- as.numeric(unlist(str_replace_all(str_extract_all(bost$salary, '[[0-9]+,.]{2,}'),',','')))

# write.csv(bost,file="bost_company_salary.csv")

####################################################
# Now same cleanup for Chicago
####################################################

# following two lines turn the nest listed into a dataframe
dfs <- lapply(comp_chi, data.frame, stringsAsFactors = FALSE)
y <- bind_rows(dfs)

# ugly solution to clean up extra text after name of company...
colnames(y) <- "company"
a <- str_split_fixed(y$company,'-',2)
colnames(a) <- c('name','misc')
a <- a[,-2]
a <- str_split_fixed(a,' Jobs',2)

# following two lines turn the nest listed into a dataframe
dfs2 <- lapply(sal_chi, data.frame, stringsAsFactors = FALSE)
z <- bind_rows(dfs2)
colnames(z) <- "salary"

# removes the row of national wide average that appears on every search result page
z2 <- data.frame(z[!grepl("Average", z$salary),])
colnames(z2) <- "salary"

# combines company and offered Data Scientist salary in a 2-column dataframe
chi <- data.frame(cbind(a[,1], z2))
colnames(chi) <- c('company','salary')

chi[] <- lapply(chi, as.character)
chi$salary <- as.numeric(unlist(str_replace_all(str_extract_all(chi$salary, '[[0-9]+,.]{2,}'),',','')))

# Annualize per hour compensation rate by assuming a 40-hour work week
chi$salary[18] <- chi$salary[18]*40*52

# write.csv(chi,file="chi_company_salary.csv")

The histogram charts below show wide dispersions Data Scientist salaries in these four metros

options(scipen = 9)

qplot(nyc$salary, geom="histogram",  xlab="Salaries",
      main = "Histogram for NYC Data Scientist Salaries", fill=I("blue"), 
      col=I("black"), xlim=c(min(nyc$salary)-10000,max(nyc$salary)+10000),
      breaks=seq(min(nyc$salary)-10000, max(nyc$salary)+10000,by=5000))

qplot(sf$salary, geom="histogram", xlab="Salaries",
      main = "Histogram for San Fran Data Scientist Salaries", fill=I("blue"), 
      col=I("black"), xlim=c(min(sf$salary)-10000,max(sf$salary)+10000),
      breaks=seq(min(sf$salary)-10000, max(sf$salary)+10000,by=10000))

qplot(bost$salary, geom="histogram", xlab="Salaries",
      main = "Histogram for Boston Data Scientist Salaries", fill=I("blue"), 
      col=I("black"), xlim=c(min(bost$salary)-10000,max(bost$salary)+10000),
      breaks=seq(min(bost$salary)-10000, max(bost$salary)+10000,by=10000))

qplot(chi$salary, geom="histogram", xlab="Salaries",
      main = "Histogram for Chicago Data Scientist Salaries", fill=I("blue"), 
      col=I("black"), xlim=c(min(chi$salary)-10000,max(chi$salary)+10000),
      breaks=seq(min(chi$salary)-10000, max(chi$salary)+10000,by=10000))

Prelimanary Analysis of Salary Data

To make a more fair comparision of salaries we must take into account cost of living for each city. We will first gather the cost of living indexes from https://www.numbeo.com/cost-of-living/region_rankings.jsp?title=2017-mid&region=019. Note that webscrapping the entire table is costly in terms of time and computer resources, so we will hand pick the cities we need: New York City, San Fransisco, Boston, and Chicago. Also the number are reported as percentages, so we will take the additional step of dividing by 100 to put them into decimal format.

col_index <- data.frame(matrix(c(c('NYC', 'SanFran', 'Boston', 'Chicago'), c(100.00/100, 101.94/100, 90.23/100,84.39/100)), ncol = 2))
col_index <- col_index %>% rename('City' = X1,'COL.Index' = X2)
col_index[,2] <- col_index[,2] %>% as.character() %>% as.numeric() 
col_index

##      City COL.Index
## 1     NYC    1.0000
## 2 SanFran    1.0194
## 3  Boston    0.9023
## 4 Chicago    0.8439

Confidence Intervals Unadjusted for Cost of Living

We will begin our comparision by calculating 95% Confidence Intervals without Adjusting for Cost of Living.

New York City

nyc_mean <- nyc$salary %>% mean()
nyc_mean

## [1] 144193.9

nyc_sd <- nyc$salary %>% sd()
nyc_sd

## [1] 32537.78

nyc_lower <- nyc_mean - 1.96*nyc_sd
nyc_upper <- nyc_mean + 1.96*nyc_sd
nyc_ci <- c(nyc_lower, nyc_upper)
nyc_ci

## [1]  80419.87 207967.97

San Fransisco

sf_mean <- sf$salary %>% mean()
sf_mean

## [1] 140620.4

sf_sd <- sf$salary %>% sd()
sf_sd

## [1] 38754.2

sf_lower <- sf_mean - 1.96*sf_sd
sf_upper <- sf_mean + 1.96*sf_sd
sf_ci <- c(sf_lower, sf_upper)
sf_ci

## [1]  64662.14 216578.59

Boston

bost_mean <- bost$salary %>% mean()
bost_mean

## [1] 119875.1

bost_sd <- bost$salary %>% sd()
bost_sd

## [1] 14607.43

bost_lower <- bost_mean - 1.96*bost_sd
bost_upper <- bost_mean + 1.96*bost_sd
bost_ci <- c(bost_lower, bost_upper)
bost_ci

## [1]  91244.55 148505.70

Chicago

chi_mean <- chi$salary %>% mean()
chi_mean

## [1] 124425.8

chi_sd <- chi$salary %>% sd()
chi_sd

## [1] 24935.52

chi_lower <- chi_mean - 1.96*chi_sd
chi_upper <- chi_mean + 1.96*chi_sd
chi_ci <- c(chi_lower, chi_upper)
chi_ci

## [1]  75552.23 173299.47

There is quite a bit of overlap between all cities, so we expect that they should be consistant with each other regarding a student’s t-test.

Student t-tests and K-S tests of the Salary Data Unadjusted for Cost of Living Idex

Student’s t-tests are used to check if the means of two samples are different from each other. The Null Hyopothesis is that the true diffenence in means of the populations the samples are drawn from is 0. The alternatibe hypothesis is that the difference in the population means is not zero. If the null is rejected you can infere that the two populations are different. For the t-test to be valid the samples must have a Normal Distribution and have similar variences.

Kologomorov-Smirnov (KS) tests test the same null and alternative hypothesis as the Student’s t-test. The KS-test analyzes differences in the Cumulative Distribution Function (CDF) of the two samples. Unlike the t-test, you do not have to make any assumptions about the samples.

We need to analyze pairs out of 4 cities this invovles 6 combinations:

combn(4,2)

##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]    1    1    1    2    2    3
## [2,]    2    3    4    3    4    4

New York and San Fransisco t-test

t.test(nyc$salary,sf$salary)

## 
##  Welch Two Sample t-test
## 
## data:  nyc$salary and sf$salary
## t = 0.54377, df = 97.351, p-value = 0.5878
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -9469.17 16616.28
## sample estimates:
## mean of x mean of y 
##  144193.9  140620.4

According to Student’s t-test New York’s and San Fransisco’s salaries are similar before accounting for cost of living.

New York and Boston t-test

t.test(nyc$salary,bost$salary)

## 
##  Welch Two Sample t-test
## 
## data:  nyc$salary and bost$salary
## t = 5.7025, df = 107.29, p-value = 0.0000001052
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  15865.07 32772.51
## sample estimates:
## mean of x mean of y 
##  144193.9  119875.1

According to Student’s t-test New York’s and Boston’s salaries are different before accounting for cost of living.

New York and Chicago t-test

t.test(nyc$salary,chi$salary)

## 
##  Welch Two Sample t-test
## 
## data:  nyc$salary and chi$salary
## t = 3.1975, df = 56.887, p-value = 0.002265
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   7387.595 32148.550
## sample estimates:
## mean of x mean of y 
##  144193.9  124425.8

According to Student’s t-test New York’s and Chicago’s salaries are different before accounting for cost of living.

San Fransisco and Boston t-test

t.test(sf$salary,bost$salary)

## 
##  Welch Two Sample t-test
## 
## data:  sf$salary and bost$salary
## t = 3.6244, df = 64.522, p-value = 0.0005718
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   9312.386 32178.090
## sample estimates:
## mean of x mean of y 
##  140620.4  119875.1

According to Student’s t-test San Fransisco’s and Boston’s salaries are different before accounting for cost of living.

San Fransisco and Chicago t-test

t.test(sf$salary,chi$salary)

## 
##  Welch Two Sample t-test
## 
## data:  sf$salary and chi$salary
## t = 2.2288, df = 71.05, p-value = 0.02899
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   1706.386 30682.652
## sample estimates:
## mean of x mean of y 
##  140620.4  124425.8

According to Student’s t-test San Fransisco’s and Chicago’s salaries are different before accounting for cost of living.

Boston and Chicago t-test

t.test(bost$salary,chi$salary)

## 
##  Welch Two Sample t-test
## 
## data:  bost$salary and chi$salary
## t = -0.86318, df = 33.363, p-value = 0.3942
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -15272.30   6170.86
## sample estimates:
## mean of x mean of y 
##  119875.1  124425.8

Boston and Chicago’s Salaries are similar under Student’s t-test.

# par(mfrow = c(4,1))
# boxplot(nyc$salary, horizontal = TRUE)
# boxplot(sf$salary, horizontal = TRUE)
# boxplot(bost$salary, horizontal = TRUE)
# boxplot(chi$salary, horizontal = TRUE)

New York and San Fransisco KS-test

ks.test(nyc$salary,sf$salary, exact = TRUE)

## Warning in ks.test(nyc$salary, sf$salary, exact = TRUE): cannot compute
## exact p-value with ties

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  nyc$salary and sf$salary
## D = 0.15177, p-value = 0.4826
## alternative hypothesis: two-sided

According to the KS test New York’s and San Fransisco’s salaries are similar before accounting for cost of living.

New York and Boston KS-test

ks.test(nyc$salary,bost$salary, exact = TRUE)

## Warning in ks.test(nyc$salary, bost$salary, exact = TRUE): cannot compute
## exact p-value with ties

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  nyc$salary and bost$salary
## D = 0.53857, p-value = 0.00000002249
## alternative hypothesis: two-sided

According to the KS test New York’s and Boston’s salaries are different before accounting for cost of living.

New York and Chicago KS-test

ks.test(nyc$salary,chi$salary, exact = TRUE)

## Warning in ks.test(nyc$salary, chi$salary, exact = TRUE): cannot compute
## exact p-value with ties

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  nyc$salary and chi$salary
## D = 0.43139, p-value = 0.001552
## alternative hypothesis: two-sided

According to the KS test New York’s and Chicago’s salaries are different before accounting for cost of living.

San Fransisco and Boston KS-test

ks.test(sf$salary,bost$salary, exact = TRUE)

## Warning in ks.test(sf$salary, bost$salary, exact = TRUE): cannot compute
## exact p-value with ties

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  sf$salary and bost$salary
## D = 0.58951, p-value = 0.00000001709
## alternative hypothesis: two-sided

According to the KS test San Fransisco’s and Boston’s salaries are different before accounting for cost of living.

San Fransisco and Chicago KS-test

ks.test(sf$salary,chi$salary, exact = TRUE)

## Warning in ks.test(sf$salary, chi$salary, exact = TRUE): cannot compute
## exact p-value with ties

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  sf$salary and chi$salary
## D = 0.40385, p-value = 0.007008
## alternative hypothesis: two-sided

According to the KS test San Fransisco’s and Chicago’s salaries are different before accounting for cost of living.

Boston and Chicago KS-test

ks.test(bost$salary,chi$salary, exact = TRUE)

## Warning in ks.test(bost$salary, chi$salary, exact = TRUE): cannot compute
## exact p-value with ties

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  bost$salary and chi$salary
## D = 0.3972, p-value = 0.007616
## alternative hypothesis: two-sided

According to the KS test Boston’s and Chicago’s salaries are different before accounting for cost of living.

Regardless of the test used, only New York and San Fransisco had similar salary distributions.

Adjusting for Cost of Living

Now we will take the salary data and divide by the Cost of Livining index. In this way lower cost of living salaries (i.e., < 1) will be increased to reflect more purchasing power per dollar, and cities with higher cost of living (>1) will have there salaries decreased to reflect less purchasing power per dollar.

head(nyc$salary)

## [1] 220501 203811 186440 174734 180758 200516

adj_nyc <- nyc$salary/col_index[1,2]
head(adj_nyc)

## [1] 220501 203811 186440 174734 180758 200516

head(sf$salary)

## [1] 300000 202769 189098 186021 180463 171443

adj_sf <- sf$salary/col_index[2,2]
head(adj_sf)

## [1] 294290.8 198910.1 185499.3 182480.9 177028.6 168180.3

head(bost$salary)

## [1] 180080 173701 141163 138230 140000 134174

adj_bost <- bost$salary/col_index[3,2]
head(adj_bost)

## [1] 199578.9 192509.1 156448.0 153197.4 155159.0 148702.2

head(chi$salary)

## [1] 131089 185309 176067 154035 150994 143377

adj_chi <- chi$salary/col_index[4,2]
head(adj_chi)

## [1] 155337.1 219586.4 208634.9 182527.6 178924.0 169898.1

Confidence Intervals Adjusted for Cost of Living

We will begin our comparision by calculating 95% Confidence Intervals without Adjusting for Cost of Living.

New York City

nyc_mean_adj <- adj_nyc %>% mean()
nyc_mean_adj

## [1] 144193.9

nyc_sd_adj <- adj_nyc %>% sd()
nyc_sd_adj

## [1] 32537.78

nyc_lower_adj <- nyc_mean_adj - 1.96*nyc_sd_adj
nyc_upper_adj <- nyc_mean_adj + 1.96*nyc_sd_adj
nyc_ci_adj <- c(nyc_lower_adj, nyc_upper_adj)
nyc_ci_adj

## [1]  80419.87 207967.97

San Fransisco

sf_mean_adj <- adj_sf %>% mean()
sf_mean_adj

## [1] 137944.2

sf_sd_adj <- adj_sf %>% sd()
sf_sd_adj

## [1] 38016.67

sf_lower_adj <- sf_mean_adj - 1.96*sf_sd_adj
sf_upper_adj <- sf_mean_adj + 1.96*sf_sd_adj
sf_ci_adj <- c(sf_lower_adj, sf_upper_adj)
sf_ci_adj

## [1]  63431.57 212456.93

Boston

bost_mean_adj <- adj_bost %>% mean()
bost_mean_adj

## [1] 132855.1

bost_sd_adj <- adj_bost %>% sd()
bost_sd_adj

## [1] 16189.11

bost_lower_adj <- bost_mean_adj - 1.96*bost_sd_adj
bost_upper_adj <- bost_mean_adj + 1.96*bost_sd_adj
bost_ci_adj <- c(bost_lower_adj, bost_upper_adj)
bost_ci_adj

## [1] 101124.4 164585.7

Chicago

chi_mean_adj <- adj_chi %>% mean()
chi_mean_adj

## [1] 147441.5

chi_sd_adj <- adj_chi %>% sd()
chi_sd_adj

## [1] 29547.96

chi_lower_adj <- chi_mean_adj - 1.96*chi_sd_adj
chi_upper_adj <- chi_mean_adj + 1.96*chi_sd_adj
chi_ci_adj <- c(chi_lower_adj, chi_upper_adj)
chi_ci_adj

## [1]  89527.46 205355.45

There is quite a bit of overlap between all cities, however we did see some statiscally significant differences even when the CI’s overlapped in the pervious section.

Student t-tests and K-S tests of the Salary Data Adjusted for Cost of Living Idex

Here we repeat the analysis from above with salary data that has been adjusted for cost of living.

New York and San Fransisco t-test

t.test(adj_nyc,adj_sf)

## 
##  Welch Two Sample t-test
## 
## data:  adj_nyc and adj_sf
## t = 0.96319, df = 98.74, p-value = 0.3378
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -6625.312 19124.656
## sample estimates:
## mean of x mean of y 
##  144193.9  137944.2

According to Student’s t-test New York’s and San Fransisco’s salaries are similar after accounting for cost of living.

New York and Boston t-test

t.test(adj_nyc,adj_bost)

## 
##  Welch Two Sample t-test
## 
## data:  adj_nyc and adj_bost
## t = 2.5964, df = 112.81, p-value = 0.01067
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   2686.564 19991.139
## sample estimates:
## mean of x mean of y 
##  144193.9  132855.1

According to Student’s t-test New York’s and Boston’s salaries are different after accounting for cost of living. Hoever, the margin has dropped quite a bit.

New York and Chicago t-test

t.test(adj_nyc,adj_chi)

## 
##  Welch Two Sample t-test
## 
## data:  adj_nyc and adj_chi
## t = -0.46929, df = 47.865, p-value = 0.641
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -17162.22  10667.14
## sample estimates:
## mean of x mean of y 
##  144193.9  147441.5

According to Student’s t-test New York’s and Chicago’s salaries are not different after accounting for cost of living.

San Fransisco and Boston t-test

t.test(adj_sf,adj_bost)

## 
##  Welch Two Sample t-test
## 
## data:  adj_sf and adj_bost
## t = 0.89189, df = 68.097, p-value = 0.3756
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -6296.742 16475.101
## sample estimates:
## mean of x mean of y 
##  137944.2  132855.1

According to Student’s t-test San Fransisco’s and Boston’s salaries are not different after accounting for cost of living.

San Fransisco and Chicago t-test

t.test(adj_sf,adj_chi)

## 
##  Welch Two Sample t-test
## 
## data:  adj_sf and adj_chi
## t = -1.2123, df = 62.517, p-value = 0.23
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -25154.849   6160.428
## sample estimates:
## mean of x mean of y 
##  137944.2  147441.5

According to Student’s t-test San Fransisco’s and Chicago’s salaries are not different after accounting for cost of living.

Boston and Chicago t-test

t.test(adj_bost,adj_chi)

## 
##  Welch Two Sample t-test
## 
## data:  adj_bost and adj_chi
## t = -2.3555, df = 32.298, p-value = 0.02473
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -27195.252  -1977.528
## sample estimates:
## mean of x mean of y 
##  132855.1  147441.5

Boston and Chicago’s Salaries are no longer similar under Student’s t-test when adjusting for cost of living.

New York and San Fransisco KS-test

ks.test(adj_nyc,adj_sf, exact = TRUE)

## Warning in ks.test(adj_nyc, adj_sf, exact = TRUE): cannot compute exact p-
## value with ties

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  adj_nyc and adj_sf
## D = 0.17879, p-value = 0.283
## alternative hypothesis: two-sided

According to the KS test New York’s and San Fransisco’s salaries are similar after accounting for cost of living.

New York and Boston KS-test

ks.test(adj_nyc,adj_bost, exact = TRUE)

## Warning in ks.test(adj_nyc, adj_bost, exact = TRUE): cannot compute exact
## p-value with ties

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  adj_nyc and adj_bost
## D = 0.45749, p-value = 0.000003675
## alternative hypothesis: two-sided

According to the KS test New York’s and Boston’s salaries are different after accounting for cost of living.

New York and Chicago KS-test

ks.test(adj_nyc,adj_chi, exact = TRUE)

## Warning in ks.test(adj_nyc, adj_chi, exact = TRUE): cannot compute exact p-
## value with ties

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  adj_nyc and adj_chi
## D = 0.22037, p-value = 0.3075
## alternative hypothesis: two-sided

According to the KS test New York’s and Chicago’s salaries are not different after accounting for cost of living.

San Fransisco and Boston KS-test

ks.test(adj_sf,adj_bost, exact = TRUE)

## Warning in ks.test(adj_sf, adj_bost, exact = TRUE): cannot compute exact p-
## value with ties

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  adj_sf and adj_bost
## D = 0.3972, p-value = 0.0004347
## alternative hypothesis: two-sided

According to the KS test San Fransisco’s and Boston’s salaries are different after accounting for cost of living.

San Fransisco and Chicago KS-test

ks.test(adj_sf,adj_chi, exact = TRUE)

## Warning in ks.test(adj_sf, adj_chi, exact = TRUE): cannot compute exact p-
## value with ties

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  adj_sf and adj_chi
## D = 0.23077, p-value = 0.3145
## alternative hypothesis: two-sided

According to the KS test San Fransisco’s and Chicago’s salaries are not different after accounting for cost of living.

Boston and Chicago KS-test

ks.test(adj_bost,adj_chi, exact = TRUE)

## Warning in ks.test(adj_bost, adj_chi, exact = TRUE): cannot compute exact
## p-value with ties

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  adj_bost and adj_chi
## D = 0.58951, p-value = 0.000009376
## alternative hypothesis: two-sided

According to the KS test Boston’s and Chicago’s salaries are different after accounting for cost of living.

After adjusting for Cost of Living, Chicago became similar to New York and San Fransisco but Boston remained different.

Data_Scientist_Salaries

Silverio Vasquez & Nathan Cooper

October 17, 2017

Links to Indeed.com to do a salary search for Data Scientist jobs by metro

Scrape webpage results for salary data in NYC, San Fran, Boston, and Chicago:

Now that the data has been scraped from the web, it’s time to clean it up

The histogram charts below show wide dispersions Data Scientist salaries in these four metros

Prelimanary Analysis of Salary Data

Confidence Intervals Unadjusted for Cost of Living

New York City

San Fransisco

Boston

Chicago

Student t-tests and K-S tests of the Salary Data Unadjusted for Cost of Living Idex

New York and San Fransisco t-test

New York and Boston t-test

New York and Chicago t-test

San Fransisco and Boston t-test

San Fransisco and Chicago t-test

Boston and Chicago t-test

New York and San Fransisco KS-test

New York and Boston KS-test

New York and Chicago KS-test

San Fransisco and Boston KS-test

San Fransisco and Chicago KS-test

Boston and Chicago KS-test

Adjusting for Cost of Living

Confidence Intervals Adjusted for Cost of Living

New York City

San Fransisco

Boston

Chicago

Student t-tests and K-S tests of the Salary Data Adjusted for Cost of Living Idex

New York and San Fransisco t-test

New York and Boston t-test

New York and Chicago t-test

San Fransisco and Boston t-test

San Fransisco and Chicago t-test

Boston and Chicago t-test

New York and San Fransisco KS-test

New York and Boston KS-test

New York and Chicago KS-test

San Fransisco and Boston KS-test

San Fransisco and Chicago KS-test

Boston and Chicago KS-test