url_nyc <- "https://www.indeed.com/salaries/Data-Scientist-Salaries,-New-York-NY"
url_sf <- "https://www.indeed.com/salaries/Data-Scientist-Salaries,-San-Francisco-CA"
url_bost <- "https://www.indeed.com/salaries/Data-Scientist-Salaries,-Boston-MA"
url_chi <- "https://www.indeed.com/salaries/Data-Scientist-Salaries,-Chicago-IL"
# lists to hold salary results and compensation results from webpages
sal_nyc <- list()
comp_nyc <- list()
sal_sf <- list()
comp_sf <- list()
sal_bost <- list()
comp_bost <- list()
sal_chi <- list()
comp_chi <- list()
# for loop to go through all the search result pages on Indeed.com
# each search result page holds about 10 listings of average salaries
# which explains why the counter 'i' that gets attached to the url
# jumps by 10 each time
# First loop for NYC
# j will act as an index for the list
j=1
for (i in seq(0,70,10)) {
# first page of results doesn't have a counter in the url
if (i == 0) link <- url_nyc else link <- paste0(url_nyc,"?start=",i)
# the HTML/CSS is messy, but I found the CSS tags associated with
# salary and average compensation
pg_sal <- read_html(link) %>% html_nodes('.cmp-sal-summary') %>% html_text()
pg_comp <- read_html(link) %>% html_nodes('.cmp-sal-links') %>% html_text()
# save each table of search results as a list within a list
sal_nyc[j] <- list(pg_sal)
comp_nyc[j] <- list(pg_comp)
# increase our index by 1 each time
j = j + 1
}
# Second loop for San Fran
# j will act as an index for the list
j=1
for (i in seq(0,50,10)) {
# first page of results doesn't have a counter in the url
if (i == 0) link <- url_sf else link <- paste0(url_sf,"?start=",i)
# the HTML/CSS is messy, but I found the CSS tags associated with
# salary and average compensation
pg_sal <- read_html(link) %>% html_nodes('.cmp-sal-summary') %>% html_text()
pg_comp <- read_html(link) %>% html_nodes('.cmp-sal-links') %>% html_text()
# save each table of search results as a list within a list
sal_sf[j] <- list(pg_sal)
comp_sf[j] <- list(pg_comp)
# increase our index by 1 each time
j = j + 1
}
# Third loop for Boston
# j will act as an index for the list
j=1
for (i in seq(0,50,10)) {
# first page of results doesn't have a counter in the url
if (i == 0) link <- url_bost else link <- paste0(url_bost,"?start=",i)
# the HTML/CSS is messy, but I found the CSS tags associated with
# salary and average compensation
pg_sal <- read_html(link) %>% html_nodes('.cmp-sal-summary') %>% html_text()
pg_comp <- read_html(link) %>% html_nodes('.cmp-sal-links') %>% html_text()
# save each table of search results as a list within a list
sal_bost[j] <- list(pg_sal)
comp_bost[j] <- list(pg_comp)
# increase our index by 1 each time
j = j + 1
}
# Fourth loop for Chicago
# j will act as an index for the list
j=1
for (i in seq(0,20,10)) {
# first page of results doesn't have a counter in the url
if (i == 0) link <- url_chi else link <- paste0(url_chi,"?start=",i)
# the HTML/CSS is messy, but I found the CSS tags associated with
# salary and average compensation
pg_sal <- read_html(link) %>% html_nodes('.cmp-sal-summary') %>% html_text()
pg_comp <- read_html(link) %>% html_nodes('.cmp-sal-links') %>% html_text()
# save each table of search results as a list within a list
sal_chi[j] <- list(pg_sal)
comp_chi[j] <- list(pg_comp)
# increase our index by 1 each time
j = j + 1
}
####################################################
# Cleanup NYC Data
####################################################
# following two lines turn the nest listed into a dataframe
dfs <- lapply(comp_nyc, data.frame, stringsAsFactors = FALSE)
y <- bind_rows(dfs)
# ugly solution to clean up extra text after name of company...
colnames(y) <- "company"
a <- str_split_fixed(y$company,'-',2)
colnames(a) <- c('name','misc')
a <- a[,-2]
a <- str_split_fixed(a,' Jobs',2)
# following two lines turn the nest listed into a dataframe
dfs2 <- lapply(sal_nyc, data.frame, stringsAsFactors = FALSE)
z <- bind_rows(dfs2)
colnames(z) <- "salary"
# removes the row of national wide average that appears on every search result page
z2 <- data.frame(z[!grepl("Average", z$salary),])
colnames(z2) <- "salary"
# combines company and offered Data Scientist salary in a 2-column dataframe
nyc <- data.frame(cbind(a[,1], z2))
colnames(nyc) <- c('company','salary')
nyc$salary <- gsub('^[0-9]+','',as.character(nyc$salary))
nyc[] <- lapply(nyc, as.character)
nyc$salary <- as.numeric(unlist(str_replace_all(str_extract_all(nyc$salary, '[[0-9]+,.]{2,}'),',','')))
# write.csv(nyc,file="nyc_company_salary.csv")
####################################################
# Now same cleanup for San Fran
####################################################
# following two lines turn the nest listed into a dataframe
dfs <- lapply(comp_sf, data.frame, stringsAsFactors = FALSE)
y <- bind_rows(dfs)
# ugly solution to clean up extra text after name of company...
colnames(y) <- "company"
a <- str_split_fixed(y$company,'-',2)
colnames(a) <- c('name','misc')
a <- a[,-2]
a <- str_split_fixed(a,' Jobs',2)
# following two lines turn the nest listed into a dataframe
dfs2 <- lapply(sal_sf, data.frame, stringsAsFactors = FALSE)
z <- bind_rows(dfs2)
colnames(z) <- "salary"
# removes the row of national wide average that appears on every search result page
z2 <- data.frame(z[!grepl("Average", z$salary),])
colnames(z2) <- "salary"
# combines company and offered Data Scientist salary in a 2-column dataframe
sf <- data.frame(cbind(a[,1], z2))
colnames(sf) <- c('company','salary')
sf$salary <- gsub('^[0-9]+','',as.character(sf$salary))
sf[] <- lapply(sf, as.character)
sf$salary <- as.numeric(unlist(str_replace_all(str_extract_all(sf$salary, '[[0-9]+,.]{2,}'),',','')))
# Annualize per hour compensation rate by assuming a 40-hour work week
sf$salary[48] <- sf$salary[48]*40*52
sf$salary[50] <- sf$salary[50]*40*52
# write.csv(sf,file="sf_company_salary.csv")
####################################################
# Now same cleanup for Boston
####################################################
# following two lines turn the nest listed into a dataframe
dfs <- lapply(comp_bost, data.frame, stringsAsFactors = FALSE)
y <- bind_rows(dfs)
# ugly solution to clean up extra text after name of company...
colnames(y) <- "company"
a <- str_split_fixed(y$company,'-',2)
colnames(a) <- c('name','misc')
a <- a[,-2]
a <- str_split_fixed(a,' Jobs',2)
# following two lines turn the nest listed into a dataframe
dfs2 <- lapply(sal_bost, data.frame, stringsAsFactors = FALSE)
z <- bind_rows(dfs2)
colnames(z) <- "salary"
# removes the row of national wide average that appears on every search result page
z2 <- data.frame(z[!grepl("Average", z$salary),])
colnames(z2) <- "salary"
# combines company and offered Data Scientist salary in a 2-column dataframe
bost <- data.frame(cbind(a[,1], z2))
colnames(bost) <- c('company','salary')
bost$salary <- gsub('^[0-9]+','',as.character(bost$salary))
bost[] <- lapply(bost, as.character)
bost$salary <- as.numeric(unlist(str_replace_all(str_extract_all(bost$salary, '[[0-9]+,.]{2,}'),',','')))
# write.csv(bost,file="bost_company_salary.csv")
####################################################
# Now same cleanup for Chicago
####################################################
# following two lines turn the nest listed into a dataframe
dfs <- lapply(comp_chi, data.frame, stringsAsFactors = FALSE)
y <- bind_rows(dfs)
# ugly solution to clean up extra text after name of company...
colnames(y) <- "company"
a <- str_split_fixed(y$company,'-',2)
colnames(a) <- c('name','misc')
a <- a[,-2]
a <- str_split_fixed(a,' Jobs',2)
# following two lines turn the nest listed into a dataframe
dfs2 <- lapply(sal_chi, data.frame, stringsAsFactors = FALSE)
z <- bind_rows(dfs2)
colnames(z) <- "salary"
# removes the row of national wide average that appears on every search result page
z2 <- data.frame(z[!grepl("Average", z$salary),])
colnames(z2) <- "salary"
# combines company and offered Data Scientist salary in a 2-column dataframe
chi <- data.frame(cbind(a[,1], z2))
colnames(chi) <- c('company','salary')
chi[] <- lapply(chi, as.character)
chi$salary <- as.numeric(unlist(str_replace_all(str_extract_all(chi$salary, '[[0-9]+,.]{2,}'),',','')))
# Annualize per hour compensation rate by assuming a 40-hour work week
chi$salary[18] <- chi$salary[18]*40*52
# write.csv(chi,file="chi_company_salary.csv")
options(scipen = 9)
qplot(nyc$salary, geom="histogram", xlab="Salaries",
main = "Histogram for NYC Data Scientist Salaries", fill=I("blue"),
col=I("black"), xlim=c(min(nyc$salary)-10000,max(nyc$salary)+10000),
breaks=seq(min(nyc$salary)-10000, max(nyc$salary)+10000,by=5000))
qplot(sf$salary, geom="histogram", xlab="Salaries",
main = "Histogram for San Fran Data Scientist Salaries", fill=I("blue"),
col=I("black"), xlim=c(min(sf$salary)-10000,max(sf$salary)+10000),
breaks=seq(min(sf$salary)-10000, max(sf$salary)+10000,by=10000))
qplot(bost$salary, geom="histogram", xlab="Salaries",
main = "Histogram for Boston Data Scientist Salaries", fill=I("blue"),
col=I("black"), xlim=c(min(bost$salary)-10000,max(bost$salary)+10000),
breaks=seq(min(bost$salary)-10000, max(bost$salary)+10000,by=10000))
qplot(chi$salary, geom="histogram", xlab="Salaries",
main = "Histogram for Chicago Data Scientist Salaries", fill=I("blue"),
col=I("black"), xlim=c(min(chi$salary)-10000,max(chi$salary)+10000),
breaks=seq(min(chi$salary)-10000, max(chi$salary)+10000,by=10000))
To make a more fair comparision of salaries we must take into account cost of living for each city. We will first gather the cost of living indexes from https://www.numbeo.com/cost-of-living/region_rankings.jsp?title=2017-mid®ion=019. Note that webscrapping the entire table is costly in terms of time and computer resources, so we will hand pick the cities we need: New York City, San Fransisco, Boston, and Chicago. Also the number are reported as percentages, so we will take the additional step of dividing by 100 to put them into decimal format.
col_index <- data.frame(matrix(c(c('NYC', 'SanFran', 'Boston', 'Chicago'), c(100.00/100, 101.94/100, 90.23/100,84.39/100)), ncol = 2))
col_index <- col_index %>% rename('City' = X1,'COL.Index' = X2)
col_index[,2] <- col_index[,2] %>% as.character() %>% as.numeric()
col_index
## City COL.Index
## 1 NYC 1.0000
## 2 SanFran 1.0194
## 3 Boston 0.9023
## 4 Chicago 0.8439
We will begin our comparision by calculating 95% Confidence Intervals without Adjusting for Cost of Living.
nyc_mean <- nyc$salary %>% mean()
nyc_mean
## [1] 144193.9
nyc_sd <- nyc$salary %>% sd()
nyc_sd
## [1] 32537.78
nyc_lower <- nyc_mean - 1.96*nyc_sd
nyc_upper <- nyc_mean + 1.96*nyc_sd
nyc_ci <- c(nyc_lower, nyc_upper)
nyc_ci
## [1] 80419.87 207967.97
sf_mean <- sf$salary %>% mean()
sf_mean
## [1] 140620.4
sf_sd <- sf$salary %>% sd()
sf_sd
## [1] 38754.2
sf_lower <- sf_mean - 1.96*sf_sd
sf_upper <- sf_mean + 1.96*sf_sd
sf_ci <- c(sf_lower, sf_upper)
sf_ci
## [1] 64662.14 216578.59
bost_mean <- bost$salary %>% mean()
bost_mean
## [1] 119875.1
bost_sd <- bost$salary %>% sd()
bost_sd
## [1] 14607.43
bost_lower <- bost_mean - 1.96*bost_sd
bost_upper <- bost_mean + 1.96*bost_sd
bost_ci <- c(bost_lower, bost_upper)
bost_ci
## [1] 91244.55 148505.70
chi_mean <- chi$salary %>% mean()
chi_mean
## [1] 124425.8
chi_sd <- chi$salary %>% sd()
chi_sd
## [1] 24935.52
chi_lower <- chi_mean - 1.96*chi_sd
chi_upper <- chi_mean + 1.96*chi_sd
chi_ci <- c(chi_lower, chi_upper)
chi_ci
## [1] 75552.23 173299.47
There is quite a bit of overlap between all cities, so we expect that they should be consistant with each other regarding a student’s t-test.
Student’s t-tests are used to check if the means of two samples are different from each other. The Null Hyopothesis is that the true diffenence in means of the populations the samples are drawn from is 0. The alternatibe hypothesis is that the difference in the population means is not zero. If the null is rejected you can infere that the two populations are different. For the t-test to be valid the samples must have a Normal Distribution and have similar variences.
Kologomorov-Smirnov (KS) tests test the same null and alternative hypothesis as the Student’s t-test. The KS-test analyzes differences in the Cumulative Distribution Function (CDF) of the two samples. Unlike the t-test, you do not have to make any assumptions about the samples.
We need to analyze pairs out of 4 cities this invovles 6 combinations:
combn(4,2)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1 1 1 2 2 3
## [2,] 2 3 4 3 4 4
t.test(nyc$salary,sf$salary)
##
## Welch Two Sample t-test
##
## data: nyc$salary and sf$salary
## t = 0.54377, df = 97.351, p-value = 0.5878
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -9469.17 16616.28
## sample estimates:
## mean of x mean of y
## 144193.9 140620.4
According to Student’s t-test New York’s and San Fransisco’s salaries are similar before accounting for cost of living.
t.test(nyc$salary,bost$salary)
##
## Welch Two Sample t-test
##
## data: nyc$salary and bost$salary
## t = 5.7025, df = 107.29, p-value = 0.0000001052
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 15865.07 32772.51
## sample estimates:
## mean of x mean of y
## 144193.9 119875.1
According to Student’s t-test New York’s and Boston’s salaries are different before accounting for cost of living.
t.test(nyc$salary,chi$salary)
##
## Welch Two Sample t-test
##
## data: nyc$salary and chi$salary
## t = 3.1975, df = 56.887, p-value = 0.002265
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 7387.595 32148.550
## sample estimates:
## mean of x mean of y
## 144193.9 124425.8
According to Student’s t-test New York’s and Chicago’s salaries are different before accounting for cost of living.
t.test(sf$salary,bost$salary)
##
## Welch Two Sample t-test
##
## data: sf$salary and bost$salary
## t = 3.6244, df = 64.522, p-value = 0.0005718
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 9312.386 32178.090
## sample estimates:
## mean of x mean of y
## 140620.4 119875.1
According to Student’s t-test San Fransisco’s and Boston’s salaries are different before accounting for cost of living.
t.test(sf$salary,chi$salary)
##
## Welch Two Sample t-test
##
## data: sf$salary and chi$salary
## t = 2.2288, df = 71.05, p-value = 0.02899
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1706.386 30682.652
## sample estimates:
## mean of x mean of y
## 140620.4 124425.8
According to Student’s t-test San Fransisco’s and Chicago’s salaries are different before accounting for cost of living.
t.test(bost$salary,chi$salary)
##
## Welch Two Sample t-test
##
## data: bost$salary and chi$salary
## t = -0.86318, df = 33.363, p-value = 0.3942
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -15272.30 6170.86
## sample estimates:
## mean of x mean of y
## 119875.1 124425.8
Boston and Chicago’s Salaries are similar under Student’s t-test.
# par(mfrow = c(4,1))
# boxplot(nyc$salary, horizontal = TRUE)
# boxplot(sf$salary, horizontal = TRUE)
# boxplot(bost$salary, horizontal = TRUE)
# boxplot(chi$salary, horizontal = TRUE)
ks.test(nyc$salary,sf$salary, exact = TRUE)
## Warning in ks.test(nyc$salary, sf$salary, exact = TRUE): cannot compute
## exact p-value with ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: nyc$salary and sf$salary
## D = 0.15177, p-value = 0.4826
## alternative hypothesis: two-sided
According to the KS test New York’s and San Fransisco’s salaries are similar before accounting for cost of living.
ks.test(nyc$salary,bost$salary, exact = TRUE)
## Warning in ks.test(nyc$salary, bost$salary, exact = TRUE): cannot compute
## exact p-value with ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: nyc$salary and bost$salary
## D = 0.53857, p-value = 0.00000002249
## alternative hypothesis: two-sided
According to the KS test New York’s and Boston’s salaries are different before accounting for cost of living.
ks.test(nyc$salary,chi$salary, exact = TRUE)
## Warning in ks.test(nyc$salary, chi$salary, exact = TRUE): cannot compute
## exact p-value with ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: nyc$salary and chi$salary
## D = 0.43139, p-value = 0.001552
## alternative hypothesis: two-sided
According to the KS test New York’s and Chicago’s salaries are different before accounting for cost of living.
ks.test(sf$salary,bost$salary, exact = TRUE)
## Warning in ks.test(sf$salary, bost$salary, exact = TRUE): cannot compute
## exact p-value with ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: sf$salary and bost$salary
## D = 0.58951, p-value = 0.00000001709
## alternative hypothesis: two-sided
According to the KS test San Fransisco’s and Boston’s salaries are different before accounting for cost of living.
ks.test(sf$salary,chi$salary, exact = TRUE)
## Warning in ks.test(sf$salary, chi$salary, exact = TRUE): cannot compute
## exact p-value with ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: sf$salary and chi$salary
## D = 0.40385, p-value = 0.007008
## alternative hypothesis: two-sided
According to the KS test San Fransisco’s and Chicago’s salaries are different before accounting for cost of living.
ks.test(bost$salary,chi$salary, exact = TRUE)
## Warning in ks.test(bost$salary, chi$salary, exact = TRUE): cannot compute
## exact p-value with ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: bost$salary and chi$salary
## D = 0.3972, p-value = 0.007616
## alternative hypothesis: two-sided
According to the KS test Boston’s and Chicago’s salaries are different before accounting for cost of living.
Regardless of the test used, only New York and San Fransisco had similar salary distributions.
Now we will take the salary data and divide by the Cost of Livining index. In this way lower cost of living salaries (i.e., < 1) will be increased to reflect more purchasing power per dollar, and cities with higher cost of living (>1) will have there salaries decreased to reflect less purchasing power per dollar.
head(nyc$salary)
## [1] 220501 203811 186440 174734 180758 200516
adj_nyc <- nyc$salary/col_index[1,2]
head(adj_nyc)
## [1] 220501 203811 186440 174734 180758 200516
head(sf$salary)
## [1] 300000 202769 189098 186021 180463 171443
adj_sf <- sf$salary/col_index[2,2]
head(adj_sf)
## [1] 294290.8 198910.1 185499.3 182480.9 177028.6 168180.3
head(bost$salary)
## [1] 180080 173701 141163 138230 140000 134174
adj_bost <- bost$salary/col_index[3,2]
head(adj_bost)
## [1] 199578.9 192509.1 156448.0 153197.4 155159.0 148702.2
head(chi$salary)
## [1] 131089 185309 176067 154035 150994 143377
adj_chi <- chi$salary/col_index[4,2]
head(adj_chi)
## [1] 155337.1 219586.4 208634.9 182527.6 178924.0 169898.1
We will begin our comparision by calculating 95% Confidence Intervals without Adjusting for Cost of Living.
nyc_mean_adj <- adj_nyc %>% mean()
nyc_mean_adj
## [1] 144193.9
nyc_sd_adj <- adj_nyc %>% sd()
nyc_sd_adj
## [1] 32537.78
nyc_lower_adj <- nyc_mean_adj - 1.96*nyc_sd_adj
nyc_upper_adj <- nyc_mean_adj + 1.96*nyc_sd_adj
nyc_ci_adj <- c(nyc_lower_adj, nyc_upper_adj)
nyc_ci_adj
## [1] 80419.87 207967.97
sf_mean_adj <- adj_sf %>% mean()
sf_mean_adj
## [1] 137944.2
sf_sd_adj <- adj_sf %>% sd()
sf_sd_adj
## [1] 38016.67
sf_lower_adj <- sf_mean_adj - 1.96*sf_sd_adj
sf_upper_adj <- sf_mean_adj + 1.96*sf_sd_adj
sf_ci_adj <- c(sf_lower_adj, sf_upper_adj)
sf_ci_adj
## [1] 63431.57 212456.93
bost_mean_adj <- adj_bost %>% mean()
bost_mean_adj
## [1] 132855.1
bost_sd_adj <- adj_bost %>% sd()
bost_sd_adj
## [1] 16189.11
bost_lower_adj <- bost_mean_adj - 1.96*bost_sd_adj
bost_upper_adj <- bost_mean_adj + 1.96*bost_sd_adj
bost_ci_adj <- c(bost_lower_adj, bost_upper_adj)
bost_ci_adj
## [1] 101124.4 164585.7
chi_mean_adj <- adj_chi %>% mean()
chi_mean_adj
## [1] 147441.5
chi_sd_adj <- adj_chi %>% sd()
chi_sd_adj
## [1] 29547.96
chi_lower_adj <- chi_mean_adj - 1.96*chi_sd_adj
chi_upper_adj <- chi_mean_adj + 1.96*chi_sd_adj
chi_ci_adj <- c(chi_lower_adj, chi_upper_adj)
chi_ci_adj
## [1] 89527.46 205355.45
There is quite a bit of overlap between all cities, however we did see some statiscally significant differences even when the CI’s overlapped in the pervious section.
Here we repeat the analysis from above with salary data that has been adjusted for cost of living.
t.test(adj_nyc,adj_sf)
##
## Welch Two Sample t-test
##
## data: adj_nyc and adj_sf
## t = 0.96319, df = 98.74, p-value = 0.3378
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6625.312 19124.656
## sample estimates:
## mean of x mean of y
## 144193.9 137944.2
According to Student’s t-test New York’s and San Fransisco’s salaries are similar after accounting for cost of living.
t.test(adj_nyc,adj_bost)
##
## Welch Two Sample t-test
##
## data: adj_nyc and adj_bost
## t = 2.5964, df = 112.81, p-value = 0.01067
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 2686.564 19991.139
## sample estimates:
## mean of x mean of y
## 144193.9 132855.1
According to Student’s t-test New York’s and Boston’s salaries are different after accounting for cost of living. Hoever, the margin has dropped quite a bit.
t.test(adj_nyc,adj_chi)
##
## Welch Two Sample t-test
##
## data: adj_nyc and adj_chi
## t = -0.46929, df = 47.865, p-value = 0.641
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -17162.22 10667.14
## sample estimates:
## mean of x mean of y
## 144193.9 147441.5
According to Student’s t-test New York’s and Chicago’s salaries are not different after accounting for cost of living.
t.test(adj_sf,adj_bost)
##
## Welch Two Sample t-test
##
## data: adj_sf and adj_bost
## t = 0.89189, df = 68.097, p-value = 0.3756
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6296.742 16475.101
## sample estimates:
## mean of x mean of y
## 137944.2 132855.1
According to Student’s t-test San Fransisco’s and Boston’s salaries are not different after accounting for cost of living.
t.test(adj_sf,adj_chi)
##
## Welch Two Sample t-test
##
## data: adj_sf and adj_chi
## t = -1.2123, df = 62.517, p-value = 0.23
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -25154.849 6160.428
## sample estimates:
## mean of x mean of y
## 137944.2 147441.5
According to Student’s t-test San Fransisco’s and Chicago’s salaries are not different after accounting for cost of living.
t.test(adj_bost,adj_chi)
##
## Welch Two Sample t-test
##
## data: adj_bost and adj_chi
## t = -2.3555, df = 32.298, p-value = 0.02473
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -27195.252 -1977.528
## sample estimates:
## mean of x mean of y
## 132855.1 147441.5
Boston and Chicago’s Salaries are no longer similar under Student’s t-test when adjusting for cost of living.
ks.test(adj_nyc,adj_sf, exact = TRUE)
## Warning in ks.test(adj_nyc, adj_sf, exact = TRUE): cannot compute exact p-
## value with ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: adj_nyc and adj_sf
## D = 0.17879, p-value = 0.283
## alternative hypothesis: two-sided
According to the KS test New York’s and San Fransisco’s salaries are similar after accounting for cost of living.
ks.test(adj_nyc,adj_bost, exact = TRUE)
## Warning in ks.test(adj_nyc, adj_bost, exact = TRUE): cannot compute exact
## p-value with ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: adj_nyc and adj_bost
## D = 0.45749, p-value = 0.000003675
## alternative hypothesis: two-sided
According to the KS test New York’s and Boston’s salaries are different after accounting for cost of living.
ks.test(adj_nyc,adj_chi, exact = TRUE)
## Warning in ks.test(adj_nyc, adj_chi, exact = TRUE): cannot compute exact p-
## value with ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: adj_nyc and adj_chi
## D = 0.22037, p-value = 0.3075
## alternative hypothesis: two-sided
According to the KS test New York’s and Chicago’s salaries are not different after accounting for cost of living.
ks.test(adj_sf,adj_bost, exact = TRUE)
## Warning in ks.test(adj_sf, adj_bost, exact = TRUE): cannot compute exact p-
## value with ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: adj_sf and adj_bost
## D = 0.3972, p-value = 0.0004347
## alternative hypothesis: two-sided
According to the KS test San Fransisco’s and Boston’s salaries are different after accounting for cost of living.
ks.test(adj_sf,adj_chi, exact = TRUE)
## Warning in ks.test(adj_sf, adj_chi, exact = TRUE): cannot compute exact p-
## value with ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: adj_sf and adj_chi
## D = 0.23077, p-value = 0.3145
## alternative hypothesis: two-sided
According to the KS test San Fransisco’s and Chicago’s salaries are not different after accounting for cost of living.
ks.test(adj_bost,adj_chi, exact = TRUE)
## Warning in ks.test(adj_bost, adj_chi, exact = TRUE): cannot compute exact
## p-value with ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: adj_bost and adj_chi
## D = 0.58951, p-value = 0.000009376
## alternative hypothesis: two-sided
According to the KS test Boston’s and Chicago’s salaries are different after accounting for cost of living.
After adjusting for Cost of Living, Chicago became similar to New York and San Fransisco but Boston remained different.