Salaries Analysis Exercise (NPCA)

library(ggplot2)
library(tidyverse)
library(scales)
library(psych)
library(ggpubr)
library(car)
library(knitr)
library(rmarkdown)

Five Questions

Which industries are the top 10% of earners associated with?
What are the characteristics of the top 10% of earners in each industry?
Are there statistically significant annual salary differences between genders in each industry? Overall?
Which industries pay the highest at entry level?
Which states have the highest salaries? What about just among nonprofits?

Data import and more cleanup

Subset to only keep data from the United States where the currency is USD. Convert the relevant columns to factors for plotting. I also create a new column to note whether or not a respondent is white or nonwhite and then subset the dataset to only include industries with at least 100 respondents.

salaries <- read.csv("clean_salaries.csv")

# remove "X" index column
salaries <- (subset(salaries, select = -c(X)))

# remove where country is not United States
salaries <- salaries[salaries$country == 'United States',]

# remove where currency is not USD
salaries <- salaries[salaries$currency == 'USD',]

# drop other currency column
salaries <- (subset(salaries, select = -c(other_currency)))

# convert categorical variables to factors
salaries$gender <- as.factor(salaries$gender)
salaries$race <- as.factor(salaries$race)
salaries$multiracial <- as.factor(salaries$multiracial)
salaries$state <- as.factor(salaries$state)
salaries$multistate <- as.factor(salaries$multistate)
salaries$industry <- as.factor(salaries$industry)
salaries$age_range <- as.factor(salaries$age_range)
salaries$all_experience <- as.factor(salaries$all_experience)
salaries$field_experience <- as.factor(salaries$field_experience)
salaries$job <- as.factor(salaries$job)
salaries$education.level <- as.factor(salaries$education.level) # column naming?

# Look at white vs. non-white - create categories 
# Would keep the more granular data if I was doing a more granular analysis

# if white and not multiracial = white, otherwise = nonwhite
white_salaries <- subset(salaries, (multiracial== "No"))
white_salaries <- subset(white_salaries, !(race != "White"))

# create new column for white vs. nonwhite
salaries <- transform(
  salaries, whiteness = ifelse(id %in% white_salaries$id, "white", "nonwhite"))
salaries$whiteness <- as.factor(salaries$whiteness)

# drop the original race column
salaries <- (subset(salaries, select = -c(race)))

# drop records associated with more than one state (not looking into this aspect of the data and this will simplify de-duplication)
salaries <- salaries[!(salaries$multistate=="Yes"),]
# drop column for multistate
salaries <- (subset(salaries, select = -c(multistate)))

# drop context columns
salaries <- (subset(salaries, select = -c(income_context)))
salaries <- (subset(salaries, select = -c(job_context)))

# Select industries where there are at least 100 people (meaningful sample of respondents)
# ** Here we create salaries2 **
industry_count <- table(salaries$industry)
salaries2 <- subset(salaries, industry %in% names(industry_count[industry_count > 100]))

# How many industries do we have now? (originally 70)
# length(unique(salaries2$industry)) # down to 7

# remove duplicates
salaries <- salaries[!duplicated(salaries$id), ]

# double check for duplicates
n_occur <- data.frame(table(salaries$id))
# n_occur[n_occur$Freq > 1,] # no duplicates left

Outlier detection and removal

Outliers can actually be really useful so we’ll identify them and store that information.

# Start with visual inspection 

# build pretty palette:
ppretty <- c("#9e0142", "#d53e4f", "#fdae61", "#ffd86b", "#66c2a5", "#3288bd", "#5e4fa2")

# create boxplot
bp <- ggplot(salaries2, aes(x="", y=annual_salary, group=industry)) + 
  geom_boxplot(aes(fill=industry)) + theme_minimal()
bp <- bp + scale_y_continuous(labels = label_comma())
bp <- bp + facet_grid(. ~ industry)
bp <- bp + scale_fill_manual(values=ppretty)
bp <- bp + theme(legend.position="none") # Remove legend
bp <- bp + theme(text = element_text(size=16), axis.title=element_text(size=16)) 
bp <- bp + labs(title = "US Salaries by Industry", x= "Respondents", y= "Salary")
bp

In ggplot2, an observation is defined as an outlier if it meets one of the following two requirements: * The observation is 1.5 times the interquartile range less than the first quartile (Q1) * The observation is 1.5 times the interquartile range greater than the third quartile (Q3).

# create a function to replicate outlier conditions
find_outlier <- function(x) {
  return(x < quantile(x, .25) - 1.5*IQR(x) | x > quantile(x, .75) + 1.5*IQR(x))
}

# create column to indicate suspected outliers
salaries2 <- salaries2 %>%
        group_by(industry) %>%
        mutate(outlier = ifelse(find_outlier(annual_salary), "Yes", "No"))

# create subset to work with where the outliers are removed
# ** Here we create salaries3 **
salaries3 <- salaries2[salaries2$outlier == 'No',]

# create a new boxplot using data without outliers
bp2 <- ggplot(salaries3, aes(x="", y=annual_salary, group=industry)) + 
  geom_boxplot(aes(fill=industry)) + theme_minimal()
bp2 <- bp2 + scale_y_continuous(labels = label_comma())
bp2 <- bp2 + facet_grid(. ~ industry)
bp2 <- bp2 + scale_fill_manual(values=ppretty)
bp2 <- bp2 + theme(legend.position="none") # Remove legend
bp2 <- bp2 + theme(text = element_text(size=16), axis.title=element_text(size=16)) 
bp2 <- bp2 + labs(title = "US Salaries by Industry", x= "Industry", y= "Salary")
bp2

1. Which industries are the top 10% of earners associated with?

# top 10% overall (independent of industry)
n <- 10
top_10p <- salaries3[salaries3$annual_salary > quantile(salaries3$annual_salary, prob=1-n/100),]

kable(unique(top_10p$industry))

x
Manufacturing
Financial
Tech
Government
Health care

2. What are the characteristics of the top 10% of earners in each industry?

# find the top 10% of earners within each industry
by_industry <- salaries3 %>% group_by(industry)
top_10p_industry <- by_industry[by_industry$annual_salary > quantile(by_industry$annual_salary, prob=1-n/100),]

Education

# remove records without answers
top_10p_industry_plot1  <- top_10p_industry[!(top_10p_industry$education.level=="NaN"),]

# new pretty palette
ppretty2 <- c("#9e0142", "#fdae61", "#66c2a5", "#3288bd", "#5e4fa2")

h1 <- ggplot(top_10p_industry_plot1, aes(x=education.level, fill=industry)) + geom_histogram(stat="count", color="lightgrey")
h1 <- h1 + facet_grid(industry ~ .) + coord_flip() + theme_minimal()
h1 <- h1 + scale_fill_manual(values=ppretty2)
h1 <- h1 + theme(legend.position="none") # Remove legend
h1 <- h1 + theme(text = element_text(size=16), axis.title=element_text(size=16)) 
h1 <- h1 + labs(title = "Education of Top 10% of Earners", x= "Education", y= "Count")
h1

Age

h2 <- ggplot(top_10p_industry, aes(x=age_range, fill=industry)) + geom_histogram(stat="count", color="lightgrey")
h2 <- h2 + facet_grid(industry ~ .) + coord_flip() + theme_minimal()
h2 <- h2 + scale_fill_manual(values=ppretty2)
h2 <- h2 + theme(legend.position="none") # Remove legend
h2 <- h2 + theme(text = element_text(size=16), axis.title=element_text(size=16)) 
h2 <- h2 + labs(title = "Age of Top 10% of Earners", x= "Age", y= "Count")
h2

Experience

h3 <- ggplot(top_10p_industry, aes(x=field_experience, fill=industry)) + geom_histogram(stat="count", color="lightgrey")
h3 <- h3 + facet_grid(industry ~ .) + coord_flip() + theme_minimal()
h3 <- h3 + scale_fill_manual(values=ppretty2)
h3 <- h3 + theme(legend.position="none") # Remove legend
h3 <- h3 + theme(text = element_text(size=16), axis.title=element_text(size=16)) 
h3 <- h3 + labs(title = "Experience of Top 10% of Earners", x= "Experience in the Field", y= "Count")
h3

Gender

# remove records without answers
top_10p_industry_plot2  <- top_10p_industry[!(top_10p_industry$gender=="NaN" | top_10p_industry$gender=="Other or prefer not to answer"),]

h4 <- ggplot(top_10p_industry_plot2, aes(x=gender, fill=industry)) + geom_histogram(stat="count", color="lightgrey")
h4 <- h4 + facet_grid(industry ~ .) + coord_flip() + theme_minimal()
h4 <- h4 + scale_fill_manual(values=ppretty2)
h4 <- h4 + theme(legend.position="none") # Remove legend
h4 <- h4 + theme(text = element_text(size=16), axis.title=element_text(size=16)) 
h4 <- h4 + labs(title = "Gender of Top 10% of Earners", x= "Gender", y= "Count")
h4

Whiteness

h5 <- ggplot(top_10p_industry, aes(x=whiteness, fill=industry)) + geom_histogram(stat="count", color="lightgrey")
h5 <- h5 + facet_grid(industry ~ .) + coord_flip() + theme_minimal()
h5 <- h5 + scale_fill_manual(values=ppretty2)
h5 <- h5 + theme(legend.position="none") # Remove legend
h5 <- h5 + theme(text = element_text(size=16), axis.title=element_text(size=16)) 
h5 <- h5 + labs(title = "Whiteness Among Top 10% of Earners", x= "Race", y= "Count")
h5

States

# remove records without answers
top_10p_industry_plot3  <- top_10p_industry[!(top_10p_industry$state=="NaN"),]

s1 <- ggplot(top_10p_industry_plot3, aes(x=state, fill=industry, color=industry)) + geom_point(stat="count") + coord_flip()
s1 <- s1 + scale_color_manual(values=ppretty2) + theme_minimal()
s1 <- s1 + theme(text = element_text(size=16), axis.title=element_text(size=16)) 
s1 <- s1 + labs(title = "States of Top 10% of Earners", x= "State", y= "Count")
s1

3. Are there statistically significant annual salary differences between genders in each industry? Overall?

# Go back to using salaries3 (industries with at least 100 records + outliers removed)
# remove records without answers

salaries3_gender  <- salaries3[!(salaries3$gender=="NaN" | salaries3$gender=="Other or prefer not to answer" | salaries3$gender=="Non-binary"),] # drop non-binary for two sample t-test

Can we do a between groups t-test? The data needs to be normally distributed with no significant difference in variance

# check for normal distribution visually

qq1 <- ggqqplot(salaries3_gender, x = "annual_salary",
   color = "gender", palette = c("#9e0142", "#fdae61")) # not normally distributed
qq1 <- qq1 + theme(legend.position="bottom")
qq1 <- qq1 + theme(text = element_text(size=16), axis.title=element_text(size=16)) 
qq1 <- qq1 + labs(title = "Q-Q Plot for Industries with at least 100 responses")
qq1

The plot shows that the data are not normally distributed. Run Shapiro-Wilk test to confirm.

shapiro.test(salaries3_gender$annual_salary)

## 
##  Shapiro-Wilk normality test
## 
## data:  salaries3_gender$annual_salary
## W = 0.93185, p-value < 2.2e-16

From the output, the p-value is < 0.05 implying that the distribution of the data are significantly different from a normal distribution. We therefore cannot assume the normality.

We’ll run an F-test to check the variance and further confirm that we can’t do a t-test.

# use F-test to see if variances are equal between groups
res.ftest <- var.test(annual_salary ~ gender, data = salaries3_gender)
res.ftest # way less than the significance level 0.05, therefore there is a significant difference between the two variances.

## 
##  F test to compare two variances
## 
## data:  annual_salary by gender
## F = 1.845, num df = 332, denom df = 1129, p-value = 2.625e-13
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  1.557904 2.203197
## sample estimates:
## ratio of variances 
##           1.844997

The p-value of F-test is much less than the significance level 0.05, so there is a significant difference between the two variances.

We can’t use the t-test for the subgroup of industries with at least 100 responses, but can we do it for the full dataset?

# Go back to using salaries (contains all industries and still contains outliers)
salaries_gender  <- salaries[!(salaries$gender=="NaN" | salaries$gender=="Other or prefer not to answer" | salaries$gender=="Non-binary"),] # drop non-binary for two sample t-test

# Need to remove outliers
# use previously defined outlier function
# create column to indicate suspected outliers
salaries_outliers_removed <- salaries_gender %>%
        mutate(outlier = ifelse(find_outlier(annual_salary), "Yes", "No"))

# create subset to work with where the outliers are removed
salaries_outliers_removed <- salaries_outliers_removed[salaries_outliers_removed$outlier == 'No',]

Check for normal distribution visually

qq2 <- ggqqplot(salaries_outliers_removed, x = "annual_salary",
   color = "gender", palette = c("#9e0142", "#fdae61")) # not normally distributed
qq2 <- qq2 + theme(legend.position="bottom") 
qq2 <- qq2 + theme(text = element_text(size=16), axis.title=element_text(size=16)) 
qq2 <- qq2 + labs(title = "Q-Q Plot for all industries")
qq2

Distriburion is definitely not normal, but run another Shapiro-Wilk to confirm and a F-test to assess variance for the full dataset.

shapiro.test(salaries_outliers_removed$annual_salary)

## 
##  Shapiro-Wilk normality test
## 
## data:  salaries_outliers_removed$annual_salary
## W = 0.9548, p-value < 2.2e-16

The distribution of the data are significantly different from a normal distribution.

# F-test
res.ftest2 <- var.test(annual_salary ~ gender, data = salaries_outliers_removed)
res.ftest2 # variance differs still, so we can't do a statistical examination

## 
##  F test to compare two variances
## 
## data:  annual_salary by gender
## F = 1.32, num df = 402, denom df = 1786, p-value = 0.000238
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  1.136774 1.544294
## sample estimates:
## ratio of variances 
##           1.319987

F-test confirms that there is a significant difference between the two variances in the full dataset.

We can still take a look to see if there are differences in pay by industry, but we are limited in our ability to attribute any observable differences to gender from a statistical standpoint. If we had been able to do a t-test or more in-depth analysis we would want to further control for years of experience and education. Unfortunately the data significantly limits our ability to do this because we only have year ranges (which are categorical) and it’s not clear that those ranges are representative of the thresholds at which you would be able to detect a significant difference using regression techniques. For instance, does pay start to be impacted at 13 years of experience or 17? We only have a range “11-20 years” so we can’t test this. The same is true with the age ranges. These limitations would also be relevant in a granular examination of race or education within this dataset.

# comparing just men and women within industries with at least 100 respondents
bp3 <- ggplot(salaries3_gender, aes(x=gender, y=annual_salary, fill=gender)) + 
    geom_boxplot(width=0.4) + theme_minimal()
bp3 <- bp3 + theme(text = element_text(size=16), axis.title=element_text(size=16)) 
bp3 <- bp3 + scale_y_continuous(labels = label_comma()) 
bp3 <- bp3 + facet_wrap(industry ~ ., ncol=4) 
bp3 <- bp3 + scale_fill_manual(values=ppretty2)
bp3 <- bp3 + labs(title = "Pay Differences by Industry", x= "Gender", y= "Salary")
bp3 <- bp3 + theme(legend.position="none")
bp3

Averages for men are higher in all industries.

# table to list averages by industry
mean_salaries <- salaries3_gender %>% group_by(industry,gender) %>% 
  summarise(mean_salary = mean(annual_salary),
            .groups = 'drop') %>%
  as.data.frame()
kable(mean_salaries)

industry	gender	mean_salary
Financial	Man	99421.07
Financial	Woman	81311.93
Government	Man	87377.27
Government	Woman	82309.44
Health care	Man	74880.83
Health care	Woman	73954.14
Higher Education	Man	67268.00
Higher Education	Woman	65438.85
Manufacturing	Man	105322.86
Manufacturing	Woman	94814.13
Nonprofit	Man	72176.47
Nonprofit	Woman	67674.13
Tech	Man	133547.42
Tech	Woman	113505.27

# create a table to describe the pay gap by industry
pay_gap <- mean_salaries %>% group_by(industry) %>% mutate(pay_gap = mean_salary-lag(mean_salary,default=first(mean_salary)))
pay_gap <- (subset(pay_gap, select = c(industry, pay_gap)))
pay_gap <- pay_gap[!(pay_gap$pay_gap==0),]
kable(pay_gap)

industry	pay_gap
Financial	-18109.1438
Government	-5067.8317
Health care	-926.6914
Higher Education	-1829.1467
Manufacturing	-10508.7256
Nonprofit	-4502.3435
Tech	-20042.1452

The largest pay gap between men and women is seen in the Tech industry.

### What about salaries across the whole dataset?
stat <- salaries_outliers_removed %>% group_by(gender) %>% summarise(mean_annual_salary = mean(annual_salary)) 

bp4 <- ggplot(salaries_outliers_removed, aes(x=gender, y=annual_salary, fill=gender)) + 
    geom_boxplot(width=0.15) + 
    stat_summary(fun.y=mean, geom="point", color="white", fill="white") + 
    geom_text(data = stat, 
            aes(label = round(mean_annual_salary, 2), x = gender, y = mean_annual_salary - 0.1),
            vjust = 0, nudge_x = 0.22) +
    theme_minimal()
bp4 <- bp4 + theme(text = element_text(size=16), axis.title=element_text(size=16)) 
bp4 <- bp4 + scale_y_continuous(labels = label_comma()) 
bp4 <- bp4 + scale_fill_manual(values=ppretty2)
bp4 <- bp4 + labs(title = "US Salaries by Gender", x= "Gender", y= "Salary")
bp4 <- bp4 + theme(legend.position="none")
bp4

What’s the difference in average pay between men and women across all industries?

pay_gap_all <- stat %>% mutate(pay_gap_all = mean_annual_salary-lag(mean_annual_salary,default=first(mean_annual_salary)))
pay_gap_all <- (subset(pay_gap_all, select = pay_gap_all))
pay_gap_all <- pay_gap_all[!(pay_gap_all$pay_gap_all==0),]
kable(pay_gap_all)

pay_gap_all
-20693.8

On average women make $20,693.80 less than men (20.58% less).

4. Which industries pay the highest at entry level?

# back to salaries2 (not subset by industries with at least 100 respondents)
# remove outliers
salaries2_outliers_removed <- salaries2 %>%
        mutate(outlier = ifelse(find_outlier(annual_salary), "Yes", "No"))
salaries2_outliers_removed <- salaries2_outliers_removed[salaries2_outliers_removed$outlier == 'No',]

# subset to only entry-level salaries
entry_salaries <- salaries2_outliers_removed[!(salaries2_outliers_removed$field_experience != "1 year or less"),]

# top 10% of entry salaries
n <- 10
entry_top_10p <- entry_salaries[entry_salaries$annual_salary > quantile(entry_salaries$annual_salary, prob=1-n/100),]

kable(unique(entry_top_10p$industry))

x
Tech
Financial
Health care

Who are these high earning entry-level employees?

entry_top_10p <- (subset(entry_top_10p, select = -c(timestamp, country, currency, multiracial, job, outlier, add_compensation)))
kable(entry_top_10p)

id	age_range	industry	annual_salary	state	all_experience	field_experience	education.level	gender	whiteness
385	25-34	Tech	105000	Washington	2 - 4 years	1 year or less	Master’s degree	Woman	nonwhite
1416	25-34	Financial	135000	Colorado	5-7 years	1 year or less	College degree	Woman	white
1532	18-24	Tech	142000	California	1 year or less	1 year or less	College degree	Woman	nonwhite
2038	35-44	Tech	131000	California	21 - 30 years	1 year or less	Some college	Woman	nonwhite
2241	25-34	Health care	125000	Massachusetts	5-7 years	1 year or less	Master’s degree	Woman	white
2416	25-34	Tech	110000	New York	2 - 4 years	1 year or less	College degree	Woman	nonwhite
2767	18-24	Tech	116000	Colorado	1 year or less	1 year or less	College degree	Man	white

5. Which states have the highest salaries?

salaries2 <- salaries2[!(salaries2$state=="NaN"),]

# remove outliers
salaries2_outliers_removed <- salaries2 %>%
        mutate(outlier = ifelse(find_outlier(annual_salary), "Yes", "No"))
salaries2_outliers_removed <- salaries2_outliers_removed[salaries2_outliers_removed$outlier == 'No',]

# plot the results
bp5 <- ggplot(salaries2_outliers_removed, aes(x=state, y=annual_salary, fill=state)) + 
  geom_boxplot() + theme_minimal() + coord_flip()
bp5 <- bp5 + scale_y_continuous(labels = label_comma())
bp5 <- bp5 + theme(legend.position="none") # Remove legend
bp5 <- bp5 + theme(text = element_text(size=12), axis.title=element_text(size=16)) 
bp5 <- bp5 + labs(title = "US Salaries by State", x= "State", y= "Salary")
bp5 <- bp5 + theme(plot.title = element_text(size=18))
bp5

high_paying_states <- head(arrange(salaries2_outliers_removed,desc(annual_salary)), n = 50) #top 50 highest paying jobs
high_paying_states_list <- as.data.frame(unique(high_paying_states$state))
colnames(high_paying_states_list) <- c("High Paying States") 
kable(high_paying_states_list)

High Paying States
Connecticut
New York
Illinois
Georgia
Colorado
California
Washington
Oregon
Texas
Arizona
Massachusetts

What about pay within just the nonprofit industry?

nonprofit_data <- filter(salaries2, industry == "Nonprofit")

nonprofit_data_outliers_removed <- nonprofit_data %>%
        mutate(outlier = ifelse(find_outlier(annual_salary), "Yes", "No"))

# create subset to work with where the outliers are removed
nonprofit_data_outliers_removed <- nonprofit_data_outliers_removed[nonprofit_data_outliers_removed$outlier == 'No',]

# plot the results
bp6 <- ggplot(nonprofit_data_outliers_removed, aes(x=state, y=annual_salary, fill=state)) + 
  geom_boxplot() + theme_minimal() + coord_flip()
bp6 <- bp6 + scale_y_continuous(labels = label_comma())
bp6 <- bp6 + theme(legend.position="none") # Remove legend
bp6 <- bp6 + theme(text = element_text(size=12), axis.title=element_text(size=16)) 
bp6 <- bp6 + labs(title = "US Nonprofit Salaries by State", x= "State", y= "Salary")
bp6 <- bp6 + theme(plot.title = element_text(size=18))
bp6

high_paying_states_nonprofit <- head(arrange(nonprofit_data_outliers_removed,desc(annual_salary)), n = 50) #top 50 highest paying jobs
high_paying_states_nonprofit <- as.data.frame(unique(high_paying_states_nonprofit$state))
colnames(high_paying_states_nonprofit) <- c("High Paying States (Non-Profits)") 
kable(high_paying_states_nonprofit)

High Paying States (Non-Profits)
District of Columbia
California
Maryland
New York
Texas
Massachusetts
North Carolina
Florida
Arizona
Washington
Illinois
Oregon
Ohio
Missouri