library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#library(httr)
#library(jsonlite)
#library(lubridate)
library(readr)
library(dplyr)
#library(testthat)
#library(psych)
library(dlookr)
## Imported Arial Narrow fonts.
##
## Attaching package: 'dlookr'
## The following object is masked from 'package:tidyr':
##
## extract
## The following object is masked from 'package:base':
##
## transform
#library(Hmisc)
library(corrplot)
## corrplot 0.92 loaded
salaries <- read.csv("Employee_Salaries_-_2020.csv", header = TRUE, fileEncoding="UTF-8-BOM")
str(salaries)
## 'data.frame': 9958 obs. of 8 variables:
## $ Department : chr "ABS" "ABS" "ABS" "ABS" ...
## $ Department.Name : chr "Alcohol Beverage Services" "Alcohol Beverage Services" "Alcohol Beverage Services" "Alcohol Beverage Services" ...
## $ Division : chr "Wholesale Administration" "Administrative Services" "Administration" "Wholesale Operations" ...
## $ Gender : chr "F" "F" "M" "F" ...
## $ Base.Salary : num 78902 35926 167345 90848 78902 ...
## $ X2020.Overtime.Pay : num 199 0 0 0 205 ...
## $ X2020.Longevity.Pay: num 0 4039 0 5718 2460 ...
## $ Grade : chr "18" "16" "M2" "21" ...
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
salaries1 <- salaries %>%
clean_names()
#salaries1
salaries1 %>%
group_by(gender) %>%
describe(base_salary)
## # A tibble: 2 x 27
## variable gender n na mean sd se_mean IQR skewness kurtosis
## <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 base_salary F 4091 0 76764. 30956. 484. 39488. 0.615 0.803
## 2 base_salary M 5867 0 80171. 29502. 385. 36892. 0.942 1.79
## # ... with 17 more variables: p00 <dbl>, p01 <dbl>, p05 <dbl>, p10 <dbl>,
## # p20 <dbl>, p25 <dbl>, p30 <dbl>, p40 <dbl>, p50 <dbl>, p60 <dbl>,
## # p70 <dbl>, p75 <dbl>, p80 <dbl>, p90 <dbl>, p95 <dbl>, p99 <dbl>,
## # p100 <dbl>
salaries1 %>%
#group_by(department_name) %>%
describe(base_salary)
## # A tibble: 1 x 26
## variable n na mean sd se_mean IQR skewness kurtosis p00
## <chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 base_salary 9958 0 78771. 30153. 302. 37674. 0.785 1.36 11147.
## # ... with 16 more variables: p01 <dbl>, p05 <dbl>, p10 <dbl>, p20 <dbl>,
## # p25 <dbl>, p30 <dbl>, p40 <dbl>, p50 <dbl>, p60 <dbl>, p70 <dbl>,
## # p75 <dbl>, p80 <dbl>, p90 <dbl>, p95 <dbl>, p99 <dbl>, p100 <dbl>
salaries1 %>%
group_by(gender) %>%
normality(base_salary)
## # A tibble: 2 x 5
## variable gender statistic p_value sample
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 base_salary F 0.976 5.88e-26 4091
## 2 base_salary M 0.952 9.04e-38 5000
salaries1 %>%
mutate(log_base_salary = log(base_salary)) %>%
group_by(department_name) %>%
normality(log_base_salary)
## # A tibble: 40 x 5
## variable department_name statistic p_value sample
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 log_base_salary Alcohol Beverage Services 0.977 1.71e-6 438
## 2 log_base_salary Board of Appeals Department 0.963 6.31e-1 3
## 3 log_base_salary Board of Elections 0.936 7.69e-2 29
## 4 log_base_salary Community Engagement Cluster 0.928 1.71e-4 83
## 5 log_base_salary Community Use of Public Facilities 0.933 9.28e-2 26
## 6 log_base_salary Correction and Rehabilitation 0.981 2.18e-6 525
## 7 log_base_salary County Attorney's Office 0.902 3.00e-5 74
## 8 log_base_salary County Council 0.969 2.10e-2 99
## 9 log_base_salary Department of Environmental Prote~ 0.945 6.17e-6 161
## 10 log_base_salary Department of Finance 0.968 6.04e-3 119
## # ... with 30 more rows
# filter(p_value > 0.01)
salaries1 %>%
mutate(log_base_salary = log(base_salary)) %>%
group_by(gender) %>%
normality(log_base_salary)
## # A tibble: 2 x 5
## variable gender statistic p_value sample
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 log_base_salary F 0.959 2.16e-32 4091
## 2 log_base_salary M 0.983 9.58e-24 5000
library(dlookr)
plot_normality(salaries1)
correlate(salaries1)
## # A tibble: 6 x 3
## var1 var2 coef_corr
## <fct> <fct> <dbl>
## 1 x2020_overtime_pay base_salary 0.0488
## 2 x2020_longevity_pay base_salary 0.255
## 3 base_salary x2020_overtime_pay 0.0488
## 4 x2020_longevity_pay x2020_overtime_pay 0.117
## 5 base_salary x2020_longevity_pay 0.255
## 6 x2020_overtime_pay x2020_longevity_pay 0.117
plot_correlate(salaries1)
salaries1$gender <- as.factor(salaries1$gender)
class(salaries1$gender)
## [1] "factor"
target_base_salary<- target_by(salaries1, base_salary)
cat_categor <- relate(target_base_salary, gender)
cat_categor
## Analysis of Variance Table
##
## Response: base_salary
## Df Sum Sq Mean Sq F value Pr(>F)
## gender 1 2.7987e+10 2.7987e+10 30.874 2.825e-08 ***
## Residuals 9956 9.0251e+12 9.0649e+08
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(cat_categor)
##
## Call:
## lm(formula = formula(formula_str), data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -69024 -21883 -3160 16666 199829
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 76763.8 470.7 163.076 < 2e-16 ***
## genderM 3407.5 613.3 5.556 2.82e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30110 on 9956 degrees of freedom
## Multiple R-squared: 0.003091, Adjusted R-squared: 0.002991
## F-statistic: 30.87 on 1 and 9956 DF, p-value: 2.825e-08
According to these results, gender does contribute to salary amount (p-value < 0.05%).
Although the coefficient for the predictor sex is found to be significant in the model, sex only explains around 0.3% of the variance in salary in this model.
plot(cat_categor)
plot1 <- salaries1 %>%
ggplot(aes(gender, base_salary, fill = gender)) +
ggtitle("Distribution of MoCo Employees Salary by Gender") +
xlab("Gender") +
ylab("Salary") +
geom_boxplot() +
scale_fill_discrete(name = "Gender", labels = c("Female", "Male")) +
theme_minimal() +
scale_fill_brewer(palette="Set1")
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
plot1
It appears average of salaries for two gender is slightly different. Is this actually true or could it have just happened by chance for our sample of data?
This is going to be an ANOVA test and we will use α=0.05.
Ho:μbx=μbr=μm=μq=μsi (In other words, there is no relationship between salary and gender)
Ha: Not all μ are equal. (There is a relationship between salary and gender)
results <- aov(salaries1$base_salary ~ salaries1$gender)
summary(results)
## Df Sum Sq Mean Sq F value Pr(>F)
## salaries1$gender 1 2.799e+10 2.799e+10 30.87 2.82e-08 ***
## Residuals 9956 9.025e+12 9.065e+08
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
P-value: 0.0000000282 < 0.05=α.
Conclusion: Reject Ho.
It appears that women has a lower salary than men. Is this true? Or did this just happen by chance with our sample?
This is going to be a two sample t-test. We will test at the 95% confidence level.
Ho:μm=μb (In other words, the salaries are the same) Ha:μb<μm (In other words, the salaries of women are less then salaries of men)
women <- salaries1 %>%
filter(gender == "F")
men <- salaries1 %>%
filter(gender == "M")
t.test(women$base_salary, men$base_salary, alternative="less",
conf.level = 0.95)
##
## Welch Two Sample t-test
##
## data: women$base_salary and men$base_salary
## t = -5.509, df = 8526.6, p-value = 1.857e-08
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf -2390.013
## sample estimates:
## mean of x mean of y
## 76763.83 80171.37
P-value: 0.00000001857 < 0.05 = α.
Conclusion: Reject Ho.
What about the distribution salary in the Departments with biggest and smallest salaries?
salaries_big_sal <- salaries1 %>%
select(base_salary, department_name) %>%
group_by(department_name) %>%
#arrange(-base_salary)
arrange(base_salary)
salaries_big_sal
## # A tibble: 9,958 x 2
## # Groups: department_name [40]
## base_salary department_name
## <dbl> <chr>
## 1 11147. Office of Agriculture
## 2 16004 Department of Public Libraries
## 3 16004 Department of Public Libraries
## 4 17139. Department of Permitting Services
## 5 17139. Department of Public Libraries
## 6 17139. Department of Public Libraries
## 7 17158 Department of Police
## 8 17158 Department of Police
## 9 17158 Department of Police
## 10 17158 Department of Police
## # ... with 9,948 more rows
For department with the buggiest salaries -
Ho:μm=μb (In other words, the salaries are the same) Ha:μb<μm (In other words, the salaries of women are less then salaries of men)
women1 <- salaries1 %>%
filter(gender == "F" & department_name == "Offices of the County Executive")
men1 <- salaries1 %>%
filter(gender == "M" & department_name == "Offices of the County Executive")
t.test(women1$base_salary, men1$base_salary, alternative="less",
conf.level = 0.95)
##
## Welch Two Sample t-test
##
## data: women1$base_salary and men1$base_salary
## t = -3.2818, df = 20.243, p-value = 0.001843
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf -29368.12
## sample estimates:
## mean of x mean of y
## 98387.55 160245.42
P-value: 0.001843 < 0.05 = α.
Conclusion: Reject Ho.
Ho:μm=μb (In other words, the salaries are the same) Ha:μb<μm (In other words, the salaries of women are less then salaries of men)
women2 <- salaries1 %>%
filter(gender == "F" & department_name == "Department of Police")
men2 <- salaries1 %>%
filter(gender == "M" & department_name == "Department of Police")
t.test(women2$base_salary, men2$base_salary, alternative="less",
conf.level = 0.95)
##
## Welch Two Sample t-test
##
## data: women2$base_salary and men2$base_salary
## t = -14.716, df = 1264.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf -17058
## sample estimates:
## mean of x mean of y
## 65056.39 84262.67
P-value: 0.00000000000000022 < 0.05 = α.
Conclusion: Reject Ho.
Ho:μm=μb (In other words, the salaries are the same) Ha:μb<μm (In other words, the salaries of women are less then salaries of men)
women3 <- salaries1 %>%
filter(gender == "F" & department_name == "Department of Health and Human Services")
men3 <- salaries1 %>%
filter(gender == "M" & department_name == "Department of Health and Human Services")
t.test(women3$base_salary, men3$base_salary, alternative="less",
conf.level = 0.95)
##
## Welch Two Sample t-test
##
## data: women3$base_salary and men3$base_salary
## t = -5.0274, df = 332.14, p-value = 4.074e-07
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf -7148.714
## sample estimates:
## mean of x mean of y
## 78458.04 89097.52
P-value: 0.0000004074 < 0.05 = α.
Conclusion: Reject Ho.
For department with the smallest salaries -
Ho:μm=μb (In other words, the salaries are the same) Ha:μb<μm (In other words, the salaries of women are less then salaries of men)
women4 <- salaries1 %>%
filter(gender == "F" & department_name == "Office of Agriculture")
men4 <- salaries1 %>%
filter(gender == "M" & department_name == "Office of Agriculture")
t.test(women4$base_salary, men4$base_salary, alternative="less",
conf.level = 0.95)
##
## Welch Two Sample t-test
##
## data: women4$base_salary and men4$base_salary
## t = -0.93743, df = 7.3873, p-value = 0.1891
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf 24622.31
## sample estimates:
## mean of x mean of y
## 65720.13 90214.90
P-value: 0.1891 > 0.05 = α.
Conclusion: Fail to reject Ho.
men7 <- salaries1 %>%
filter(gender == "M" & department_name == "Office of Agriculture")
women7 <- salaries1 %>%
filter(gender == "F" & department_name == "Office of Agriculture")
t.test(men7$base_salary, women7$base_salary, alternative="less",
conf.level = 0.95)
##
## Welch Two Sample t-test
##
## data: men7$base_salary and women7$base_salary
## t = 0.93743, df = 7.3873, p-value = 0.8109
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf 73611.85
## sample estimates:
## mean of x mean of y
## 90214.90 65720.13
women5 <- salaries1 %>%
filter(gender == "F" & department_name == "Department of Public Libraries")
men5 <- salaries1 %>%
filter(gender == "M" & department_name == "Department of Public Libraries")
t.test(women5$base_salary, men5$base_salary, alternative="less",
conf.level = 0.95)
##
## Welch Two Sample t-test
##
## data: women5$base_salary and men5$base_salary
## t = -1.9766, df = 138.45, p-value = 0.02504
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf -1194.679
## sample estimates:
## mean of x mean of y
## 55333.15 62697.46
P-value: 0.02504 < 0.05 = α.
Conclusion: Reject Ho.
women6 <- salaries1 %>%
filter(gender == "F" & department_name == "Department of Permitting Services")
men6 <- salaries1 %>%
filter(gender == "M" & department_name == "Department of Permitting Services")
t.test(women6$base_salary, men6$base_salary, alternative="less",
conf.level = 0.95)
##
## Welch Two Sample t-test
##
## data: women6$base_salary and men6$base_salary
## t = -1.3401, df = 98.714, p-value = 0.09165
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf 1327.588
## sample estimates:
## mean of x mean of y
## 93530.15 99083.87
P-value: 0.09165 > 0.05 = α.
Conclusion: Fail to reject Ho.
summary(salaries1$base_salary)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 11147 56994 75290 78771 94668 280000
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
salaries2 <- ddply(salaries1, "gender", summarise, base_salary.mean=mean(base_salary))
salaries2
## gender base_salary.mean
## 1 F 76763.83
## 2 M 80171.37
The salaries are different, but gender variable only explains around 0.3% of the variance.
Which other variables in the dataset affect salary and how much?
Let’s take a brief look at salary regressed upon the variables in simple linear regression models
mod_dep <- lm(base_salary~department_name, data = salaries1)
summary(mod_dep)
According to these results, the name of department does contribute to salary amount as expected (p-value < 0.05)
The amount of variance in salary explained by gender in this model is around 18.99%
mod_grade <- lm(base_salary~grade, data = salaries1)
summary(mod_grade)
##
## Call:
## lm(formula = base_salary ~ grade, data = salaries1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -158643 -7026 998 8638 110210
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 169790 1366 124.283 < 2e-16 ***
## grade10 -142468 1576 -90.408 < 2e-16 ***
## grade11 -122245 2816 -43.404 < 2e-16 ***
## grade12 -134124 1596 -84.047 < 2e-16 ***
## grade13 -120906 1557 -77.655 < 2e-16 ***
## grade14 -117802 1592 -74.001 < 2e-16 ***
## grade15 -116695 1413 -82.603 < 2e-16 ***
## grade16 -105927 1479 -71.639 < 2e-16 ***
## grade17 -104023 1664 -62.495 < 2e-16 ***
## grade18 -104146 1481 -70.318 < 2e-16 ***
## grade19 -97077 1548 -62.693 < 2e-16 ***
## grade20 -96423 1526 -63.169 < 2e-16 ***
## grade21 -92076 1463 -62.934 < 2e-16 ***
## grade22 -83515 1738 -48.044 < 2e-16 ***
## grade23 -83023 1473 -56.348 < 2e-16 ***
## grade24 -79517 1455 -54.635 < 2e-16 ***
## grade25 -66575 1497 -44.476 < 2e-16 ***
## grade26 -63510 1615 -39.318 < 2e-16 ***
## grade27 -57422 2431 -23.620 < 2e-16 ***
## grade28 -49064 1661 -29.536 < 2e-16 ***
## grade29 -38872 5566 -6.984 3.06e-12 ***
## grade30 -35508 3402 -10.438 < 2e-16 ***
## grade31 -24668 12143 -2.032 0.042230 *
## grade32 -29812 2228 -13.381 < 2e-16 ***
## grade33 -33227 8640 -3.846 0.000121 ***
## grade34 -10315 7099 -1.453 0.146232
## grade35 -29790 12143 -2.453 0.014171 *
## grade38 -7083 12143 -0.583 0.559719
## grade39 1560 6186 0.252 0.800842
## grade40 7366 8640 0.853 0.393922
## grade5 -139707 3230 -43.259 < 2e-16 ***
## grade7 -133548 8640 -15.456 < 2e-16 ***
## grade8 -135774 2275 -59.685 < 2e-16 ***
## grade9 -123716 2431 -50.889 < 2e-16 ***
## gradeA1 -62927 1690 -37.231 < 2e-16 ***
## gradeA2 -32641 2387 -13.675 < 2e-16 ***
## gradeA3 -11437 2913 -3.927 8.67e-05 ***
## gradeB1 -70243 1744 -40.285 < 2e-16 ***
## gradeB2 -52769 1684 -31.330 < 2e-16 ***
## gradeB2P -111073 12143 -9.147 < 2e-16 ***
## gradeB3 -35731 2773 -12.885 < 2e-16 ***
## gradeB4 -21077 3311 -6.365 2.04e-10 ***
## gradeB6 -1173 7099 -0.165 0.868747
## gradeC1 -62794 2913 -21.559 < 2e-16 ***
## gradeC2 -51376 6186 -8.306 < 2e-16 ***
## gradeC3 -121919 2732 -44.621 < 2e-16 ***
## gradeC4 -112096 2002 -55.997 < 2e-16 ***
## gradeC5 -96892 1663 -58.269 < 2e-16 ***
## gradeC6 -77762 2228 -34.903 < 2e-16 ***
## gradeD1 -68710 2533 -27.127 < 2e-16 ***
## gradeD2 -55433 3741 -14.816 < 2e-16 ***
## gradeD3 -35514 5566 -6.380 1.85e-10 ***
## gradeD4 -16009 8640 -1.853 0.063942 .
## gradeF1 -121571 3502 -34.714 < 2e-16 ***
## gradeF2 -112481 1548 -72.641 < 2e-16 ***
## gradeF3 -97410 1481 -65.793 < 2e-16 ***
## gradeF4 -79055 1592 -49.660 < 2e-16 ***
## gradeG1 -118800 5112 -23.241 < 2e-16 ***
## gradeG2 -117669 3087 -38.120 < 2e-16 ***
## gradeG3 -110114 2773 -39.709 < 2e-16 ***
## gradeG4 -87764 2002 -43.842 < 2e-16 ***
## gradeM1 11790 2732 4.315 1.61e-05 ***
## gradeM2 -12548 1726 -7.272 3.82e-13 ***
## gradeM3 -39647 1546 -25.645 < 2e-16 ***
## gradeMD2 -11047 12143 -0.910 0.362969
## gradeMD3 -1826 6186 -0.295 0.767858
## gradeMD4 -10397 6186 -1.681 0.092832 .
## gradeP1 -116404 2592 -44.907 < 2e-16 ***
## gradeP2 -115532 1920 -60.175 < 2e-16 ***
## gradeP3 -107239 1789 -59.930 < 2e-16 ***
## gradeP4 -84722 1435 -59.051 < 2e-16 ***
## gradeP4P -127041 4761 -26.686 < 2e-16 ***
## gradeP5 -72916 1979 -36.843 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12070 on 9885 degrees of freedom
## Multiple R-squared: 0.841, Adjusted R-squared: 0.8399
## F-statistic: 726.4 on 72 and 9885 DF, p-value: < 2.2e-16
According to these results, gender does contribute to salary amount as expected (p-value < 0.05)
The amount of variance in salary explained by grade in this model is around 83.99%
Cheacking all variables to collinearity
library(DataExplorer)
plot_correlation(salaries1)
## 4 features with more than 20 categories ignored!
## department: 40 categories
## department_name: 40 categories
## division: 605 categories
## grade: 73 categories
Build model with all meaningful variables
model_full <- lm(base_salary ~ department_name + grade, data = salaries1)
summary(model_full)
The combination of four variables (department_name + grade) has the biggest adjusted R-squared value - 87%
plot(model_full)
## Warning: not plotting observations with leverage one:
## 4032, 6719, 7374, 9953
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
salaries1 %>%
dplyr::group_by(department_name, gender) %>%
dplyr::summarize(avg = mean(base_salary, na.rm=T)) %>%
ggplot(aes(x = reorder(department_name, +avg), avg, fill = gender)) +
geom_bar(stat = "identity") +
coord_flip() +
theme_minimal() +
theme(legend.position = "bottom") + #set position of the legend
scale_fill_brewer(palette = "Set1") +
labs(title = "Distribution of MoCo Employees Salary by Departments in 2020",
x = "Deparmnents",
y = "Salary (in dollars)",
fill = "Gender")
## `summarise()` has grouped output by 'department_name'. You can override using the `.groups` argument.
salaries_m <- salaries1 %>%
filter(gender == "M")
salaries_m %>%
dplyr::group_by(department_name, gender) %>%
dplyr::summarize(avg = mean(base_salary, na.rm=T)) %>%
ggplot(aes(x = reorder(department_name, +avg), avg, fill = gender)) +
geom_bar(stat = "identity") +
coord_flip() +
theme_minimal()
## `summarise()` has grouped output by 'department_name'. You can override using the `.groups` argument.
salaries_hist1 <- salaries1 %>%
ggplot(aes(x = base_salary, fill = gender), options(scipen = 999)) +
geom_histogram(position="dodge", bins = 25)+
# scale_fill_discrete(name = "Gender", labels = c("Female", "Male"))+
geom_vline(data=salaries2, aes(xintercept=base_salary.mean, colour = gender),
linetype="dashed", size=1) +
labs(x = "Salary (in dollars)", y = "Frequency",
title = "MoCo Employees Salary Distribution")+
theme_minimal() +
scale_fill_brewer(palette = "Set1")
salaries_hist1
library(treemap)
treemap(salaries1, index="department_name", vSize="base_salary", #set size
vColor="base_salary", #set color
type="manual",
title="Treemap of Salary by Departments", palette="RdYlBu")