HR2_salaries

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

#library(httr)
#library(jsonlite)
#library(lubridate)
library(readr)   
library(dplyr)
#library(testthat)
#library(psych)
library(dlookr)

## Imported Arial Narrow fonts.

## 
## Attaching package: 'dlookr'

## The following object is masked from 'package:tidyr':
## 
##     extract

## The following object is masked from 'package:base':
## 
##     transform

#library(Hmisc)
library(corrplot)

## corrplot 0.92 loaded

salaries <- read.csv("Employee_Salaries_-_2020.csv", header = TRUE, fileEncoding="UTF-8-BOM")

str(salaries)

## 'data.frame':    9958 obs. of  8 variables:
##  $ Department         : chr  "ABS" "ABS" "ABS" "ABS" ...
##  $ Department.Name    : chr  "Alcohol Beverage Services" "Alcohol Beverage Services" "Alcohol Beverage Services" "Alcohol Beverage Services" ...
##  $ Division           : chr  "Wholesale Administration" "Administrative Services" "Administration" "Wholesale Operations" ...
##  $ Gender             : chr  "F" "F" "M" "F" ...
##  $ Base.Salary        : num  78902 35926 167345 90848 78902 ...
##  $ X2020.Overtime.Pay : num  199 0 0 0 205 ...
##  $ X2020.Longevity.Pay: num  0 4039 0 5718 2460 ...
##  $ Grade              : chr  "18" "16" "M2" "21" ...

library(janitor)

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

salaries1 <- salaries %>%
  clean_names()
#salaries1

salaries1 %>%
group_by(gender) %>% 
describe(base_salary)

## # A tibble: 2 x 27
##   variable    gender     n    na   mean     sd se_mean    IQR skewness kurtosis
##   <chr>       <chr>  <int> <int>  <dbl>  <dbl>   <dbl>  <dbl>    <dbl>    <dbl>
## 1 base_salary F       4091     0 76764. 30956.    484. 39488.    0.615    0.803
## 2 base_salary M       5867     0 80171. 29502.    385. 36892.    0.942    1.79 
## # ... with 17 more variables: p00 <dbl>, p01 <dbl>, p05 <dbl>, p10 <dbl>,
## #   p20 <dbl>, p25 <dbl>, p30 <dbl>, p40 <dbl>, p50 <dbl>, p60 <dbl>,
## #   p70 <dbl>, p75 <dbl>, p80 <dbl>, p90 <dbl>, p95 <dbl>, p99 <dbl>,
## #   p100 <dbl>

salaries1 %>%
#group_by(department_name) %>% 
describe(base_salary)

## # A tibble: 1 x 26
##   variable        n    na   mean     sd se_mean    IQR skewness kurtosis    p00
##   <chr>       <int> <int>  <dbl>  <dbl>   <dbl>  <dbl>    <dbl>    <dbl>  <dbl>
## 1 base_salary  9958     0 78771. 30153.    302. 37674.    0.785     1.36 11147.
## # ... with 16 more variables: p01 <dbl>, p05 <dbl>, p10 <dbl>, p20 <dbl>,
## #   p25 <dbl>, p30 <dbl>, p40 <dbl>, p50 <dbl>, p60 <dbl>, p70 <dbl>,
## #   p75 <dbl>, p80 <dbl>, p90 <dbl>, p95 <dbl>, p99 <dbl>, p100 <dbl>

salaries1 %>%
group_by(gender) %>% 
normality(base_salary)

## # A tibble: 2 x 5
##   variable    gender statistic  p_value sample
##   <chr>       <chr>      <dbl>    <dbl>  <dbl>
## 1 base_salary F          0.976 5.88e-26   4091
## 2 base_salary M          0.952 9.04e-38   5000

salaries1 %>%
  mutate(log_base_salary = log(base_salary)) %>%
  group_by(department_name) %>%
  normality(log_base_salary)

## # A tibble: 40 x 5
##    variable        department_name                    statistic   p_value sample
##    <chr>           <chr>                                  <dbl>     <dbl>  <dbl>
##  1 log_base_salary Alcohol Beverage Services              0.977   1.71e-6    438
##  2 log_base_salary Board of Appeals Department            0.963   6.31e-1      3
##  3 log_base_salary Board of Elections                     0.936   7.69e-2     29
##  4 log_base_salary Community Engagement Cluster           0.928   1.71e-4     83
##  5 log_base_salary Community Use of Public Facilities     0.933   9.28e-2     26
##  6 log_base_salary Correction and Rehabilitation          0.981   2.18e-6    525
##  7 log_base_salary County Attorney's Office               0.902   3.00e-5     74
##  8 log_base_salary County Council                         0.969   2.10e-2     99
##  9 log_base_salary Department of Environmental Prote~     0.945   6.17e-6    161
## 10 log_base_salary Department of Finance                  0.968   6.04e-3    119
## # ... with 30 more rows

 # filter(p_value > 0.01)

salaries1 %>%
  mutate(log_base_salary = log(base_salary)) %>%
  group_by(gender) %>%
  normality(log_base_salary)

## # A tibble: 2 x 5
##   variable        gender statistic  p_value sample
##   <chr>           <chr>      <dbl>    <dbl>  <dbl>
## 1 log_base_salary F          0.959 2.16e-32   4091
## 2 log_base_salary M          0.983 9.58e-24   5000

library(dlookr)
plot_normality(salaries1)

correlate(salaries1)

## # A tibble: 6 x 3
##   var1                var2                coef_corr
##   <fct>               <fct>                   <dbl>
## 1 x2020_overtime_pay  base_salary            0.0488
## 2 x2020_longevity_pay base_salary            0.255 
## 3 base_salary         x2020_overtime_pay     0.0488
## 4 x2020_longevity_pay x2020_overtime_pay     0.117 
## 5 base_salary         x2020_longevity_pay    0.255 
## 6 x2020_overtime_pay  x2020_longevity_pay    0.117

plot_correlate(salaries1)

salaries1$gender <- as.factor(salaries1$gender) 

class(salaries1$gender)

## [1] "factor"

target_base_salary<- target_by(salaries1, base_salary)

cat_categor <- relate(target_base_salary, gender)
cat_categor

## Analysis of Variance Table
## 
## Response: base_salary
##             Df     Sum Sq    Mean Sq F value    Pr(>F)    
## gender       1 2.7987e+10 2.7987e+10  30.874 2.825e-08 ***
## Residuals 9956 9.0251e+12 9.0649e+08                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(cat_categor)

## 
## Call:
## lm(formula = formula(formula_str), data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -69024 -21883  -3160  16666 199829 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  76763.8      470.7 163.076  < 2e-16 ***
## genderM       3407.5      613.3   5.556 2.82e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30110 on 9956 degrees of freedom
## Multiple R-squared:  0.003091,   Adjusted R-squared:  0.002991 
## F-statistic: 30.87 on 1 and 9956 DF,  p-value: 2.825e-08

According to these results, gender does contribute to salary amount (p-value < 0.05%).

Although the coefficient for the predictor sex is found to be significant in the model, sex only explains around 0.3% of the variance in salary in this model.

plot(cat_categor)

plot1 <- salaries1 %>%
   ggplot(aes(gender, base_salary, fill = gender)) + 
  ggtitle("Distribution of MoCo Employees Salary by Gender") +
  xlab("Gender") +
  ylab("Salary") +
  geom_boxplot() +
  scale_fill_discrete(name = "Gender", labels = c("Female", "Male")) +
  theme_minimal() +
  scale_fill_brewer(palette="Set1")

## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

plot1

It appears average of salaries for two gender is slightly different. Is this actually true or could it have just happened by chance for our sample of data?

This is going to be an ANOVA test and we will use α=0.05.

Ho:μbx=μbr=μm=μq=μsi (In other words, there is no relationship between salary and gender)

Ha: Not all μ are equal. (There is a relationship between salary and gender)

results <- aov(salaries1$base_salary ~ salaries1$gender)

summary(results)

##                    Df    Sum Sq   Mean Sq F value   Pr(>F)    
## salaries1$gender    1 2.799e+10 2.799e+10   30.87 2.82e-08 ***
## Residuals        9956 9.025e+12 9.065e+08                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

P-value: 0.0000000282 < 0.05=α.

Conclusion: Reject Ho.

It appears that women has a lower salary than men. Is this true? Or did this just happen by chance with our sample?

This is going to be a two sample t-test. We will test at the 95% confidence level.

Ho:μm=μb (In other words, the salaries are the same) Ha:μb<μm (In other words, the salaries of women are less then salaries of men)

women <- salaries1 %>%
  filter(gender == "F")

men <- salaries1 %>%
  filter(gender == "M")

t.test(women$base_salary, men$base_salary, alternative="less",
       conf.level = 0.95)

## 
##  Welch Two Sample t-test
## 
## data:  women$base_salary and men$base_salary
## t = -5.509, df = 8526.6, p-value = 1.857e-08
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##       -Inf -2390.013
## sample estimates:
## mean of x mean of y 
##  76763.83  80171.37

P-value: 0.00000001857 < 0.05 = α.

Conclusion: Reject Ho.

What about the distribution salary in the Departments with biggest and smallest salaries?

salaries_big_sal <- salaries1 %>%
  select(base_salary, department_name) %>%
  group_by(department_name) %>%
#arrange(-base_salary) 
 arrange(base_salary) 
salaries_big_sal

## # A tibble: 9,958 x 2
## # Groups:   department_name [40]
##    base_salary department_name                  
##          <dbl> <chr>                            
##  1      11147. Office of Agriculture            
##  2      16004  Department of Public Libraries   
##  3      16004  Department of Public Libraries   
##  4      17139. Department of Permitting Services
##  5      17139. Department of Public Libraries   
##  6      17139. Department of Public Libraries   
##  7      17158  Department of Police             
##  8      17158  Department of Police             
##  9      17158  Department of Police             
## 10      17158  Department of Police             
## # ... with 9,948 more rows

For department with the buggiest salaries -

Ho:μm=μb (In other words, the salaries are the same) Ha:μb<μm (In other words, the salaries of women are less then salaries of men)

women1 <- salaries1 %>%
  filter(gender == "F" & department_name == "Offices of the County Executive")

men1 <- salaries1 %>%
  filter(gender == "M" & department_name == "Offices of the County Executive")

t.test(women1$base_salary, men1$base_salary, alternative="less",
       conf.level = 0.95)

## 
##  Welch Two Sample t-test
## 
## data:  women1$base_salary and men1$base_salary
## t = -3.2818, df = 20.243, p-value = 0.001843
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##       -Inf -29368.12
## sample estimates:
## mean of x mean of y 
##  98387.55 160245.42

P-value: 0.001843 < 0.05 = α.

Conclusion: Reject Ho.

Ho:μm=μb (In other words, the salaries are the same) Ha:μb<μm (In other words, the salaries of women are less then salaries of men)

women2 <- salaries1 %>%
  filter(gender == "F" & department_name == "Department of Police")

men2 <- salaries1 %>%
  filter(gender == "M" & department_name == "Department of Police")

t.test(women2$base_salary, men2$base_salary, alternative="less",
       conf.level = 0.95)

## 
##  Welch Two Sample t-test
## 
## data:  women2$base_salary and men2$base_salary
## t = -14.716, df = 1264.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##    -Inf -17058
## sample estimates:
## mean of x mean of y 
##  65056.39  84262.67

P-value: 0.00000000000000022 < 0.05 = α.

Conclusion: Reject Ho.

Ho:μm=μb (In other words, the salaries are the same) Ha:μb<μm (In other words, the salaries of women are less then salaries of men)

women3 <- salaries1 %>%
  filter(gender == "F" & department_name == "Department of Health and Human Services")

men3 <- salaries1 %>%
  filter(gender == "M" & department_name == "Department of Health and Human Services")

t.test(women3$base_salary, men3$base_salary, alternative="less",
       conf.level = 0.95)

## 
##  Welch Two Sample t-test
## 
## data:  women3$base_salary and men3$base_salary
## t = -5.0274, df = 332.14, p-value = 4.074e-07
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##       -Inf -7148.714
## sample estimates:
## mean of x mean of y 
##  78458.04  89097.52

P-value: 0.0000004074 < 0.05 = α.

Conclusion: Reject Ho.

For department with the smallest salaries -

Ho:μm=μb (In other words, the salaries are the same) Ha:μb<μm (In other words, the salaries of women are less then salaries of men)

women4 <- salaries1 %>%
  filter(gender == "F" & department_name == "Office of Agriculture")

men4 <- salaries1 %>%
  filter(gender == "M" & department_name == "Office of Agriculture")

t.test(women4$base_salary, men4$base_salary, alternative="less",
       conf.level = 0.95)

## 
##  Welch Two Sample t-test
## 
## data:  women4$base_salary and men4$base_salary
## t = -0.93743, df = 7.3873, p-value = 0.1891
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##      -Inf 24622.31
## sample estimates:
## mean of x mean of y 
##  65720.13  90214.90

P-value: 0.1891 > 0.05 = α.

Conclusion: Fail to reject Ho.

men7 <- salaries1 %>%
  filter(gender == "M" & department_name == "Office of Agriculture")

women7 <- salaries1 %>%
  filter(gender == "F" & department_name == "Office of Agriculture")

t.test(men7$base_salary, women7$base_salary, alternative="less",
       conf.level = 0.95)

## 
##  Welch Two Sample t-test
## 
## data:  men7$base_salary and women7$base_salary
## t = 0.93743, df = 7.3873, p-value = 0.8109
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##      -Inf 73611.85
## sample estimates:
## mean of x mean of y 
##  90214.90  65720.13

women5 <- salaries1 %>%
  filter(gender == "F" & department_name == "Department of Public Libraries")

men5 <- salaries1 %>%
  filter(gender == "M" & department_name == "Department of Public Libraries")

t.test(women5$base_salary, men5$base_salary, alternative="less",
       conf.level = 0.95)

## 
##  Welch Two Sample t-test
## 
## data:  women5$base_salary and men5$base_salary
## t = -1.9766, df = 138.45, p-value = 0.02504
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##       -Inf -1194.679
## sample estimates:
## mean of x mean of y 
##  55333.15  62697.46

P-value: 0.02504 < 0.05 = α.

Conclusion: Reject Ho.

women6 <- salaries1 %>%
  filter(gender == "F" & department_name == "Department of Permitting Services")

men6 <- salaries1 %>%
  filter(gender == "M" & department_name == "Department of Permitting Services")

t.test(women6$base_salary, men6$base_salary, alternative="less",
       conf.level = 0.95)

## 
##  Welch Two Sample t-test
## 
## data:  women6$base_salary and men6$base_salary
## t = -1.3401, df = 98.714, p-value = 0.09165
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##      -Inf 1327.588
## sample estimates:
## mean of x mean of y 
##  93530.15  99083.87

P-value: 0.09165 > 0.05 = α.

Conclusion: Fail to reject Ho.

summary(salaries1$base_salary)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   11147   56994   75290   78771   94668  280000

library(plyr)

## ------------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## ------------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following object is masked from 'package:purrr':
## 
##     compact

salaries2 <- ddply(salaries1, "gender", summarise, base_salary.mean=mean(base_salary))
salaries2

##   gender base_salary.mean
## 1      F         76763.83
## 2      M         80171.37

The salaries are different, but gender variable only explains around 0.3% of the variance.

Which other variables in the dataset affect salary and how much?

Let’s take a brief look at salary regressed upon the variables in simple linear regression models

mod_dep <- lm(base_salary~department_name, data = salaries1)
summary(mod_dep)

According to these results, the name of department does contribute to salary amount as expected (p-value < 0.05)

The amount of variance in salary explained by gender in this model is around 18.99%

mod_grade <- lm(base_salary~grade, data = salaries1)
summary(mod_grade)

## 
## Call:
## lm(formula = base_salary ~ grade, data = salaries1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -158643   -7026     998    8638  110210 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   169790       1366 124.283  < 2e-16 ***
## grade10      -142468       1576 -90.408  < 2e-16 ***
## grade11      -122245       2816 -43.404  < 2e-16 ***
## grade12      -134124       1596 -84.047  < 2e-16 ***
## grade13      -120906       1557 -77.655  < 2e-16 ***
## grade14      -117802       1592 -74.001  < 2e-16 ***
## grade15      -116695       1413 -82.603  < 2e-16 ***
## grade16      -105927       1479 -71.639  < 2e-16 ***
## grade17      -104023       1664 -62.495  < 2e-16 ***
## grade18      -104146       1481 -70.318  < 2e-16 ***
## grade19       -97077       1548 -62.693  < 2e-16 ***
## grade20       -96423       1526 -63.169  < 2e-16 ***
## grade21       -92076       1463 -62.934  < 2e-16 ***
## grade22       -83515       1738 -48.044  < 2e-16 ***
## grade23       -83023       1473 -56.348  < 2e-16 ***
## grade24       -79517       1455 -54.635  < 2e-16 ***
## grade25       -66575       1497 -44.476  < 2e-16 ***
## grade26       -63510       1615 -39.318  < 2e-16 ***
## grade27       -57422       2431 -23.620  < 2e-16 ***
## grade28       -49064       1661 -29.536  < 2e-16 ***
## grade29       -38872       5566  -6.984 3.06e-12 ***
## grade30       -35508       3402 -10.438  < 2e-16 ***
## grade31       -24668      12143  -2.032 0.042230 *  
## grade32       -29812       2228 -13.381  < 2e-16 ***
## grade33       -33227       8640  -3.846 0.000121 ***
## grade34       -10315       7099  -1.453 0.146232    
## grade35       -29790      12143  -2.453 0.014171 *  
## grade38        -7083      12143  -0.583 0.559719    
## grade39         1560       6186   0.252 0.800842    
## grade40         7366       8640   0.853 0.393922    
## grade5       -139707       3230 -43.259  < 2e-16 ***
## grade7       -133548       8640 -15.456  < 2e-16 ***
## grade8       -135774       2275 -59.685  < 2e-16 ***
## grade9       -123716       2431 -50.889  < 2e-16 ***
## gradeA1       -62927       1690 -37.231  < 2e-16 ***
## gradeA2       -32641       2387 -13.675  < 2e-16 ***
## gradeA3       -11437       2913  -3.927 8.67e-05 ***
## gradeB1       -70243       1744 -40.285  < 2e-16 ***
## gradeB2       -52769       1684 -31.330  < 2e-16 ***
## gradeB2P     -111073      12143  -9.147  < 2e-16 ***
## gradeB3       -35731       2773 -12.885  < 2e-16 ***
## gradeB4       -21077       3311  -6.365 2.04e-10 ***
## gradeB6        -1173       7099  -0.165 0.868747    
## gradeC1       -62794       2913 -21.559  < 2e-16 ***
## gradeC2       -51376       6186  -8.306  < 2e-16 ***
## gradeC3      -121919       2732 -44.621  < 2e-16 ***
## gradeC4      -112096       2002 -55.997  < 2e-16 ***
## gradeC5       -96892       1663 -58.269  < 2e-16 ***
## gradeC6       -77762       2228 -34.903  < 2e-16 ***
## gradeD1       -68710       2533 -27.127  < 2e-16 ***
## gradeD2       -55433       3741 -14.816  < 2e-16 ***
## gradeD3       -35514       5566  -6.380 1.85e-10 ***
## gradeD4       -16009       8640  -1.853 0.063942 .  
## gradeF1      -121571       3502 -34.714  < 2e-16 ***
## gradeF2      -112481       1548 -72.641  < 2e-16 ***
## gradeF3       -97410       1481 -65.793  < 2e-16 ***
## gradeF4       -79055       1592 -49.660  < 2e-16 ***
## gradeG1      -118800       5112 -23.241  < 2e-16 ***
## gradeG2      -117669       3087 -38.120  < 2e-16 ***
## gradeG3      -110114       2773 -39.709  < 2e-16 ***
## gradeG4       -87764       2002 -43.842  < 2e-16 ***
## gradeM1        11790       2732   4.315 1.61e-05 ***
## gradeM2       -12548       1726  -7.272 3.82e-13 ***
## gradeM3       -39647       1546 -25.645  < 2e-16 ***
## gradeMD2      -11047      12143  -0.910 0.362969    
## gradeMD3       -1826       6186  -0.295 0.767858    
## gradeMD4      -10397       6186  -1.681 0.092832 .  
## gradeP1      -116404       2592 -44.907  < 2e-16 ***
## gradeP2      -115532       1920 -60.175  < 2e-16 ***
## gradeP3      -107239       1789 -59.930  < 2e-16 ***
## gradeP4       -84722       1435 -59.051  < 2e-16 ***
## gradeP4P     -127041       4761 -26.686  < 2e-16 ***
## gradeP5       -72916       1979 -36.843  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12070 on 9885 degrees of freedom
## Multiple R-squared:  0.841,  Adjusted R-squared:  0.8399 
## F-statistic: 726.4 on 72 and 9885 DF,  p-value: < 2.2e-16

According to these results, gender does contribute to salary amount as expected (p-value < 0.05)

The amount of variance in salary explained by grade in this model is around 83.99%

Cheacking all variables to collinearity

library(DataExplorer)
plot_correlation(salaries1)

## 4 features with more than 20 categories ignored!
## department: 40 categories
## department_name: 40 categories
## division: 605 categories
## grade: 73 categories

Build model with all meaningful variables

model_full <- lm(base_salary ~ department_name + grade, data = salaries1)
summary(model_full)

The combination of four variables (department_name + grade) has the biggest adjusted R-squared value - 87%

plot(model_full)

## Warning: not plotting observations with leverage one:
##   4032, 6719, 7374, 9953

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

salaries1  %>%
dplyr::group_by(department_name, gender) %>%
dplyr::summarize(avg = mean(base_salary, na.rm=T)) %>%

ggplot(aes(x = reorder(department_name, +avg), avg, fill = gender)) +
geom_bar(stat = "identity") +
coord_flip() +
theme_minimal() +
  theme(legend.position = "bottom") + #set position of the legend
 scale_fill_brewer(palette = "Set1") +
  labs(title = "Distribution of MoCo Employees Salary by Departments in 2020",
     x = "Deparmnents", 
     y = "Salary (in dollars)",  
      fill = "Gender")

## `summarise()` has grouped output by 'department_name'. You can override using the `.groups` argument.

salaries_m <- salaries1 %>%
  filter(gender == "M")

salaries_m %>%
dplyr::group_by(department_name, gender) %>%
dplyr::summarize(avg = mean(base_salary, na.rm=T)) %>%

ggplot(aes(x = reorder(department_name, +avg), avg, fill = gender)) +

geom_bar(stat = "identity") +

coord_flip() +

theme_minimal()

## `summarise()` has grouped output by 'department_name'. You can override using the `.groups` argument.

salaries_hist1 <- salaries1 %>%
  ggplot(aes(x = base_salary, fill = gender), options(scipen = 999)) +
  geom_histogram(position="dodge", bins = 25)+
 # scale_fill_discrete(name = "Gender", labels = c("Female", "Male"))+
  
  geom_vline(data=salaries2, aes(xintercept=base_salary.mean,  colour = gender),
               linetype="dashed", size=1) +
  
  labs(x = "Salary (in dollars)", y = "Frequency", 
       title = "MoCo Employees Salary Distribution")+
  theme_minimal() +
  scale_fill_brewer(palette = "Set1")

salaries_hist1

library(treemap)
treemap(salaries1, index="department_name", vSize="base_salary", #set size
        vColor="base_salary", #set color
        type="manual",
        title="Treemap of Salary by Departments", palette="RdYlBu")