HR2_personnel

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

#library(httr)
#library(jsonlite)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(readr)   
library(dplyr)
#library(psych)
#library(dlookr)
library(Hmisc)

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(corrplot)

## corrplot 0.92 loaded

personnel <- read.csv("PMR_MLS__Management__Report.csv", header = TRUE, fileEncoding="UTF-8-BOM")

str(personnel)

## 'data.frame':    407 obs. of  9 variables:
##  $ Generation         : chr  "Baby Boomers" "Generation X" "Baby Boomers" "Baby Boomers" ...
##  $ Age                : int  56 47 59 56 53 63 51 51 64 53 ...
##  $ Ethnic.Origin      : chr  "Unreported" "Unreported" "White (Not Hispanic or Latino)" "Black or African American (Not Hispanic or Latino)" ...
##  $ Gender             : chr  "Male" "Male" "Female" "Male" ...
##  $ Length.Of.Service  : int  6 1 14 27 24 29 32 2 31 12 ...
##  $ Job.Class          : chr  "MLS" "MLS" "MLS" "MLS" ...
##  $ Grade              : chr  "M2" "M2" "M1" "M3" ...
##  $ Assignment.Category: chr  "Fulltime-Regular" "Fulltime-Regular" "Fulltime-Regular" "Fulltime-Regular" ...
##  $ Salary.Range       : chr  "130-139K" ">=150K" ">=150K" "140-149K" ...

library(janitor)

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

personnel1 <- personnel %>%
  clean_names()
#personnel1

personnel1$gender <- as.factor(personnel1$gender) 
personnel1$length_of_service <- as.numeric(personnel1$length_of_service) 
personnel1$age <- as.numeric(personnel1$age) 

class(personnel1$gender)

## [1] "factor"

class(personnel1$length_of_service)

## [1] "numeric"

class(personnel1$age)

## [1] "numeric"

str(personnel1)

## 'data.frame':    407 obs. of  9 variables:
##  $ generation         : chr  "Baby Boomers" "Generation X" "Baby Boomers" "Baby Boomers" ...
##  $ age                : num  56 47 59 56 53 63 51 51 64 53 ...
##  $ ethnic_origin      : chr  "Unreported" "Unreported" "White (Not Hispanic or Latino)" "Black or African American (Not Hispanic or Latino)" ...
##  $ gender             : Factor w/ 2 levels "Female","Male": 2 2 1 2 1 2 1 2 2 1 ...
##  $ length_of_service  : num  6 1 14 27 24 29 32 2 31 12 ...
##  $ job_class          : chr  "MLS" "MLS" "MLS" "MLS" ...
##  $ grade              : chr  "M2" "M2" "M1" "M3" ...
##  $ assignment_category: chr  "Fulltime-Regular" "Fulltime-Regular" "Fulltime-Regular" "Fulltime-Regular" ...
##  $ salary_range       : chr  "130-139K" ">=150K" ">=150K" "140-149K" ...

library(dlookr)

## Imported Arial Narrow fonts.

## 
## Attaching package: 'dlookr'

## The following object is masked from 'package:Hmisc':
## 
##     describe

## The following object is masked from 'package:tidyr':
## 
##     extract

## The following object is masked from 'package:base':
## 
##     transform

personnel1 %>%
group_by(gender) %>% 
describe(length_of_service, age)

## # A tibble: 4 x 27
##   variable  gender     n    na  mean    sd se_mean   IQR skewness kurtosis   p00
##   <chr>     <fct>  <int> <int> <dbl> <dbl>   <dbl> <dbl>    <dbl>    <dbl> <dbl>
## 1 age       Female   193     0  55.3  8.53   0.614    13   -0.502   -0.394    31
## 2 age       Male     214     0  55.5  9.54   0.652    13   -0.349   -0.354    31
## 3 length_o~ Female   193     0  18.1 11.2    0.809    19    0.316   -0.843     0
## 4 length_o~ Male     214     0  15.5 10.3    0.706    16    0.491   -0.518     0
## # ... with 16 more variables: p01 <dbl>, p05 <dbl>, p10 <dbl>, p20 <dbl>,
## #   p25 <dbl>, p30 <dbl>, p40 <dbl>, p50 <dbl>, p60 <dbl>, p70 <dbl>,
## #   p75 <dbl>, p80 <dbl>, p90 <dbl>, p95 <dbl>, p99 <dbl>, p100 <dbl>

personnel1 %>%
group_by(gender) %>% 
normality(length_of_service, age)

## # A tibble: 4 x 5
##   variable          gender statistic    p_value sample
##   <chr>             <fct>      <dbl>      <dbl>  <dbl>
## 1 length_of_service Female     0.955 0.00000846    193
## 2 length_of_service Male       0.956 0.00000334    214
## 3 age               Female     0.969 0.000265      193
## 4 age               Male       0.983 0.0105        214

personnel1 %>%
  normality(length_of_service, age) %>%
  filter(p_value <= 0.01)

## # A tibble: 2 x 4
##   vars              statistic       p_value sample
##   <chr>                 <dbl>         <dbl>  <dbl>
## 1 length_of_service     0.956 0.00000000111    407
## 2 age                   0.979 0.0000158        407

personnel1 %>%
plot_normality(length_of_service, age)

salary = c("80-89K", "90-99K", "100-109K", "110-119K", "120-129K", "130-139K" , "140-149K" , ">=150K")

personnel1$salary_range <- factor(personnel1$salary_range, levels=salary, ordered = TRUE)

personnel1$salary_range

##   [1] 130-139K >=150K   >=150K   140-149K 140-149K 140-149K 110-119K >=150K  
##   [9] 140-149K 140-149K 90-99K   100-109K 130-139K 130-139K 140-149K 110-119K
##  [17] >=150K   >=150K   140-149K 140-149K >=150K   >=150K   140-149K 140-149K
##  [25] 120-129K 120-129K 140-149K 140-149K 130-139K 120-129K >=150K   130-139K
##  [33] 140-149K 140-149K 140-149K >=150K   >=150K   >=150K   130-139K >=150K  
##  [41] 110-119K 120-129K 130-139K 140-149K >=150K   120-129K 140-149K 120-129K
##  [49] >=150K   140-149K 140-149K 130-139K 140-149K >=150K   120-129K 140-149K
##  [57] 100-109K 130-139K 130-139K >=150K   120-129K 120-129K >=150K   >=150K  
##  [65] 130-139K 130-139K >=150K   110-119K 140-149K 140-149K 120-129K 140-149K
##  [73] 120-129K 120-129K >=150K   110-119K >=150K   120-129K 120-129K 130-139K
##  [81] 130-139K 110-119K 140-149K 120-129K 140-149K 120-129K 130-139K 100-109K
##  [89] 100-109K 120-129K 110-119K 140-149K 140-149K 80-89K   >=150K   >=150K  
##  [97] >=150K   >=150K   130-139K >=150K   >=150K   140-149K 130-139K >=150K  
## [105] 140-149K >=150K   >=150K   80-89K   120-129K 130-139K 140-149K 140-149K
## [113] 110-119K 130-139K 120-129K 110-119K 100-109K 90-99K   110-119K 120-129K
## [121] 130-139K 140-149K 140-149K 140-149K 140-149K >=150K   >=150K   90-99K  
## [129] 140-149K 120-129K 130-139K 140-149K >=150K   130-139K 110-119K 90-99K  
## [137] >=150K   120-129K 110-119K 90-99K   >=150K   >=150K   140-149K 140-149K
## [145] 140-149K 110-119K >=150K   >=150K   >=150K   >=150K   100-109K 120-129K
## [153] 140-149K 130-139K 120-129K >=150K   100-109K 120-129K >=150K   140-149K
## [161] 140-149K >=150K   >=150K   120-129K >=150K   110-119K 120-129K 130-139K
## [169] >=150K   >=150K   >=150K   140-149K 140-149K 130-139K >=150K   130-139K
## [177] >=150K   >=150K   140-149K >=150K   120-129K 120-129K 120-129K 140-149K
## [185] 90-99K   >=150K   130-139K 130-139K 110-119K 130-139K 140-149K 100-109K
## [193] >=150K   140-149K >=150K   140-149K 140-149K 80-89K   140-149K 140-149K
## [201] 140-149K 110-119K >=150K   140-149K 130-139K >=150K   140-149K 140-149K
## [209] >=150K   >=150K   140-149K 130-139K >=150K   130-139K 140-149K 140-149K
## [217] 120-129K 80-89K   >=150K   140-149K 100-109K >=150K   110-119K 140-149K
## [225] 110-119K >=150K   >=150K   120-129K >=150K   >=150K   120-129K >=150K  
## [233] >=150K   100-109K 100-109K 130-139K 130-139K 100-109K >=150K   140-149K
## [241] 120-129K >=150K   >=150K   >=150K   140-149K 100-109K >=150K   140-149K
## [249] 130-139K 140-149K 120-129K 140-149K >=150K   >=150K   120-129K 120-129K
## [257] 130-139K 140-149K >=150K   120-129K >=150K   130-139K 130-139K 140-149K
## [265] 120-129K >=150K   >=150K   >=150K   140-149K 110-119K >=150K   >=150K  
## [273] >=150K   100-109K 120-129K 140-149K 110-119K 110-119K >=150K   120-129K
## [281] 130-139K 140-149K >=150K   140-149K 110-119K 110-119K >=150K   130-139K
## [289] 140-149K 140-149K 120-129K 140-149K 110-119K >=150K   140-149K 140-149K
## [297] 120-129K 140-149K >=150K   >=150K   140-149K 130-139K 140-149K >=150K  
## [305] 140-149K >=150K   140-149K 120-129K 100-109K 140-149K 120-129K 140-149K
## [313] >=150K   120-129K >=150K   >=150K   >=150K   120-129K >=150K   >=150K  
## [321] >=150K   140-149K >=150K   >=150K   110-119K 130-139K 140-149K 120-129K
## [329] 120-129K >=150K   >=150K   140-149K >=150K   140-149K >=150K   110-119K
## [337] 140-149K 140-149K >=150K   140-149K >=150K   140-149K >=150K   >=150K  
## [345] 130-139K 140-149K >=150K   140-149K >=150K   120-129K 110-119K >=150K  
## [353] 140-149K 100-109K 90-99K   110-119K 120-129K <NA>     >=150K   140-149K
## [361] 110-119K 130-139K >=150K   110-119K >=150K   90-99K   140-149K 120-129K
## [369] 120-129K 130-139K 140-149K >=150K   130-139K 140-149K 110-119K 120-129K
## [377] 140-149K >=150K   140-149K 120-129K 110-119K 130-139K 120-129K 130-139K
## [385] >=150K   140-149K 140-149K 130-139K 140-149K 90-99K   >=150K   100-109K
## [393] >=150K   120-129K >=150K   >=150K   140-149K 130-139K >=150K   >=150K  
## [401] 130-139K >=150K   >=150K   140-149K >=150K   140-149K 130-139K
## 8 Levels: 80-89K < 90-99K < 100-109K < 110-119K < 120-129K < ... < >=150K

#personnel1$salary_range <- as.factor(personnel1$salary_range) 

class(personnel1$salary_range)

## [1] "ordered" "factor"

table(personnel1$salary_range)

## 
##   80-89K   90-99K 100-109K 110-119K 120-129K 130-139K 140-149K   >=150K 
##        4        9       17       32       57       52      109      126

type_counts <- table(personnel1$salary_range)
type_counts / sum(type_counts)

## 
##      80-89K      90-99K    100-109K    110-119K    120-129K    130-139K 
## 0.009852217 0.022167488 0.041871921 0.078817734 0.140394089 0.128078818 
##    140-149K      >=150K 
## 0.268472906 0.310344828

target_salary <- target_by(personnel1, salary_range)

salary_gender <- relate(target_salary, generation)
salary_gender

##             generation
## salary_range Baby Boomers Generation X Millennial Generation
##     80-89K              1            2                     1
##     90-99K              3            5                     1
##     100-109K            5            7                     4
##     110-119K            7           18                     7
##     120-129K           33           23                     1
##     130-139K           33           19                     0
##     140-149K           76           33                     0
##     >=150K             84           37                     4
##     <NA>                1            0                     0
##             generation
## salary_range Traditionalist/Silent Generation
##     80-89K                                  0
##     90-99K                                  0
##     100-109K                                1
##     110-119K                                0
##     120-129K                                0
##     130-139K                                0
##     140-149K                                0
##     >=150K                                  1
##     <NA>                                    0

summary(salary_gender)

## Call: xtabs(formula = formula_str, data = data, addNA = TRUE)
## Number of cases in table: 407 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = 84.58, df = 24, p-value = 1.114e-08
##  Chi-squared approximation may be incorrect

salary_age <- relate(target_salary, age)
salary_age

## # A tibble: 10 x 27
##    variable salary_range     n    na  mean    sd se_mean   IQR skewness kurtosis
##    <chr>    <ord>        <int> <int> <dbl> <dbl>   <dbl> <dbl>    <dbl>    <dbl>
##  1 age      80-89K           4     0  49   17.8    8.90  20     0.671   -0.482  
##  2 age      90-99K           9     0  49   11.3    3.77  15    -0.00467 -0.903  
##  3 age      100-109K        17     0  49.8 13.1    3.19  20     0.746   -0.0199 
##  4 age      110-119K        32     0  47.5  9.36   1.66  13.5   0.307   -0.733  
##  5 age      120-129K        57     0  54.9  8.70   1.15  14    -0.131   -0.602  
##  6 age      130-139K        52     0  55.9  7.63   1.06   9.25 -0.238   -0.611  
##  7 age      140-149K       109     0  57.2  7.07   0.677  9    -0.548    0.00788
##  8 age      >=150K         126     0  57.2  8.66   0.771 11.8  -0.482   -0.334  
##  9 age      <NA>             1     0  64   NA     NA      0    NA       NA      
## 10 age      total          407     0  55.4  9.06   0.449 13    -0.406   -0.347  
## # ... with 17 more variables: p00 <dbl>, p01 <dbl>, p05 <dbl>, p10 <dbl>,
## #   p20 <dbl>, p25 <dbl>, p30 <dbl>, p40 <dbl>, p50 <dbl>, p60 <dbl>,
## #   p70 <dbl>, p75 <dbl>, p80 <dbl>, p90 <dbl>, p95 <dbl>, p99 <dbl>,
## #   p100 <dbl>

summary(salary_age)

##    variable           salary_range       n               na         mean      
##  Length:10          80-89K  :1     Min.   :  1.0   Min.   :0   Min.   :47.50  
##  Class :character   90-99K  :1     1st Qu.: 11.0   1st Qu.:0   1st Qu.:49.21  
##  Mode  :character   100-109K:1     Median : 42.0   Median :0   Median :55.14  
##                     110-119K:1     Mean   : 81.4   Mean   :0   Mean   :53.99  
##                     120-129K:1     3rd Qu.: 96.0   3rd Qu.:0   3rd Qu.:56.86  
##                     (Other) :4     Max.   :407.0   Max.   :0   Max.   :64.00  
##                     NA's    :1                                                
##        sd            se_mean            IQR            skewness       
##  Min.   : 7.066   Min.   :0.4492   Min.   : 0.000   Min.   :-0.54757  
##  1st Qu.: 8.659   1st Qu.:0.7714   1st Qu.: 9.875   1st Qu.:-0.40621  
##  Median : 9.063   Median :1.1525   Median :13.250   Median :-0.13058  
##  Mean   :10.302   Mean   :2.4018   Mean   :12.550   Mean   :-0.00938  
##  3rd Qu.:11.303   3rd Qu.:3.1880   3rd Qu.:14.750   3rd Qu.: 0.30667  
##  Max.   :17.795   Max.   :8.8976   Max.   :20.000   Max.   : 0.74637  
##  NA's   :1        NA's   :1                         NA's   :1         
##     kurtosis              p00             p01             p05       
##  Min.   :-0.902555   Min.   :31.00   Min.   :31.27   Min.   :32.35  
##  1st Qu.:-0.611234   1st Qu.:31.25   1st Qu.:33.07   1st Qu.:35.44  
##  Median :-0.481695   Median :33.50   Median :35.16   Median :39.65  
##  Mean   :-0.447106   Mean   :37.10   Mean   :38.21   Mean   :40.86  
##  3rd Qu.:-0.334441   3rd Qu.:37.50   3rd Qu.:39.34   3rd Qu.:41.95  
##  Max.   : 0.007879   Max.   :64.00   Max.   :64.00   Max.   :64.00  
##  NA's   :1                                                          
##       p10             p20             p25             p30       
##  Min.   :33.70   Min.   :36.40   Min.   :37.75   Min.   :39.10  
##  1st Qu.:36.55   1st Qu.:38.80   1st Qu.:39.88   1st Qu.:41.85  
##  Median :42.80   Median :46.60   Median :48.00   Median :50.40  
##  Mean   :43.23   Mean   :45.98   Mean   :47.40   Mean   :48.73  
##  3rd Qu.:45.75   3rd Qu.:48.85   3rd Qu.:51.94   3rd Qu.:53.20  
##  Max.   :64.00   Max.   :64.00   Max.   :64.00   Max.   :64.00  
##                                                                 
##       p40             p50             p60             p70       
##  Min.   :42.60   Min.   :46.50   Min.   :49.00   Min.   :51.70  
##  1st Qu.:44.45   1st Qu.:48.25   1st Qu.:51.00   1st Qu.:55.12  
##  Median :53.70   Median :56.50   Median :57.50   Median :59.50  
##  Mean   :51.48   Mean   :54.10   Mean   :56.12   Mean   :58.52  
##  3rd Qu.:55.85   3rd Qu.:57.75   3rd Qu.:59.75   3rd Qu.:61.00  
##  Max.   :64.00   Max.   :64.00   Max.   :64.00   Max.   :64.00  
##                                                                 
##       p75             p80             p90             p95       
##  Min.   :53.00   Min.   :56.20   Min.   :58.90   Min.   :64.00  
##  1st Qu.:58.06   1st Qu.:60.60   1st Qu.:64.45   1st Qu.:64.99  
##  Median :61.00   Median :62.90   Median :66.00   Median :67.40  
##  Mean   :59.95   Mean   :61.84   Mean   :64.99   Mean   :66.95  
##  3rd Qu.:62.00   3rd Qu.:63.85   3rd Qu.:66.00   3rd Qu.:68.56  
##  Max.   :64.00   Max.   :65.00   Max.   :67.50   Max.   :69.60  
##                                                                 
##       p99             p100     
##  Min.   :64.00   Min.   :64.0  
##  1st Qu.:66.64   1st Qu.:67.0  
##  Median :70.50   Median :71.0  
##  Mean   :69.84   Mean   :71.3  
##  3rd Qu.:71.86   3rd Qu.:73.5  
##  Max.   :77.92   Max.   :80.0  
##

plot(salary_age)

## Warning: Groups with fewer than two data points have been dropped.

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

salary_generation <- relate(target_salary, generation)
salary_generation

##             generation
## salary_range Baby Boomers Generation X Millennial Generation
##     80-89K              1            2                     1
##     90-99K              3            5                     1
##     100-109K            5            7                     4
##     110-119K            7           18                     7
##     120-129K           33           23                     1
##     130-139K           33           19                     0
##     140-149K           76           33                     0
##     >=150K             84           37                     4
##     <NA>                1            0                     0
##             generation
## salary_range Traditionalist/Silent Generation
##     80-89K                                  0
##     90-99K                                  0
##     100-109K                                1
##     110-119K                                0
##     120-129K                                0
##     130-139K                                0
##     140-149K                                0
##     >=150K                                  1
##     <NA>                                    0

summary(salary_generation)

## Call: xtabs(formula = formula_str, data = data, addNA = TRUE)
## Number of cases in table: 407 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = 84.58, df = 24, p-value = 1.114e-08
##  Chi-squared approximation may be incorrect

According to these results, generation does contribute to salary range (p-value < 0.05%)

personnel1$ethnic_origin[personnel1$ethnic_origin == "White (Not Hispanic or Latino)"] <- "White"
personnel1$ethnic_origin[personnel1$ethnic_origin== "Asian (Not Hispanic or Latino)"] <- "Asian"
personnel1$ethnic_origin[personnel1$ethnic_origin == "Black or African American (Not Hispanic or Latino)"] <- "Black or African American"
personnel1$ethnic_origin[personnel1$ethnic_origin == "American Indian or Alaska Native (Not Hispanic or Latino)"] <- "American Indian or Alaska Native"
personnel1$ethnic_origin[personnel1$ethnic_origin == "Two or More Races (Not Hispanic or Latino)"] <- "Two or More Races"

salary_ethnic_origin <- relate(target_salary, ethnic_origin)
salary_ethnic_origin

##             ethnic_origin
## salary_range American Indian or Alaska Native (Not Hispanic or Latino)
##     80-89K                                                           0
##     90-99K                                                           0
##     100-109K                                                         0
##     110-119K                                                         0
##     120-129K                                                         0
##     130-139K                                                         1
##     140-149K                                                         0
##     >=150K                                                           0
##     <NA>                                                             0
##             ethnic_origin
## salary_range Asian (Not Hispanic or Latino)
##     80-89K                                0
##     90-99K                                0
##     100-109K                              0
##     110-119K                              0
##     120-129K                              5
##     130-139K                              0
##     140-149K                             12
##     >=150K                                6
##     <NA>                                  0
##             ethnic_origin
## salary_range Black or African American (Not Hispanic or Latino)
##     80-89K                                                    1
##     90-99K                                                    3
##     100-109K                                                  5
##     110-119K                                                  8
##     120-129K                                                 12
##     130-139K                                                  8
##     140-149K                                                 16
##     >=150K                                                   25
##     <NA>                                                      0
##             ethnic_origin
## salary_range Hispanic or Latino Two or More Races (Not Hispanic or Latino)
##     80-89K                    0                                          0
##     90-99K                    0                                          0
##     100-109K                  2                                          0
##     110-119K                  3                                          0
##     120-129K                  7                                          0
##     130-139K                  3                                          0
##     140-149K                  3                                          1
##     >=150K                    6                                          0
##     <NA>                      0                                          0
##             ethnic_origin
## salary_range Unreported White (Not Hispanic or Latino)
##     80-89K            0                              3
##     90-99K            0                              6
##     100-109K          2                              8
##     110-119K          2                             19
##     120-129K          2                             31
##     130-139K          4                             36
##     140-149K          2                             75
##     >=150K            5                             84
##     <NA>              0                              1

summary(salary_ethnic_origin)

## Call: xtabs(formula = formula_str, data = data, addNA = TRUE)
## Number of cases in table: 407 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = 44.64, df = 48, p-value = 0.6115
##  Chi-squared approximation may be incorrect

According to these results, ethnicity does not contribute to salary range (p-value > 0.05%)

salary_length_of_service <- relate(target_salary, length_of_service)
salary_length_of_service

## # A tibble: 10 x 27
##    variable salary_range     n    na  mean    sd se_mean   IQR skewness kurtosis
##    <chr>    <ord>        <int> <int> <dbl> <dbl>   <dbl> <dbl>    <dbl>    <dbl>
##  1 length_~ 80-89K           4     0  6.75  3.77   1.89   2.25   1.13     2.23  
##  2 length_~ 90-99K           9     0  7.56  5.59   1.86  10      0.259   -1.73  
##  3 length_~ 100-109K        17     0 10.8   9.74   2.36  12      1.23     0.949 
##  4 length_~ 110-119K        32     0 11.6   9.45   1.67  16      0.806    0.0425
##  5 length_~ 120-129K        57     0 15.9  11.4    1.51  22      0.439   -0.987 
##  6 length_~ 130-139K        52     0 16.7  10.3    1.43  17      0.441   -0.318 
##  7 length_~ 140-149K       109     0 19.6   9.70   0.929 17      0.0573  -1.04  
##  8 length_~ >=150K         126     0 17.7  11.5    1.02  17.8    0.483   -0.509 
##  9 length_~ <NA>             1     0 18    NA     NA      0     NA       NA     
## 10 length_~ total          407     0 16.7  10.8    0.537 18.5    0.418   -0.686 
## # ... with 17 more variables: p00 <dbl>, p01 <dbl>, p05 <dbl>, p10 <dbl>,
## #   p20 <dbl>, p25 <dbl>, p30 <dbl>, p40 <dbl>, p50 <dbl>, p60 <dbl>,
## #   p70 <dbl>, p75 <dbl>, p80 <dbl>, p90 <dbl>, p95 <dbl>, p99 <dbl>,
## #   p100 <dbl>

summary(salary_length_of_service)

##    variable           salary_range       n               na         mean      
##  Length:10          80-89K  :1     Min.   :  1.0   Min.   :0   Min.   : 6.75  
##  Class :character   90-99K  :1     1st Qu.: 11.0   1st Qu.:0   1st Qu.:10.96  
##  Mode  :character   100-109K:1     Median : 42.0   Median :0   Median :16.31  
##                     110-119K:1     Mean   : 81.4   Mean   :0   Mean   :14.13  
##                     120-129K:1     3rd Qu.: 96.0   3rd Qu.:0   3rd Qu.:17.48  
##                     (Other) :4     Max.   :407.0   Max.   :0   Max.   :19.56  
##                     NA's    :1                                                
##        sd            se_mean           IQR           skewness      
##  Min.   : 3.775   Min.   :0.537   Min.   : 0.00   Min.   :0.05731  
##  1st Qu.: 9.446   1st Qu.:1.025   1st Qu.:10.50   1st Qu.:0.41804  
##  Median : 9.737   Median :1.512   Median :16.50   Median :0.44137  
##  Mean   : 9.146   Mean   :1.468   Mean   :13.25   Mean   :0.58524  
##  3rd Qu.:10.833   3rd Qu.:1.864   3rd Qu.:17.56   3rd Qu.:0.80574  
##  Max.   :11.501   Max.   :2.362   Max.   :22.00   Max.   :1.23314  
##  NA's   :1        NA's   :1                       NA's   :1        
##     kurtosis             p00            p01              p05        
##  Min.   :-1.73236   Min.   : 0.0   Min.   : 0.310   Min.   : 1.000  
##  1st Qu.:-0.98741   1st Qu.: 0.0   1st Qu.: 0.670   1st Qu.: 1.500  
##  Median :-0.50890   Median : 1.0   Median : 1.040   Median : 2.000  
##  Mean   :-0.22830   Mean   : 2.5   Mean   : 2.779   Mean   : 3.700  
##  3rd Qu.: 0.04251   3rd Qu.: 1.0   3rd Qu.: 1.140   3rd Qu.: 3.225  
##  Max.   : 2.22715   Max.   :18.0   Max.   :18.000   Max.   :18.000  
##  NA's   :1                                                          
##       p10             p20             p25              p30        
##  Min.   : 1.80   Min.   : 2.00   Min.   : 2.000   Min.   : 3.200  
##  1st Qu.: 2.00   1st Qu.: 3.45   1st Qu.: 3.562   1st Qu.: 5.175  
##  Median : 3.25   Median : 5.50   Median : 6.500   Median : 9.800  
##  Mean   : 4.73   Mean   : 6.40   Mean   : 7.125   Mean   : 9.190  
##  3rd Qu.: 4.05   3rd Qu.: 6.15   3rd Qu.: 8.500   3rd Qu.:11.750  
##  Max.   :18.00   Max.   :18.00   Max.   :18.000   Max.   :18.000  
##                                                                   
##       p40            p50             p60             p70             p75       
##  Min.   : 5.2   Min.   : 6.00   Min.   : 6.00   Min.   : 6.60   Min.   : 7.50  
##  1st Qu.: 6.0   1st Qu.: 7.75   1st Qu.:12.70   1st Qu.:13.90   1st Qu.:15.00  
##  Median :12.0   Median :14.00   Median :17.30   Median :19.35   Median :21.50  
##  Mean   :10.7   Mean   :12.60   Mean   :15.12   Mean   :18.31   Mean   :20.38  
##  3rd Qu.:13.3   3rd Qu.:16.75   3rd Qu.:18.00   3rd Qu.:22.30   3rd Qu.:26.44  
##  Max.   :18.0   Max.   :18.00   Max.   :20.80   Max.   :29.00   Max.   :29.00  
##                                                                                
##       p80             p90             p95             p99       
##  Min.   : 8.40   Min.   :10.20   Min.   :11.10   Min.   :11.82  
##  1st Qu.:15.00   1st Qu.:19.00   1st Qu.:20.26   1st Qu.:21.67  
##  Median :22.90   Median :27.20   Median :31.93   Median :35.38  
##  Mean   :21.74   Mean   :24.74   Mean   :27.59   Mean   :31.61  
##  3rd Qu.:29.00   3rd Qu.:31.40   3rd Qu.:34.00   3rd Qu.:40.78  
##  Max.   :30.00   Max.   :33.00   Max.   :38.50   Max.   :45.75  
##                                                                 
##       p100      
##  Min.   :12.00  
##  1st Qu.:21.75  
##  Median :38.00  
##  Mean   :33.20  
##  3rd Qu.:42.00  
##  Max.   :47.00  
##

plot(salary_length_of_service)

## Warning: Groups with fewer than two data points have been dropped.

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

# Are salary and gender related?
# Null Hypothesis: salary and gender are independent

sal_gender <- table(personnel1$salary_range, personnel1$ethnic_origin)
sal_gender

##           
##            American Indian or Alaska Native Asian Black or African American
##   80-89K                                  0     0                         1
##   90-99K                                  0     0                         3
##   100-109K                                0     0                         5
##   110-119K                                0     0                         8
##   120-129K                                0     5                        12
##   130-139K                                1     0                         8
##   140-149K                                0    12                        16
##   >=150K                                  0     6                        25
##           
##            Hispanic or Latino Two or More Races Unreported White
##   80-89K                    0                 0          0     3
##   90-99K                    0                 0          0     6
##   100-109K                  2                 0          2     8
##   110-119K                  3                 0          2    19
##   120-129K                  7                 0          2    31
##   130-139K                  3                 0          4    36
##   140-149K                  3                 1          2    75
##   >=150K                    6                 0          5    84

chisq.test(sal_gender)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  sal_gender
## X-squared = 43.988, df = 42, p-value = 0.3874

# Are salary and length_of_service related?
# Null Hypothesis: salary and length_of_service are independent
sal_years <- table(personnel1$salary_range, personnel1$length_of_service)
sal_years

##           
##            0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
##   80-89K   0 0 0 1 0 0 2 0 0 0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0
##   90-99K   0 1 2 0 0 1 1 0 0 0  1  0  1  0  0  2  0  0  0  0  0  0  0  0  0  0
##   100-109K 0 1 4 0 0 1 2 1 0 0  0  0  1  2  2  0  0  0  0  0  1  0  0  0  0  0
##   110-119K 1 2 2 4 0 4 2 0 0 1  0  1  0  3  0  1  2  0  0  2  1  1  2  1  0  0
##   120-129K 1 3 3 2 2 3 2 1 0 1  2  4  3  3  2  0  2  2  1  1  0  0  2  0  1  0
##   130-139K 1 1 1 2 1 4 1 3 0 0  0  1  2  4  1  2  3  3  4  0  2  1  1  0  2  0
##   140-149K 0 2 2 2 0 2 0 6 0 0  2  7  8  5  2  4  6  3  4  7  3  2  1  2  2  1
##   >=150K   0 5 5 3 6 5 2 3 2 3  3  3  7  5  4  2  4  8  5  2  4  3  2  1  3  2
##           
##            26 27 28 29 30 31 32 33 34 35 36 40 41 42 43 45 46 47
##   80-89K    0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   90-99K    0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   100-109K  0  0  0  0  0  1  0  1  0  0  0  0  0  0  0  0  0  0
##   110-119K  0  0  0  0  0  0  1  0  0  0  1  0  0  0  0  0  0  0
##   120-129K  1  0  1  5  2  1  1  1  1  0  2  0  0  1  0  0  0  0
##   130-139K  2  1  1  1  2  0  2  1  0  0  0  0  1  1  0  0  0  0
##   140-149K  0  1  1  7  8  7  3  2  2  2  2  1  0  0  0  0  0  0
##   >=150K    2  3  1  1  5  7  1  2  5  0  0  2  1  0  1  1  1  1

chisq.test(sal_years)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  sal_years
## X-squared = 282.67, df = 301, p-value = 0.769

# Are salary and age related?
# Null Hypothesis: salary and age are independent
sal_age <- table(personnel1$age, personnel1$age)
sal_age
chisq.test(sal_age)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

# Are salary and ethnicity related?
# Null Hypothesis: salary and ethnicity are independent
sal_ethnicity <- table(personnel1$salary_range, personnel1$ethnic_origin)
sal_ethnicity

##           
##            American Indian or Alaska Native Asian Black or African American
##   80-89K                                  0     0                         1
##   90-99K                                  0     0                         3
##   100-109K                                0     0                         5
##   110-119K                                0     0                         8
##   120-129K                                0     5                        12
##   130-139K                                1     0                         8
##   140-149K                                0    12                        16
##   >=150K                                  0     6                        25
##           
##            Hispanic or Latino Two or More Races Unreported White
##   80-89K                    0                 0          0     3
##   90-99K                    0                 0          0     6
##   100-109K                  2                 0          2     8
##   110-119K                  3                 0          2    19
##   120-129K                  7                 0          2    31
##   130-139K                  3                 0          4    36
##   140-149K                  3                 1          2    75
##   >=150K                    6                 0          5    84

chisq.test(sal_ethnicity)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  sal_ethnicity
## X-squared = 43.988, df = 42, p-value = 0.3874

plot1 <- personnel1 %>%
  ggplot(aes(x = salary_range, fill = ethnic_origin)) +
  geom_histogram(stat="count", binwidth = 5, color = "white")+
  scale_fill_discrete(name = "Ethnicity", labels = c("American Indian or Alaska Native", "Asian","Black or African American", "Hispanic or Latino", "Two or More Races", "Unreported", "White")) +
  labs(x = "Salary (in dollars)", y = "Frequency", 
       title = "MoCo Employees Salary Distribution by Ethnicity")+
  theme_minimal() +
  scale_fill_brewer(palette="Set1")

## Warning: Ignoring unknown parameters: binwidth, bins, pad

## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

plot1

personnel1$generation[personnel1$generation == "Millennial Generation"] <- "Millennial"
personnel1$generation[personnel1$generation == "Traditionalist/Silent Generation"] <- "Silent"

plot1 <- personnel1 %>%
  ggplot(aes(x = salary_range, fill = grade)) +
  geom_histogram(stat="count", binwidth = 5, color = "white")+
  scale_fill_discrete(name = "Ethnicity", labels = c("M1", "M2", "M3")) +
  labs(x = "Salary (in dollars)", y = "Frequency", 
       title = "MoCo Employees Salary Distribution by Grade")+
  theme_minimal() +
  scale_fill_brewer(palette="Set1")

## Warning: Ignoring unknown parameters: binwidth, bins, pad

## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

plot1

ggplot(data = personnel1) +
  geom_bar(mapping = aes(x = salary_range,
                       fill = gender)) +
  labs(x = "Salary (in dollars)",
       title = "MoCo Employees Salary Distribution by Gender")+
  theme_minimal() +
  scale_fill_brewer(palette="Set1")

{r fig.width = 7, fig.height = 4}

#library(rgdal)
#table_gender_salary <- personnel1 %>%
#dplyr::count(grade, salary_range)

#ggplot(data = table_gender_salary) +
#  geom_tile(mapping = aes(x = grade, 
#                         y = salary_range, fill = n)) +
#  scale_fill_gradientn(colors = brewer.pal(5, "RdYlGn")) +
#labs(x = "Grade", y = "Salary (in dollars)", 
#       title = "MoCo Employees Salary Distribution by Grade") +
#  theme_minimal()
#  facet_grid(.~ grade)

Is there a relationship between gender and salary range?

This is going to be a Chi-Square test. We will test using α=0.05. Ho: gender and salary range are independent. Ha: gender and salary range are dependent.

gender_salary <- table(personnel1$grade, 
                            personnel1$grade)

result5 <- chisq.test(gender_salary)

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

result5

## 
##  Pearson's Chi-squared test
## 
## data:  gender_salary
## X-squared = 814, df = 4, p-value < 2.2e-16

P-value: 0.4017 > 0.05 = α.

Conclusion: Fail to reject Ho.

result5$residuals

##     
##             M1        M2        M3
##   M1 18.786333 -2.885191 -4.213002
##   M2 -2.885191 14.176494 -8.758008
##   M3 -4.213002 -8.758008  7.385656

corrplot(result5$residuals, is.corr = FALSE)