library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#library(httr)
#library(jsonlite)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(readr)
library(dplyr)
#library(psych)
#library(dlookr)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(corrplot)
## corrplot 0.92 loaded
personnel <- read.csv("PMR_MLS__Management__Report.csv", header = TRUE, fileEncoding="UTF-8-BOM")
str(personnel)
## 'data.frame': 407 obs. of 9 variables:
## $ Generation : chr "Baby Boomers" "Generation X" "Baby Boomers" "Baby Boomers" ...
## $ Age : int 56 47 59 56 53 63 51 51 64 53 ...
## $ Ethnic.Origin : chr "Unreported" "Unreported" "White (Not Hispanic or Latino)" "Black or African American (Not Hispanic or Latino)" ...
## $ Gender : chr "Male" "Male" "Female" "Male" ...
## $ Length.Of.Service : int 6 1 14 27 24 29 32 2 31 12 ...
## $ Job.Class : chr "MLS" "MLS" "MLS" "MLS" ...
## $ Grade : chr "M2" "M2" "M1" "M3" ...
## $ Assignment.Category: chr "Fulltime-Regular" "Fulltime-Regular" "Fulltime-Regular" "Fulltime-Regular" ...
## $ Salary.Range : chr "130-139K" ">=150K" ">=150K" "140-149K" ...
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
personnel1 <- personnel %>%
clean_names()
#personnel1
personnel1$gender <- as.factor(personnel1$gender)
personnel1$length_of_service <- as.numeric(personnel1$length_of_service)
personnel1$age <- as.numeric(personnel1$age)
class(personnel1$gender)
## [1] "factor"
class(personnel1$length_of_service)
## [1] "numeric"
class(personnel1$age)
## [1] "numeric"
str(personnel1)
## 'data.frame': 407 obs. of 9 variables:
## $ generation : chr "Baby Boomers" "Generation X" "Baby Boomers" "Baby Boomers" ...
## $ age : num 56 47 59 56 53 63 51 51 64 53 ...
## $ ethnic_origin : chr "Unreported" "Unreported" "White (Not Hispanic or Latino)" "Black or African American (Not Hispanic or Latino)" ...
## $ gender : Factor w/ 2 levels "Female","Male": 2 2 1 2 1 2 1 2 2 1 ...
## $ length_of_service : num 6 1 14 27 24 29 32 2 31 12 ...
## $ job_class : chr "MLS" "MLS" "MLS" "MLS" ...
## $ grade : chr "M2" "M2" "M1" "M3" ...
## $ assignment_category: chr "Fulltime-Regular" "Fulltime-Regular" "Fulltime-Regular" "Fulltime-Regular" ...
## $ salary_range : chr "130-139K" ">=150K" ">=150K" "140-149K" ...
library(dlookr)
## Imported Arial Narrow fonts.
##
## Attaching package: 'dlookr'
## The following object is masked from 'package:Hmisc':
##
## describe
## The following object is masked from 'package:tidyr':
##
## extract
## The following object is masked from 'package:base':
##
## transform
personnel1 %>%
group_by(gender) %>%
describe(length_of_service, age)
## # A tibble: 4 x 27
## variable gender n na mean sd se_mean IQR skewness kurtosis p00
## <chr> <fct> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 age Female 193 0 55.3 8.53 0.614 13 -0.502 -0.394 31
## 2 age Male 214 0 55.5 9.54 0.652 13 -0.349 -0.354 31
## 3 length_o~ Female 193 0 18.1 11.2 0.809 19 0.316 -0.843 0
## 4 length_o~ Male 214 0 15.5 10.3 0.706 16 0.491 -0.518 0
## # ... with 16 more variables: p01 <dbl>, p05 <dbl>, p10 <dbl>, p20 <dbl>,
## # p25 <dbl>, p30 <dbl>, p40 <dbl>, p50 <dbl>, p60 <dbl>, p70 <dbl>,
## # p75 <dbl>, p80 <dbl>, p90 <dbl>, p95 <dbl>, p99 <dbl>, p100 <dbl>
personnel1 %>%
group_by(gender) %>%
normality(length_of_service, age)
## # A tibble: 4 x 5
## variable gender statistic p_value sample
## <chr> <fct> <dbl> <dbl> <dbl>
## 1 length_of_service Female 0.955 0.00000846 193
## 2 length_of_service Male 0.956 0.00000334 214
## 3 age Female 0.969 0.000265 193
## 4 age Male 0.983 0.0105 214
personnel1 %>%
normality(length_of_service, age) %>%
filter(p_value <= 0.01)
## # A tibble: 2 x 4
## vars statistic p_value sample
## <chr> <dbl> <dbl> <dbl>
## 1 length_of_service 0.956 0.00000000111 407
## 2 age 0.979 0.0000158 407
personnel1 %>%
plot_normality(length_of_service, age)
salary = c("80-89K", "90-99K", "100-109K", "110-119K", "120-129K", "130-139K" , "140-149K" , ">=150K")
personnel1$salary_range <- factor(personnel1$salary_range, levels=salary, ordered = TRUE)
personnel1$salary_range
## [1] 130-139K >=150K >=150K 140-149K 140-149K 140-149K 110-119K >=150K
## [9] 140-149K 140-149K 90-99K 100-109K 130-139K 130-139K 140-149K 110-119K
## [17] >=150K >=150K 140-149K 140-149K >=150K >=150K 140-149K 140-149K
## [25] 120-129K 120-129K 140-149K 140-149K 130-139K 120-129K >=150K 130-139K
## [33] 140-149K 140-149K 140-149K >=150K >=150K >=150K 130-139K >=150K
## [41] 110-119K 120-129K 130-139K 140-149K >=150K 120-129K 140-149K 120-129K
## [49] >=150K 140-149K 140-149K 130-139K 140-149K >=150K 120-129K 140-149K
## [57] 100-109K 130-139K 130-139K >=150K 120-129K 120-129K >=150K >=150K
## [65] 130-139K 130-139K >=150K 110-119K 140-149K 140-149K 120-129K 140-149K
## [73] 120-129K 120-129K >=150K 110-119K >=150K 120-129K 120-129K 130-139K
## [81] 130-139K 110-119K 140-149K 120-129K 140-149K 120-129K 130-139K 100-109K
## [89] 100-109K 120-129K 110-119K 140-149K 140-149K 80-89K >=150K >=150K
## [97] >=150K >=150K 130-139K >=150K >=150K 140-149K 130-139K >=150K
## [105] 140-149K >=150K >=150K 80-89K 120-129K 130-139K 140-149K 140-149K
## [113] 110-119K 130-139K 120-129K 110-119K 100-109K 90-99K 110-119K 120-129K
## [121] 130-139K 140-149K 140-149K 140-149K 140-149K >=150K >=150K 90-99K
## [129] 140-149K 120-129K 130-139K 140-149K >=150K 130-139K 110-119K 90-99K
## [137] >=150K 120-129K 110-119K 90-99K >=150K >=150K 140-149K 140-149K
## [145] 140-149K 110-119K >=150K >=150K >=150K >=150K 100-109K 120-129K
## [153] 140-149K 130-139K 120-129K >=150K 100-109K 120-129K >=150K 140-149K
## [161] 140-149K >=150K >=150K 120-129K >=150K 110-119K 120-129K 130-139K
## [169] >=150K >=150K >=150K 140-149K 140-149K 130-139K >=150K 130-139K
## [177] >=150K >=150K 140-149K >=150K 120-129K 120-129K 120-129K 140-149K
## [185] 90-99K >=150K 130-139K 130-139K 110-119K 130-139K 140-149K 100-109K
## [193] >=150K 140-149K >=150K 140-149K 140-149K 80-89K 140-149K 140-149K
## [201] 140-149K 110-119K >=150K 140-149K 130-139K >=150K 140-149K 140-149K
## [209] >=150K >=150K 140-149K 130-139K >=150K 130-139K 140-149K 140-149K
## [217] 120-129K 80-89K >=150K 140-149K 100-109K >=150K 110-119K 140-149K
## [225] 110-119K >=150K >=150K 120-129K >=150K >=150K 120-129K >=150K
## [233] >=150K 100-109K 100-109K 130-139K 130-139K 100-109K >=150K 140-149K
## [241] 120-129K >=150K >=150K >=150K 140-149K 100-109K >=150K 140-149K
## [249] 130-139K 140-149K 120-129K 140-149K >=150K >=150K 120-129K 120-129K
## [257] 130-139K 140-149K >=150K 120-129K >=150K 130-139K 130-139K 140-149K
## [265] 120-129K >=150K >=150K >=150K 140-149K 110-119K >=150K >=150K
## [273] >=150K 100-109K 120-129K 140-149K 110-119K 110-119K >=150K 120-129K
## [281] 130-139K 140-149K >=150K 140-149K 110-119K 110-119K >=150K 130-139K
## [289] 140-149K 140-149K 120-129K 140-149K 110-119K >=150K 140-149K 140-149K
## [297] 120-129K 140-149K >=150K >=150K 140-149K 130-139K 140-149K >=150K
## [305] 140-149K >=150K 140-149K 120-129K 100-109K 140-149K 120-129K 140-149K
## [313] >=150K 120-129K >=150K >=150K >=150K 120-129K >=150K >=150K
## [321] >=150K 140-149K >=150K >=150K 110-119K 130-139K 140-149K 120-129K
## [329] 120-129K >=150K >=150K 140-149K >=150K 140-149K >=150K 110-119K
## [337] 140-149K 140-149K >=150K 140-149K >=150K 140-149K >=150K >=150K
## [345] 130-139K 140-149K >=150K 140-149K >=150K 120-129K 110-119K >=150K
## [353] 140-149K 100-109K 90-99K 110-119K 120-129K <NA> >=150K 140-149K
## [361] 110-119K 130-139K >=150K 110-119K >=150K 90-99K 140-149K 120-129K
## [369] 120-129K 130-139K 140-149K >=150K 130-139K 140-149K 110-119K 120-129K
## [377] 140-149K >=150K 140-149K 120-129K 110-119K 130-139K 120-129K 130-139K
## [385] >=150K 140-149K 140-149K 130-139K 140-149K 90-99K >=150K 100-109K
## [393] >=150K 120-129K >=150K >=150K 140-149K 130-139K >=150K >=150K
## [401] 130-139K >=150K >=150K 140-149K >=150K 140-149K 130-139K
## 8 Levels: 80-89K < 90-99K < 100-109K < 110-119K < 120-129K < ... < >=150K
#personnel1$salary_range <- as.factor(personnel1$salary_range)
class(personnel1$salary_range)
## [1] "ordered" "factor"
table(personnel1$salary_range)
##
## 80-89K 90-99K 100-109K 110-119K 120-129K 130-139K 140-149K >=150K
## 4 9 17 32 57 52 109 126
type_counts <- table(personnel1$salary_range)
type_counts / sum(type_counts)
##
## 80-89K 90-99K 100-109K 110-119K 120-129K 130-139K
## 0.009852217 0.022167488 0.041871921 0.078817734 0.140394089 0.128078818
## 140-149K >=150K
## 0.268472906 0.310344828
target_salary <- target_by(personnel1, salary_range)
salary_gender <- relate(target_salary, generation)
salary_gender
## generation
## salary_range Baby Boomers Generation X Millennial Generation
## 80-89K 1 2 1
## 90-99K 3 5 1
## 100-109K 5 7 4
## 110-119K 7 18 7
## 120-129K 33 23 1
## 130-139K 33 19 0
## 140-149K 76 33 0
## >=150K 84 37 4
## <NA> 1 0 0
## generation
## salary_range Traditionalist/Silent Generation
## 80-89K 0
## 90-99K 0
## 100-109K 1
## 110-119K 0
## 120-129K 0
## 130-139K 0
## 140-149K 0
## >=150K 1
## <NA> 0
summary(salary_gender)
## Call: xtabs(formula = formula_str, data = data, addNA = TRUE)
## Number of cases in table: 407
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 84.58, df = 24, p-value = 1.114e-08
## Chi-squared approximation may be incorrect
salary_age <- relate(target_salary, age)
salary_age
## # A tibble: 10 x 27
## variable salary_range n na mean sd se_mean IQR skewness kurtosis
## <chr> <ord> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 age 80-89K 4 0 49 17.8 8.90 20 0.671 -0.482
## 2 age 90-99K 9 0 49 11.3 3.77 15 -0.00467 -0.903
## 3 age 100-109K 17 0 49.8 13.1 3.19 20 0.746 -0.0199
## 4 age 110-119K 32 0 47.5 9.36 1.66 13.5 0.307 -0.733
## 5 age 120-129K 57 0 54.9 8.70 1.15 14 -0.131 -0.602
## 6 age 130-139K 52 0 55.9 7.63 1.06 9.25 -0.238 -0.611
## 7 age 140-149K 109 0 57.2 7.07 0.677 9 -0.548 0.00788
## 8 age >=150K 126 0 57.2 8.66 0.771 11.8 -0.482 -0.334
## 9 age <NA> 1 0 64 NA NA 0 NA NA
## 10 age total 407 0 55.4 9.06 0.449 13 -0.406 -0.347
## # ... with 17 more variables: p00 <dbl>, p01 <dbl>, p05 <dbl>, p10 <dbl>,
## # p20 <dbl>, p25 <dbl>, p30 <dbl>, p40 <dbl>, p50 <dbl>, p60 <dbl>,
## # p70 <dbl>, p75 <dbl>, p80 <dbl>, p90 <dbl>, p95 <dbl>, p99 <dbl>,
## # p100 <dbl>
summary(salary_age)
## variable salary_range n na mean
## Length:10 80-89K :1 Min. : 1.0 Min. :0 Min. :47.50
## Class :character 90-99K :1 1st Qu.: 11.0 1st Qu.:0 1st Qu.:49.21
## Mode :character 100-109K:1 Median : 42.0 Median :0 Median :55.14
## 110-119K:1 Mean : 81.4 Mean :0 Mean :53.99
## 120-129K:1 3rd Qu.: 96.0 3rd Qu.:0 3rd Qu.:56.86
## (Other) :4 Max. :407.0 Max. :0 Max. :64.00
## NA's :1
## sd se_mean IQR skewness
## Min. : 7.066 Min. :0.4492 Min. : 0.000 Min. :-0.54757
## 1st Qu.: 8.659 1st Qu.:0.7714 1st Qu.: 9.875 1st Qu.:-0.40621
## Median : 9.063 Median :1.1525 Median :13.250 Median :-0.13058
## Mean :10.302 Mean :2.4018 Mean :12.550 Mean :-0.00938
## 3rd Qu.:11.303 3rd Qu.:3.1880 3rd Qu.:14.750 3rd Qu.: 0.30667
## Max. :17.795 Max. :8.8976 Max. :20.000 Max. : 0.74637
## NA's :1 NA's :1 NA's :1
## kurtosis p00 p01 p05
## Min. :-0.902555 Min. :31.00 Min. :31.27 Min. :32.35
## 1st Qu.:-0.611234 1st Qu.:31.25 1st Qu.:33.07 1st Qu.:35.44
## Median :-0.481695 Median :33.50 Median :35.16 Median :39.65
## Mean :-0.447106 Mean :37.10 Mean :38.21 Mean :40.86
## 3rd Qu.:-0.334441 3rd Qu.:37.50 3rd Qu.:39.34 3rd Qu.:41.95
## Max. : 0.007879 Max. :64.00 Max. :64.00 Max. :64.00
## NA's :1
## p10 p20 p25 p30
## Min. :33.70 Min. :36.40 Min. :37.75 Min. :39.10
## 1st Qu.:36.55 1st Qu.:38.80 1st Qu.:39.88 1st Qu.:41.85
## Median :42.80 Median :46.60 Median :48.00 Median :50.40
## Mean :43.23 Mean :45.98 Mean :47.40 Mean :48.73
## 3rd Qu.:45.75 3rd Qu.:48.85 3rd Qu.:51.94 3rd Qu.:53.20
## Max. :64.00 Max. :64.00 Max. :64.00 Max. :64.00
##
## p40 p50 p60 p70
## Min. :42.60 Min. :46.50 Min. :49.00 Min. :51.70
## 1st Qu.:44.45 1st Qu.:48.25 1st Qu.:51.00 1st Qu.:55.12
## Median :53.70 Median :56.50 Median :57.50 Median :59.50
## Mean :51.48 Mean :54.10 Mean :56.12 Mean :58.52
## 3rd Qu.:55.85 3rd Qu.:57.75 3rd Qu.:59.75 3rd Qu.:61.00
## Max. :64.00 Max. :64.00 Max. :64.00 Max. :64.00
##
## p75 p80 p90 p95
## Min. :53.00 Min. :56.20 Min. :58.90 Min. :64.00
## 1st Qu.:58.06 1st Qu.:60.60 1st Qu.:64.45 1st Qu.:64.99
## Median :61.00 Median :62.90 Median :66.00 Median :67.40
## Mean :59.95 Mean :61.84 Mean :64.99 Mean :66.95
## 3rd Qu.:62.00 3rd Qu.:63.85 3rd Qu.:66.00 3rd Qu.:68.56
## Max. :64.00 Max. :65.00 Max. :67.50 Max. :69.60
##
## p99 p100
## Min. :64.00 Min. :64.0
## 1st Qu.:66.64 1st Qu.:67.0
## Median :70.50 Median :71.0
## Mean :69.84 Mean :71.3
## 3rd Qu.:71.86 3rd Qu.:73.5
## Max. :77.92 Max. :80.0
##
plot(salary_age)
## Warning: Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
salary_generation <- relate(target_salary, generation)
salary_generation
## generation
## salary_range Baby Boomers Generation X Millennial Generation
## 80-89K 1 2 1
## 90-99K 3 5 1
## 100-109K 5 7 4
## 110-119K 7 18 7
## 120-129K 33 23 1
## 130-139K 33 19 0
## 140-149K 76 33 0
## >=150K 84 37 4
## <NA> 1 0 0
## generation
## salary_range Traditionalist/Silent Generation
## 80-89K 0
## 90-99K 0
## 100-109K 1
## 110-119K 0
## 120-129K 0
## 130-139K 0
## 140-149K 0
## >=150K 1
## <NA> 0
summary(salary_generation)
## Call: xtabs(formula = formula_str, data = data, addNA = TRUE)
## Number of cases in table: 407
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 84.58, df = 24, p-value = 1.114e-08
## Chi-squared approximation may be incorrect
According to these results, generation does contribute to salary range (p-value < 0.05%)
personnel1$ethnic_origin[personnel1$ethnic_origin == "White (Not Hispanic or Latino)"] <- "White"
personnel1$ethnic_origin[personnel1$ethnic_origin== "Asian (Not Hispanic or Latino)"] <- "Asian"
personnel1$ethnic_origin[personnel1$ethnic_origin == "Black or African American (Not Hispanic or Latino)"] <- "Black or African American"
personnel1$ethnic_origin[personnel1$ethnic_origin == "American Indian or Alaska Native (Not Hispanic or Latino)"] <- "American Indian or Alaska Native"
personnel1$ethnic_origin[personnel1$ethnic_origin == "Two or More Races (Not Hispanic or Latino)"] <- "Two or More Races"
salary_ethnic_origin <- relate(target_salary, ethnic_origin)
salary_ethnic_origin
## ethnic_origin
## salary_range American Indian or Alaska Native (Not Hispanic or Latino)
## 80-89K 0
## 90-99K 0
## 100-109K 0
## 110-119K 0
## 120-129K 0
## 130-139K 1
## 140-149K 0
## >=150K 0
## <NA> 0
## ethnic_origin
## salary_range Asian (Not Hispanic or Latino)
## 80-89K 0
## 90-99K 0
## 100-109K 0
## 110-119K 0
## 120-129K 5
## 130-139K 0
## 140-149K 12
## >=150K 6
## <NA> 0
## ethnic_origin
## salary_range Black or African American (Not Hispanic or Latino)
## 80-89K 1
## 90-99K 3
## 100-109K 5
## 110-119K 8
## 120-129K 12
## 130-139K 8
## 140-149K 16
## >=150K 25
## <NA> 0
## ethnic_origin
## salary_range Hispanic or Latino Two or More Races (Not Hispanic or Latino)
## 80-89K 0 0
## 90-99K 0 0
## 100-109K 2 0
## 110-119K 3 0
## 120-129K 7 0
## 130-139K 3 0
## 140-149K 3 1
## >=150K 6 0
## <NA> 0 0
## ethnic_origin
## salary_range Unreported White (Not Hispanic or Latino)
## 80-89K 0 3
## 90-99K 0 6
## 100-109K 2 8
## 110-119K 2 19
## 120-129K 2 31
## 130-139K 4 36
## 140-149K 2 75
## >=150K 5 84
## <NA> 0 1
summary(salary_ethnic_origin)
## Call: xtabs(formula = formula_str, data = data, addNA = TRUE)
## Number of cases in table: 407
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 44.64, df = 48, p-value = 0.6115
## Chi-squared approximation may be incorrect
According to these results, ethnicity does not contribute to salary range (p-value > 0.05%)
salary_length_of_service <- relate(target_salary, length_of_service)
salary_length_of_service
## # A tibble: 10 x 27
## variable salary_range n na mean sd se_mean IQR skewness kurtosis
## <chr> <ord> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 length_~ 80-89K 4 0 6.75 3.77 1.89 2.25 1.13 2.23
## 2 length_~ 90-99K 9 0 7.56 5.59 1.86 10 0.259 -1.73
## 3 length_~ 100-109K 17 0 10.8 9.74 2.36 12 1.23 0.949
## 4 length_~ 110-119K 32 0 11.6 9.45 1.67 16 0.806 0.0425
## 5 length_~ 120-129K 57 0 15.9 11.4 1.51 22 0.439 -0.987
## 6 length_~ 130-139K 52 0 16.7 10.3 1.43 17 0.441 -0.318
## 7 length_~ 140-149K 109 0 19.6 9.70 0.929 17 0.0573 -1.04
## 8 length_~ >=150K 126 0 17.7 11.5 1.02 17.8 0.483 -0.509
## 9 length_~ <NA> 1 0 18 NA NA 0 NA NA
## 10 length_~ total 407 0 16.7 10.8 0.537 18.5 0.418 -0.686
## # ... with 17 more variables: p00 <dbl>, p01 <dbl>, p05 <dbl>, p10 <dbl>,
## # p20 <dbl>, p25 <dbl>, p30 <dbl>, p40 <dbl>, p50 <dbl>, p60 <dbl>,
## # p70 <dbl>, p75 <dbl>, p80 <dbl>, p90 <dbl>, p95 <dbl>, p99 <dbl>,
## # p100 <dbl>
summary(salary_length_of_service)
## variable salary_range n na mean
## Length:10 80-89K :1 Min. : 1.0 Min. :0 Min. : 6.75
## Class :character 90-99K :1 1st Qu.: 11.0 1st Qu.:0 1st Qu.:10.96
## Mode :character 100-109K:1 Median : 42.0 Median :0 Median :16.31
## 110-119K:1 Mean : 81.4 Mean :0 Mean :14.13
## 120-129K:1 3rd Qu.: 96.0 3rd Qu.:0 3rd Qu.:17.48
## (Other) :4 Max. :407.0 Max. :0 Max. :19.56
## NA's :1
## sd se_mean IQR skewness
## Min. : 3.775 Min. :0.537 Min. : 0.00 Min. :0.05731
## 1st Qu.: 9.446 1st Qu.:1.025 1st Qu.:10.50 1st Qu.:0.41804
## Median : 9.737 Median :1.512 Median :16.50 Median :0.44137
## Mean : 9.146 Mean :1.468 Mean :13.25 Mean :0.58524
## 3rd Qu.:10.833 3rd Qu.:1.864 3rd Qu.:17.56 3rd Qu.:0.80574
## Max. :11.501 Max. :2.362 Max. :22.00 Max. :1.23314
## NA's :1 NA's :1 NA's :1
## kurtosis p00 p01 p05
## Min. :-1.73236 Min. : 0.0 Min. : 0.310 Min. : 1.000
## 1st Qu.:-0.98741 1st Qu.: 0.0 1st Qu.: 0.670 1st Qu.: 1.500
## Median :-0.50890 Median : 1.0 Median : 1.040 Median : 2.000
## Mean :-0.22830 Mean : 2.5 Mean : 2.779 Mean : 3.700
## 3rd Qu.: 0.04251 3rd Qu.: 1.0 3rd Qu.: 1.140 3rd Qu.: 3.225
## Max. : 2.22715 Max. :18.0 Max. :18.000 Max. :18.000
## NA's :1
## p10 p20 p25 p30
## Min. : 1.80 Min. : 2.00 Min. : 2.000 Min. : 3.200
## 1st Qu.: 2.00 1st Qu.: 3.45 1st Qu.: 3.562 1st Qu.: 5.175
## Median : 3.25 Median : 5.50 Median : 6.500 Median : 9.800
## Mean : 4.73 Mean : 6.40 Mean : 7.125 Mean : 9.190
## 3rd Qu.: 4.05 3rd Qu.: 6.15 3rd Qu.: 8.500 3rd Qu.:11.750
## Max. :18.00 Max. :18.00 Max. :18.000 Max. :18.000
##
## p40 p50 p60 p70 p75
## Min. : 5.2 Min. : 6.00 Min. : 6.00 Min. : 6.60 Min. : 7.50
## 1st Qu.: 6.0 1st Qu.: 7.75 1st Qu.:12.70 1st Qu.:13.90 1st Qu.:15.00
## Median :12.0 Median :14.00 Median :17.30 Median :19.35 Median :21.50
## Mean :10.7 Mean :12.60 Mean :15.12 Mean :18.31 Mean :20.38
## 3rd Qu.:13.3 3rd Qu.:16.75 3rd Qu.:18.00 3rd Qu.:22.30 3rd Qu.:26.44
## Max. :18.0 Max. :18.00 Max. :20.80 Max. :29.00 Max. :29.00
##
## p80 p90 p95 p99
## Min. : 8.40 Min. :10.20 Min. :11.10 Min. :11.82
## 1st Qu.:15.00 1st Qu.:19.00 1st Qu.:20.26 1st Qu.:21.67
## Median :22.90 Median :27.20 Median :31.93 Median :35.38
## Mean :21.74 Mean :24.74 Mean :27.59 Mean :31.61
## 3rd Qu.:29.00 3rd Qu.:31.40 3rd Qu.:34.00 3rd Qu.:40.78
## Max. :30.00 Max. :33.00 Max. :38.50 Max. :45.75
##
## p100
## Min. :12.00
## 1st Qu.:21.75
## Median :38.00
## Mean :33.20
## 3rd Qu.:42.00
## Max. :47.00
##
plot(salary_length_of_service)
## Warning: Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
# Are salary and gender related?
# Null Hypothesis: salary and gender are independent
sal_gender <- table(personnel1$salary_range, personnel1$ethnic_origin)
sal_gender
##
## American Indian or Alaska Native Asian Black or African American
## 80-89K 0 0 1
## 90-99K 0 0 3
## 100-109K 0 0 5
## 110-119K 0 0 8
## 120-129K 0 5 12
## 130-139K 1 0 8
## 140-149K 0 12 16
## >=150K 0 6 25
##
## Hispanic or Latino Two or More Races Unreported White
## 80-89K 0 0 0 3
## 90-99K 0 0 0 6
## 100-109K 2 0 2 8
## 110-119K 3 0 2 19
## 120-129K 7 0 2 31
## 130-139K 3 0 4 36
## 140-149K 3 1 2 75
## >=150K 6 0 5 84
chisq.test(sal_gender)
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
##
## Pearson's Chi-squared test
##
## data: sal_gender
## X-squared = 43.988, df = 42, p-value = 0.3874
# Are salary and length_of_service related?
# Null Hypothesis: salary and length_of_service are independent
sal_years <- table(personnel1$salary_range, personnel1$length_of_service)
sal_years
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## 80-89K 0 0 0 1 0 0 2 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 90-99K 0 1 2 0 0 1 1 0 0 0 1 0 1 0 0 2 0 0 0 0 0 0 0 0 0 0
## 100-109K 0 1 4 0 0 1 2 1 0 0 0 0 1 2 2 0 0 0 0 0 1 0 0 0 0 0
## 110-119K 1 2 2 4 0 4 2 0 0 1 0 1 0 3 0 1 2 0 0 2 1 1 2 1 0 0
## 120-129K 1 3 3 2 2 3 2 1 0 1 2 4 3 3 2 0 2 2 1 1 0 0 2 0 1 0
## 130-139K 1 1 1 2 1 4 1 3 0 0 0 1 2 4 1 2 3 3 4 0 2 1 1 0 2 0
## 140-149K 0 2 2 2 0 2 0 6 0 0 2 7 8 5 2 4 6 3 4 7 3 2 1 2 2 1
## >=150K 0 5 5 3 6 5 2 3 2 3 3 3 7 5 4 2 4 8 5 2 4 3 2 1 3 2
##
## 26 27 28 29 30 31 32 33 34 35 36 40 41 42 43 45 46 47
## 80-89K 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 90-99K 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 100-109K 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0
## 110-119K 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0
## 120-129K 1 0 1 5 2 1 1 1 1 0 2 0 0 1 0 0 0 0
## 130-139K 2 1 1 1 2 0 2 1 0 0 0 0 1 1 0 0 0 0
## 140-149K 0 1 1 7 8 7 3 2 2 2 2 1 0 0 0 0 0 0
## >=150K 2 3 1 1 5 7 1 2 5 0 0 2 1 0 1 1 1 1
chisq.test(sal_years)
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
##
## Pearson's Chi-squared test
##
## data: sal_years
## X-squared = 282.67, df = 301, p-value = 0.769
# Are salary and age related?
# Null Hypothesis: salary and age are independent
sal_age <- table(personnel1$age, personnel1$age)
sal_age
chisq.test(sal_age)
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
# Are salary and ethnicity related?
# Null Hypothesis: salary and ethnicity are independent
sal_ethnicity <- table(personnel1$salary_range, personnel1$ethnic_origin)
sal_ethnicity
##
## American Indian or Alaska Native Asian Black or African American
## 80-89K 0 0 1
## 90-99K 0 0 3
## 100-109K 0 0 5
## 110-119K 0 0 8
## 120-129K 0 5 12
## 130-139K 1 0 8
## 140-149K 0 12 16
## >=150K 0 6 25
##
## Hispanic or Latino Two or More Races Unreported White
## 80-89K 0 0 0 3
## 90-99K 0 0 0 6
## 100-109K 2 0 2 8
## 110-119K 3 0 2 19
## 120-129K 7 0 2 31
## 130-139K 3 0 4 36
## 140-149K 3 1 2 75
## >=150K 6 0 5 84
chisq.test(sal_ethnicity)
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
##
## Pearson's Chi-squared test
##
## data: sal_ethnicity
## X-squared = 43.988, df = 42, p-value = 0.3874
plot1 <- personnel1 %>%
ggplot(aes(x = salary_range, fill = ethnic_origin)) +
geom_histogram(stat="count", binwidth = 5, color = "white")+
scale_fill_discrete(name = "Ethnicity", labels = c("American Indian or Alaska Native", "Asian","Black or African American", "Hispanic or Latino", "Two or More Races", "Unreported", "White")) +
labs(x = "Salary (in dollars)", y = "Frequency",
title = "MoCo Employees Salary Distribution by Ethnicity")+
theme_minimal() +
scale_fill_brewer(palette="Set1")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
plot1
personnel1$generation[personnel1$generation == "Millennial Generation"] <- "Millennial"
personnel1$generation[personnel1$generation == "Traditionalist/Silent Generation"] <- "Silent"
plot1 <- personnel1 %>%
ggplot(aes(x = salary_range, fill = grade)) +
geom_histogram(stat="count", binwidth = 5, color = "white")+
scale_fill_discrete(name = "Ethnicity", labels = c("M1", "M2", "M3")) +
labs(x = "Salary (in dollars)", y = "Frequency",
title = "MoCo Employees Salary Distribution by Grade")+
theme_minimal() +
scale_fill_brewer(palette="Set1")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
plot1
ggplot(data = personnel1) +
geom_bar(mapping = aes(x = salary_range,
fill = gender)) +
labs(x = "Salary (in dollars)",
title = "MoCo Employees Salary Distribution by Gender")+
theme_minimal() +
scale_fill_brewer(palette="Set1")
{r fig.width = 7, fig.height = 4}
#library(rgdal)
#table_gender_salary <- personnel1 %>%
#dplyr::count(grade, salary_range)
#ggplot(data = table_gender_salary) +
# geom_tile(mapping = aes(x = grade,
# y = salary_range, fill = n)) +
# scale_fill_gradientn(colors = brewer.pal(5, "RdYlGn")) +
#labs(x = "Grade", y = "Salary (in dollars)",
# title = "MoCo Employees Salary Distribution by Grade") +
# theme_minimal()
# facet_grid(.~ grade)
Is there a relationship between gender and salary range?
This is going to be a Chi-Square test. We will test using α=0.05. Ho: gender and salary range are independent. Ha: gender and salary range are dependent.
gender_salary <- table(personnel1$grade,
personnel1$grade)
result5 <- chisq.test(gender_salary)
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
result5
##
## Pearson's Chi-squared test
##
## data: gender_salary
## X-squared = 814, df = 4, p-value < 2.2e-16
P-value: 0.4017 > 0.05 = α.
Conclusion: Fail to reject Ho.
result5$residuals
##
## M1 M2 M3
## M1 18.786333 -2.885191 -4.213002
## M2 -2.885191 14.176494 -8.758008
## M3 -4.213002 -8.758008 7.385656
corrplot(result5$residuals, is.corr = FALSE)