library(readr)
library(tidyverse)

production <- read_csv("~/eda_r/human_resource/production_staff.csv")
core <- read_csv("~/eda_r/human_resource/core_dataset.csv")
HR <- read_csv("~/eda_r/human_resource/HRDataset_v9.csv")
core <- core[1:nrow(core)-1,]

Data Description

Data cleaning

Missing values

sapply(core, function(x) sum(is.na(x))) %>%
  sort(decreasing = TRUE) %>% 
  knitr::kable(format = 'html') 
x
Date of Termination 198
Employee Name 0
Employee Number 0
State 0
Zip 0
DOB 0
Age 0
Sex 0
MaritalDesc 0
CitizenDesc 0
Hispanic/Latino 0
RaceDesc 0
Date of Hire 0
Reason For Term 0
Employment Status 0
Department 0
Position 0
Pay Rate 0
Manager Name 0
Employee Source 0
Performance Score 0

Text issues

  • male -> Male
  • Software engineering: extra space

Age

core %>% 
  group_by(Age) %>%
  summarise(count = n()) %>%
  ggplot(aes(x = Age, y = count, fill = factor(Age))) + geom_bar(stat = 'identity') + 
  ggtitle('Distribution of Ages among Employees') + theme(legend.position = 'none')

Gender

core$Sex <- gsub('\\<male\\>', 'Male', core$Sex)

core %>% 
  group_by(Sex) %>% 
  summarise(wage = mean(`Pay Rate`)) %>% 
  ggplot(aes(x = Sex, y = wage, fill = Sex)) + geom_bar(stat = 'identity') + 
  theme(legend.position = 'none')

core %>% 
  group_by(Sex) %>% 
  summarise(wage = mean(`Pay Rate`)) %>% 
  knitr::kable(format = 'html')
Sex wage
Female 29.11678
Male 32.90528

Race

core %>% 
  group_by(RaceDesc) %>% 
  summarise(wage = mean(`Pay Rate`)) %>% 
  ggplot(aes(x = RaceDesc, y = wage, fill = RaceDesc)) + geom_bar(stat = 'identity') + 
  theme(legend.position = 'none') + xlab('Race')

core %>% 
  group_by(RaceDesc) %>% 
  summarise(wage = mean(`Pay Rate`)) %>% 
  knitr::kable(format = 'html')
RaceDesc wage
American Indian or Alaska Native 30.37500
Asian 29.44065
Black or African American 34.62463
Hispanic 39.75000
Two or more races 31.26444
White 29.57705

Group by sex, race

Top three earning races by gender:

core %>% 
  group_by(Sex, RaceDesc) %>% 
  summarise(av.wage = mean(`Pay Rate`), av.age = mean(`Age`)) %>% 
  top_n(3, wt = av.wage) %>% 
  knitr::kable(format = 'html')
Sex RaceDesc av.wage av.age
Female Asian 26.85550 39.80000
Female Black or African American 34.38367 39.96667
Female White 28.65182 38.20000
Male American Indian or Alaska Native 36.00000 38.50000
Male Hispanic 47.33333 36.00000
Male Two or more races 40.46571 42.42857

Department

core %>% 
  group_by(Department) %>% 
  summarise(wage = mean(`Pay Rate`)) %>% 
  ggplot(aes(x = Department, y = wage, fill = Department)) + geom_bar(stat = 'identity') + 
  theme(legend.position = 'none')

Average wage by department:

core %>% 
  group_by(Department) %>% 
  summarise(wage = mean(`Pay Rate`)) %>% 
  knitr::kable(format = 'html')
Department wage
Admin Offices 31.89600
Executive Office 80.00000
IT/IS 44.79220
Production 23.08630
Sales 55.52419
Software Engineering 48.66500

Group by department, position

Top two earning positions (on average) in each department:

core %>% 
  group_by(Department, Position) %>% 
  summarise(wage = mean(`Pay Rate`)) %>% 
  top_n(n=2, wt = wage) %>%
  knitr::kable(format = 'html')
Department Position wage
Admin Offices Shared Services Manager 55.00000
Admin Offices Sr. Accountant 34.95000
Executive Office President & CEO 80.00000
IT/IS CIO 65.00000
IT/IS IT Director 65.00000
Production Director of Operations 60.00000
Production Production Manager 49.67857
Sales Director of Sales 60.00000
Sales Sales Manager 56.75000
Software Engineering Software Engineer 51.07222
Software Engineering Software Engineering Manager 27.00000

Top-earning persons for each department: - Admin Offices: Shared Services Manager - Executive Office: President & CEO - IT/IS: CIO, IT Director - Production: Director of Operations - Sales: Director of Sales - Software Engineering: Software Engineer

Recruitment tools

  • Sales seems to rely heavily on PPC recruitment - but perhaps recruits are keen to show that they are familiar with marketing tools
  • Production has the largest proportion in practically every employee source - but note that Production is also the largest department.
core %>% 
  group_by(Department) %>% 
  summarise(count = n()) %>% 
  ggplot(aes(Department, count, fill = Department)) + geom_bar(stat = 'identity') + theme(legend.position = 'none')

core$`Employee Source` <- gsub('Pay Per Click - Google|Website Banner Ads', 
                               'Pay Per Click', core$`Employee Source`)
core$`Employee Source` <- gsub('Search Engine - Google Bing Yahoo', 
                               'Internet Search', core$`Employee Source`)
core$`Employee Source` <- gsub('Monster.com|Glassdoor|Careerbuilder', 
                               'Job Sites', core$`Employee Source`)
core$`Employee Source` <- gsub('Newspager/Magazine', 'Newspaper/Magazine', 
                               core$`Employee Source`)
core$`Employee Source` <- gsub('Company Intranet - Partner|Information Session|On-line Web application', 'Other', core$`Employee Source`)

table(core$`Employee Source`, core$Department) %>%
  as.data.frame %>%
  ggplot(aes(x = Var1, y = Freq, fill = Var2)) + geom_bar(stat = 'identity') +
  coord_flip() + labs(x = 'Employee Source', fill = 'Department')

Distribution of pay rates

  • We group people according to their level in the organisation: executives, seniors/managers and others
  • This approximately matches the distribution.
core$NewPosition <- NA
core[grepl(pattern = 'Director|CIO|CEO', x = core$Position),'NewPosition'] <- 'Executives'
core[grepl(pattern = 'Sr.|Manager', x = core$Position),'NewPosition'] <- 'Senior/Manager'

core[is.na(core$NewPosition), 'NewPosition'] <- 'Others'

ggplot(core, aes(`Pay Rate`, fill = NewPosition)) + geom_histogram() + labs(fill = 'Position group')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Is there a statistically significant difference in the average pay of males and females?

  • Yes. The difference in means is 3.8 and is significant at the 5% level.
lmtest::coeftest(lm(`Pay Rate`~factor(Sex), data = core))
## 
## t test of coefficients:
## 
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      29.1168     1.1467 25.3925  < 2e-16 ***
## factor(Sex)Male   3.7885     1.7653  2.1461  0.03267 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Is there a difference in the distribution of pay between males and females?

  • Doesn’t seem to be much of a difference in the distribution for junior positions.
  • Almost all execs are females, but there’s a concentration of females among junior-level managers.
library(ggridges)
ggplot(core, aes(x = `Pay Rate`, y = Sex, fill = NewPosition)) + 
  geom_density_ridges() + labs('Position group')
## Picking joint bandwidth of 1.92

Performance ratings

Is there gender disparity in how performance ratings are assigned?

ggplot(as.data.frame(table(core$Sex, core$`Performance Score`)), aes(x=Var1, y = Freq, fill = Var2)) + geom_bar(stat="identity") + labs(x = 'Sex', fill = 'Performance score')

Time to termination within Production

Looks like people who leave Production tend to leave very soon!

production$`Date of Hire` <- as.Date(production$`Date of Hire`, '%m/%d/%Y')
production$`TermDate` <- as.Date(production$`TermDate`, '%m/%d/%Y')
production$TimeDiff <- production$TermDate - production$`Date of Hire` 
ggplot(production, aes(x = TimeDiff, fill= Position)) + geom_histogram() + labs(x = 'Time difference (days)')
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 173 rows containing non-finite values (stat_bin).

Why do they leave?

production[!grepl(pattern = 'N/A',production$`Reason for Term`) & !is.na(production$`Reason for Term`),] %>% 
  group_by(`Reason for Term`) %>%
  summarise(Freq = n()) %>% 
  ggplot(aes(x = reorder(`Reason for Term`, Freq), y = Freq, fill = `Reason for Term`)) + 
  geom_bar(stat = 'identity') + coord_flip() + labs(x = 'Reason for Termination')

Is it the pay rate?

Most of production consists of junior positions.

core$prod = ifelse(core$Department == 'Production', 'Production', 'Not production')
ggplot(core, aes(x = `Pay Rate`, y = factor(prod), fill = NewPosition)) +
  ggridges::geom_density_ridges() + labs(fill = 'Position group') + 
  theme(axis.title.y = element_blank())
## Picking joint bandwidth of 2.14