library(readr)
library(tidyverse)

production <- read_csv("~/eda_r/human_resource/production_staff.csv")
core <- read_csv("~/eda_r/human_resource/core_dataset.csv")
HR <- read_csv("~/eda_r/human_resource/HRDataset_v9.csv")
core <- core[1:nrow(core)-1,]

Data Description

Data: Human Resource: Core dataset
Observations: 302
Objective: Pay rate

Data cleaning

Missing values

sapply(core, function(x) sum(is.na(x))) %>%
  sort(decreasing = TRUE) %>% 
  knitr::kable(format = 'html')

	x
Date of Termination	198
Employee Name	0
Employee Number	0
State	0
Zip	0
DOB	0
Age	0
Sex	0
MaritalDesc	0
CitizenDesc	0
Hispanic/Latino	0
RaceDesc	0
Date of Hire	0
Reason For Term	0
Employment Status	0
Department	0
Position	0
Pay Rate	0
Manager Name	0
Employee Source	0
Performance Score	0

Text issues

male -> Male
Software engineering: extra space

Age

core %>% 
  group_by(Age) %>%
  summarise(count = n()) %>%
  ggplot(aes(x = Age, y = count, fill = factor(Age))) + geom_bar(stat = 'identity') + 
  ggtitle('Distribution of Ages among Employees') + theme(legend.position = 'none')

Gender

core$Sex <- gsub('\\<male\\>', 'Male', core$Sex)

core %>% 
  group_by(Sex) %>% 
  summarise(wage = mean(`Pay Rate`)) %>% 
  ggplot(aes(x = Sex, y = wage, fill = Sex)) + geom_bar(stat = 'identity') + 
  theme(legend.position = 'none')

core %>% 
  group_by(Sex) %>% 
  summarise(wage = mean(`Pay Rate`)) %>% 
  knitr::kable(format = 'html')

Sex	wage
Female	29.11678
Male	32.90528

Race

core %>% 
  group_by(RaceDesc) %>% 
  summarise(wage = mean(`Pay Rate`)) %>% 
  ggplot(aes(x = RaceDesc, y = wage, fill = RaceDesc)) + geom_bar(stat = 'identity') + 
  theme(legend.position = 'none') + xlab('Race')

core %>% 
  group_by(RaceDesc) %>% 
  summarise(wage = mean(`Pay Rate`)) %>% 
  knitr::kable(format = 'html')

RaceDesc	wage
American Indian or Alaska Native	30.37500
Asian	29.44065
Black or African American	34.62463
Hispanic	39.75000
Two or more races	31.26444
White	29.57705

Group by sex, race

Females and African-Americans have higher pay rate
Males and Hispanics have higher pay rate

Top three earning races by gender:

core %>% 
  group_by(Sex, RaceDesc) %>% 
  summarise(av.wage = mean(`Pay Rate`), av.age = mean(`Age`)) %>% 
  top_n(3, wt = av.wage) %>% 
  knitr::kable(format = 'html')

Sex	RaceDesc	av.wage	av.age
Female	Asian	26.85550	39.80000
Female	Black or African American	34.38367	39.96667
Female	White	28.65182	38.20000
Male	American Indian or Alaska Native	36.00000	38.50000
Male	Hispanic	47.33333	36.00000
Male	Two or more races	40.46571	42.42857

Department

core %>% 
  group_by(Department) %>% 
  summarise(wage = mean(`Pay Rate`)) %>% 
  ggplot(aes(x = Department, y = wage, fill = Department)) + geom_bar(stat = 'identity') + 
  theme(legend.position = 'none')

Average wage by department:

core %>% 
  group_by(Department) %>% 
  summarise(wage = mean(`Pay Rate`)) %>% 
  knitr::kable(format = 'html')

Department	wage
Admin Offices	31.89600
Executive Office	80.00000
IT/IS	44.79220
Production	23.08630
Sales	55.52419
Software Engineering	48.66500

Group by department, position

Top two earning positions (on average) in each department:

core %>% 
  group_by(Department, Position) %>% 
  summarise(wage = mean(`Pay Rate`)) %>% 
  top_n(n=2, wt = wage) %>%
  knitr::kable(format = 'html')

Department	Position	wage
Admin Offices	Shared Services Manager	55.00000
Admin Offices	Sr. Accountant	34.95000
Executive Office	President & CEO	80.00000
IT/IS	CIO	65.00000
IT/IS	IT Director	65.00000
Production	Director of Operations	60.00000
Production	Production Manager	49.67857
Sales	Director of Sales	60.00000
Sales	Sales Manager	56.75000
Software Engineering	Software Engineer	51.07222
Software Engineering	Software Engineering Manager	27.00000

Top-earning persons for each department: - Admin Offices: Shared Services Manager - Executive Office: President & CEO - IT/IS: CIO, IT Director - Production: Director of Operations - Sales: Director of Sales - Software Engineering: Software Engineer

Recruitment tools

Sales seems to rely heavily on PPC recruitment - but perhaps recruits are keen to show that they are familiar with marketing tools
Production has the largest proportion in practically every employee source - but note that Production is also the largest department.

core %>% 
  group_by(Department) %>% 
  summarise(count = n()) %>% 
  ggplot(aes(Department, count, fill = Department)) + geom_bar(stat = 'identity') + theme(legend.position = 'none')

core$`Employee Source` <- gsub('Pay Per Click - Google|Website Banner Ads', 
                               'Pay Per Click', core$`Employee Source`)
core$`Employee Source` <- gsub('Search Engine - Google Bing Yahoo', 
                               'Internet Search', core$`Employee Source`)
core$`Employee Source` <- gsub('Monster.com|Glassdoor|Careerbuilder', 
                               'Job Sites', core$`Employee Source`)
core$`Employee Source` <- gsub('Newspager/Magazine', 'Newspaper/Magazine', 
                               core$`Employee Source`)
core$`Employee Source` <- gsub('Company Intranet - Partner|Information Session|On-line Web application', 'Other', core$`Employee Source`)

table(core$`Employee Source`, core$Department) %>%
  as.data.frame %>%
  ggplot(aes(x = Var1, y = Freq, fill = Var2)) + geom_bar(stat = 'identity') +
  coord_flip() + labs(x = 'Employee Source', fill = 'Department')

Distribution of pay rates

We group people according to their level in the organisation: executives, seniors/managers and others
This approximately matches the distribution.

core$NewPosition <- NA
core[grepl(pattern = 'Director|CIO|CEO', x = core$Position),'NewPosition'] <- 'Executives'
core[grepl(pattern = 'Sr.|Manager', x = core$Position),'NewPosition'] <- 'Senior/Manager'

core[is.na(core$NewPosition), 'NewPosition'] <- 'Others'

ggplot(core, aes(`Pay Rate`, fill = NewPosition)) + geom_histogram() + labs(fill = 'Position group')

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Is there a statistically significant difference in the average pay of males and females?

Yes. The difference in means is 3.8 and is significant at the 5% level.

lmtest::coeftest(lm(`Pay Rate`~factor(Sex), data = core))

## 
## t test of coefficients:
## 
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      29.1168     1.1467 25.3925  < 2e-16 ***
## factor(Sex)Male   3.7885     1.7653  2.1461  0.03267 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Is there a difference in the distribution of pay between males and females?

Doesn’t seem to be much of a difference in the distribution for junior positions.
Almost all execs are females, but there’s a concentration of females among junior-level managers.

library(ggridges)
ggplot(core, aes(x = `Pay Rate`, y = Sex, fill = NewPosition)) + 
  geom_density_ridges() + labs('Position group')

## Picking joint bandwidth of 1.92

Performance ratings

Is there gender disparity in how performance ratings are assigned?

ggplot(as.data.frame(table(core$Sex, core$`Performance Score`)), aes(x=Var1, y = Freq, fill = Var2)) + geom_bar(stat="identity") + labs(x = 'Sex', fill = 'Performance score')

Time to termination within Production

Looks like people who leave Production tend to leave very soon!

production$`Date of Hire` <- as.Date(production$`Date of Hire`, '%m/%d/%Y')
production$`TermDate` <- as.Date(production$`TermDate`, '%m/%d/%Y')
production$TimeDiff <- production$TermDate - production$`Date of Hire` 
ggplot(production, aes(x = TimeDiff, fill= Position)) + geom_histogram() + labs(x = 'Time difference (days)')

## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 173 rows containing non-finite values (stat_bin).

Why do they leave?

production[!grepl(pattern = 'N/A',production$`Reason for Term`) & !is.na(production$`Reason for Term`),] %>% 
  group_by(`Reason for Term`) %>%
  summarise(Freq = n()) %>% 
  ggplot(aes(x = reorder(`Reason for Term`, Freq), y = Freq, fill = `Reason for Term`)) + 
  geom_bar(stat = 'identity') + coord_flip() + labs(x = 'Reason for Termination')

Is it the pay rate?

Most of production consists of junior positions.

core$prod = ifelse(core$Department == 'Production', 'Production', 'Not production')
ggplot(core, aes(x = `Pay Rate`, y = factor(prod), fill = NewPosition)) +
  ggridges::geom_density_ridges() + labs(fill = 'Position group') + 
  theme(axis.title.y = element_blank())

## Picking joint bandwidth of 2.14

Exploring HR Data

Jia Xin Seng, Philip Khor

18 April 2018