library(readr)
library(tidyverse)
production <- read_csv("~/eda_r/human_resource/production_staff.csv")
core <- read_csv("~/eda_r/human_resource/core_dataset.csv")
HR <- read_csv("~/eda_r/human_resource/HRDataset_v9.csv")
core <- core[1:nrow(core)-1,]
sapply(core, function(x) sum(is.na(x))) %>%
sort(decreasing = TRUE) %>%
knitr::kable(format = 'html')
| x | |
|---|---|
| Date of Termination | 198 |
| Employee Name | 0 |
| Employee Number | 0 |
| State | 0 |
| Zip | 0 |
| DOB | 0 |
| Age | 0 |
| Sex | 0 |
| MaritalDesc | 0 |
| CitizenDesc | 0 |
| Hispanic/Latino | 0 |
| RaceDesc | 0 |
| Date of Hire | 0 |
| Reason For Term | 0 |
| Employment Status | 0 |
| Department | 0 |
| Position | 0 |
| Pay Rate | 0 |
| Manager Name | 0 |
| Employee Source | 0 |
| Performance Score | 0 |
core %>%
group_by(Age) %>%
summarise(count = n()) %>%
ggplot(aes(x = Age, y = count, fill = factor(Age))) + geom_bar(stat = 'identity') +
ggtitle('Distribution of Ages among Employees') + theme(legend.position = 'none')
core$Sex <- gsub('\\<male\\>', 'Male', core$Sex)
core %>%
group_by(Sex) %>%
summarise(wage = mean(`Pay Rate`)) %>%
ggplot(aes(x = Sex, y = wage, fill = Sex)) + geom_bar(stat = 'identity') +
theme(legend.position = 'none')
core %>%
group_by(Sex) %>%
summarise(wage = mean(`Pay Rate`)) %>%
knitr::kable(format = 'html')
| Sex | wage |
|---|---|
| Female | 29.11678 |
| Male | 32.90528 |
core %>%
group_by(RaceDesc) %>%
summarise(wage = mean(`Pay Rate`)) %>%
ggplot(aes(x = RaceDesc, y = wage, fill = RaceDesc)) + geom_bar(stat = 'identity') +
theme(legend.position = 'none') + xlab('Race')
core %>%
group_by(RaceDesc) %>%
summarise(wage = mean(`Pay Rate`)) %>%
knitr::kable(format = 'html')
| RaceDesc | wage |
|---|---|
| American Indian or Alaska Native | 30.37500 |
| Asian | 29.44065 |
| Black or African American | 34.62463 |
| Hispanic | 39.75000 |
| Two or more races | 31.26444 |
| White | 29.57705 |
Top three earning races by gender:
core %>%
group_by(Sex, RaceDesc) %>%
summarise(av.wage = mean(`Pay Rate`), av.age = mean(`Age`)) %>%
top_n(3, wt = av.wage) %>%
knitr::kable(format = 'html')
| Sex | RaceDesc | av.wage | av.age |
|---|---|---|---|
| Female | Asian | 26.85550 | 39.80000 |
| Female | Black or African American | 34.38367 | 39.96667 |
| Female | White | 28.65182 | 38.20000 |
| Male | American Indian or Alaska Native | 36.00000 | 38.50000 |
| Male | Hispanic | 47.33333 | 36.00000 |
| Male | Two or more races | 40.46571 | 42.42857 |
core %>%
group_by(Department) %>%
summarise(wage = mean(`Pay Rate`)) %>%
ggplot(aes(x = Department, y = wage, fill = Department)) + geom_bar(stat = 'identity') +
theme(legend.position = 'none')
Average wage by department:
core %>%
group_by(Department) %>%
summarise(wage = mean(`Pay Rate`)) %>%
knitr::kable(format = 'html')
| Department | wage |
|---|---|
| Admin Offices | 31.89600 |
| Executive Office | 80.00000 |
| IT/IS | 44.79220 |
| Production | 23.08630 |
| Sales | 55.52419 |
| Software Engineering | 48.66500 |
Top two earning positions (on average) in each department:
core %>%
group_by(Department, Position) %>%
summarise(wage = mean(`Pay Rate`)) %>%
top_n(n=2, wt = wage) %>%
knitr::kable(format = 'html')
| Department | Position | wage |
|---|---|---|
| Admin Offices | Shared Services Manager | 55.00000 |
| Admin Offices | Sr. Accountant | 34.95000 |
| Executive Office | President & CEO | 80.00000 |
| IT/IS | CIO | 65.00000 |
| IT/IS | IT Director | 65.00000 |
| Production | Director of Operations | 60.00000 |
| Production | Production Manager | 49.67857 |
| Sales | Director of Sales | 60.00000 |
| Sales | Sales Manager | 56.75000 |
| Software Engineering | Software Engineer | 51.07222 |
| Software Engineering | Software Engineering Manager | 27.00000 |
Top-earning persons for each department: - Admin Offices: Shared Services Manager - Executive Office: President & CEO - IT/IS: CIO, IT Director - Production: Director of Operations - Sales: Director of Sales - Software Engineering: Software Engineer
core %>%
group_by(Department) %>%
summarise(count = n()) %>%
ggplot(aes(Department, count, fill = Department)) + geom_bar(stat = 'identity') + theme(legend.position = 'none')
core$`Employee Source` <- gsub('Pay Per Click - Google|Website Banner Ads',
'Pay Per Click', core$`Employee Source`)
core$`Employee Source` <- gsub('Search Engine - Google Bing Yahoo',
'Internet Search', core$`Employee Source`)
core$`Employee Source` <- gsub('Monster.com|Glassdoor|Careerbuilder',
'Job Sites', core$`Employee Source`)
core$`Employee Source` <- gsub('Newspager/Magazine', 'Newspaper/Magazine',
core$`Employee Source`)
core$`Employee Source` <- gsub('Company Intranet - Partner|Information Session|On-line Web application', 'Other', core$`Employee Source`)
table(core$`Employee Source`, core$Department) %>%
as.data.frame %>%
ggplot(aes(x = Var1, y = Freq, fill = Var2)) + geom_bar(stat = 'identity') +
coord_flip() + labs(x = 'Employee Source', fill = 'Department')
core$NewPosition <- NA
core[grepl(pattern = 'Director|CIO|CEO', x = core$Position),'NewPosition'] <- 'Executives'
core[grepl(pattern = 'Sr.|Manager', x = core$Position),'NewPosition'] <- 'Senior/Manager'
core[is.na(core$NewPosition), 'NewPosition'] <- 'Others'
ggplot(core, aes(`Pay Rate`, fill = NewPosition)) + geom_histogram() + labs(fill = 'Position group')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
lmtest::coeftest(lm(`Pay Rate`~factor(Sex), data = core))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.1168 1.1467 25.3925 < 2e-16 ***
## factor(Sex)Male 3.7885 1.7653 2.1461 0.03267 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(ggridges)
ggplot(core, aes(x = `Pay Rate`, y = Sex, fill = NewPosition)) +
geom_density_ridges() + labs('Position group')
## Picking joint bandwidth of 1.92
Is there gender disparity in how performance ratings are assigned?
ggplot(as.data.frame(table(core$Sex, core$`Performance Score`)), aes(x=Var1, y = Freq, fill = Var2)) + geom_bar(stat="identity") + labs(x = 'Sex', fill = 'Performance score')
Looks like people who leave Production tend to leave very soon!
production$`Date of Hire` <- as.Date(production$`Date of Hire`, '%m/%d/%Y')
production$`TermDate` <- as.Date(production$`TermDate`, '%m/%d/%Y')
production$TimeDiff <- production$TermDate - production$`Date of Hire`
ggplot(production, aes(x = TimeDiff, fill= Position)) + geom_histogram() + labs(x = 'Time difference (days)')
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 173 rows containing non-finite values (stat_bin).
production[!grepl(pattern = 'N/A',production$`Reason for Term`) & !is.na(production$`Reason for Term`),] %>%
group_by(`Reason for Term`) %>%
summarise(Freq = n()) %>%
ggplot(aes(x = reorder(`Reason for Term`, Freq), y = Freq, fill = `Reason for Term`)) +
geom_bar(stat = 'identity') + coord_flip() + labs(x = 'Reason for Termination')
Most of production consists of junior positions.
core$prod = ifelse(core$Department == 'Production', 'Production', 'Not production')
ggplot(core, aes(x = `Pay Rate`, y = factor(prod), fill = NewPosition)) +
ggridges::geom_density_ridges() + labs(fill = 'Position group') +
theme(axis.title.y = element_blank())
## Picking joint bandwidth of 2.14