library(readr)
library(tidyverse)
## -- Attaching packages --------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.4.2     v dplyr   0.7.4
## v tidyr   0.8.0     v stringr 1.3.0
## v ggplot2 2.2.1     v forcats 0.3.0
## -- Conflicts ------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
production <- read_csv("~/eda_r/human_resource/production_staff.csv")
## Parsed with column specification:
## cols(
##   `Employee Name` = col_character(),
##   `Race Desc` = col_character(),
##   `Date of Hire` = col_character(),
##   TermDate = col_character(),
##   `Reason for Term` = col_character(),
##   `Employment Status` = col_character(),
##   Department = col_character(),
##   Position = col_character(),
##   Pay = col_character(),
##   `Manager Name` = col_character(),
##   `Performance Score` = col_character(),
##   `Abutments/Hour Wk 1` = col_integer(),
##   `Abutments/Hour Wk 2` = col_integer(),
##   `Daily Error Rate` = col_integer(),
##   `90-day Complaints` = col_integer()
## )
core <- read_csv("~/eda_r/human_resource/core_dataset.csv")
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   Age = col_integer(),
##   `Pay Rate` = col_double()
## )
## See spec(...) for full column specifications.
HR <- read_csv("~/eda_r/human_resource/HRDataset_v9.csv")
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   `Employee Number` = col_integer(),
##   MarriedID = col_integer(),
##   MaritalStatusID = col_integer(),
##   GenderID = col_integer(),
##   EmpStatus_ID = col_integer(),
##   DeptID = col_integer(),
##   Perf_ScoreID = col_integer(),
##   Age = col_integer(),
##   `Pay Rate` = col_double(),
##   `Days Employed` = col_integer()
## )
## See spec(...) for full column specifications.
core$Sex <- gsub('\\<male\\>', 'Male', core$Sex)
ggplot(as.data.frame(table(core$`Performance Score`, core$Sex)), aes(x=Var1, y = Freq, fill = Var2)) + geom_bar(stat="identity")

# Breakdown of sample

table(core$`Employment Status`, core$Sex) %>%
  as.data.frame %>%
  ggplot(aes(x = Var1, y = Freq, fill = Var2)) + geom_bar(stat = 'identity') + coord_flip()

# Employee Source

core$`Employee Source` <- gsub('Pay Per Click - Google|Website Banner Ads', 
                               'Pay Per Click', core$`Employee Source`)
core$`Employee Source` <- gsub('Search Engine - Google Bing Yahoo', 
                               'Internet Search', core$`Employee Source`)
core$`Employee Source` <- gsub('Monster.com|Glassdoor|Careerbuilder', 
                               'Job Sites', core$`Employee Source`)
core$`Employee Source` <- gsub('Newspager/Magazine', 'Newspaper/Magazine', 
                               core$`Employee Source`)
core$`Employee Source` <- gsub('Company Intranet - Partner|Information Session|On-line Web application', 'Other', core$`Employee Source`)

table(core$`Employee Source`, core$Department) %>%
  as.data.frame %>%
  ggplot(aes(x = Var1, y = Freq, fill = Var2)) + geom_bar(stat = 'identity') + coord_flip()

Pay rate

core$NewPosition <-NA
core[grepl(pattern = 'Director|CIO|CEO', x = core$Position),'NewPosition'] <- 'Executives'
core[grepl(pattern = 'Sr.|Manager', x = core$Position),'NewPosition'] <- 'Senior/Manager'
core[is.na(core$NewPosition), 'NewPosition'] <- 'Others'
ggplot(core, aes(`Pay Rate`, fill = NewPosition)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).

production$`Date of Hire` <- as.Date(production$`Date of Hire`, '%m/%d/%Y')
production$TermDate <- as.Date(production$TermDate, '%m/%d/%Y')
production$TimeToTerminate<- production$TermDate -  production$`Date of Hire`
ggplot(data = production, aes(unclass(production$TimeToTerminate), fill = Position)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 173 rows containing non-finite values (stat_bin).

# Ridge plots: distribution of pay by gender

library(ggridges)
ggplot(core, aes(x = `Pay Rate`, y = Sex, fill = NewPosition)) + 
  geom_density_ridges()
## Picking joint bandwidth of 1.92
## Warning: Removed 1 rows containing non-finite values (stat_density_ridges).

On average, how much less/more do women earn in this company?

summary(lm(`Pay Rate`~factor(Sex), data = core))
## 
## Call:
## lm(formula = `Pay Rate` ~ factor(Sex), data = core)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.905 -11.117  -6.117  12.095  50.883 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       29.117      1.147  25.393   <2e-16 ***
## factor(Sex)Male    3.788      1.765   2.146   0.0327 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.13 on 299 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.01517,    Adjusted R-squared:  0.01188 
## F-statistic: 4.606 on 1 and 299 DF,  p-value: 0.03267
library(treemapify)
core %>%
  group_by(Department, Position) %>% 
  summarise(count = n(), pay = mean(`Pay Rate`, na.rm = TRUE)) %>% 
  ggplot(aes(area = count, fill = pay, label = factor(Position))) + 
  geom_treemap()