library(readr)
library(tidyverse)
## -- Attaching packages --------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.2 v dplyr 0.7.4
## v tidyr 0.8.0 v stringr 1.3.0
## v ggplot2 2.2.1 v forcats 0.3.0
## -- Conflicts ------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
production <- read_csv("~/eda_r/human_resource/production_staff.csv")
## Parsed with column specification:
## cols(
## `Employee Name` = col_character(),
## `Race Desc` = col_character(),
## `Date of Hire` = col_character(),
## TermDate = col_character(),
## `Reason for Term` = col_character(),
## `Employment Status` = col_character(),
## Department = col_character(),
## Position = col_character(),
## Pay = col_character(),
## `Manager Name` = col_character(),
## `Performance Score` = col_character(),
## `Abutments/Hour Wk 1` = col_integer(),
## `Abutments/Hour Wk 2` = col_integer(),
## `Daily Error Rate` = col_integer(),
## `90-day Complaints` = col_integer()
## )
core <- read_csv("~/eda_r/human_resource/core_dataset.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## Age = col_integer(),
## `Pay Rate` = col_double()
## )
## See spec(...) for full column specifications.
HR <- read_csv("~/eda_r/human_resource/HRDataset_v9.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## `Employee Number` = col_integer(),
## MarriedID = col_integer(),
## MaritalStatusID = col_integer(),
## GenderID = col_integer(),
## EmpStatus_ID = col_integer(),
## DeptID = col_integer(),
## Perf_ScoreID = col_integer(),
## Age = col_integer(),
## `Pay Rate` = col_double(),
## `Days Employed` = col_integer()
## )
## See spec(...) for full column specifications.
core$Sex <- gsub('\\<male\\>', 'Male', core$Sex)
ggplot(as.data.frame(table(core$`Performance Score`, core$Sex)), aes(x=Var1, y = Freq, fill = Var2)) + geom_bar(stat="identity")
# Breakdown of sample
table(core$`Employment Status`, core$Sex) %>%
as.data.frame %>%
ggplot(aes(x = Var1, y = Freq, fill = Var2)) + geom_bar(stat = 'identity') + coord_flip()
# Employee Source
core$`Employee Source` <- gsub('Pay Per Click - Google|Website Banner Ads',
'Pay Per Click', core$`Employee Source`)
core$`Employee Source` <- gsub('Search Engine - Google Bing Yahoo',
'Internet Search', core$`Employee Source`)
core$`Employee Source` <- gsub('Monster.com|Glassdoor|Careerbuilder',
'Job Sites', core$`Employee Source`)
core$`Employee Source` <- gsub('Newspager/Magazine', 'Newspaper/Magazine',
core$`Employee Source`)
core$`Employee Source` <- gsub('Company Intranet - Partner|Information Session|On-line Web application', 'Other', core$`Employee Source`)
table(core$`Employee Source`, core$Department) %>%
as.data.frame %>%
ggplot(aes(x = Var1, y = Freq, fill = Var2)) + geom_bar(stat = 'identity') + coord_flip()
core$NewPosition <-NA
core[grepl(pattern = 'Director|CIO|CEO', x = core$Position),'NewPosition'] <- 'Executives'
core[grepl(pattern = 'Sr.|Manager', x = core$Position),'NewPosition'] <- 'Senior/Manager'
core[is.na(core$NewPosition), 'NewPosition'] <- 'Others'
ggplot(core, aes(`Pay Rate`, fill = NewPosition)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
production$`Date of Hire` <- as.Date(production$`Date of Hire`, '%m/%d/%Y')
production$TermDate <- as.Date(production$TermDate, '%m/%d/%Y')
production$TimeToTerminate<- production$TermDate - production$`Date of Hire`
ggplot(data = production, aes(unclass(production$TimeToTerminate), fill = Position)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 173 rows containing non-finite values (stat_bin).
# Ridge plots: distribution of pay by gender
library(ggridges)
ggplot(core, aes(x = `Pay Rate`, y = Sex, fill = NewPosition)) +
geom_density_ridges()
## Picking joint bandwidth of 1.92
## Warning: Removed 1 rows containing non-finite values (stat_density_ridges).
On average, how much less/more do women earn in this company?
summary(lm(`Pay Rate`~factor(Sex), data = core))
##
## Call:
## lm(formula = `Pay Rate` ~ factor(Sex), data = core)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.905 -11.117 -6.117 12.095 50.883
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.117 1.147 25.393 <2e-16 ***
## factor(Sex)Male 3.788 1.765 2.146 0.0327 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.13 on 299 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.01517, Adjusted R-squared: 0.01188
## F-statistic: 4.606 on 1 and 299 DF, p-value: 0.03267
library(treemapify)
core %>%
group_by(Department, Position) %>%
summarise(count = n(), pay = mean(`Pay Rate`, na.rm = TRUE)) %>%
ggplot(aes(area = count, fill = pay, label = factor(Position))) +
geom_treemap()