pkgs00 <- c("devtools", "RCurl")
install.packages(pkgs00)
## Installing packages into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
pkg01 <- c(
"tidyverse", #For data science
"tidyquant", #Financial time series
"lime", #Explaining black-box mode
"glue", #Pasting text
"fs", #File system
"cowplot", #Handle multiple ggplot
"readxl", #read excel file
"writexl" #write excel file
)
install.packages(pkg01)
## Installing packages into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages("pacman")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages("h2o")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages("xlsx")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
h2o.no_progress() # turn off h2o progress bars
h2o.init() # launch h2o
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## /tmp/Rtmplzv4GN/file160664606b7/h2o_r1737463_started_from_r.out
## /tmp/Rtmplzv4GN/file1601a95d5ab/h2o_r1737463_started_from_r.err
##
##
## Starting H2O JVM and connecting: ... Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 2 seconds 412 milliseconds
## H2O cluster timezone: UTC
## H2O data parsing timezone: UTC
## H2O cluster version: 3.40.0.1
## H2O cluster version age: 1 month and 14 days
## H2O cluster name: H2O_started_from_R_r1737463_otf517
## H2O cluster total nodes: 1
## H2O cluster total memory: 0.24 GB
## H2O cluster total cores: 1
## H2O cluster allowed cores: 1
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.2.3 (2023-03-15)
library(readxl)
library(pacman)
p_load(tidyverse, tidyquant, lime, glue, cowplot, ggplot2,fs,readxl, writexl)
library(xlsx)
library(readr)
library(writexl)
data <- read_csv("telco_train.xlsx")
## Multiple files in zip: reading 'xl/worksheets/sheet1.xml'
## Rows: 1 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(readxl)
# Create data directory
fs::dir_create("data")
# Import telco_train.xlsx data
path_train <- read_excel("telco_train.xlsx")
library("magrittr")
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library("dplyr")
# Analysis of attrition by department
dept_job_role_tbl <- path_train %>%
select(EmployeeNumber, Department, JobRole,
PerformanceRating, Attrition)
dept_job_role_tbl %>% group_by(Attrition) %>%
summarise(n = n()) %>%
ungroup() %>%
mutate(pct = n /sum(n))
## # A tibble: 2 × 3
## Attrition n pct
## <chr> <int> <dbl>
## 1 No 1049 0.839
## 2 Yes 201 0.161
# department attrition
dept_job_role_tbl %>% group_by(Department, Attrition) %>%
summarise(n = n()) %>%
ungroup() %>%
#group_by(Department) %>%
mutate(pct = n /sum(n))
## `summarise()` has grouped output by 'Department'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 4
## Department Attrition n pct
## <chr> <chr> <int> <dbl>
## 1 Human Resources No 37 0.0296
## 2 Human Resources Yes 12 0.0096
## 3 Research & Development No 721 0.577
## 4 Research & Development Yes 111 0.0888
## 5 Sales No 291 0.233
## 6 Sales Yes 78 0.0624
# JobRole attrition
dept_job_role_tbl %>% group_by(Department, JobRole, Attrition) %>%
summarise(n = n()) %>%
ungroup() %>%
group_by(Department, JobRole) %>%
mutate(pct = n /sum(n)) %>%
ungroup() %>%
filter(Attrition %in% c("Yes"))
## `summarise()` has grouped output by 'Department', 'JobRole'. You can override
## using the `.groups` argument.
## # A tibble: 10 × 5
## Department JobRole Attrition n pct
## <chr> <chr> <chr> <int> <dbl>
## 1 Human Resources Human Resources Yes 12 0.308
## 2 Research & Development Healthcare Representative Yes 8 0.0762
## 3 Research & Development Laboratory Technician Yes 49 0.219
## 4 Research & Development Manager Yes 2 0.0417
## 5 Research & Development Manufacturing Director Yes 7 0.0569
## 6 Research & Development Research Director Yes 2 0.0274
## 7 Research & Development Research Scientist Yes 43 0.166
## 8 Sales Manager Yes 2 0.0645
## 9 Sales Sales Executive Yes 50 0.183
## 10 Sales Sales Representative Yes 26 0.4
calculate_attrition_cost <- function(
# Employee
n = 1,
salary = 80000,
# Direct Costs
separation_cost = 500,
vacancy_cost = 10000,
acquisition_cost = 4900,
placement_cost = 3500,
# Productivity Costs
net_revenue_per_employee = 250000,
workdays_per_year = 240,
workdays_position_open = 40,
workdays_onboarding = 60,
onboarding_efficiency = 0.50
) {
# Direct Costs
direct_cost <- sum(separation_cost, vacancy_cost, acquisition_cost, placement_cost)
# Lost Productivity Costs
productivity_cost <- net_revenue_per_employee / workdays_per_year *
(workdays_position_open + workdays_onboarding * onboarding_efficiency)
# Savings of Salary & Benefits (Cost Reduction)
salary_benefit_reduction <- salary / workdays_per_year * workdays_position_open
# Estimated Turnover Per Employee
cost_per_employee <- direct_cost + productivity_cost - salary_benefit_reduction
# Total Cost of Employee Turnover
total_cost <- n * cost_per_employee
return(total_cost)
}