hw1

# Some key packages 
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

pkgs00 <- c("devtools", "RCurl")
install.packages(pkgs00)

## Installing packages into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)

# 
pkg01 <- c(
         "tidyverse", #For data science 
         "tidyquant", #Financial time series
         "lime",      #Explaining black-box mode
         "glue",      #Pasting text
         "fs",        #File system
         "cowplot",   #Handle multiple ggplot
         "readxl",    #read excel file
         "writexl"    #write excel file
)
install.packages(pkg01)

## Installing packages into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)

install.packages("pacman")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)

# pacman::p_install(h2o)
install.packages("h2o")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)

library(h2o)

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------

## 
## Attaching package: 'h2o'

## The following objects are masked from 'package:stats':
## 
##     cor, sd, var

## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

# h2o set-up 
h2o.no_progress()  # turn off h2o progress bars
h2o.init()         # launch h2o

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         57 minutes 21 seconds 
##     H2O cluster timezone:       UTC 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.40.0.1 
##     H2O cluster version age:    1 month and 14 days 
##     H2O cluster name:           H2O_started_from_R_r1753226_his864 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   0.19 GB 
##     H2O cluster total cores:    1 
##     H2O cluster allowed cores:  1 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.2.3 (2023-03-15)

library(readxl)
library(pacman)
p_load(tidyverse, tidyquant, lime, glue, cowplot, ggplot2,fs,readxl, writexl) 

# Create data directory
fs::dir_create("data")

# Import telco_train.csv data
path_train <- "/cloud/project/WA_Fn-UseC_-HR-Employee-Attrition.csv"
train_raw_tbl <- read_csv(path_train)

## Rows: 1470 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (9): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (26): Age, DailyRate, DistanceFromHome, Education, EmployeeCount, Employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

class(train_raw_tbl)

## [1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame"

#1 Analysis of attrition by department
dept_job_role_tbl <- train_raw_tbl %>% 
                     select(EmployeeNumber, Department, JobRole, 
                            PerformanceRating, Attrition)
    
dept_job_role_tbl %>% group_by(Attrition) %>%
                      summarise(n = n()) %>%
                      ungroup() %>%
                      mutate(pct = n /sum(n))

## # A tibble: 2 × 3
##   Attrition     n   pct
##   <chr>     <int> <dbl>
## 1 No         1233 0.839
## 2 Yes         237 0.161

#2 department attrition
dept_job_role_tbl %>% group_by(Department, Attrition) %>%
                      summarise(n = n()) %>%
                      ungroup() %>%
                      #group_by(Department) %>%
                      mutate(pct = n /sum(n))

## `summarise()` has grouped output by 'Department'. You can override using the
## `.groups` argument.

## # A tibble: 6 × 4
##   Department             Attrition     n     pct
##   <chr>                  <chr>     <int>   <dbl>
## 1 Human Resources        No           51 0.0347 
## 2 Human Resources        Yes          12 0.00816
## 3 Research & Development No          828 0.563  
## 4 Research & Development Yes         133 0.0905 
## 5 Sales                  No          354 0.241  
## 6 Sales                  Yes          92 0.0626

# JobRole attrition
dept_job_role_tbl %>% group_by(Department, JobRole, Attrition) %>%
                      summarise(n = n()) %>%
                      ungroup() %>%
                      group_by(Department, JobRole) %>%
                      mutate(pct = n /sum(n)) %>%
                      ungroup() %>%
                      filter(Attrition %in% c("Yes"))

## `summarise()` has grouped output by 'Department', 'JobRole'. You can override
## using the `.groups` argument.

## # A tibble: 10 × 5
##    Department             JobRole                   Attrition     n    pct
##    <chr>                  <chr>                     <chr>     <int>  <dbl>
##  1 Human Resources        Human Resources           Yes          12 0.231 
##  2 Research & Development Healthcare Representative Yes           9 0.0687
##  3 Research & Development Laboratory Technician     Yes          62 0.239 
##  4 Research & Development Manager                   Yes           3 0.0556
##  5 Research & Development Manufacturing Director    Yes          10 0.0690
##  6 Research & Development Research Director         Yes           2 0.025 
##  7 Research & Development Research Scientist        Yes          47 0.161 
##  8 Sales                  Manager                   Yes           2 0.0541
##  9 Sales                  Sales Executive           Yes          57 0.175 
## 10 Sales                  Sales Representative      Yes          33 0.398

#calculateing attrition cost
calculate_attrition_cost <- function(
    
  # Employee
  n                    = 1,
  salary               = 80000,
  
  # Direct Costs
  separation_cost      = 500,
  vacancy_cost         = 10000,
  acquisition_cost     = 4900,
  placement_cost       = 3500,
  
  # Productivity Costs
  net_revenue_per_employee = 250000,
  workdays_per_year        = 240,
  workdays_position_open   = 40,
  workdays_onboarding      = 60,
  onboarding_efficiency    = 0.50
  
) {
  
  # Direct Costs
  direct_cost <- sum(separation_cost, vacancy_cost, acquisition_cost, placement_cost)
  
  # Lost Productivity Costs
  productivity_cost <- net_revenue_per_employee / workdays_per_year * 
    (workdays_position_open + workdays_onboarding * onboarding_efficiency) 
  
  # Savings of Salary & Benefits (Cost Reduction)
  salary_benefit_reduction <- salary / workdays_per_year * workdays_position_open
  
  # Estimated Turnover Per Employee
  cost_per_employee <- direct_cost + productivity_cost - salary_benefit_reduction
  
  # Total Cost of Employee Turnover
  total_cost <- n * cost_per_employee
  
  return(total_cost)
  
}

calculate_attrition_cost(n = 1,salary = 80000)

## [1] 78483.33

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)

##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.