pkgs00 <- c("devtools", "RCurl")
install.packages(pkgs00)
## Installing packages into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
pkg01 <- c(
         "tidyverse", #For data science 
         "tidyquant", #Financial time series
         "lime",      #Explaining black-box mode
         "glue",      #Pasting text
         "fs",        #File system
         "cowplot",   #Handle multiple ggplot
         "readxl",    #read excel file
         "writexl"    #write excel file
)
install.packages(pkg01)
## Installing packages into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages("pacman")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
pacman::p_install(h2o)
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
## 
## h2o installed
install.packages("h2o")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(h2o)
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
# h2o set-up 
h2o.no_progress()  # turn off h2o progress bars
h2o.init()         # launch h2o
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         15 minutes 30 seconds 
##     H2O cluster timezone:       UTC 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.40.0.1 
##     H2O cluster version age:    1 month and 7 days 
##     H2O cluster name:           H2O_started_from_R_r1737463_rvx944 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   0.19 GB 
##     H2O cluster total cores:    1 
##     H2O cluster allowed cores:  1 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.2.3 (2023-03-15)
library(readxl)
library(pacman)
p_load(tidyverse, tidyquant, lime, glue, cowplot, ggplot2,fs,readxl, writexl) 
fs::dir_create("data")
data <- read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")
## Rows: 1470 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (9): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (26): Age, DailyRate, DistanceFromHome, Education, EmployeeCount, Employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dat1 <- data %>% select(EmployeeNumber, Department, JobRole, PerformanceRating, Attrition)
head(dat1)
## # A tibble: 6 × 5
##   EmployeeNumber Department             JobRole               Performa…¹ Attri…²
##            <dbl> <chr>                  <chr>                      <dbl> <chr>  
## 1              1 Sales                  Sales Executive                3 Yes    
## 2              2 Research & Development Research Scientist             4 No     
## 3              4 Research & Development Laboratory Technician          3 Yes    
## 4              5 Research & Development Research Scientist             3 No     
## 5              7 Research & Development Laboratory Technician          3 No     
## 6              8 Research & Development Laboratory Technician          3 No     
## # … with abbreviated variable names ¹​PerformanceRating, ²​Attrition

##Question 2.1

##Compute the number of attrition and percentage of attrition by Department:

dat2 <- dat1 %>% group_by(Department, Attrition) %>% summarise(count= n()) %>% ungroup() %>%
  group_by(Department) %>% mutate(percentage= count/sum(count)) %>% ungroup() %>% filter(Attrition=="Yes")
## `summarise()` has grouped output by 'Department'. You can override using the
## `.groups` argument.
dat2
## # A tibble: 3 × 4
##   Department             Attrition count percentage
##   <chr>                  <chr>     <int>      <dbl>
## 1 Human Resources        Yes          12      0.190
## 2 Research & Development Yes         133      0.138
## 3 Sales                  Yes          92      0.206

##Question 2.2

##Compute the number of attrition and percentage of attrition by JobRole in each Department:

dat3 <- dat1 %>% group_by(Department, JobRole, Attrition) %>% summarise(count= n()) %>% ungroup() %>%
  group_by(Department, JobRole) %>% mutate(percentage= count/sum(count)) %>% ungroup() %>% filter(Attrition=="Yes")
## `summarise()` has grouped output by 'Department', 'JobRole'. You can override
## using the `.groups` argument.
dat3
## # A tibble: 10 × 5
##    Department             JobRole                   Attrition count percentage
##    <chr>                  <chr>                     <chr>     <int>      <dbl>
##  1 Human Resources        Human Resources           Yes          12     0.231 
##  2 Research & Development Healthcare Representative Yes           9     0.0687
##  3 Research & Development Laboratory Technician     Yes          62     0.239 
##  4 Research & Development Manager                   Yes           3     0.0556
##  5 Research & Development Manufacturing Director    Yes          10     0.0690
##  6 Research & Development Research Director         Yes           2     0.025 
##  7 Research & Development Research Scientist        Yes          47     0.161 
##  8 Sales                  Manager                   Yes           2     0.0541
##  9 Sales                  Sales Executive           Yes          57     0.175 
## 10 Sales                  Sales Representative      Yes          33     0.398