pkgs00 <- c("devtools", "RCurl")
install.packages(pkgs00)
## Installing packages into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
pkg01 <- c(
"tidyverse", #For data science
"tidyquant", #Financial time series
"lime", #Explaining black-box mode
"glue", #Pasting text
"fs", #File system
"cowplot", #Handle multiple ggplot
"readxl", #read excel file
"writexl" #write excel file
)
install.packages(pkg01)
## Installing packages into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages("pacman")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
pacman::p_install(h2o)
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
##
## h2o installed
install.packages("h2o")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
# h2o set-up
h2o.no_progress() # turn off h2o progress bars
h2o.init() # launch h2o
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 15 minutes 30 seconds
## H2O cluster timezone: UTC
## H2O data parsing timezone: UTC
## H2O cluster version: 3.40.0.1
## H2O cluster version age: 1 month and 7 days
## H2O cluster name: H2O_started_from_R_r1737463_rvx944
## H2O cluster total nodes: 1
## H2O cluster total memory: 0.19 GB
## H2O cluster total cores: 1
## H2O cluster allowed cores: 1
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.2.3 (2023-03-15)
library(readxl)
library(pacman)
p_load(tidyverse, tidyquant, lime, glue, cowplot, ggplot2,fs,readxl, writexl)
fs::dir_create("data")
data <- read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")
## Rows: 1470 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (26): Age, DailyRate, DistanceFromHome, Education, EmployeeCount, Employ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dat1 <- data %>% select(EmployeeNumber, Department, JobRole, PerformanceRating, Attrition)
head(dat1)
## # A tibble: 6 × 5
## EmployeeNumber Department JobRole Performa…¹ Attri…²
## <dbl> <chr> <chr> <dbl> <chr>
## 1 1 Sales Sales Executive 3 Yes
## 2 2 Research & Development Research Scientist 4 No
## 3 4 Research & Development Laboratory Technician 3 Yes
## 4 5 Research & Development Research Scientist 3 No
## 5 7 Research & Development Laboratory Technician 3 No
## 6 8 Research & Development Laboratory Technician 3 No
## # … with abbreviated variable names ¹PerformanceRating, ²Attrition
##Question 2.1
##Compute the number of attrition and percentage of attrition by Department:
dat2 <- dat1 %>% group_by(Department, Attrition) %>% summarise(count= n()) %>% ungroup() %>%
group_by(Department) %>% mutate(percentage= count/sum(count)) %>% ungroup() %>% filter(Attrition=="Yes")
## `summarise()` has grouped output by 'Department'. You can override using the
## `.groups` argument.
dat2
## # A tibble: 3 × 4
## Department Attrition count percentage
## <chr> <chr> <int> <dbl>
## 1 Human Resources Yes 12 0.190
## 2 Research & Development Yes 133 0.138
## 3 Sales Yes 92 0.206
##Question 2.2
##Compute the number of attrition and percentage of attrition by JobRole in each Department:
dat3 <- dat1 %>% group_by(Department, JobRole, Attrition) %>% summarise(count= n()) %>% ungroup() %>%
group_by(Department, JobRole) %>% mutate(percentage= count/sum(count)) %>% ungroup() %>% filter(Attrition=="Yes")
## `summarise()` has grouped output by 'Department', 'JobRole'. You can override
## using the `.groups` argument.
dat3
## # A tibble: 10 × 5
## Department JobRole Attrition count percentage
## <chr> <chr> <chr> <int> <dbl>
## 1 Human Resources Human Resources Yes 12 0.231
## 2 Research & Development Healthcare Representative Yes 9 0.0687
## 3 Research & Development Laboratory Technician Yes 62 0.239
## 4 Research & Development Manager Yes 3 0.0556
## 5 Research & Development Manufacturing Director Yes 10 0.0690
## 6 Research & Development Research Director Yes 2 0.025
## 7 Research & Development Research Scientist Yes 47 0.161
## 8 Sales Manager Yes 2 0.0541
## 9 Sales Sales Executive Yes 57 0.175
## 10 Sales Sales Representative Yes 33 0.398