#pkgs00 <- c("devtools", "RCurl")
#install.packages(pkgs00)
#pkg01 <- c(
#"tidyverse", #For data science
#"tidyquant", #Financial time series
#"lime", #Explaining black-box mode
#"glue", #Pasting text
#"cowplot", #Handle multiple ggplot
#"readxl", #Read excel file
#"writexl" #Write excel file
#)
#install.packages(pkg01)
#install.packages("pacman")
library(readxl)
library(pacman)
p_load(tidyverse, tidyquant, lime, glue, cowplot, readxl, writexl)
#create data directory
#install.packages("fs")
library(fs)
fs::dir_create("data")
#import telco_train.xlsx data
#install.packages("readxl")
library(readxl)
path_train <- "~/Application_ML_2023/data/IBMdataset.csv"
train_raw_tbl <- read.csv(path_train)
class(train_raw_tbl)
## [1] "data.frame"
library("magrittr")
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library("dplyr")
dept_job_role_tbl <- train_raw_tbl %>%
select(EmployeeNumber, Department, JobRole, PerformanceRating, Attrition)
dept_job_role_tbl %>% group_by(Attrition) %>%
summarise(n = n()) %>%
ungroup() %>%
mutate(pct = n / sum(n))
## # A tibble: 2 × 3
## Attrition n pct
## <chr> <int> <dbl>
## 1 No 1233 0.839
## 2 Yes 237 0.161
#department attrition
dept_job_role_tbl %>% group_by(Department, Attrition) %>%
summarise(n = n()) %>%
ungroup() %>%
group_by(Department) %>%
mutate(pct = n / sum(n))
## `summarise()` has grouped output by 'Department'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 4
## # Groups: Department [3]
## Department Attrition n pct
## <chr> <chr> <int> <dbl>
## 1 Human Resources No 51 0.810
## 2 Human Resources Yes 12 0.190
## 3 Research & Development No 828 0.862
## 4 Research & Development Yes 133 0.138
## 5 Sales No 354 0.794
## 6 Sales Yes 92 0.206
#JobRole attrition
dept_job_role_tbl %>% group_by(Department, JobRole, Attrition) %>%
summarise(n = n()) %>%
ungroup() %>%
group_by(Department, JobRole) %>%
mutate(pct = n / sum(n)) %>%
print(n=21)
## `summarise()` has grouped output by 'Department', 'JobRole'. You can override
## using the `.groups` argument.
## # A tibble: 21 × 5
## # Groups: Department, JobRole [11]
## Department JobRole Attrition n pct
## <chr> <chr> <chr> <int> <dbl>
## 1 Human Resources Human Resources No 40 0.769
## 2 Human Resources Human Resources Yes 12 0.231
## 3 Human Resources Manager No 11 1
## 4 Research & Development Healthcare Representative No 122 0.931
## 5 Research & Development Healthcare Representative Yes 9 0.0687
## 6 Research & Development Laboratory Technician No 197 0.761
## 7 Research & Development Laboratory Technician Yes 62 0.239
## 8 Research & Development Manager No 51 0.944
## 9 Research & Development Manager Yes 3 0.0556
## 10 Research & Development Manufacturing Director No 135 0.931
## 11 Research & Development Manufacturing Director Yes 10 0.0690
## 12 Research & Development Research Director No 78 0.975
## 13 Research & Development Research Director Yes 2 0.025
## 14 Research & Development Research Scientist No 245 0.839
## 15 Research & Development Research Scientist Yes 47 0.161
## 16 Sales Manager No 35 0.946
## 17 Sales Manager Yes 2 0.0541
## 18 Sales Sales Executive No 269 0.825
## 19 Sales Sales Executive Yes 57 0.175
## 20 Sales Sales Representative No 50 0.602
## 21 Sales Sales Representative Yes 33 0.398
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.