All models are wrong, but some are useful. - George Box
library(tidyverse)
library(odbc)
library(DBI)
library(RSQLite)
library(tvthemes)
library(ggthemes)
library(scales)
dat_new <- read_csv("./input/WA_Fn-UseC_-HR-Employee-Attrition.csv")
dat_new <- dat_new %>%
mutate_if(is.character, as_factor) %>%
mutate(
EnvironmentSatisfaction = factor(EnvironmentSatisfaction, ordered = TRUE),
StockOptionLevel = factor(StockOptionLevel, ordered = TRUE),
JobLevel = factor(JobLevel, ordered = TRUE),
JobInvolvement = factor(JobInvolvement, ordered = TRUE)
) %>%
select(EmployeeNumber, Attrition, everything())
my_skim <- skimr::skim_with(numeric = skimr::sfl(p25 = NULL, p50 = NULL, p75 = NULL, hist = NULL))
my_skim(dat_new)| Name | dat_new |
| Number of rows | 1470 |
| Number of columns | 35 |
| _______________________ | |
| Column type frequency: | |
| factor | 13 |
| numeric | 22 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Attrition | 0 | 1 | FALSE | 2 | No: 1233, Yes: 237 |
| BusinessTravel | 0 | 1 | FALSE | 3 | Tra: 1043, Tra: 277, Non: 150 |
| Department | 0 | 1 | FALSE | 3 | Res: 961, Sal: 446, Hum: 63 |
| EducationField | 0 | 1 | FALSE | 6 | Lif: 606, Med: 464, Mar: 159, Tec: 132 |
| EnvironmentSatisfaction | 0 | 1 | TRUE | 4 | 3: 453, 4: 446, 2: 287, 1: 284 |
| Gender | 0 | 1 | FALSE | 2 | Mal: 882, Fem: 588 |
| JobInvolvement | 0 | 1 | TRUE | 4 | 3: 868, 2: 375, 4: 144, 1: 83 |
| JobLevel | 0 | 1 | TRUE | 5 | 1: 543, 2: 534, 3: 218, 4: 106 |
| JobRole | 0 | 1 | FALSE | 9 | Sal: 326, Res: 292, Lab: 259, Man: 145 |
| MaritalStatus | 0 | 1 | FALSE | 3 | Mar: 673, Sin: 470, Div: 327 |
| Over18 | 0 | 1 | FALSE | 1 | Y: 1470 |
| OverTime | 0 | 1 | FALSE | 2 | No: 1054, Yes: 416 |
| StockOptionLevel | 0 | 1 | TRUE | 4 | 0: 631, 1: 596, 2: 158, 3: 85 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p100 |
|---|---|---|---|---|---|---|
| EmployeeNumber | 0 | 1 | 1024.87 | 602.02 | 1 | 2068 |
| Age | 0 | 1 | 36.92 | 9.14 | 18 | 60 |
| DailyRate | 0 | 1 | 802.49 | 403.51 | 102 | 1499 |
| DistanceFromHome | 0 | 1 | 9.19 | 8.11 | 1 | 29 |
| Education | 0 | 1 | 2.91 | 1.02 | 1 | 5 |
| EmployeeCount | 0 | 1 | 1.00 | 0.00 | 1 | 1 |
| HourlyRate | 0 | 1 | 65.89 | 20.33 | 30 | 100 |
| JobSatisfaction | 0 | 1 | 2.73 | 1.10 | 1 | 4 |
| MonthlyIncome | 0 | 1 | 6502.93 | 4707.96 | 1009 | 19999 |
| MonthlyRate | 0 | 1 | 14313.10 | 7117.79 | 2094 | 26999 |
| NumCompaniesWorked | 0 | 1 | 2.69 | 2.50 | 0 | 9 |
| PercentSalaryHike | 0 | 1 | 15.21 | 3.66 | 11 | 25 |
| PerformanceRating | 0 | 1 | 3.15 | 0.36 | 3 | 4 |
| RelationshipSatisfaction | 0 | 1 | 2.71 | 1.08 | 1 | 4 |
| StandardHours | 0 | 1 | 80.00 | 0.00 | 80 | 80 |
| TotalWorkingYears | 0 | 1 | 11.28 | 7.78 | 0 | 40 |
| TrainingTimesLastYear | 0 | 1 | 2.80 | 1.29 | 0 | 6 |
| WorkLifeBalance | 0 | 1 | 2.76 | 0.71 | 1 | 4 |
| YearsAtCompany | 0 | 1 | 7.01 | 6.13 | 0 | 40 |
| YearsInCurrentRole | 0 | 1 | 4.23 | 3.62 | 0 | 18 |
| YearsSinceLastPromotion | 0 | 1 | 2.19 | 3.22 | 0 | 15 |
| YearsWithCurrManager | 0 | 1 | 4.12 | 3.57 | 0 | 17 |
con<-dbConnect(RSQLite::SQLite(), ":memory:")
copy_to(con,dat_new)LIMIT n does the trickdbGetQuery(con,'SELECT *
FROM dat_new
LIMIT 5')dbGetQuery(con,'SELECT COUNT(*) AS num_rows
FROM dat_new')dbGetQuery(con,'SELECT COUNT(*)-COUNT(EmployeeNumber) AS missing
FROM dat_new')dbGetQuery(con,"SELECT Attrition,
MAX(MonthlyIncome) AS max_income ,
MIN(MonthlyIncome) AS min_income ,
AVG(MonthlyIncome) AS avg_income
FROM dat_new
GROUP BY Attrition;")dat_new |>
ggplot(aes(x=Attrition,y=MonthlyIncome ,fill=Attrition))+
geom_boxplot(outlier.color = "blue")+
tvthemes::scale_fill_avatar()t.test(MonthlyIncome~Attrition,data=dat_new,alternative="two.sided")##
## Welch Two Sample t-test
##
## data: MonthlyIncome by Attrition
## t = -7.4826, df = 412.74, p-value = 4.434e-13
## alternative hypothesis: true difference in means between group Yes and group No is not equal to 0
## 95 percent confidence interval:
## -2583.050 -1508.244
## sample estimates:
## mean in group Yes mean in group No
## 4787.093 6832.740
Comments
t=-7.4826 represents the
calculated difference in sample means in units of standard error and the
negative sign suggest the first mean is smaller than the second0.05 suggesting that the
difference in means is statistically significantmonthly income for each group is statistically
significantvars_numeric<-dat_new |>
select(-EmployeeCount,-EmployeeNumber,-StandardHours) |>
select_if(is.numeric) |> colnames()
km_aov <- vars_numeric %>%
map(~ t.test(rlang::eval_tidy(expr(!!sym(.x) ~Attrition)), data = dat_new))
km_aov %>%
map(~ data.frame(`test statistic` = .x$`statistic`[[1]], p.value = .x$`p.value`[[1]])) %>%
bind_rows() %>%
bind_cols(Attribute = vars_numeric) %>%
select(Attribute, everything()) %>%
mutate(`p value` = round(p.value,3)) %>%
select(-p.value)%>%
flextable::flextable() %>%
flextable::autofit() %>%
flextable::colformat_num(j = 2, digits = 2) %>%
flextable::colformat_num(j = 3, digits = 4)Attribute | test.statistic | p value |
|---|---|---|
Age | -5.8280119 | 0.000 |
DailyRate | -2.1788817 | 0.030 |
DistanceFromHome | 2.8881831 | 0.004 |
Education | -1.2177494 | 0.224 |
HourlyRate | -0.2647686 | 0.791 |
JobSatisfaction | -3.9261129 | 0.000 |
MonthlyIncome | -7.4826216 | 0.000 |
MonthlyRate | 0.5755022 | 0.565 |
NumCompaniesWorked | 1.5746511 | 0.116 |
PercentSalaryHike | -0.5042445 | 0.614 |
PerformanceRating | 0.1099940 | 0.912 |
RelationshipSatisfaction | -1.7019371 | 0.090 |
TotalWorkingYears | -7.0191785 | 0.000 |
TrainingTimesLastYear | -2.3305223 | 0.020 |
WorkLifeBalance | -2.1741928 | 0.030 |
YearsAtCompany | -5.2825961 | 0.000 |
YearsInCurrentRole | -6.8470792 | 0.000 |
YearsSinceLastPromotion | -1.2879266 | 0.199 |
YearsWithCurrManager | -6.6333988 | 0.000 |
comments
0.05 suggest that the
difference in means is statistically significantchancevars_numeric<-dat_new |>
select(-EmployeeCount,-EmployeeNumber,-StandardHours) |>
select_if(is.numeric) |>
colnames()
subset<- dat_new |>
select(vars_numeric,Attrition)
# Bring in external file for visualisations
source('functions/visualisations.R')
# Use plot function
plot <- histoplotter(subset, Attrition,
chart_x_axis_lbl = "attrition status",
chart_y_axis_lbl = 'Measures',
boxplot_color = 'navy',
boxplot_fill = '#89CFF0',
box_fill_transparency = 0.2)
# Add extras to plot
plot +
ggthemes::theme_excel() +
tvthemes::scale_color_attackOnTitan()+
theme(legend.position = 'top') out of interest we might want to look at how
Departmentaffects each of the numeric variables throughanova
options(scipen=999)
vars_numeric<-dat_new |>
select(-EmployeeCount,-EmployeeNumber,-StandardHours) |>
select_if(is.numeric) |> colnames()
km_aov <- vars_numeric %>%
map(~ aov(rlang::eval_tidy(expr(!!sym(.x) ~Department)), data = dat_new))
km_aov %>%
map(anova) %>%
map(~ data.frame(F = .x$`F value`[[1]], p = .x$`Pr(>F)`[[1]])) %>%
bind_rows() %>%
bind_cols(Attribute = vars_numeric) %>%
select(Attribute, everything()) %>%
mutate(`Pr(>F)` = round(p,2)) %>%
select(-p) %>%
flextable::flextable() %>%
flextable::autofit() %>%
flextable::colformat_num(j = 2, digits = 2) %>%
flextable::colformat_num(j = 3, digits = 4)Attribute | F | Pr(>F) |
|---|---|---|
Age | 0.7655032 | 0.47 |
DailyRate | 0.5647346 | 0.57 |
DistanceFromHome | 0.2350266 | 0.79 |
Education | 0.2830675 | 0.75 |
HourlyRate | 0.3553375 | 0.70 |
JobSatisfaction | 0.5021229 | 0.61 |
MonthlyIncome | 3.2017829 | 0.04 |
MonthlyRate | 0.5628353 | 0.57 |
NumCompaniesWorked | 0.9516554 | 0.39 |
PercentSalaryHike | 0.9243264 | 0.40 |
PerformanceRating | 0.7940044 | 0.45 |
RelationshipSatisfaction | 0.9023113 | 0.41 |
TotalWorkingYears | 0.1824744 | 0.83 |
TrainingTimesLastYear | 1.4506366 | 0.23 |
WorkLifeBalance | 4.2131414 | 0.01 |
YearsAtCompany | 0.7620289 | 0.47 |
YearsInCurrentRole | 2.4721299 | 0.08 |
YearsSinceLastPromotion | 1.2231552 | 0.29 |
YearsWithCurrManager | 0.9569419 | 0.38 |
comments
Pr(>F) < 0.05 indicate the difference in
means per each department is not an outcome of chance aloneres<-dbGetQuery(con,'SELECT Attrition ,
COUNT(*) * 100.0/ SUM(COUNT(*)) OVER() as PERC
FROM dat_new
GROUP BY Attrition;')
resres |>
ggplot(aes(x=fct_reorder(Attrition, PERC), PERC)) +
geom_col() +
scale_fill_avatar()+
theme_avatar()+
labs(x="STATUS",
y="count",
title="") +
coord_flip() +
geom_text(aes(label=paste0(round(PERC), "% "), hjust=1), col="white")res<-dbGetQuery(con,'SELECT Attrition,
Department,
COUNT(*) * 100.0/ SUM(COUNT(*)) OVER() as PERC
FROM dat_new
GROUP BY Attrition,Department;')
resres %>%
ggplot(aes(Department, Attrition, fill = PERC)) +
geom_tile(color = "white") +
geom_text(aes(label = number(PERC, big.mark = ","))) +
scale_fill_binned(low = "blue", high = "lightyellow",
label = number_format(big.mark = ",")) +
theme(legend.position = "top",
legend.key.width = unit(15, "mm")) +
labs(x = "Department",
y = "Attrition",
fill = "Frequency")res<-dbGetQuery(con,'SELECT Attrition ,
AVG(DailyRate) as Daily_average,
AVG(MonthlyRate) as Monthly_average,
AVG(HourlyRate) as hourly_average,
AVG(YearsAtCompany) as average_number_of_years,
AVG(RelationshipSatisfaction) as average_satisfaction
FROM dat_new
GROUP BY Attrition;')
resres<-dbGetQuery(con,'SELECT Attrition,Department , AVG(TotalWorkingYears) as average_workhours ,
AVG(DistanceFromHome) as average_distance ,
AVG(YearsAtCompany) as average_years
FROM dat_new
GROUP BY Attrition,Department ;')
resres<-dbGetQuery(con,'SELECT Attrition,JobRole, AVG(MonthlyRate) as average_rate
FROM dat_new
GROUP BY Attrition,JobRole ;')
resdat_new %>%
ggplot(aes(x = DailyRate, fill =Attrition)) +
geom_histogram(color = "white", alpha = 0.75,bins=30) +
theme(legend.position = "top") +
scale_x_continuous(labels = number_format(big.mark = ",")) +
scale_y_continuous(labels = number_format(big.mark = ",")) +
ggthemes::scale_fill_tableau()+
labs(x = "Daily Rate", y = "Frequency",
fill = "Attrition",
title = "Daily Rate Distribution")dat_new %>%
count(Attrition, JobRole) %>%
ggplot(aes(JobRole, Attrition, fill = n)) +
geom_tile(color = "white") +
geom_text(aes(label = number(n, big.mark = ","))) +
scale_fill_binned(low = "blue", high = "lightyellow",
label = number_format(big.mark = ",")) +
theme(legend.position = "top",
legend.key.width = unit(15, "mm")) +
labs(x = "JobRole",
y = "Attrition",
fill = "Frequency")+
theme(axis.text.x = element_text(angle = 30, hjust = 1,color="blue"))Using binary correlation, I’ll include just the variables with a
correlation coefficient of at least 0.10. For our employee attrition
data set, OverTime (Y|N) has the largest correlation,
JobLevel = 1, MonthlyIncome <= 2,695.80,
etc.
library(plotly)
dat_new %>%
select(-EmployeeNumber) %>%
correlationfunnel::binarize(n_bins = 5, thresh_infreq = 0.01) %>%
correlationfunnel::correlate(Attrition__Yes) %>%
correlationfunnel::plot_correlation_funnel(interactive = FALSE) %>%
ggplotly() # Makes prettier, but drops the labelslibrary(rpart)
library(rattle)
library(tidymodels)attrition_df<-dat_new |> select(-EmployeeCount,-EmployeeNumber,-StandardHours)
# Initialize the split
split <- initial_split(attrition_df, prop = 0.8, strata = "Attrition")
# Extract training set
train <- split %>% training()
# Extract testing set
test <- split %>% testing()# Create recipe
feature_selection_recipe <-
recipe(Attrition ~ ., data = train) %>%
step_filter_missing(all_predictors(), threshold = 0.5) %>%
step_scale(all_numeric_predictors()) %>%
step_nzv(all_predictors()) %>%
prep()lr_model <- logistic_reg() %>%
set_engine("glm")attrition_wflow <- workflow() %>%
add_recipe(feature_selection_recipe) %>%
add_model(lr_model)f_meas() to evaluate the model’s performance on the
test data.accuracy() to determine accuracy of the
model# Fit workflow to train data
attrition_fit <-
attrition_wflow %>% fit(data = train)
# Add the test predictions to the test data
attrition_pred_df <-predict(attrition_fit, test) %>%
bind_cols(test %>% select(Attrition))
# Evaluate F score
f_meas(attrition_pred_df,Attrition, .pred_class)accuracy(attrition_pred_df, Attrition, .pred_class)# Create modeld
dt_model <- decision_tree() %>%
set_engine("rpart")%>%
set_mode("classification")
# Add recipe and model to a workflow
attrition_wflow <- workflow() %>%
add_recipe(feature_selection_recipe) %>%
add_model(dt_model)# Fit workflow to train data
attrition_fit <-
attrition_wflow %>% fit(data = train)
# Add the test predictions to the test data
attrition_pred_df <-predict(attrition_fit, test) %>%
bind_cols(test %>% select(Attrition))
# Evaluate F score
f_meas(attrition_pred_df,Attrition, .pred_class)accuracy(attrition_pred_df, Attrition, .pred_class)