All models are wrong, but some are useful. - George Box
library(tidyverse)
library(odbc)
library(DBI)
library(RSQLite)
library(tvthemes)
library(ggthemes)
library(scales)
<- read_csv("./input/WA_Fn-UseC_-HR-Employee-Attrition.csv")
dat_new <- dat_new %>%
dat_new mutate_if(is.character, as_factor) %>%
mutate(
EnvironmentSatisfaction = factor(EnvironmentSatisfaction, ordered = TRUE),
StockOptionLevel = factor(StockOptionLevel, ordered = TRUE),
JobLevel = factor(JobLevel, ordered = TRUE),
JobInvolvement = factor(JobInvolvement, ordered = TRUE)
%>%
) select(EmployeeNumber, Attrition, everything())
<- skimr::skim_with(numeric = skimr::sfl(p25 = NULL, p50 = NULL, p75 = NULL, hist = NULL))
my_skim my_skim(dat_new)
Name | dat_new |
Number of rows | 1470 |
Number of columns | 35 |
_______________________ | |
Column type frequency: | |
factor | 13 |
numeric | 22 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
Attrition | 0 | 1 | FALSE | 2 | No: 1233, Yes: 237 |
BusinessTravel | 0 | 1 | FALSE | 3 | Tra: 1043, Tra: 277, Non: 150 |
Department | 0 | 1 | FALSE | 3 | Res: 961, Sal: 446, Hum: 63 |
EducationField | 0 | 1 | FALSE | 6 | Lif: 606, Med: 464, Mar: 159, Tec: 132 |
EnvironmentSatisfaction | 0 | 1 | TRUE | 4 | 3: 453, 4: 446, 2: 287, 1: 284 |
Gender | 0 | 1 | FALSE | 2 | Mal: 882, Fem: 588 |
JobInvolvement | 0 | 1 | TRUE | 4 | 3: 868, 2: 375, 4: 144, 1: 83 |
JobLevel | 0 | 1 | TRUE | 5 | 1: 543, 2: 534, 3: 218, 4: 106 |
JobRole | 0 | 1 | FALSE | 9 | Sal: 326, Res: 292, Lab: 259, Man: 145 |
MaritalStatus | 0 | 1 | FALSE | 3 | Mar: 673, Sin: 470, Div: 327 |
Over18 | 0 | 1 | FALSE | 1 | Y: 1470 |
OverTime | 0 | 1 | FALSE | 2 | No: 1054, Yes: 416 |
StockOptionLevel | 0 | 1 | TRUE | 4 | 0: 631, 1: 596, 2: 158, 3: 85 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p100 |
---|---|---|---|---|---|---|
EmployeeNumber | 0 | 1 | 1024.87 | 602.02 | 1 | 2068 |
Age | 0 | 1 | 36.92 | 9.14 | 18 | 60 |
DailyRate | 0 | 1 | 802.49 | 403.51 | 102 | 1499 |
DistanceFromHome | 0 | 1 | 9.19 | 8.11 | 1 | 29 |
Education | 0 | 1 | 2.91 | 1.02 | 1 | 5 |
EmployeeCount | 0 | 1 | 1.00 | 0.00 | 1 | 1 |
HourlyRate | 0 | 1 | 65.89 | 20.33 | 30 | 100 |
JobSatisfaction | 0 | 1 | 2.73 | 1.10 | 1 | 4 |
MonthlyIncome | 0 | 1 | 6502.93 | 4707.96 | 1009 | 19999 |
MonthlyRate | 0 | 1 | 14313.10 | 7117.79 | 2094 | 26999 |
NumCompaniesWorked | 0 | 1 | 2.69 | 2.50 | 0 | 9 |
PercentSalaryHike | 0 | 1 | 15.21 | 3.66 | 11 | 25 |
PerformanceRating | 0 | 1 | 3.15 | 0.36 | 3 | 4 |
RelationshipSatisfaction | 0 | 1 | 2.71 | 1.08 | 1 | 4 |
StandardHours | 0 | 1 | 80.00 | 0.00 | 80 | 80 |
TotalWorkingYears | 0 | 1 | 11.28 | 7.78 | 0 | 40 |
TrainingTimesLastYear | 0 | 1 | 2.80 | 1.29 | 0 | 6 |
WorkLifeBalance | 0 | 1 | 2.76 | 0.71 | 1 | 4 |
YearsAtCompany | 0 | 1 | 7.01 | 6.13 | 0 | 40 |
YearsInCurrentRole | 0 | 1 | 4.23 | 3.62 | 0 | 18 |
YearsSinceLastPromotion | 0 | 1 | 2.19 | 3.22 | 0 | 15 |
YearsWithCurrManager | 0 | 1 | 4.12 | 3.57 | 0 | 17 |
<-dbConnect(RSQLite::SQLite(), ":memory:")
concopy_to(con,dat_new)
LIMIT n
does the trickdbGetQuery(con,'SELECT *
FROM dat_new
LIMIT 5')
dbGetQuery(con,'SELECT COUNT(*) AS num_rows
FROM dat_new')
dbGetQuery(con,'SELECT COUNT(*)-COUNT(EmployeeNumber) AS missing
FROM dat_new')
dbGetQuery(con,"SELECT Attrition,
MAX(MonthlyIncome) AS max_income ,
MIN(MonthlyIncome) AS min_income ,
AVG(MonthlyIncome) AS avg_income
FROM dat_new
GROUP BY Attrition;")
|>
dat_new ggplot(aes(x=Attrition,y=MonthlyIncome ,fill=Attrition))+
geom_boxplot(outlier.color = "blue")+
::scale_fill_avatar() tvthemes
t.test(MonthlyIncome~Attrition,data=dat_new,alternative="two.sided")
##
## Welch Two Sample t-test
##
## data: MonthlyIncome by Attrition
## t = -7.4826, df = 412.74, p-value = 4.434e-13
## alternative hypothesis: true difference in means between group Yes and group No is not equal to 0
## 95 percent confidence interval:
## -2583.050 -1508.244
## sample estimates:
## mean in group Yes mean in group No
## 4787.093 6832.740
Comments
t=-7.4826
represents the
calculated difference in sample means in units of standard error and the
negative sign suggest the first mean is smaller than the second0.05
suggesting that the
difference in means is statistically significantmonthly income
for each group is statistically
significant<-dat_new |>
vars_numericselect(-EmployeeCount,-EmployeeNumber,-StandardHours) |>
select_if(is.numeric) |> colnames()
<- vars_numeric %>%
km_aov map(~ t.test(rlang::eval_tidy(expr(!!sym(.x) ~Attrition)), data = dat_new))
%>%
km_aov map(~ data.frame(`test statistic` = .x$`statistic`[[1]], p.value = .x$`p.value`[[1]])) %>%
bind_rows() %>%
bind_cols(Attribute = vars_numeric) %>%
select(Attribute, everything()) %>%
mutate(`p value` = round(p.value,3)) %>%
select(-p.value)%>%
::flextable() %>%
flextable::autofit() %>%
flextable::colformat_num(j = 2, digits = 2) %>%
flextable::colformat_num(j = 3, digits = 4) flextable
Attribute | test.statistic | p value |
---|---|---|
Age | -5.8280119 | 0.000 |
DailyRate | -2.1788817 | 0.030 |
DistanceFromHome | 2.8881831 | 0.004 |
Education | -1.2177494 | 0.224 |
HourlyRate | -0.2647686 | 0.791 |
JobSatisfaction | -3.9261129 | 0.000 |
MonthlyIncome | -7.4826216 | 0.000 |
MonthlyRate | 0.5755022 | 0.565 |
NumCompaniesWorked | 1.5746511 | 0.116 |
PercentSalaryHike | -0.5042445 | 0.614 |
PerformanceRating | 0.1099940 | 0.912 |
RelationshipSatisfaction | -1.7019371 | 0.090 |
TotalWorkingYears | -7.0191785 | 0.000 |
TrainingTimesLastYear | -2.3305223 | 0.020 |
WorkLifeBalance | -2.1741928 | 0.030 |
YearsAtCompany | -5.2825961 | 0.000 |
YearsInCurrentRole | -6.8470792 | 0.000 |
YearsSinceLastPromotion | -1.2879266 | 0.199 |
YearsWithCurrManager | -6.6333988 | 0.000 |
comments
0.05
suggest that the
difference in means is statistically significantchance
<-dat_new |>
vars_numericselect(-EmployeeCount,-EmployeeNumber,-StandardHours) |>
select_if(is.numeric) |>
colnames()
<- dat_new |>
subsetselect(vars_numeric,Attrition)
# Bring in external file for visualisations
source('functions/visualisations.R')
# Use plot function
<- histoplotter(subset, Attrition,
plot chart_x_axis_lbl = "attrition status",
chart_y_axis_lbl = 'Measures',
boxplot_color = 'navy',
boxplot_fill = '#89CFF0',
box_fill_transparency = 0.2)
# Add extras to plot
+
plot ::theme_excel() +
ggthemes::scale_color_attackOnTitan()+
tvthemestheme(legend.position = 'top')
out of interest we might want to look at how
Department
affects each of the numeric variables throughanova
options(scipen=999)
<-dat_new |>
vars_numericselect(-EmployeeCount,-EmployeeNumber,-StandardHours) |>
select_if(is.numeric) |> colnames()
<- vars_numeric %>%
km_aov map(~ aov(rlang::eval_tidy(expr(!!sym(.x) ~Department)), data = dat_new))
%>%
km_aov map(anova) %>%
map(~ data.frame(F = .x$`F value`[[1]], p = .x$`Pr(>F)`[[1]])) %>%
bind_rows() %>%
bind_cols(Attribute = vars_numeric) %>%
select(Attribute, everything()) %>%
mutate(`Pr(>F)` = round(p,2)) %>%
select(-p) %>%
::flextable() %>%
flextable::autofit() %>%
flextable::colformat_num(j = 2, digits = 2) %>%
flextable::colformat_num(j = 3, digits = 4) flextable
Attribute | F | Pr(>F) |
---|---|---|
Age | 0.7655032 | 0.47 |
DailyRate | 0.5647346 | 0.57 |
DistanceFromHome | 0.2350266 | 0.79 |
Education | 0.2830675 | 0.75 |
HourlyRate | 0.3553375 | 0.70 |
JobSatisfaction | 0.5021229 | 0.61 |
MonthlyIncome | 3.2017829 | 0.04 |
MonthlyRate | 0.5628353 | 0.57 |
NumCompaniesWorked | 0.9516554 | 0.39 |
PercentSalaryHike | 0.9243264 | 0.40 |
PerformanceRating | 0.7940044 | 0.45 |
RelationshipSatisfaction | 0.9023113 | 0.41 |
TotalWorkingYears | 0.1824744 | 0.83 |
TrainingTimesLastYear | 1.4506366 | 0.23 |
WorkLifeBalance | 4.2131414 | 0.01 |
YearsAtCompany | 0.7620289 | 0.47 |
YearsInCurrentRole | 2.4721299 | 0.08 |
YearsSinceLastPromotion | 1.2231552 | 0.29 |
YearsWithCurrManager | 0.9569419 | 0.38 |
comments
Pr(>F) < 0.05
indicate the difference in
means per each department is not an outcome of chance alone<-dbGetQuery(con,'SELECT Attrition ,
res COUNT(*) * 100.0/ SUM(COUNT(*)) OVER() as PERC
FROM dat_new
GROUP BY Attrition;')
res
|>
res ggplot(aes(x=fct_reorder(Attrition, PERC), PERC)) +
geom_col() +
scale_fill_avatar()+
theme_avatar()+
labs(x="STATUS",
y="count",
title="") +
coord_flip() +
geom_text(aes(label=paste0(round(PERC), "% "), hjust=1), col="white")
<-dbGetQuery(con,'SELECT Attrition,
res Department,
COUNT(*) * 100.0/ SUM(COUNT(*)) OVER() as PERC
FROM dat_new
GROUP BY Attrition,Department;')
res
%>%
res ggplot(aes(Department, Attrition, fill = PERC)) +
geom_tile(color = "white") +
geom_text(aes(label = number(PERC, big.mark = ","))) +
scale_fill_binned(low = "blue", high = "lightyellow",
label = number_format(big.mark = ",")) +
theme(legend.position = "top",
legend.key.width = unit(15, "mm")) +
labs(x = "Department",
y = "Attrition",
fill = "Frequency")
<-dbGetQuery(con,'SELECT Attrition ,
res AVG(DailyRate) as Daily_average,
AVG(MonthlyRate) as Monthly_average,
AVG(HourlyRate) as hourly_average,
AVG(YearsAtCompany) as average_number_of_years,
AVG(RelationshipSatisfaction) as average_satisfaction
FROM dat_new
GROUP BY Attrition;')
res
<-dbGetQuery(con,'SELECT Attrition,Department , AVG(TotalWorkingYears) as average_workhours ,
res AVG(DistanceFromHome) as average_distance ,
AVG(YearsAtCompany) as average_years
FROM dat_new
GROUP BY Attrition,Department ;')
res
<-dbGetQuery(con,'SELECT Attrition,JobRole, AVG(MonthlyRate) as average_rate
res FROM dat_new
GROUP BY Attrition,JobRole ;')
res
%>%
dat_new ggplot(aes(x = DailyRate, fill =Attrition)) +
geom_histogram(color = "white", alpha = 0.75,bins=30) +
theme(legend.position = "top") +
scale_x_continuous(labels = number_format(big.mark = ",")) +
scale_y_continuous(labels = number_format(big.mark = ",")) +
::scale_fill_tableau()+
ggthemeslabs(x = "Daily Rate", y = "Frequency",
fill = "Attrition",
title = "Daily Rate Distribution")
%>%
dat_new count(Attrition, JobRole) %>%
ggplot(aes(JobRole, Attrition, fill = n)) +
geom_tile(color = "white") +
geom_text(aes(label = number(n, big.mark = ","))) +
scale_fill_binned(low = "blue", high = "lightyellow",
label = number_format(big.mark = ",")) +
theme(legend.position = "top",
legend.key.width = unit(15, "mm")) +
labs(x = "JobRole",
y = "Attrition",
fill = "Frequency")+
theme(axis.text.x = element_text(angle = 30, hjust = 1,color="blue"))
Using binary correlation, I’ll include just the variables with a
correlation coefficient of at least 0.10. For our employee attrition
data set, OverTime
(Y|N) has the largest correlation,
JobLevel = 1
, MonthlyIncome <= 2,695.80
,
etc.
library(plotly)
%>%
dat_new select(-EmployeeNumber) %>%
::binarize(n_bins = 5, thresh_infreq = 0.01) %>%
correlationfunnel::correlate(Attrition__Yes) %>%
correlationfunnel::plot_correlation_funnel(interactive = FALSE) %>%
correlationfunnelggplotly() # Makes prettier, but drops the labels
library(rpart)
library(rattle)
library(tidymodels)
<-dat_new |> select(-EmployeeCount,-EmployeeNumber,-StandardHours)
attrition_df# Initialize the split
<- initial_split(attrition_df, prop = 0.8, strata = "Attrition")
split
# Extract training set
<- split %>% training()
train
# Extract testing set
<- split %>% testing() test
# Create recipe
<-
feature_selection_recipe recipe(Attrition ~ ., data = train) %>%
step_filter_missing(all_predictors(), threshold = 0.5) %>%
step_scale(all_numeric_predictors()) %>%
step_nzv(all_predictors()) %>%
prep()
<- logistic_reg() %>%
lr_model set_engine("glm")
<- workflow() %>%
attrition_wflow add_recipe(feature_selection_recipe) %>%
add_model(lr_model)
f_meas()
to evaluate the model’s performance on the
test data.accuracy()
to determine accuracy of the
model# Fit workflow to train data
<-
attrition_fit %>% fit(data = train)
attrition_wflow
# Add the test predictions to the test data
<-predict(attrition_fit, test) %>%
attrition_pred_df bind_cols(test %>% select(Attrition))
# Evaluate F score
f_meas(attrition_pred_df,Attrition, .pred_class)
accuracy(attrition_pred_df, Attrition, .pred_class)
# Create modeld
<- decision_tree() %>%
dt_model set_engine("rpart")%>%
set_mode("classification")
# Add recipe and model to a workflow
<- workflow() %>%
attrition_wflow add_recipe(feature_selection_recipe) %>%
add_model(dt_model)
# Fit workflow to train data
<-
attrition_fit %>% fit(data = train)
attrition_wflow
# Add the test predictions to the test data
<-predict(attrition_fit, test) %>%
attrition_pred_df bind_cols(test %>% select(Attrition))
# Evaluate F score
f_meas(attrition_pred_df,Attrition, .pred_class)
accuracy(attrition_pred_df, Attrition, .pred_class)