library(tidyverse)
library(odbc)
library(DBI)
library(RSQLite)
library(tvthemes)
library(ggthemes)
library(scales)
<- read_csv("./input/WA_Fn-UseC_-HR-Employee-Attrition.csv")
dat_new <- dat_new %>%
dat_new mutate_if(is.character, as_factor) %>%
mutate(
EnvironmentSatisfaction = factor(EnvironmentSatisfaction, ordered = TRUE),
StockOptionLevel = factor(StockOptionLevel, ordered = TRUE),
JobLevel = factor(JobLevel, ordered = TRUE),
JobInvolvement = factor(JobInvolvement, ordered = TRUE)
%>%
) select(EmployeeNumber, Attrition, everything())
<- skimr::skim_with(numeric = skimr::sfl(p25 = NULL, p50 = NULL, p75 = NULL, hist = NULL))
my_skim my_skim(dat_new)
Name | dat_new |
Number of rows | 1470 |
Number of columns | 35 |
_______________________ | |
Column type frequency: | |
factor | 13 |
numeric | 22 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
Attrition | 0 | 1 | FALSE | 2 | No: 1233, Yes: 237 |
BusinessTravel | 0 | 1 | FALSE | 3 | Tra: 1043, Tra: 277, Non: 150 |
Department | 0 | 1 | FALSE | 3 | Res: 961, Sal: 446, Hum: 63 |
EducationField | 0 | 1 | FALSE | 6 | Lif: 606, Med: 464, Mar: 159, Tec: 132 |
EnvironmentSatisfaction | 0 | 1 | TRUE | 4 | 3: 453, 4: 446, 2: 287, 1: 284 |
Gender | 0 | 1 | FALSE | 2 | Mal: 882, Fem: 588 |
JobInvolvement | 0 | 1 | TRUE | 4 | 3: 868, 2: 375, 4: 144, 1: 83 |
JobLevel | 0 | 1 | TRUE | 5 | 1: 543, 2: 534, 3: 218, 4: 106 |
JobRole | 0 | 1 | FALSE | 9 | Sal: 326, Res: 292, Lab: 259, Man: 145 |
MaritalStatus | 0 | 1 | FALSE | 3 | Mar: 673, Sin: 470, Div: 327 |
Over18 | 0 | 1 | FALSE | 1 | Y: 1470 |
OverTime | 0 | 1 | FALSE | 2 | No: 1054, Yes: 416 |
StockOptionLevel | 0 | 1 | TRUE | 4 | 0: 631, 1: 596, 2: 158, 3: 85 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p100 |
---|---|---|---|---|---|---|
EmployeeNumber | 0 | 1 | 1024.87 | 602.02 | 1 | 2068 |
Age | 0 | 1 | 36.92 | 9.14 | 18 | 60 |
DailyRate | 0 | 1 | 802.49 | 403.51 | 102 | 1499 |
DistanceFromHome | 0 | 1 | 9.19 | 8.11 | 1 | 29 |
Education | 0 | 1 | 2.91 | 1.02 | 1 | 5 |
EmployeeCount | 0 | 1 | 1.00 | 0.00 | 1 | 1 |
HourlyRate | 0 | 1 | 65.89 | 20.33 | 30 | 100 |
JobSatisfaction | 0 | 1 | 2.73 | 1.10 | 1 | 4 |
MonthlyIncome | 0 | 1 | 6502.93 | 4707.96 | 1009 | 19999 |
MonthlyRate | 0 | 1 | 14313.10 | 7117.79 | 2094 | 26999 |
NumCompaniesWorked | 0 | 1 | 2.69 | 2.50 | 0 | 9 |
PercentSalaryHike | 0 | 1 | 15.21 | 3.66 | 11 | 25 |
PerformanceRating | 0 | 1 | 3.15 | 0.36 | 3 | 4 |
RelationshipSatisfaction | 0 | 1 | 2.71 | 1.08 | 1 | 4 |
StandardHours | 0 | 1 | 80.00 | 0.00 | 80 | 80 |
TotalWorkingYears | 0 | 1 | 11.28 | 7.78 | 0 | 40 |
TrainingTimesLastYear | 0 | 1 | 2.80 | 1.29 | 0 | 6 |
WorkLifeBalance | 0 | 1 | 2.76 | 0.71 | 1 | 4 |
YearsAtCompany | 0 | 1 | 7.01 | 6.13 | 0 | 40 |
YearsInCurrentRole | 0 | 1 | 4.23 | 3.62 | 0 | 18 |
YearsSinceLastPromotion | 0 | 1 | 2.19 | 3.22 | 0 | 15 |
YearsWithCurrManager | 0 | 1 | 4.12 | 3.57 | 0 | 17 |
<-dbConnect(RSQLite::SQLite(), ":memory:")
concopy_to(con,dat_new)
LIMIT n
does the trickdbGetQuery(con,'SELECT *
FROM dat_new
LIMIT 5')
dbGetQuery(con,'SELECT COUNT(*) AS num_rows
FROM dat_new')
dbGetQuery(con,'SELECT COUNT(*)-COUNT(EmployeeNumber) AS missing
FROM dat_new')
dbGetQuery(con,"SELECT Attrition,
MAX(MonthlyIncome) AS max_income ,
MIN(MonthlyIncome) AS min_income ,
AVG(MonthlyIncome) AS avg_income
FROM dat_new
GROUP BY Attrition;")
|>
dat_new ggplot(aes(x=Attrition,y=MonthlyIncome ,fill=Attrition))+
geom_boxplot(outlier.color = "blue")
t.test(MonthlyIncome~Attrition,data=dat_new,alternative="two.sided")
##
## Welch Two Sample t-test
##
## data: MonthlyIncome by Attrition
## t = -7.4826, df = 412.74, p-value = 4.434e-13
## alternative hypothesis: true difference in means between group Yes and group No is not equal to 0
## 95 percent confidence interval:
## -2583.050 -1508.244
## sample estimates:
## mean in group Yes mean in group No
## 4787.093 6832.740
Comments
t=-7.4826
represents the
calculated difference in sample means in units of standard error and the
negative sign suggest the first mean is smaller than the second0.05
suggesting that the
difference in means is statistically significantmonthly income
for each group is statistically
significant<-dbGetQuery(con,'SELECT Attrition ,
res COUNT(*) * 100.0/ SUM(COUNT(*)) OVER() as PERC
FROM dat_new
GROUP BY Attrition;')
res
|>
res ggplot(aes(x=fct_reorder(Attrition, PERC), PERC)) +
geom_col() +
scale_fill_avatar()+
theme_avatar()+
labs(x="STATUS",
y="count",
title="") +
coord_flip() +
geom_text(aes(label=paste0(round(PERC), "% "), hjust=1), col="white")
<-dbGetQuery(con,'SELECT Attrition,
res Department,
COUNT(*) * 100.0/ SUM(COUNT(*)) OVER() as PERC
FROM dat_new
GROUP BY Attrition,Department;')
res
%>%
res ggplot(aes(Department, Attrition, fill = PERC)) +
geom_tile(color = "white") +
geom_text(aes(label = number(PERC, big.mark = ","))) +
scale_fill_binned(low = "blue", high = "lightyellow",
label = number_format(big.mark = ",")) +
theme(legend.position = "top",
legend.key.width = unit(15, "mm")) +
labs(x = "Department",
y = "Attrition",
fill = "Frequency")
<-dbGetQuery(con,'SELECT Attrition ,
res AVG(DailyRate) as Daily_average,
AVG(MonthlyRate) as Monthly_average,
AVG(HourlyRate) as hourly_average,
AVG(YearsAtCompany) as average_number_of_years,
AVG(RelationshipSatisfaction) as average_satisfaction
FROM dat_new
GROUP BY Attrition;')
res
<-dbGetQuery(con,'SELECT Attrition,Department , AVG(TotalWorkingYears) as average_workhours ,
res AVG(DistanceFromHome) as average_distance ,
AVG(YearsAtCompany) as average_years
FROM dat_new
GROUP BY Attrition,Department ;')
res
<-dbGetQuery(con,'SELECT Attrition,JobRole, AVG(MonthlyRate) as average_rate
res FROM dat_new
GROUP BY Attrition,JobRole ;')
res
%>%
dat_new ggplot(aes(x = DailyRate, fill =Attrition)) +
geom_histogram(color = "white", alpha = 0.75,bins=30) +
theme(legend.position = "top") +
scale_x_continuous(labels = number_format(big.mark = ",")) +
scale_y_continuous(labels = number_format(big.mark = ",")) +
::scale_fill_tableau()+
ggthemeslabs(x = "Daily Rate", y = "Frequency",
fill = "Attrition",
title = "Daily Rate Distribution")
%>%
dat_new count(Attrition, JobRole) %>%
ggplot(aes(JobRole, Attrition, fill = n)) +
geom_tile(color = "white") +
geom_text(aes(label = number(n, big.mark = ","))) +
scale_fill_binned(low = "blue", high = "lightyellow",
label = number_format(big.mark = ",")) +
theme(legend.position = "top",
legend.key.width = unit(15, "mm")) +
labs(x = "JobRole",
y = "Attrition",
fill = "Frequency")+
theme(axis.text.x = element_text(angle = 30, hjust = 1,color="blue"))
Using binary correlation, I’ll include just the variables with a
correlation coefficient of at least 0.10. For our employee attrition
data set, OverTime
(Y|N) has the largest correlation,
JobLevel = 1
, MonthlyIncome <= 2,695.80
,
etc.
library(plotly)
%>%
dat_new select(-EmployeeNumber) %>%
::binarize(n_bins = 5, thresh_infreq = 0.01) %>%
correlationfunnel::correlate(Attrition__Yes) %>%
correlationfunnel::plot_correlation_funnel(interactive = FALSE) %>%
correlationfunnelggplotly() # Makes prettier, but drops the labels