library(tidyverse)
library(odbc)
library(DBI)
library(RSQLite)
library(tvthemes)
library(ggthemes)
library(scales)
dat_new <- read_csv("./input/WA_Fn-UseC_-HR-Employee-Attrition.csv")
dat_new <- dat_new %>%
mutate_if(is.character, as_factor) %>%
mutate(
EnvironmentSatisfaction = factor(EnvironmentSatisfaction, ordered = TRUE),
StockOptionLevel = factor(StockOptionLevel, ordered = TRUE),
JobLevel = factor(JobLevel, ordered = TRUE),
JobInvolvement = factor(JobInvolvement, ordered = TRUE)
) %>%
select(EmployeeNumber, Attrition, everything())
my_skim <- skimr::skim_with(numeric = skimr::sfl(p25 = NULL, p50 = NULL, p75 = NULL, hist = NULL))
my_skim(dat_new)| Name | dat_new |
| Number of rows | 1470 |
| Number of columns | 35 |
| _______________________ | |
| Column type frequency: | |
| factor | 13 |
| numeric | 22 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Attrition | 0 | 1 | FALSE | 2 | No: 1233, Yes: 237 |
| BusinessTravel | 0 | 1 | FALSE | 3 | Tra: 1043, Tra: 277, Non: 150 |
| Department | 0 | 1 | FALSE | 3 | Res: 961, Sal: 446, Hum: 63 |
| EducationField | 0 | 1 | FALSE | 6 | Lif: 606, Med: 464, Mar: 159, Tec: 132 |
| EnvironmentSatisfaction | 0 | 1 | TRUE | 4 | 3: 453, 4: 446, 2: 287, 1: 284 |
| Gender | 0 | 1 | FALSE | 2 | Mal: 882, Fem: 588 |
| JobInvolvement | 0 | 1 | TRUE | 4 | 3: 868, 2: 375, 4: 144, 1: 83 |
| JobLevel | 0 | 1 | TRUE | 5 | 1: 543, 2: 534, 3: 218, 4: 106 |
| JobRole | 0 | 1 | FALSE | 9 | Sal: 326, Res: 292, Lab: 259, Man: 145 |
| MaritalStatus | 0 | 1 | FALSE | 3 | Mar: 673, Sin: 470, Div: 327 |
| Over18 | 0 | 1 | FALSE | 1 | Y: 1470 |
| OverTime | 0 | 1 | FALSE | 2 | No: 1054, Yes: 416 |
| StockOptionLevel | 0 | 1 | TRUE | 4 | 0: 631, 1: 596, 2: 158, 3: 85 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p100 |
|---|---|---|---|---|---|---|
| EmployeeNumber | 0 | 1 | 1024.87 | 602.02 | 1 | 2068 |
| Age | 0 | 1 | 36.92 | 9.14 | 18 | 60 |
| DailyRate | 0 | 1 | 802.49 | 403.51 | 102 | 1499 |
| DistanceFromHome | 0 | 1 | 9.19 | 8.11 | 1 | 29 |
| Education | 0 | 1 | 2.91 | 1.02 | 1 | 5 |
| EmployeeCount | 0 | 1 | 1.00 | 0.00 | 1 | 1 |
| HourlyRate | 0 | 1 | 65.89 | 20.33 | 30 | 100 |
| JobSatisfaction | 0 | 1 | 2.73 | 1.10 | 1 | 4 |
| MonthlyIncome | 0 | 1 | 6502.93 | 4707.96 | 1009 | 19999 |
| MonthlyRate | 0 | 1 | 14313.10 | 7117.79 | 2094 | 26999 |
| NumCompaniesWorked | 0 | 1 | 2.69 | 2.50 | 0 | 9 |
| PercentSalaryHike | 0 | 1 | 15.21 | 3.66 | 11 | 25 |
| PerformanceRating | 0 | 1 | 3.15 | 0.36 | 3 | 4 |
| RelationshipSatisfaction | 0 | 1 | 2.71 | 1.08 | 1 | 4 |
| StandardHours | 0 | 1 | 80.00 | 0.00 | 80 | 80 |
| TotalWorkingYears | 0 | 1 | 11.28 | 7.78 | 0 | 40 |
| TrainingTimesLastYear | 0 | 1 | 2.80 | 1.29 | 0 | 6 |
| WorkLifeBalance | 0 | 1 | 2.76 | 0.71 | 1 | 4 |
| YearsAtCompany | 0 | 1 | 7.01 | 6.13 | 0 | 40 |
| YearsInCurrentRole | 0 | 1 | 4.23 | 3.62 | 0 | 18 |
| YearsSinceLastPromotion | 0 | 1 | 2.19 | 3.22 | 0 | 15 |
| YearsWithCurrManager | 0 | 1 | 4.12 | 3.57 | 0 | 17 |
con<-dbConnect(RSQLite::SQLite(), ":memory:")
copy_to(con,dat_new)LIMIT n does the trickdbGetQuery(con,'SELECT *
FROM dat_new
LIMIT 5')dbGetQuery(con,'SELECT COUNT(*) AS num_rows
FROM dat_new')dbGetQuery(con,'SELECT COUNT(*)-COUNT(EmployeeNumber) AS missing
FROM dat_new')dbGetQuery(con,"SELECT Attrition,
MAX(MonthlyIncome) AS max_income ,
MIN(MonthlyIncome) AS min_income ,
AVG(MonthlyIncome) AS avg_income
FROM dat_new
GROUP BY Attrition;")dat_new |>
ggplot(aes(x=Attrition,y=MonthlyIncome ,fill=Attrition))+
geom_boxplot(outlier.color = "blue")t.test(MonthlyIncome~Attrition,data=dat_new,alternative="two.sided")##
## Welch Two Sample t-test
##
## data: MonthlyIncome by Attrition
## t = -7.4826, df = 412.74, p-value = 4.434e-13
## alternative hypothesis: true difference in means between group Yes and group No is not equal to 0
## 95 percent confidence interval:
## -2583.050 -1508.244
## sample estimates:
## mean in group Yes mean in group No
## 4787.093 6832.740
Comments
t=-7.4826 represents the
calculated difference in sample means in units of standard error and the
negative sign suggest the first mean is smaller than the second0.05 suggesting that the
difference in means is statistically significantmonthly income for each group is statistically
significantres<-dbGetQuery(con,'SELECT Attrition ,
COUNT(*) * 100.0/ SUM(COUNT(*)) OVER() as PERC
FROM dat_new
GROUP BY Attrition;')
resres |>
ggplot(aes(x=fct_reorder(Attrition, PERC), PERC)) +
geom_col() +
scale_fill_avatar()+
theme_avatar()+
labs(x="STATUS",
y="count",
title="") +
coord_flip() +
geom_text(aes(label=paste0(round(PERC), "% "), hjust=1), col="white")res<-dbGetQuery(con,'SELECT Attrition,
Department,
COUNT(*) * 100.0/ SUM(COUNT(*)) OVER() as PERC
FROM dat_new
GROUP BY Attrition,Department;')
resres %>%
ggplot(aes(Department, Attrition, fill = PERC)) +
geom_tile(color = "white") +
geom_text(aes(label = number(PERC, big.mark = ","))) +
scale_fill_binned(low = "blue", high = "lightyellow",
label = number_format(big.mark = ",")) +
theme(legend.position = "top",
legend.key.width = unit(15, "mm")) +
labs(x = "Department",
y = "Attrition",
fill = "Frequency")res<-dbGetQuery(con,'SELECT Attrition ,
AVG(DailyRate) as Daily_average,
AVG(MonthlyRate) as Monthly_average,
AVG(HourlyRate) as hourly_average,
AVG(YearsAtCompany) as average_number_of_years,
AVG(RelationshipSatisfaction) as average_satisfaction
FROM dat_new
GROUP BY Attrition;')
resres<-dbGetQuery(con,'SELECT Attrition,Department , AVG(TotalWorkingYears) as average_workhours ,
AVG(DistanceFromHome) as average_distance ,
AVG(YearsAtCompany) as average_years
FROM dat_new
GROUP BY Attrition,Department ;')
resres<-dbGetQuery(con,'SELECT Attrition,JobRole, AVG(MonthlyRate) as average_rate
FROM dat_new
GROUP BY Attrition,JobRole ;')
resdat_new %>%
ggplot(aes(x = DailyRate, fill =Attrition)) +
geom_histogram(color = "white", alpha = 0.75,bins=30) +
theme(legend.position = "top") +
scale_x_continuous(labels = number_format(big.mark = ",")) +
scale_y_continuous(labels = number_format(big.mark = ",")) +
ggthemes::scale_fill_tableau()+
labs(x = "Daily Rate", y = "Frequency",
fill = "Attrition",
title = "Daily Rate Distribution")dat_new %>%
count(Attrition, JobRole) %>%
ggplot(aes(JobRole, Attrition, fill = n)) +
geom_tile(color = "white") +
geom_text(aes(label = number(n, big.mark = ","))) +
scale_fill_binned(low = "blue", high = "lightyellow",
label = number_format(big.mark = ",")) +
theme(legend.position = "top",
legend.key.width = unit(15, "mm")) +
labs(x = "JobRole",
y = "Attrition",
fill = "Frequency")+
theme(axis.text.x = element_text(angle = 30, hjust = 1,color="blue"))Using binary correlation, I’ll include just the variables with a
correlation coefficient of at least 0.10. For our employee attrition
data set, OverTime (Y|N) has the largest correlation,
JobLevel = 1, MonthlyIncome <= 2,695.80,
etc.
library(plotly)
dat_new %>%
select(-EmployeeNumber) %>%
correlationfunnel::binarize(n_bins = 5, thresh_infreq = 0.01) %>%
correlationfunnel::correlate(Attrition__Yes) %>%
correlationfunnel::plot_correlation_funnel(interactive = FALSE) %>%
ggplotly() # Makes prettier, but drops the labels