library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(ggplot2)
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.0.5
library(data.table)
library(car)
## Warning: package 'car' was built under R version 4.0.5
## Loading required package: carData
library(caTools)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.0.5
## corrplot 0.88 loaded
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.0.5
dados_rh <- fread('dados/dataset.csv')
dados_rh$Attrition <- as.factor(dados_rh$Attrition)
dados_rh$BusinessTravel <- as.factor(dados_rh$BusinessTravel)
dados_rh$Department <- as.factor(dados_rh$Department)
dados_rh$Education <- as.factor(dados_rh$Education)
dados_rh$EducationField <- as.factor(dados_rh$EducationField)
dados_rh$'Employee Source' <- as.factor(dados_rh$'Employee Source')
dados_rh$EnvironmentSatisfaction <- as.factor(dados_rh$EnvironmentSatisfaction)
dados_rh$Gender <- as.factor(dados_rh$Gender)
dados_rh$JobInvolvement <- as.factor(dados_rh$JobInvolvement)
dados_rh$JobLevel <- as.factor(dados_rh$JobLevel)
dados_rh$JobRole <- as.factor(dados_rh$JobRole)
dados_rh$JobSatisfaction <- as.factor(dados_rh$JobSatisfaction)
dados_rh$MaritalStatus <- as.factor(dados_rh$MaritalStatus)
dados_rh$OverTime <- as.factor(dados_rh$OverTime)
dados_rh$PerformanceRating <- as.factor(dados_rh$PerformanceRating)
dados_rh$RelationshipSatisfaction <- as.factor(dados_rh$RelationshipSatisfaction)
dados_rh$StockOptionLevel <- as.factor(dados_rh$StockOptionLevel)
dados_rh$WorkLifeBalance <- as.factor(dados_rh$WorkLifeBalance)
dados_rh$DistanceFromHome <- as.integer(dados_rh$DistanceFromHome)
dados_rh$MonthlyIncome <- as.integer(dados_rh$MonthlyIncome)
dados_rh$PercentSalaryHike <- as.integer(dados_rh$PercentSalaryHike)
dados <- droplevels(dados_rh)
summary(dados_rh)
## Age Attrition BusinessTravel
## Min. :18.00 Current employee :19370 Non-Travel : 2344
## 1st Qu.:30.00 Termination : 87 Travel_Frequently: 4378
## Median :36.00 Voluntary Resignation: 3601 Travel_Rarely :16336
## Mean :37.04
## 3rd Qu.:43.00
## Max. :60.00
##
## Department DistanceFromHome Education
## Human Resources : 1010 Min. : 1.000 1:2659
## Research & Development:15040 1st Qu.: 2.000 2:4436
## Sales : 7008 Median : 7.000 3:8930
## Mean : 9.215 4:6279
## 3rd Qu.:14.000 5: 754
## Max. :29.000
##
## EducationField EnvironmentSatisfaction Gender JobInvolvement
## Human Resources : 442 1:4490 Female: 9205 1: 1287
## Life Sciences :9513 2:4476 Male :13853 2: 5888
## Marketing :2484 3:7091 3:13644
## Medical :7267 4:7001 4: 2239
## Other :1291
## Technical Degree:2061
##
## JobLevel JobRole JobSatisfaction MaritalStatus
## 1:8594 Sales Executive :5067 1:4575 Divorced: 5163
## 2:8448 Research Scientist :4591 2:4371 Married :10543
## 3:3440 Laboratory Technician :4112 3:6938 Single : 7352
## 4:1563 Manufacturing Director :2346 4:7174
## 5:1013 Healthcare Representative:2069
## Manager :1521
## (Other) :3352
## MonthlyIncome NumCompaniesWorked OverTime PercentSalaryHike
## Min. : 1009 Min. :0.000 No :16524 Min. :11.00
## 1st Qu.: 2900 1st Qu.:1.000 Yes: 6534 1st Qu.:12.00
## Median : 4898 Median :2.000 Median :14.00
## Mean : 6416 Mean :2.691 Mean :15.22
## 3rd Qu.: 8120 3rd Qu.:4.000 3rd Qu.:18.00
## Max. :19999 Max. :9.000 Max. :25.00
##
## PerformanceRating RelationshipSatisfaction StockOptionLevel TotalWorkingYears
## 3:19478 1:4331 0:9873 Min. : 0.00
## 4: 3580 2:4762 1:9370 1st Qu.: 6.00
## 3:7164 2:2497 Median :10.00
## 4:6801 3:1318 Mean :11.07
## 3rd Qu.:15.00
## Max. :40.00
##
## TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole
## Min. :0.000 1: 1263 Min. : 0.00 Min. : 0.000
## 1st Qu.:2.000 2: 5374 1st Qu.: 3.00 1st Qu.: 2.000
## Median :3.000 3:14016 Median : 5.00 Median : 3.000
## Mean :2.804 4: 2405 Mean : 6.91 Mean : 4.201
## 3rd Qu.:3.000 3rd Qu.: 9.00 3rd Qu.: 7.000
## Max. :6.000 Max. :40.00 Max. :18.000
##
## YearsSinceLastPromotion YearsWithCurrManager Employee Source
## Min. : 0.000 Min. : 0.000 Company Website:5327
## 1st Qu.: 0.000 1st Qu.: 2.000 Seek :3655
## Median : 1.000 Median : 3.000 Indeed :2471
## Mean : 2.164 Mean : 4.091 Jora :2408
## 3rd Qu.: 3.000 3rd Qu.: 7.000 LinkedIn :2294
## Max. :15.000 Max. :17.000 Recruit.net :2283
## (Other) :4620
## AgeStartedWorking
## Min. : 0.00
## 1st Qu.:20.00
## Median :25.00
## Mean :25.96
## 3rd Qu.:31.00
## Max. :60.00
##
Criamos uma coluna de anos anteriores de experiência para visualizar melhor o perfil de experiência do funcionário.
dados_rh$PriorYearsOfExperience <- dados_rh$TotalWorkingYears - dados_rh$YearsAtCompany
A estabilidade no emprego (job tenure) é a medida do tempo que um funcionário está empregado por seu empregador atual. A estabilidade no emprego de um funcionário é muito importante e muitas vezes os empregadores consideram a estabilidade no emprego um critério para a contratação de novos funcionários. A permanência no emprego pode ser longa ou curta.
Criamos um novo recurso de estabilidade média para traçar o perfil de permanência média dos funcionários em empresas anteriores.
dados_rh$AverageTenure <- dados_rh$PriorYearsOfExperience / dados_rh$NumCompaniesWorked
View(dados_rh)
A estabilidade média produz valores como Inf devido à natureza de sua derivaçãoSubstituímos para zero.
dados_rh$AverageTenure[!is.finite(dados_rh$AverageTenure)] <- 0
Analisamos e dividimos os dados como base na coluna Termination, que indica se o funcionário foi desligado da empresa.
dados_rh_1 <- dados_rh[dados_rh$Attrition != 'Termination']
dados_rh_1 <- droplevels(dados_rh_1)
dim(dados_rh_1)
## [1] 22971 32
summary(dados_rh_1)
## Age Attrition BusinessTravel
## Min. :18.00 Current employee :19370 Non-Travel : 2344
## 1st Qu.:30.00 Voluntary Resignation: 3601 Travel_Frequently: 4363
## Median :36.00 Travel_Rarely :16264
## Mean :37.06
## 3rd Qu.:43.00
## Max. :60.00
##
## Department DistanceFromHome Education
## Human Resources : 1010 Min. : 1.000 1:2659
## Research & Development:14977 1st Qu.: 2.000 2:4421
## Sales : 6984 Median : 7.000 3:8890
## Mean : 9.191 4:6247
## 3rd Qu.:14.000 5: 754
## Max. :29.000
##
## EducationField EnvironmentSatisfaction Gender JobInvolvement
## Human Resources : 442 1:4482 Female: 9173 1: 1279
## Life Sciences :9494 2:4460 Male :13798 2: 5849
## Marketing :2484 3:7067 3:13612
## Medical :7215 4:6962 4: 2231
## Other :1291
## Technical Degree:2045
##
## JobLevel JobRole JobSatisfaction MaritalStatus
## 1:8547 Sales Executive :5051 1:4543 Divorced: 5148
## 2:8432 Research Scientist :4576 2:4355 Married :10502
## 3:3424 Laboratory Technician :4088 3:6914 Single : 7321
## 4:1563 Manufacturing Director :2338 4:7159
## 5:1005 Healthcare Representative:2061
## Manager :1513
## (Other) :3344
## MonthlyIncome NumCompaniesWorked OverTime PercentSalaryHike
## Min. : 1009 Min. :0.000 No :16476 Min. :11.00
## 1st Qu.: 2909 1st Qu.:1.000 Yes: 6495 1st Qu.:12.00
## Median : 4898 Median :2.000 Median :14.00
## Mean : 6418 Mean :2.688 Mean :15.22
## 3rd Qu.: 8120 3rd Qu.:4.000 3rd Qu.:18.00
## Max. :19999 Max. :9.000 Max. :25.00
##
## PerformanceRating RelationshipSatisfaction StockOptionLevel TotalWorkingYears
## 3:19407 1:4315 0:9826 Min. : 0.00
## 4: 3564 2:4739 1:9330 1st Qu.: 6.00
## 3:7132 2:2497 Median :10.00
## 4:6785 3:1318 Mean :11.08
## 3rd Qu.:15.00
## Max. :40.00
##
## TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole
## Min. :0.000 1: 1255 Min. : 0.000 Min. : 0.000
## 1st Qu.:2.000 2: 5359 1st Qu.: 3.000 1st Qu.: 2.000
## Median :3.000 3:13960 Median : 5.000 Median : 3.000
## Mean :2.805 4: 2397 Mean : 6.914 Mean : 4.202
## 3rd Qu.:3.000 3rd Qu.: 9.000 3rd Qu.: 7.000
## Max. :6.000 Max. :40.000 Max. :18.000
##
## YearsSinceLastPromotion YearsWithCurrManager Employee Source
## Min. : 0.000 Min. : 0.000 Company Website:5307
## 1st Qu.: 0.000 1st Qu.: 2.000 Seek :3622
## Median : 1.000 Median : 3.000 Indeed :2459
## Mean : 2.167 Mean : 4.096 Jora :2398
## 3rd Qu.: 3.000 3rd Qu.: 7.000 LinkedIn :2294
## Max. :15.000 Max. :17.000 Recruit.net :2273
## (Other) :4618
## AgeStartedWorking PriorYearsOfExperience AverageTenure
## Min. : 0.00 Min. : 0.000 Min. : 0.0000
## 1st Qu.:20.00 1st Qu.: 0.000 1st Qu.: 0.0000
## Median :25.00 Median : 2.000 Median : 0.3333
## Mean :25.98 Mean : 4.165 Mean : 1.7700
## 3rd Qu.:31.00 3rd Qu.: 5.000 3rd Qu.: 1.5000
## Max. :60.00 Max. :40.000 Max. :40.0000
##
dados_rh_2 <- dados_rh[dados_rh$Attrition != 'Voluntary Resignation']
dados_rh_2 <-droplevels(dados_rh_2)
dim(dados_rh_2)
## [1] 19457 32
summary(dados_rh_2)
## Age Attrition BusinessTravel
## Min. :18.00 Current employee:19370 Non-Travel : 2154
## 1st Qu.:31.00 Termination : 87 Travel_Frequently: 3306
## Median :36.00 Travel_Rarely :13997
## Mean :37.61
## 3rd Qu.:43.00
## Max. :60.00
##
## Department DistanceFromHome Education
## Human Resources : 821 Min. : 1.000 1:2178
## Research & Development:13031 1st Qu.: 2.000 2:3762
## Sales : 5605 Median : 7.000 3:7455
## Mean : 8.969 4:5387
## 3rd Qu.:13.000 5: 675
## Max. :29.000
##
## EducationField EnvironmentSatisfaction Gender JobInvolvement
## Human Resources : 367 1:3636 Female: 7807 1: 982
## Life Sciences :8170 2:3749 Male :11650 2: 4858
## Marketing :2070 3:6067 3:11652
## Medical :6186 4:6005 4: 1965
## Other :1065
## Technical Degree:1599
##
## JobLevel JobRole JobSatisfaction MaritalStatus
## 1:6837 Sales Executive :4245 1:3698 Divorced:4581
## 2:7383 Research Scientist :3860 2:3701 Married :9167
## 3:2911 Laboratory Technician :3365 3:5821 Single :5709
## 4:1386 Manufacturing Director :2066 4:6237
## 5: 940 Healthcare Representative:1780
## Manager :1402
## (Other) :2739
## MonthlyIncome NumCompaniesWorked OverTime PercentSalaryHike
## Min. : 1009 Min. :0.000 No :14485 Min. :11.00
## 1st Qu.: 2996 1st Qu.:1.000 Yes: 4972 1st Qu.:12.00
## Median : 5006 Median :2.000 Median :14.00
## Mean : 6584 Mean :2.647 Mean :15.26
## 3rd Qu.: 8396 3rd Qu.:4.000 3rd Qu.:18.00
## Max. :19999 Max. :9.000 Max. :25.00
##
## PerformanceRating RelationshipSatisfaction StockOptionLevel TotalWorkingYears
## 3:16416 1:3581 0:7769 Min. : 0.00
## 4: 3041 2:4141 1:8361 1st Qu.: 6.00
## 3:5983 2:2220 Median :10.00
## 4:5752 3:1107 Mean :11.37
## 3rd Qu.:15.00
## Max. :40.00
##
## TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole
## Min. :0.000 1: 1017 Min. : 0.000 Min. : 0.000
## 1st Qu.:2.000 2: 4512 1st Qu.: 3.000 1st Qu.: 2.000
## Median :3.000 3:11869 Median : 5.000 Median : 3.000
## Mean :2.828 4: 2059 Mean : 7.079 Mean : 4.342
## 3rd Qu.:3.000 3rd Qu.:10.000 3rd Qu.: 7.000
## Max. :6.000 Max. :40.000 Max. :18.000
##
## YearsSinceLastPromotion YearsWithCurrManager Employee Source
## Min. : 0.000 Min. : 0.000 Company Website:4409
## 1st Qu.: 0.000 1st Qu.: 2.000 Seek :3067
## Median : 1.000 Median : 3.000 Indeed :2151
## Mean : 2.196 Mean : 4.196 LinkedIn :1986
## 3rd Qu.: 3.000 3rd Qu.: 7.000 Jora :1969
## Max. :15.000 Max. :17.000 Recruit.net :1951
## (Other) :3924
## AgeStartedWorking PriorYearsOfExperience AverageTenure
## Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.:20.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median :25.00 Median : 2.00 Median : 0.375
## Mean :26.24 Mean : 4.29 Mean : 1.860
## 3rd Qu.:32.00 3rd Qu.: 6.00 3rd Qu.: 1.667
## Max. :60.00 Max. :40.00 Max. :40.000
##
library(ggplot2)
ggplot(dados_rh) + geom_bar(aes(x = Gender))
Temos mais registro do sexo masculino do que feminino
ggplot(dados_rh) + geom_density(aes(x = Age))
#Utilizo o geom_density , Age ´e numerico (vamos criar a distribuição numerica)categorico utilizar é o ideal
A maioroia dos funcionarios esta em 30 e 40 anos
ggplot(dados_rh) + geom_bar(aes(x =Attrition))
A maioria dos funcionarios continuam empregados a minoria foram demitidos e outros pediram demissão. A grande maioria continua na empresa
ggplot(dados_rh) + geom_bar(aes(x = Department))
A maioria é de pesuisa e desenvolvimento , vendas tem uma parcela siguiificativa r recurso humano é a minoria dentro do quadro de funcionarios.
ggplot(dados_rh) + geom_bar(aes(x = JobRole))
Temos detalhado o quadro de funções dentro da IBM
ggplot(dados_rh) + geom_bar(aes(x = Education)) + facet_grid(~ EducationField)
Temos a base da educação do quadro de funcionarios, a maioria estão em ciencia da vida ou na área médica.
p.TotalWorkingYears <- ggplot(dados_rh) + geom_density(aes(TotalWorkingYears))
p.YearsAtCompany <- ggplot(dados_rh) + geom_density(aes(YearsAtCompany))
p.YearsSinceLastPromotion <- ggplot(dados_rh) + geom_density(aes(YearsSinceLastPromotion))
p.YearsWithCurrManager <- ggplot(dados_rh) + geom_density(aes(YearsWithCurrManager))
p.YearsInCurrentRole <- ggplot(dados_rh) + geom_density(aes(YearsInCurrentRole))
p.PriorYearsOfExperience <- ggplot(dados_rh) + geom_density(aes(PriorYearsOfExperience))
#Agora vamos chamar o grafico
# Organiza no grid
grid.arrange(p.TotalWorkingYears,
p.YearsAtCompany,
p.YearsSinceLastPromotion,
p.YearsWithCurrManager,
p.YearsInCurrentRole,
p.PriorYearsOfExperience,
nrow = 2,
ncol = 3)
Comparação das variaveis, podemos constatar nos primeiros anos são maiores, e caindo conforme o tempo, nada de anormal dentro de uma empresa.
O estudo vai ser da seguinte forma, queremos saber o tempo de serviço de cada colaborador. Vamos descobrir a proporção de funcionarios com menos de alguns anos de experiência. Escolhemos os vamores de 1, 3, 5, 7, 10 anos.
length(which(dados_rh$PriorYearsOfExperience < 1)) / length(dados_rh$PriorYearsOfExperience)
## [1] 0.3246596
length(which(dados_rh$PriorYearsOfExperience < 3)) / length(dados_rh$PriorYearsOfExperience)
## [1] 0.5828346
length(which(dados_rh$PriorYearsOfExperience < 5)) / length(dados_rh$PriorYearsOfExperience)
## [1] 0.7085177
length(which(dados_rh$PriorYearsOfExperience < 7)) / length(dados_rh$PriorYearsOfExperience)
## [1] 0.7952121
length(which(dados_rh$PriorYearsOfExperience < 10)) / length(dados_rh$PriorYearsOfExperience)
## [1] 0.8589644
58% dos funcionários têm menos de 3 anos de experiência de trabalho antes de entrar na IBM
Possíveis problemas: conjuntos de habilidades subdesenvolvidos, base de jovens funcionários, mentalidade de “trabalho” imatura.
length(which(dados_rh$Age < 30)) / length(dados_rh$Age)
## [1] 0.2165409
Insight, temos dentro da empresa apenas 22% com a idade inferior a 30 anos. Ou seja, a base de funcionarios não é tão jovem.
summary(dados_rh$Education)
## 1 2 3 4 5
## 2659 4436 8930 6279 754
#vamos verificar a proporção
length(which(dados_rh$Education == 3)) / length(dados_rh$Education)
## [1] 0.3872842
length(which(dados_rh$Education == 4)) / length(dados_rh$Education)
## [1] 0.2723133
Insight Educação
Cerca de 39% dos funcionários são graduados e 27% realizaram o mestrado. A busca pelo ensino superior pode ter levado a uma diminuição da experiência de trabalho.
Boxplot mostrando a distribuição do salário mensal para todos os 4 níveis de satisfação no trabalho de 1-4
ggplot(data = subset(dados_rh, !is.na(JobSatisfaction)), aes(JobSatisfaction, MonthlyIncome)) + geom_boxplot()
Não a sinais que um slario mais alto leva a uma maior satisfação no trabalho.
Temos o outliers em todos os nivies de trabalho.
Resumindo, salario mais alto não é garantia de satisfação no trabalho.
#vamo susar somente o complete.obs , isso que dizer, vamos descartar qualquer valor ausente
# Correlação
cor(dados_rh$TotalWorkingYears, dados_rh$YearsAtCompany, use = "complete.obs")#apenaso
## [1] 0.624816
cor(dados_rh$YearsAtCompany, dados_rh$YearsInCurrentRole, use = "complete.obs")
## [1] 0.7670497
cor(dados_rh$YearsAtCompany, dados_rh$YearsSinceLastPromotion, use = "complete.obs")
## [1] 0.6236737
cor(dados_rh$YearsAtCompany, dados_rh$YearsWithCurrManager, use = "complete.obs")
## [1] 0.7728072
cor(dados_rh$TotalWorkingYears, dados_rh$MonthlyIncome, use = "complete.obs")
## [1] 0.7582066
cor(dados_rh$YearsAtCompany, dados_rh$MonthlyIncome, use = "complete.obs")
## [1] 0.4981578
Temos os resultado com a correlação positiva por serem perto de 1
Agora vamos colocar em um grafico as duas ultimas correlação, TotalWorkingYears e YearsAtCompany
ggplot(dados_rh) + geom_point(aes(TotalWorkingYears, MonthlyIncome))
Aparentemente a uma tendencia uma de crescimento, nada fora do normal
ggplot(dados_rh) + geom_point(aes(YearsAtCompany, MonthlyIncome))
Média que uma variavel aumenta a outra aumenta, corresponde a normalidade.
ggplot(data = subset(dados_rh, !is.na(WorkLifeBalance)), aes(WorkLifeBalance, MonthlyIncome)) +
geom_boxplot()
Os funcionários que avaliaram o equilíbrio entre vida profissional e pessoal igual a 1 também têm renda média mensal significativamente mais baixa. Baixo equilíbrio entre vida profissional e baixo salário? Um problema que o departamento de RH precisa examinar. Numro 1 tem um baixo equilibrio na vida pessoal e profissional , o numero 4 tem alto equilibrio.
O RH tem que observar o grupo numero um, o que pode esta acontecendo com a vida do grupo 1. uma aventual promoção ou aumento de salario pode ser um fato motivacional.
Ou uma POssivel demissão.
ggplot(data = subset(dados_rh, !is.na(Gender)), aes(Gender, MonthlyIncome, fill = Gender)) +
geom_boxplot() +
theme(legend.position = "none", plot.title = element_text(hjust = 0.5, size = 10)) +
labs(x = "Gender", y = "Monthly Income", title = "Salário Mensal Entre Gêneros") +
coord_flip()
Insight: Não há sinais de discriminação de gênero; na verdade, as mulheres ganham um pouco mais, em média, desconsiderando todos os outros fatores.
ggplot(data = subset(dados_rh, !is.na(JobRole))) + geom_boxplot(aes(JobRole, MonthlyIncome)) +
ggtitle("Salário Mensal Por Função")
ggplot(data = subset(dados_rh, !is.na(JobRole))) + geom_boxplot(aes(JobRole, AgeStartedWorking)) +
ggtitle("Idade Que Iniciou na Função")
O salario mensal munda conforme a função.
ggplot(data = subset(dados_rh, !is.na(JobRole))) + geom_boxplot(aes(JobRole, Age)) +
ggtitle("Idade Por Função")
ggplot(data = subset(dados_rh, !is.na(JobRole))) + geom_boxplot(aes(JobRole, YearsAtCompany)) +
ggtitle("Tempo de Empresa (em anos)")
O cargo de gerente fica mais tempo que outras funções. O corpo d egestores formam os lideres da empresa.
ggplot(data = na.omit(dados_rh)) + geom_bar(aes(JobRole, fill = Education), position = "fill") +
ggtitle("Nível de Educação Por Função") +
ylab("Proportion")
P ara o nivel 5 da cor rosa, o numero de doutorados estão na área de pesquisa e desenvolvimento.
ggplot(data = dados_rh_1) +
geom_bar(aes(x = Education , fill = Attrition), position = 'fill') +
facet_grid(.~Department)
ggplot(data = dados_rh_1) +
geom_bar(aes(x = Education , fill = Attrition), position = 'fill') +
facet_grid(.~JobRole)
ggplot(data = dados_rh_1) +
geom_bar(aes(x = EducationField , fill = Attrition), position = 'fill') +
facet_grid(.~JobRole) +
theme(axis.text.x = element_text(angle = -90, hjust = 0))
# Plots de análise multivariada para variáveis normalmente usadas após o processo de contratação
ggplot(dados_rh_1) + geom_bar(aes(x = Age, fill = Attrition), position = 'fill')
ggplot(dados_rh_1) + geom_bar(aes(x = Department, fill = Attrition), position = 'fill')
ggplot(dados_rh_1) + geom_bar(aes(x = DistanceFromHome, fill = Attrition), position = 'fill')
ggplot(dados_rh_1) + geom_bar(aes(x = `Employee Source`, fill = Attrition), position = 'fill')
ggplot(dados_rh_1) + geom_bar(aes(x = JobRole, fill = Attrition), position = 'fill')
ggplot(dados_rh_1) + geom_bar(aes(x = MaritalStatus, fill = Attrition), position = 'fill')
ggplot(dados_rh_1) + geom_bar(aes(x = AverageTenure, fill = Attrition), position = 'fill')
ggplot(dados_rh_1) + geom_bar(aes(x = Education, fill = Attrition), position = 'fill')
ggplot(dados_rh_1) + geom_boxplot(aes(Attrition, MonthlyIncome))
ggplot(dados_rh_1) + geom_boxplot(aes(Attrition, PercentSalaryHike))
ggplot(dados_rh_1) + geom_bar(aes(TrainingTimesLastYear, fill = Attrition), position = 'fill')
ggplot(dados_rh_1) + geom_bar(aes(BusinessTravel, fill = Attrition), position = 'fill')
ggplot(dados_rh_1) + geom_bar(aes(OverTime, fill = Attrition), position = 'fill')
ggplot(dados_rh_1) + geom_bar(aes(StockOptionLevel, fill = Attrition), position = 'fill')
ggplot(dados_rh_1) + geom_bar(aes(EnvironmentSatisfaction, fill = Attrition), position = 'fill')
ggplot(dados_rh_1) + geom_bar(aes(JobSatisfaction, fill = Attrition), position = 'fill')
ggplot(dados_rh_1) + geom_bar(aes(JobInvolvement, fill = Attrition), position = 'fill')
ggplot(dados_rh_1) + geom_bar(aes(RelationshipSatisfaction, fill = Attrition), position = 'fill')
ggplot(dados_rh_1) + geom_bar(aes(WorkLifeBalance, fill = Attrition), position = 'fill')
Vamos concentrar nosso trabalho em tentar ajudar o RH a recrutar melhor visando evitar atritos e, consequentemente, demissões.
Criaremos 5 versões do modelo e para cada um vamos explorar as opções e interpretar o resultado.
#glm regressão logistica
# Attrition vai ser a variavel target, por isso utilizamos o sinal de ~
#POsso fazer qualquer experimento com a quantidade de variavel infinita par ao meu modelo
#Quando não tem nenhum * mostrar que não tem nenhuma relevancia para a target que é a variavel alvo.
?glm
## starting httpd help server ... done
modelo_v1 <- glm(Attrition ~ Age + Department + DistanceFromHome + `Employee Source` +
JobRole + MaritalStatus + AverageTenure + PriorYearsOfExperience + Gender +
Education + EducationField,
family = binomial,
data = dados_rh)
#obs, aqui usei toda a minha base de dados, com isso não consigo fazer a previsão , o certo é dividir em treino e teste.
summary(modelo_v1)
##
## Call:
## glm(formula = Attrition ~ Age + Department + DistanceFromHome +
## `Employee Source` + JobRole + MaritalStatus + AverageTenure +
## PriorYearsOfExperience + Gender + Education + EducationField,
## family = binomial, data = dados_rh)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4738 -0.6239 -0.4962 -0.3553 2.7405
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.515415 0.198808 -2.593 0.009527 **
## Age -0.046402 0.002434 -19.062 < 2e-16 ***
## DepartmentResearch & Development -0.402413 0.102837 -3.913 9.11e-05 ***
## DepartmentSales 0.041108 0.106275 0.387 0.698901
## DistanceFromHome 0.022014 0.002497 8.816 < 2e-16 ***
## `Employee Source`Company Website 0.200175 0.074567 2.684 0.007264 **
## `Employee Source`GlassDoor -0.002062 0.089568 -0.023 0.981630
## `Employee Source`Indeed -0.048126 0.088966 -0.541 0.588545
## `Employee Source`Jora 0.202494 0.084534 2.395 0.016602 *
## `Employee Source`LinkedIn -0.086527 0.090292 -0.958 0.337911
## `Employee Source`Recruit.net -0.024145 0.088800 -0.272 0.785699
## `Employee Source`Referral 0.222132 0.147177 1.509 0.131226
## `Employee Source`Seek 0.039192 0.079096 0.495 0.620253
## JobRoleHuman Resources 0.092163 0.125250 0.736 0.461832
## JobRoleLaboratory Technician 0.313456 0.079749 3.931 8.48e-05 ***
## JobRoleManager -0.370055 0.121400 -3.048 0.002302 **
## JobRoleManufacturing Director -0.091942 0.094178 -0.976 0.328937
## JobRoleResearch Director -0.326907 0.125855 -2.597 0.009391 **
## JobRoleResearch Scientist 0.102218 0.078537 1.302 0.193080
## JobRoleSales Executive -0.030434 0.079097 -0.385 0.700414
## JobRoleSales Representative 0.484732 0.095181 5.093 3.53e-07 ***
## MaritalStatusMarried 0.179376 0.053279 3.367 0.000761 ***
## MaritalStatusSingle 0.740422 0.053393 13.867 < 2e-16 ***
## AverageTenure -0.016927 0.009230 -1.834 0.066663 .
## PriorYearsOfExperience 0.018901 0.005353 3.531 0.000414 ***
## GenderMale 0.033768 0.038421 0.879 0.379467
## Education2 0.096221 0.068965 1.395 0.162951
## Education3 0.129656 0.061109 2.122 0.033862 *
## Education4 0.120603 0.066456 1.815 0.069558 .
## Education5 -0.221560 0.134302 -1.650 0.099001 .
## EducationFieldLife Sciences -0.149802 0.143779 -1.042 0.297462
## EducationFieldMarketing -0.122315 0.152984 -0.800 0.423984
## EducationFieldMedical -0.176829 0.145066 -1.219 0.222859
## EducationFieldOther -0.170949 0.161651 -1.058 0.290274
## EducationFieldTechnical Degree 0.183255 0.154276 1.188 0.234898
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 20272 on 23057 degrees of freedom
## Residual deviance: 18904 on 23023 degrees of freedom
## AIC: 18974
##
## Number of Fisher Scoring iterations: 5
# A quantidade de asteriscos *** mostra a relevancia que tem para a variavel alvo.
?vif
vif(modelo_v1)
## GVIF Df GVIF^(1/(2*Df))
## Age 1.197853 1 1.094465
## Department 2.027501 2 1.193274
## DistanceFromHome 1.321206 1 1.149437
## `Employee Source` 1.107922 8 1.006426
## JobRole 2.564522 8 1.060628
## MaritalStatus 1.042578 2 1.010479
## AverageTenure 2.478002 1 1.574167
## PriorYearsOfExperience 2.440072 1 1.562073
## Gender 1.019571 1 1.009738
## Education 1.121235 4 1.014407
## EducationField 1.648089 5 1.051231
#Variaveis com maior importancia, concentar na primeira coluna, quando maior o numero maior vai ser a sua importancia.
# modelo base, falta dividir a base em treino e teste para o experimento
#
set.seed(2004)
index_treino <- sample.split(Y = dados_rh_1$Attrition, SplitRatio = 0.7)#divisor 70% para treino dados verdadeiros
dados_rh_1_treino <- subset(dados_rh_1, train = T)
dados_rh_1_teste <- subset(dados_rh_1, train = F)
modelo_v2 <- glm(Attrition ~ Age + Department + DistanceFromHome + `Employee Source` +
JobRole + MaritalStatus + AverageTenure + PriorYearsOfExperience + Gender +
Education + EducationField,
family = binomial,
data = dados_rh_1_treino)#Vou utilizar a base de dados criado na versão V1, dados esse criando sem as pessoas demitidas.
summary(modelo_v2)
##
## Call:
## glm(formula = Attrition ~ Age + Department + DistanceFromHome +
## `Employee Source` + JobRole + MaritalStatus + AverageTenure +
## PriorYearsOfExperience + Gender + Education + EducationField,
## family = binomial, data = dados_rh_1_treino)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4484 -0.6177 -0.4918 -0.3558 2.7300
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.499751 0.199492 -2.505 0.012241 *
## Age -0.044889 0.002446 -18.348 < 2e-16 ***
## DepartmentResearch & Development -0.427955 0.103053 -4.153 3.28e-05 ***
## DepartmentSales 0.025684 0.106499 0.241 0.809423
## DistanceFromHome 0.020372 0.002522 8.076 6.69e-16 ***
## `Employee Source`Company Website 0.183335 0.074868 2.449 0.014334 *
## `Employee Source`GlassDoor 0.006274 0.089680 0.070 0.944229
## `Employee Source`Indeed -0.080908 0.089734 -0.902 0.367244
## `Employee Source`Jora 0.183678 0.084958 2.162 0.030618 *
## `Employee Source`LinkedIn -0.079145 0.090405 -0.875 0.381325
## `Employee Source`Recruit.net -0.050665 0.089444 -0.566 0.571095
## `Employee Source`Referral 0.230121 0.147168 1.564 0.117897
## `Employee Source`Seek -0.005837 0.079828 -0.073 0.941708
## JobRoleHuman Resources 0.107348 0.125753 0.854 0.393302
## JobRoleLaboratory Technician 0.314968 0.080707 3.903 9.52e-05 ***
## JobRoleManager -0.402633 0.123788 -3.253 0.001144 **
## JobRoleManufacturing Director -0.083426 0.095273 -0.876 0.381221
## JobRoleResearch Director -0.292195 0.126243 -2.315 0.020637 *
## JobRoleResearch Scientist 0.111877 0.079359 1.410 0.158608
## JobRoleSales Executive -0.028140 0.079873 -0.352 0.724611
## JobRoleSales Representative 0.478077 0.096067 4.977 6.47e-07 ***
## MaritalStatusMarried 0.176289 0.053865 3.273 0.001065 **
## MaritalStatusSingle 0.747383 0.053896 13.867 < 2e-16 ***
## AverageTenure -0.021245 0.009467 -2.244 0.024825 *
## PriorYearsOfExperience 0.019787 0.005399 3.665 0.000248 ***
## GenderMale 0.030982 0.038752 0.800 0.424000
## Education2 0.067584 0.069195 0.977 0.328712
## Education3 0.092553 0.061236 1.511 0.130684
## Education4 0.071013 0.066760 1.064 0.287461
## Education5 -0.233758 0.134267 -1.741 0.081685 .
## EducationFieldLife Sciences -0.148858 0.143810 -1.035 0.300620
## EducationFieldMarketing -0.106268 0.152995 -0.695 0.487317
## EducationFieldMedical -0.202212 0.145203 -1.393 0.163736
## EducationFieldOther -0.137807 0.161652 -0.852 0.393940
## EducationFieldTechnical Degree 0.180977 0.154552 1.171 0.241608
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19951 on 22970 degrees of freedom
## Residual deviance: 18626 on 22936 degrees of freedom
## AIC: 18696
##
## Number of Fisher Scoring iterations: 5
vif(modelo_v2)
## GVIF Df GVIF^(1/(2*Df))
## Age 1.196689 1 1.093933
## Department 2.033631 2 1.194175
## DistanceFromHome 1.321839 1 1.149712
## `Employee Source` 1.109202 8 1.006499
## JobRole 2.550173 8 1.060256
## MaritalStatus 1.043120 2 1.010610
## AverageTenure 2.441327 1 1.562475
## PriorYearsOfExperience 2.412397 1 1.553189
## Gender 1.018652 1 1.009283
## Education 1.123257 4 1.014635
## EducationField 1.650765 5 1.051401
# Previsões
threshold <- 0.5
previsoes_v2 <- predict(modelo_v2, type = 'response', newdata = dados_rh_1_teste)
previsoes_finais_v2 <- ifelse(previsoes_v2 > threshold, 'Voluntary Resignation', 'Current employee')
table(dados_rh_1_teste$Attrition, previsoes_finais_v2)
## previsoes_finais_v2
## Current employee Voluntary Resignation
## Current employee 19328 42
## Voluntary Resignation 3523 78
quando a claaase era que o funcionario continuaria como empregado o nosso modelo previu 19328
Quando a classe real era pedido de demissão o nosso previu que continuaria como empregado no total de 3523, ou seja o nosso modelo errou. uma taxa de erro consideravel. POderiamos fazer uma balanceamento de classes para ter um modelo mais preciso.
modelo_v3 <- glm(Attrition ~ Age + Department + DistanceFromHome + `Employee Source` +
JobRole + MaritalStatus + AverageTenure + PriorYearsOfExperience + Gender,
family = binomial,
data = dados_rh_1_treino)
summary(modelo_v3)
##
## Call:
## glm(formula = Attrition ~ Age + Department + DistanceFromHome +
## `Employee Source` + JobRole + MaritalStatus + AverageTenure +
## PriorYearsOfExperience + Gender, family = binomial, data = dados_rh_1_treino)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3428 -0.6201 -0.4941 -0.3619 2.7143
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.594443 0.163302 -3.640 0.000272 ***
## Age -0.044338 0.002361 -18.781 < 2e-16 ***
## DepartmentResearch & Development -0.455831 0.097648 -4.668 3.04e-06 ***
## DepartmentSales 0.006375 0.100798 0.063 0.949567
## DistanceFromHome 0.023945 0.002219 10.792 < 2e-16 ***
## `Employee Source`Company Website 0.185836 0.074684 2.488 0.012835 *
## `Employee Source`GlassDoor 0.004131 0.089469 0.046 0.963174
## `Employee Source`Indeed -0.084488 0.089587 -0.943 0.345638
## `Employee Source`Jora 0.182141 0.084629 2.152 0.031378 *
## `Employee Source`LinkedIn -0.073833 0.090249 -0.818 0.413300
## `Employee Source`Recruit.net -0.058670 0.089241 -0.657 0.510903
## `Employee Source`Referral 0.237922 0.146800 1.621 0.105078
## `Employee Source`Seek -0.006818 0.079571 -0.086 0.931717
## JobRoleHuman Resources 0.099083 0.125594 0.789 0.430163
## JobRoleLaboratory Technician 0.312339 0.080556 3.877 0.000106 ***
## JobRoleManager -0.418085 0.123665 -3.381 0.000723 ***
## JobRoleManufacturing Director -0.079696 0.095061 -0.838 0.401826
## JobRoleResearch Director -0.308958 0.126075 -2.451 0.014263 *
## JobRoleResearch Scientist 0.119993 0.079265 1.514 0.130071
## JobRoleSales Executive -0.023432 0.079774 -0.294 0.768961
## JobRoleSales Representative 0.483836 0.095952 5.042 4.60e-07 ***
## MaritalStatusMarried 0.176480 0.053793 3.281 0.001035 **
## MaritalStatusSingle 0.747665 0.053772 13.904 < 2e-16 ***
## AverageTenure -0.019906 0.009465 -2.103 0.035453 *
## PriorYearsOfExperience 0.019187 0.005400 3.553 0.000381 ***
## GenderMale 0.033764 0.038690 0.873 0.382838
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19951 on 22970 degrees of freedom
## Residual deviance: 18668 on 22945 degrees of freedom
## AIC: 18720
##
## Number of Fisher Scoring iterations: 5
vif(modelo_v3)
## GVIF Df GVIF^(1/(2*Df))
## Age 1.116616 1 1.056701
## Department 1.733091 2 1.147375
## DistanceFromHome 1.008189 1 1.004086
## `Employee Source` 1.080863 8 1.004872
## JobRole 2.519868 8 1.059464
## MaritalStatus 1.037566 2 1.009262
## AverageTenure 2.443412 1 1.563142
## PriorYearsOfExperience 2.418527 1 1.555161
## Gender 1.017671 1 1.008797
# Previsões
threshold <- 0.5
previsoes_v3 <- predict(modelo_v3, type = 'response', newdata = dados_rh_1_teste)
previsoes_finais_v3 <- ifelse(previsoes_v3 > threshold, 'Voluntary Resignation', 'Current employee')
table(dados_rh_1_teste$Attrition, previsoes_finais_v3)
## previsoes_finais_v3
## Current employee Voluntary Resignation
## Current employee 19328 42
## Voluntary Resignation 3541 60
modelo_v4 <- glm(Attrition ~ Age + Department + DistanceFromHome + `Employee Source` +
JobRole + MaritalStatus + AverageTenure + PriorYearsOfExperience,
family = binomial,
data = dados_rh_1_treino)
summary(modelo_v4)
##
## Call:
## glm(formula = Attrition ~ Age + Department + DistanceFromHome +
## `Employee Source` + JobRole + MaritalStatus + AverageTenure +
## PriorYearsOfExperience, family = binomial, data = dados_rh_1_treino)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3360 -0.6192 -0.4939 -0.3622 2.7205
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.569968 0.160865 -3.543 0.000395 ***
## Age -0.044408 0.002359 -18.822 < 2e-16 ***
## DepartmentResearch & Development -0.457114 0.097648 -4.681 2.85e-06 ***
## DepartmentSales 0.004776 0.100790 0.047 0.962208
## DistanceFromHome 0.023979 0.002218 10.810 < 2e-16 ***
## `Employee Source`Company Website 0.185968 0.074691 2.490 0.012780 *
## `Employee Source`GlassDoor 0.004217 0.089473 0.047 0.962404
## `Employee Source`Indeed -0.082065 0.089543 -0.916 0.359412
## `Employee Source`Jora 0.182210 0.084632 2.153 0.031321 *
## `Employee Source`LinkedIn -0.073105 0.090254 -0.810 0.417948
## `Employee Source`Recruit.net -0.058149 0.089234 -0.652 0.514631
## `Employee Source`Referral 0.240776 0.146746 1.641 0.100844
## `Employee Source`Seek -0.006816 0.079577 -0.086 0.931742
## JobRoleHuman Resources 0.100479 0.125614 0.800 0.423769
## JobRoleLaboratory Technician 0.315123 0.080478 3.916 9.02e-05 ***
## JobRoleManager -0.419678 0.123673 -3.393 0.000690 ***
## JobRoleManufacturing Director -0.082962 0.094978 -0.873 0.382397
## JobRoleResearch Director -0.310452 0.126056 -2.463 0.013785 *
## JobRoleResearch Scientist 0.120223 0.079252 1.517 0.129277
## JobRoleSales Executive -0.023015 0.079761 -0.289 0.772925
## JobRoleSales Representative 0.482258 0.095927 5.027 4.97e-07 ***
## MaritalStatusMarried 0.175136 0.053769 3.257 0.001125 **
## MaritalStatusSingle 0.745551 0.053714 13.880 < 2e-16 ***
## AverageTenure -0.019985 0.009465 -2.112 0.034727 *
## PriorYearsOfExperience 0.019266 0.005398 3.569 0.000358 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19951 on 22970 degrees of freedom
## Residual deviance: 18668 on 22946 degrees of freedom
## AIC: 18718
##
## Number of Fisher Scoring iterations: 5
vif(modelo_v4)
## GVIF Df GVIF^(1/(2*Df))
## Age 1.115570 1 1.056205
## Department 1.732346 2 1.147252
## DistanceFromHome 1.007827 1 1.003906
## `Employee Source` 1.078323 8 1.004724
## JobRole 2.493476 8 1.058767
## MaritalStatus 1.035363 2 1.008726
## AverageTenure 2.443871 1 1.563288
## PriorYearsOfExperience 2.417511 1 1.554835
# Previsões
threshold <- 0.5
previsoes_v4 <- predict(modelo_v4, type = 'response', newdata = dados_rh_1_teste)
previsoes_finais_v4 <- ifelse(previsoes_v4 > threshold, 'Voluntary Resignation', 'Current employee')
table(dados_rh_1_teste$Attrition, previsoes_finais_v4)
## previsoes_finais_v4
## Current employee Voluntary Resignation
## Current employee 19326 44
## Voluntary Resignation 3545 56
Os três modelos feito mostrou a relevancia dos JobRole ,PriorYearsOfExperience e AverageTenure. apresentar esses valores aos tomadores de decisão.
?rpart
modelo_v5 <- rpart(Attrition ~ Age + Department + DistanceFromHome + JobRole + MaritalStatus +
AverageTenure + PriorYearsOfExperience,
method = "class",
control = rpart.control(minsplit = 500, cp = 0),
data = dados_rh_1_treino)
summary(modelo_v5)
## Call:
## rpart(formula = Attrition ~ Age + Department + DistanceFromHome +
## JobRole + MaritalStatus + AverageTenure + PriorYearsOfExperience,
## data = dados_rh_1_treino, method = "class", control = rpart.control(minsplit = 500,
## cp = 0))
## n= 22971
##
## CP nsplit rel error xerror xstd
## 1 0.006803666 0 1.0000000 1.0000000 0.01530253
## 2 0.000416551 3 0.9780616 0.9800056 0.01517691
## 3 0.000000000 7 0.9763954 0.9816718 0.01518746
##
## Variable importance
## Age MaritalStatus JobRole
## 68 15 9
## DistanceFromHome AverageTenure PriorYearsOfExperience
## 6 2 1
##
## Node number 1: 22971 observations, complexity param=0.006803666
## predicted class=Current employee expected loss=0.1567629 P(node) =1
## class counts: 19370 3601
## probabilities: 0.843 0.157
## left son=2 (14161 obs) right son=3 (8810 obs)
## Primary splits:
## Age < 33.5 to the right, improve=192.45070, (0 missing)
## MaritalStatus splits as LLR, improve= 98.38529, (0 missing)
## DistanceFromHome < 11.5 to the left, improve= 45.04993, (0 missing)
## Department splits as RLR, improve= 44.05590, (0 missing)
## JobRole splits as LRRLLLRRR, improve= 43.19011, (0 missing)
## Surrogate splits:
## JobRole splits as LLLLLLLLR, agree=0.622, adj=0.015, (0 split)
##
## Node number 2: 14161 observations
## predicted class=Current employee expected loss=0.1057129 P(node) =0.6164729
## class counts: 12664 1497
## probabilities: 0.894 0.106
##
## Node number 3: 8810 observations, complexity param=0.006803666
## predicted class=Current employee expected loss=0.2388195 P(node) =0.3835271
## class counts: 6706 2104
## probabilities: 0.761 0.239
## left son=6 (8239 obs) right son=7 (571 obs)
## Primary splits:
## Age < 21.5 to the right, improve=112.91850, (0 missing)
## MaritalStatus splits as LLR, improve=105.19850, (0 missing)
## DistanceFromHome < 11.5 to the left, improve= 31.14560, (0 missing)
## JobRole splits as LRRLLLLLR, improve= 28.54958, (0 missing)
## Department splits as RLL, improve= 13.48102, (0 missing)
##
## Node number 6: 8239 observations, complexity param=0.000416551
## predicted class=Current employee expected loss=0.2177449 P(node) =0.3586696
## class counts: 6445 1794
## probabilities: 0.782 0.218
## left son=12 (5600 obs) right son=13 (2639 obs)
## Primary splits:
## MaritalStatus splits as LLR, improve=67.13122, (0 missing)
## DistanceFromHome < 11.5 to the left, improve=41.60399, (0 missing)
## JobRole splits as LRRLLLRRR, improve=19.17166, (0 missing)
## Age < 26.5 to the right, improve=14.50616, (0 missing)
## PriorYearsOfExperience < 14.5 to the right, improve=12.24881, (0 missing)
##
## Node number 7: 571 observations, complexity param=0.006803666
## predicted class=Voluntary Resignation expected loss=0.4570928 P(node) =0.02485743
## class counts: 261 310
## probabilities: 0.457 0.543
## left son=14 (190 obs) right son=15 (381 obs)
## Primary splits:
## JobRole splits as LRRRRRLRR, improve=8.456258, (0 missing)
## Age < 19.5 to the right, improve=5.644184, (0 missing)
## Department splits as RLR, improve=2.925194, (0 missing)
## DistanceFromHome < 5.5 to the left, improve=2.502770, (0 missing)
## Surrogate splits:
## PriorYearsOfExperience < 15 to the right, agree=0.681, adj=0.042, (0 split)
## DistanceFromHome < 23.5 to the right, agree=0.673, adj=0.016, (0 split)
## MaritalStatus splits as RLR, agree=0.671, adj=0.011, (0 split)
##
## Node number 12: 5600 observations
## predicted class=Current employee expected loss=0.1739286 P(node) =0.2437856
## class counts: 4626 974
## probabilities: 0.826 0.174
##
## Node number 13: 2639 observations, complexity param=0.000416551
## predicted class=Current employee expected loss=0.3107238 P(node) =0.114884
## class counts: 1819 820
## probabilities: 0.689 0.311
## left son=26 (378 obs) right son=27 (2261 obs)
## Primary splits:
## JobRole splits as RLRLLRRRR, improve=28.098750, (0 missing)
## DistanceFromHome < 20.5 to the left, improve=25.339560, (0 missing)
## PriorYearsOfExperience < 5.5 to the right, improve= 8.692388, (0 missing)
## AverageTenure < 2.770833 to the right, improve= 4.328876, (0 missing)
## Age < 27.5 to the left, improve= 3.922510, (0 missing)
## Surrogate splits:
## AverageTenure < 8.25 to the right, agree=0.869, adj=0.087, (0 split)
## PriorYearsOfExperience < 19 to the right, agree=0.865, adj=0.058, (0 split)
##
## Node number 14: 190 observations
## predicted class=Current employee expected loss=0.4210526 P(node) =0.008271299
## class counts: 110 80
## probabilities: 0.579 0.421
##
## Node number 15: 381 observations
## predicted class=Voluntary Resignation expected loss=0.3963255 P(node) =0.01658613
## class counts: 151 230
## probabilities: 0.396 0.604
##
## Node number 26: 378 observations
## predicted class=Current employee expected loss=0.1322751 P(node) =0.01645553
## class counts: 328 50
## probabilities: 0.868 0.132
##
## Node number 27: 2261 observations, complexity param=0.000416551
## predicted class=Current employee expected loss=0.3405573 P(node) =0.09842845
## class counts: 1491 770
## probabilities: 0.659 0.341
## left son=54 (1621 obs) right son=55 (640 obs)
## Primary splits:
## DistanceFromHome < 12.5 to the left, improve=27.233170, (0 missing)
## PriorYearsOfExperience < 1.5 to the left, improve= 8.703747, (0 missing)
## Age < 27.5 to the left, improve= 8.372829, (0 missing)
## AverageTenure < 0.2111111 to the left, improve= 7.042972, (0 missing)
## JobRole splits as L-L--LLLR, improve= 3.730849, (0 missing)
##
## Node number 54: 1621 observations
## predicted class=Current employee expected loss=0.2917952 P(node) =0.07056724
## class counts: 1148 473
## probabilities: 0.708 0.292
##
## Node number 55: 640 observations, complexity param=0.000416551
## predicted class=Current employee expected loss=0.4640625 P(node) =0.02786122
## class counts: 343 297
## probabilities: 0.536 0.464
## left son=110 (172 obs) right son=111 (468 obs)
## Primary splits:
## AverageTenure < 0.6458333 to the right, improve=6.245802, (0 missing)
## JobRole splits as L-L--RLRR, improve=5.369857, (0 missing)
## Department splits as RLR, improve=4.685078, (0 missing)
## DistanceFromHome < 20.5 to the left, improve=4.056358, (0 missing)
## PriorYearsOfExperience < 0.5 to the right, improve=2.844245, (0 missing)
## Surrogate splits:
## PriorYearsOfExperience < 2.5 to the right, agree=0.820, adj=0.331, (0 split)
## Age < 23.5 to the left, agree=0.755, adj=0.087, (0 split)
##
## Node number 110: 172 observations
## predicted class=Current employee expected loss=0.3488372 P(node) =0.007487702
## class counts: 112 60
## probabilities: 0.651 0.349
##
## Node number 111: 468 observations
## predicted class=Voluntary Resignation expected loss=0.4935897 P(node) =0.02037351
## class counts: 231 237
## probabilities: 0.494 0.506
#Arvore de decisão
rpart.plot(modelo_v5)
RESUMO:
Apresentar esse grafico par aos gestores, é uma forma facil de interpretar. Observamos que a classe idade tem grande peso.