“RH_ANALITICS”

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(ggplot2)
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.0.5
library(data.table)
library(car)
## Warning: package 'car' was built under R version 4.0.5
## Loading required package: carData
library(caTools)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.0.5
## corrplot 0.88 loaded
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.0.5
dados_rh <- fread('dados/dataset.csv')

Transformando variáveis categóricas para o tipo fator

dados_rh$Attrition                <- as.factor(dados_rh$Attrition)
dados_rh$BusinessTravel           <- as.factor(dados_rh$BusinessTravel)
dados_rh$Department               <- as.factor(dados_rh$Department)
dados_rh$Education                <- as.factor(dados_rh$Education)
dados_rh$EducationField           <- as.factor(dados_rh$EducationField)
dados_rh$'Employee Source'        <- as.factor(dados_rh$'Employee Source')
dados_rh$EnvironmentSatisfaction  <- as.factor(dados_rh$EnvironmentSatisfaction)
dados_rh$Gender                   <- as.factor(dados_rh$Gender)
dados_rh$JobInvolvement           <- as.factor(dados_rh$JobInvolvement)
dados_rh$JobLevel                 <- as.factor(dados_rh$JobLevel)
dados_rh$JobRole                  <- as.factor(dados_rh$JobRole)
dados_rh$JobSatisfaction          <- as.factor(dados_rh$JobSatisfaction)
dados_rh$MaritalStatus            <- as.factor(dados_rh$MaritalStatus)
dados_rh$OverTime                 <- as.factor(dados_rh$OverTime)
dados_rh$PerformanceRating        <- as.factor(dados_rh$PerformanceRating)
dados_rh$RelationshipSatisfaction <- as.factor(dados_rh$RelationshipSatisfaction)
dados_rh$StockOptionLevel         <- as.factor(dados_rh$StockOptionLevel)
dados_rh$WorkLifeBalance          <- as.factor(dados_rh$WorkLifeBalance)

Transformando variáveis numéricas para o tipo inteiro

dados_rh$DistanceFromHome  <- as.integer(dados_rh$DistanceFromHome)
dados_rh$MonthlyIncome     <- as.integer(dados_rh$MonthlyIncome)
dados_rh$PercentSalaryHike <- as.integer(dados_rh$PercentSalaryHike)

Drop dos níveis de fatores com 0 count

dados <- droplevels(dados_rh)
summary(dados_rh)
##       Age                        Attrition               BusinessTravel 
##  Min.   :18.00   Current employee     :19370   Non-Travel       : 2344  
##  1st Qu.:30.00   Termination          :   87   Travel_Frequently: 4378  
##  Median :36.00   Voluntary Resignation: 3601   Travel_Rarely    :16336  
##  Mean   :37.04                                                          
##  3rd Qu.:43.00                                                          
##  Max.   :60.00                                                          
##                                                                         
##                   Department    DistanceFromHome Education
##  Human Resources       : 1010   Min.   : 1.000   1:2659   
##  Research & Development:15040   1st Qu.: 2.000   2:4436   
##  Sales                 : 7008   Median : 7.000   3:8930   
##                                 Mean   : 9.215   4:6279   
##                                 3rd Qu.:14.000   5: 754   
##                                 Max.   :29.000            
##                                                           
##           EducationField EnvironmentSatisfaction    Gender      JobInvolvement
##  Human Resources : 442   1:4490                  Female: 9205   1: 1287       
##  Life Sciences   :9513   2:4476                  Male  :13853   2: 5888       
##  Marketing       :2484   3:7091                                 3:13644       
##  Medical         :7267   4:7001                                 4: 2239       
##  Other           :1291                                                        
##  Technical Degree:2061                                                        
##                                                                               
##  JobLevel                      JobRole     JobSatisfaction  MaritalStatus  
##  1:8594   Sales Executive          :5067   1:4575          Divorced: 5163  
##  2:8448   Research Scientist       :4591   2:4371          Married :10543  
##  3:3440   Laboratory Technician    :4112   3:6938          Single  : 7352  
##  4:1563   Manufacturing Director   :2346   4:7174                          
##  5:1013   Healthcare Representative:2069                                   
##           Manager                  :1521                                   
##           (Other)                  :3352                                   
##  MonthlyIncome   NumCompaniesWorked OverTime    PercentSalaryHike
##  Min.   : 1009   Min.   :0.000      No :16524   Min.   :11.00    
##  1st Qu.: 2900   1st Qu.:1.000      Yes: 6534   1st Qu.:12.00    
##  Median : 4898   Median :2.000                  Median :14.00    
##  Mean   : 6416   Mean   :2.691                  Mean   :15.22    
##  3rd Qu.: 8120   3rd Qu.:4.000                  3rd Qu.:18.00    
##  Max.   :19999   Max.   :9.000                  Max.   :25.00    
##                                                                  
##  PerformanceRating RelationshipSatisfaction StockOptionLevel TotalWorkingYears
##  3:19478           1:4331                   0:9873           Min.   : 0.00    
##  4: 3580           2:4762                   1:9370           1st Qu.: 6.00    
##                    3:7164                   2:2497           Median :10.00    
##                    4:6801                   3:1318           Mean   :11.07    
##                                                              3rd Qu.:15.00    
##                                                              Max.   :40.00    
##                                                                               
##  TrainingTimesLastYear WorkLifeBalance YearsAtCompany  YearsInCurrentRole
##  Min.   :0.000         1: 1263         Min.   : 0.00   Min.   : 0.000    
##  1st Qu.:2.000         2: 5374         1st Qu.: 3.00   1st Qu.: 2.000    
##  Median :3.000         3:14016         Median : 5.00   Median : 3.000    
##  Mean   :2.804         4: 2405         Mean   : 6.91   Mean   : 4.201    
##  3rd Qu.:3.000                         3rd Qu.: 9.00   3rd Qu.: 7.000    
##  Max.   :6.000                         Max.   :40.00   Max.   :18.000    
##                                                                          
##  YearsSinceLastPromotion YearsWithCurrManager        Employee Source
##  Min.   : 0.000          Min.   : 0.000       Company Website:5327  
##  1st Qu.: 0.000          1st Qu.: 2.000       Seek           :3655  
##  Median : 1.000          Median : 3.000       Indeed         :2471  
##  Mean   : 2.164          Mean   : 4.091       Jora           :2408  
##  3rd Qu.: 3.000          3rd Qu.: 7.000       LinkedIn       :2294  
##  Max.   :15.000          Max.   :17.000       Recruit.net    :2283  
##                                               (Other)        :4620  
##  AgeStartedWorking
##  Min.   : 0.00    
##  1st Qu.:20.00    
##  Median :25.00    
##  Mean   :25.96    
##  3rd Qu.:31.00    
##  Max.   :60.00    
## 

Engenharia de Atributos

Criamos uma coluna de anos anteriores de experiência para visualizar melhor o perfil de experiência do funcionário.

dados_rh$PriorYearsOfExperience <- dados_rh$TotalWorkingYears - dados_rh$YearsAtCompany

A estabilidade no emprego (job tenure) é a medida do tempo que um funcionário está empregado por seu empregador atual. A estabilidade no emprego de um funcionário é muito importante e muitas vezes os empregadores consideram a estabilidade no emprego um critério para a contratação de novos funcionários. A permanência no emprego pode ser longa ou curta.

Criamos um novo recurso de estabilidade média para traçar o perfil de permanência média dos funcionários em empresas anteriores.

dados_rh$AverageTenure <- dados_rh$PriorYearsOfExperience / dados_rh$NumCompaniesWorked
View(dados_rh)

A estabilidade média produz valores como Inf devido à natureza de sua derivaçãoSubstituímos para zero.

dados_rh$AverageTenure[!is.finite(dados_rh$AverageTenure)] <- 0

Analisamos e dividimos os dados como base na coluna Termination, que indica se o funcionário foi desligado da empresa.

dados_rh_1 <- dados_rh[dados_rh$Attrition != 'Termination']
dados_rh_1 <- droplevels(dados_rh_1)
dim(dados_rh_1)
## [1] 22971    32
summary(dados_rh_1)
##       Age                        Attrition               BusinessTravel 
##  Min.   :18.00   Current employee     :19370   Non-Travel       : 2344  
##  1st Qu.:30.00   Voluntary Resignation: 3601   Travel_Frequently: 4363  
##  Median :36.00                                 Travel_Rarely    :16264  
##  Mean   :37.06                                                          
##  3rd Qu.:43.00                                                          
##  Max.   :60.00                                                          
##                                                                         
##                   Department    DistanceFromHome Education
##  Human Resources       : 1010   Min.   : 1.000   1:2659   
##  Research & Development:14977   1st Qu.: 2.000   2:4421   
##  Sales                 : 6984   Median : 7.000   3:8890   
##                                 Mean   : 9.191   4:6247   
##                                 3rd Qu.:14.000   5: 754   
##                                 Max.   :29.000            
##                                                           
##           EducationField EnvironmentSatisfaction    Gender      JobInvolvement
##  Human Resources : 442   1:4482                  Female: 9173   1: 1279       
##  Life Sciences   :9494   2:4460                  Male  :13798   2: 5849       
##  Marketing       :2484   3:7067                                 3:13612       
##  Medical         :7215   4:6962                                 4: 2231       
##  Other           :1291                                                        
##  Technical Degree:2045                                                        
##                                                                               
##  JobLevel                      JobRole     JobSatisfaction  MaritalStatus  
##  1:8547   Sales Executive          :5051   1:4543          Divorced: 5148  
##  2:8432   Research Scientist       :4576   2:4355          Married :10502  
##  3:3424   Laboratory Technician    :4088   3:6914          Single  : 7321  
##  4:1563   Manufacturing Director   :2338   4:7159                          
##  5:1005   Healthcare Representative:2061                                   
##           Manager                  :1513                                   
##           (Other)                  :3344                                   
##  MonthlyIncome   NumCompaniesWorked OverTime    PercentSalaryHike
##  Min.   : 1009   Min.   :0.000      No :16476   Min.   :11.00    
##  1st Qu.: 2909   1st Qu.:1.000      Yes: 6495   1st Qu.:12.00    
##  Median : 4898   Median :2.000                  Median :14.00    
##  Mean   : 6418   Mean   :2.688                  Mean   :15.22    
##  3rd Qu.: 8120   3rd Qu.:4.000                  3rd Qu.:18.00    
##  Max.   :19999   Max.   :9.000                  Max.   :25.00    
##                                                                  
##  PerformanceRating RelationshipSatisfaction StockOptionLevel TotalWorkingYears
##  3:19407           1:4315                   0:9826           Min.   : 0.00    
##  4: 3564           2:4739                   1:9330           1st Qu.: 6.00    
##                    3:7132                   2:2497           Median :10.00    
##                    4:6785                   3:1318           Mean   :11.08    
##                                                              3rd Qu.:15.00    
##                                                              Max.   :40.00    
##                                                                               
##  TrainingTimesLastYear WorkLifeBalance YearsAtCompany   YearsInCurrentRole
##  Min.   :0.000         1: 1255         Min.   : 0.000   Min.   : 0.000    
##  1st Qu.:2.000         2: 5359         1st Qu.: 3.000   1st Qu.: 2.000    
##  Median :3.000         3:13960         Median : 5.000   Median : 3.000    
##  Mean   :2.805         4: 2397         Mean   : 6.914   Mean   : 4.202    
##  3rd Qu.:3.000                         3rd Qu.: 9.000   3rd Qu.: 7.000    
##  Max.   :6.000                         Max.   :40.000   Max.   :18.000    
##                                                                           
##  YearsSinceLastPromotion YearsWithCurrManager        Employee Source
##  Min.   : 0.000          Min.   : 0.000       Company Website:5307  
##  1st Qu.: 0.000          1st Qu.: 2.000       Seek           :3622  
##  Median : 1.000          Median : 3.000       Indeed         :2459  
##  Mean   : 2.167          Mean   : 4.096       Jora           :2398  
##  3rd Qu.: 3.000          3rd Qu.: 7.000       LinkedIn       :2294  
##  Max.   :15.000          Max.   :17.000       Recruit.net    :2273  
##                                               (Other)        :4618  
##  AgeStartedWorking PriorYearsOfExperience AverageTenure    
##  Min.   : 0.00     Min.   : 0.000         Min.   : 0.0000  
##  1st Qu.:20.00     1st Qu.: 0.000         1st Qu.: 0.0000  
##  Median :25.00     Median : 2.000         Median : 0.3333  
##  Mean   :25.98     Mean   : 4.165         Mean   : 1.7700  
##  3rd Qu.:31.00     3rd Qu.: 5.000         3rd Qu.: 1.5000  
##  Max.   :60.00     Max.   :40.000         Max.   :40.0000  
## 

filtro demissão voluntaria

dados_rh_2 <- dados_rh[dados_rh$Attrition != 'Voluntary Resignation']
dados_rh_2 <-droplevels(dados_rh_2)
dim(dados_rh_2)
## [1] 19457    32
summary(dados_rh_2)
##       Age                   Attrition               BusinessTravel 
##  Min.   :18.00   Current employee:19370   Non-Travel       : 2154  
##  1st Qu.:31.00   Termination     :   87   Travel_Frequently: 3306  
##  Median :36.00                            Travel_Rarely    :13997  
##  Mean   :37.61                                                     
##  3rd Qu.:43.00                                                     
##  Max.   :60.00                                                     
##                                                                    
##                   Department    DistanceFromHome Education
##  Human Resources       :  821   Min.   : 1.000   1:2178   
##  Research & Development:13031   1st Qu.: 2.000   2:3762   
##  Sales                 : 5605   Median : 7.000   3:7455   
##                                 Mean   : 8.969   4:5387   
##                                 3rd Qu.:13.000   5: 675   
##                                 Max.   :29.000            
##                                                           
##           EducationField EnvironmentSatisfaction    Gender      JobInvolvement
##  Human Resources : 367   1:3636                  Female: 7807   1:  982       
##  Life Sciences   :8170   2:3749                  Male  :11650   2: 4858       
##  Marketing       :2070   3:6067                                 3:11652       
##  Medical         :6186   4:6005                                 4: 1965       
##  Other           :1065                                                        
##  Technical Degree:1599                                                        
##                                                                               
##  JobLevel                      JobRole     JobSatisfaction  MaritalStatus 
##  1:6837   Sales Executive          :4245   1:3698          Divorced:4581  
##  2:7383   Research Scientist       :3860   2:3701          Married :9167  
##  3:2911   Laboratory Technician    :3365   3:5821          Single  :5709  
##  4:1386   Manufacturing Director   :2066   4:6237                         
##  5: 940   Healthcare Representative:1780                                  
##           Manager                  :1402                                  
##           (Other)                  :2739                                  
##  MonthlyIncome   NumCompaniesWorked OverTime    PercentSalaryHike
##  Min.   : 1009   Min.   :0.000      No :14485   Min.   :11.00    
##  1st Qu.: 2996   1st Qu.:1.000      Yes: 4972   1st Qu.:12.00    
##  Median : 5006   Median :2.000                  Median :14.00    
##  Mean   : 6584   Mean   :2.647                  Mean   :15.26    
##  3rd Qu.: 8396   3rd Qu.:4.000                  3rd Qu.:18.00    
##  Max.   :19999   Max.   :9.000                  Max.   :25.00    
##                                                                  
##  PerformanceRating RelationshipSatisfaction StockOptionLevel TotalWorkingYears
##  3:16416           1:3581                   0:7769           Min.   : 0.00    
##  4: 3041           2:4141                   1:8361           1st Qu.: 6.00    
##                    3:5983                   2:2220           Median :10.00    
##                    4:5752                   3:1107           Mean   :11.37    
##                                                              3rd Qu.:15.00    
##                                                              Max.   :40.00    
##                                                                               
##  TrainingTimesLastYear WorkLifeBalance YearsAtCompany   YearsInCurrentRole
##  Min.   :0.000         1: 1017         Min.   : 0.000   Min.   : 0.000    
##  1st Qu.:2.000         2: 4512         1st Qu.: 3.000   1st Qu.: 2.000    
##  Median :3.000         3:11869         Median : 5.000   Median : 3.000    
##  Mean   :2.828         4: 2059         Mean   : 7.079   Mean   : 4.342    
##  3rd Qu.:3.000                         3rd Qu.:10.000   3rd Qu.: 7.000    
##  Max.   :6.000                         Max.   :40.000   Max.   :18.000    
##                                                                           
##  YearsSinceLastPromotion YearsWithCurrManager        Employee Source
##  Min.   : 0.000          Min.   : 0.000       Company Website:4409  
##  1st Qu.: 0.000          1st Qu.: 2.000       Seek           :3067  
##  Median : 1.000          Median : 3.000       Indeed         :2151  
##  Mean   : 2.196          Mean   : 4.196       LinkedIn       :1986  
##  3rd Qu.: 3.000          3rd Qu.: 7.000       Jora           :1969  
##  Max.   :15.000          Max.   :17.000       Recruit.net    :1951  
##                                               (Other)        :3924  
##  AgeStartedWorking PriorYearsOfExperience AverageTenure   
##  Min.   : 0.00     Min.   : 0.00          Min.   : 0.000  
##  1st Qu.:20.00     1st Qu.: 0.00          1st Qu.: 0.000  
##  Median :25.00     Median : 2.00          Median : 0.375  
##  Mean   :26.24     Mean   : 4.29          Mean   : 1.860  
##  3rd Qu.:32.00     3rd Qu.: 6.00          3rd Qu.: 1.667  
##  Max.   :60.00     Max.   :40.00          Max.   :40.000  
## 

ANALISE E EXPLORATORIA EXTRAINDO INSIGHTS

Plots de análise Univariadas

library(ggplot2)
ggplot(dados_rh) + geom_bar(aes(x = Gender))

Temos mais registro do sexo masculino do que feminino

ggplot(dados_rh) + geom_density(aes(x = Age))

#Utilizo o geom_density , Age ´e numerico (vamos criar a distribuição numerica)categorico utilizar é o ideal 

A maioroia dos funcionarios esta em 30 e 40 anos

ggplot(dados_rh) + geom_bar(aes(x =Attrition))

A maioria dos funcionarios continuam empregados a minoria foram demitidos e outros pediram demissão. A grande maioria continua na empresa

ggplot(dados_rh) + geom_bar(aes(x = Department))

A maioria é de pesuisa e desenvolvimento , vendas tem uma parcela siguiificativa r recurso humano é a minoria dentro do quadro de funcionarios.

ggplot(dados_rh) + geom_bar(aes(x  = JobRole))

Temos detalhado o quadro de funções dentro da IBM

ggplot(dados_rh) + geom_bar(aes(x =  Education)) + facet_grid(~ EducationField)

Temos a base da educação do quadro de funcionarios, a maioria estão em ciencia da vida ou na área médica.

Multiplot Grid

p.TotalWorkingYears       <- ggplot(dados_rh) + geom_density(aes(TotalWorkingYears))
p.YearsAtCompany          <- ggplot(dados_rh) + geom_density(aes(YearsAtCompany))
p.YearsSinceLastPromotion <- ggplot(dados_rh) + geom_density(aes(YearsSinceLastPromotion))
p.YearsWithCurrManager    <- ggplot(dados_rh) + geom_density(aes(YearsWithCurrManager))
p.YearsInCurrentRole      <- ggplot(dados_rh) + geom_density(aes(YearsInCurrentRole))
p.PriorYearsOfExperience  <- ggplot(dados_rh) + geom_density(aes(PriorYearsOfExperience))
#Agora vamos chamar o grafico

# Organiza no grid
grid.arrange(p.TotalWorkingYears, 
             p.YearsAtCompany, 
             p.YearsSinceLastPromotion, 
             p.YearsWithCurrManager, 
             p.YearsInCurrentRole, 
             p.PriorYearsOfExperience, 
             nrow = 2, 
             ncol = 3)

Comparação das variaveis, podemos constatar nos primeiros anos são maiores, e caindo conforme o tempo, nada de anormal dentro de uma empresa.

O estudo vai ser da seguinte forma, queremos saber o tempo de serviço de cada colaborador. Vamos descobrir a proporção de funcionarios com menos de alguns anos de experiência. Escolhemos os vamores de 1, 3, 5, 7, 10 anos.

length(which(dados_rh$PriorYearsOfExperience < 1)) / length(dados_rh$PriorYearsOfExperience)  
## [1] 0.3246596
length(which(dados_rh$PriorYearsOfExperience < 3)) / length(dados_rh$PriorYearsOfExperience)   
## [1] 0.5828346
length(which(dados_rh$PriorYearsOfExperience < 5)) / length(dados_rh$PriorYearsOfExperience)   
## [1] 0.7085177
length(which(dados_rh$PriorYearsOfExperience < 7)) / length(dados_rh$PriorYearsOfExperience)   
## [1] 0.7952121
length(which(dados_rh$PriorYearsOfExperience < 10)) / length(dados_rh$PriorYearsOfExperience)  
## [1] 0.8589644

Exemplo de insight:

58% dos funcionários têm menos de 3 anos de experiência de trabalho antes de entrar na IBM

Possíveis problemas: conjuntos de habilidades subdesenvolvidos, base de jovens funcionários, mentalidade de “trabalho” imatura.

Idade

length(which(dados_rh$Age < 30)) / length(dados_rh$Age)
## [1] 0.2165409

Insight, temos dentro da empresa apenas 22% com a idade inferior a 30 anos. Ou seja, a base de funcionarios não é tão jovem.

Educação

summary(dados_rh$Education)
##    1    2    3    4    5 
## 2659 4436 8930 6279  754
#vamos verificar a proporção
length(which(dados_rh$Education == 3)) / length(dados_rh$Education)
## [1] 0.3872842
length(which(dados_rh$Education == 4)) / length(dados_rh$Education)
## [1] 0.2723133

Insight Educação

Cerca de 39% dos funcionários são graduados e 27% realizaram o mestrado. A busca pelo ensino superior pode ter levado a uma diminuição da experiência de trabalho.

Boxplot mostrando a distribuição do salário mensal para todos os 4 níveis de satisfação no trabalho de 1-4

ggplot(data = subset(dados_rh, !is.na(JobSatisfaction)), aes(JobSatisfaction, MonthlyIncome)) + geom_boxplot()

Não a sinais que um slario mais alto leva a uma maior satisfação no trabalho.

Temos o outliers em todos os nivies de trabalho.

Resumindo, salario mais alto não é garantia de satisfação no trabalho.

#vamo susar somente o complete.obs , isso que dizer, vamos descartar qualquer valor ausente
# Correlação
cor(dados_rh$TotalWorkingYears, dados_rh$YearsAtCompany,          use = "complete.obs")#apenaso
## [1] 0.624816
cor(dados_rh$YearsAtCompany,    dados_rh$YearsInCurrentRole,      use = "complete.obs")
## [1] 0.7670497
cor(dados_rh$YearsAtCompany,    dados_rh$YearsSinceLastPromotion, use = "complete.obs")
## [1] 0.6236737
cor(dados_rh$YearsAtCompany,    dados_rh$YearsWithCurrManager,    use = "complete.obs")
## [1] 0.7728072
cor(dados_rh$TotalWorkingYears, dados_rh$MonthlyIncome,           use = "complete.obs")
## [1] 0.7582066
cor(dados_rh$YearsAtCompany,    dados_rh$MonthlyIncome,           use = "complete.obs")  
## [1] 0.4981578

Temos os resultado com a correlação positiva por serem perto de 1

Scarteplot

Agora vamos colocar em um grafico as duas ultimas correlação, TotalWorkingYears e YearsAtCompany

ggplot(dados_rh) + geom_point(aes(TotalWorkingYears, MonthlyIncome))

Aparentemente a uma tendencia uma de crescimento, nada fora do normal

ggplot(dados_rh) + geom_point(aes(YearsAtCompany, MonthlyIncome))

Média que uma variavel aumenta a outra aumenta, corresponde a normalidade.

Equilibrio da vida Pessoal e profissional e renda Mensal

ggplot(data = subset(dados_rh, !is.na(WorkLifeBalance)), aes(WorkLifeBalance, MonthlyIncome)) + 
  geom_boxplot()

Os funcionários que avaliaram o equilíbrio entre vida profissional e pessoal igual a 1 também têm renda média mensal significativamente mais baixa. Baixo equilíbrio entre vida profissional e baixo salário? Um problema que o departamento de RH precisa examinar. Numro 1 tem um baixo equilibrio na vida pessoal e profissional , o numero 4 tem alto equilibrio.

O RH tem que observar o grupo numero um, o que pode esta acontecendo com a vida do grupo 1. uma aventual promoção ou aumento de salario pode ser um fato motivacional.

Ou uma POssivel demissão.

Diferença salarial por Gênero.

ggplot(data = subset(dados_rh, !is.na(Gender)), aes(Gender, MonthlyIncome, fill = Gender)) +
  geom_boxplot() + 
  theme(legend.position = "none", plot.title = element_text(hjust = 0.5, size = 10)) +
  labs(x = "Gender", y = "Monthly Income", title = "Salário Mensal Entre Gêneros") +
  coord_flip()

Insight: Não há sinais de discriminação de gênero; na verdade, as mulheres ganham um pouco mais, em média, desconsiderando todos os outros fatores.

ggplot(data = subset(dados_rh, !is.na(JobRole))) + geom_boxplot(aes(JobRole, MonthlyIncome)) +
  ggtitle("Salário Mensal Por Função")

ggplot(data = subset(dados_rh, !is.na(JobRole))) + geom_boxplot(aes(JobRole, AgeStartedWorking)) +
  ggtitle("Idade Que Iniciou na Função")

O salario mensal munda conforme a função.

ggplot(data = subset(dados_rh, !is.na(JobRole))) + geom_boxplot(aes(JobRole, Age)) +
  ggtitle("Idade Por Função")

ggplot(data = subset(dados_rh, !is.na(JobRole))) + geom_boxplot(aes(JobRole, YearsAtCompany)) +
  ggtitle("Tempo de Empresa (em anos)")

O cargo de gerente fica mais tempo que outras funções. O corpo d egestores formam os lideres da empresa.

ggplot(data = na.omit(dados_rh)) + geom_bar(aes(JobRole, fill = Education), position = "fill") +
  ggtitle("Nível de Educação Por Função") + 
  ylab("Proportion")

P ara o nivel 5 da cor rosa, o numero de doutorados estão na área de pesquisa e desenvolvimento.

Plots de análise multivariada para variáveis normalmente usadas durante o processo de contratação

ggplot(data = dados_rh_1) + 
  geom_bar(aes(x = Education , fill = Attrition), position = 'fill') + 
  facet_grid(.~Department)

ggplot(data = dados_rh_1) + 
  geom_bar(aes(x = Education , fill = Attrition), position = 'fill') + 
  facet_grid(.~JobRole)

ggplot(data = dados_rh_1) + 
  geom_bar(aes(x = EducationField , fill = Attrition), position = 'fill') + 
  facet_grid(.~JobRole) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0))

Plots de análise multivariada para variáveis normalmente usadas após o processo de contratação

# Plots de análise multivariada para variáveis normalmente usadas após o processo de contratação
ggplot(dados_rh_1) + geom_bar(aes(x = Age, fill = Attrition), position = 'fill') 

ggplot(dados_rh_1) + geom_bar(aes(x = Department, fill = Attrition), position = 'fill') 

ggplot(dados_rh_1) + geom_bar(aes(x = DistanceFromHome, fill = Attrition), position = 'fill')

ggplot(dados_rh_1) + geom_bar(aes(x = `Employee Source`, fill = Attrition), position = 'fill')

ggplot(dados_rh_1) + geom_bar(aes(x = JobRole, fill = Attrition), position = 'fill')

ggplot(dados_rh_1) + geom_bar(aes(x = MaritalStatus, fill = Attrition), position = 'fill')

ggplot(dados_rh_1) + geom_bar(aes(x = AverageTenure, fill = Attrition), position = 'fill')

ggplot(dados_rh_1) + geom_bar(aes(x = Education, fill = Attrition), position = 'fill')

Plots de análise multivariada entre algumas variáveis e o status do funcionário

ggplot(dados_rh_1) + geom_boxplot(aes(Attrition, MonthlyIncome))

ggplot(dados_rh_1) + geom_boxplot(aes(Attrition, PercentSalaryHike))

ggplot(dados_rh_1) + geom_bar(aes(TrainingTimesLastYear, fill = Attrition), position = 'fill')

ggplot(dados_rh_1) + geom_bar(aes(BusinessTravel, fill = Attrition), position = 'fill')

ggplot(dados_rh_1) + geom_bar(aes(OverTime, fill = Attrition), position = 'fill')

ggplot(dados_rh_1) + geom_bar(aes(StockOptionLevel, fill = Attrition), position = 'fill')

ggplot(dados_rh_1) + geom_bar(aes(EnvironmentSatisfaction, fill = Attrition), position = 'fill')

ggplot(dados_rh_1) + geom_bar(aes(JobSatisfaction, fill = Attrition), position = 'fill')

ggplot(dados_rh_1) + geom_bar(aes(JobInvolvement, fill = Attrition), position = 'fill')

ggplot(dados_rh_1) + geom_bar(aes(RelationshipSatisfaction, fill = Attrition), position = 'fill')

ggplot(dados_rh_1) + geom_bar(aes(WorkLifeBalance, fill = Attrition), position = 'fill')

Modelagem Preditiva

Vamos concentrar nosso trabalho em tentar ajudar o RH a recrutar melhor visando evitar atritos e, consequentemente, demissões.

Criaremos 5 versões do modelo e para cada um vamos explorar as opções e interpretar o resultado.

Primeira versão do modelo com algumas variáveis

#glm regressão logistica
# Attrition vai ser a variavel target, por isso utilizamos o sinal de ~
#POsso fazer qualquer experimento com a quantidade de variavel infinita par ao meu modelo

#Quando não tem nenhum * mostrar que não tem nenhuma relevancia para a target que é a variavel alvo.



?glm
## starting httpd help server ... done
modelo_v1 <- glm(Attrition ~ Age + Department + DistanceFromHome + `Employee Source` + 
                   JobRole + MaritalStatus + AverageTenure + PriorYearsOfExperience + Gender + 
                   Education + EducationField, 
                 family = binomial, 
                 data = dados_rh)

#obs, aqui usei toda a minha base de dados, com isso não consigo fazer a previsão , o certo é dividir em treino e teste.
summary(modelo_v1)
## 
## Call:
## glm(formula = Attrition ~ Age + Department + DistanceFromHome + 
##     `Employee Source` + JobRole + MaritalStatus + AverageTenure + 
##     PriorYearsOfExperience + Gender + Education + EducationField, 
##     family = binomial, data = dados_rh)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4738  -0.6239  -0.4962  -0.3553   2.7405  
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      -0.515415   0.198808  -2.593 0.009527 ** 
## Age                              -0.046402   0.002434 -19.062  < 2e-16 ***
## DepartmentResearch & Development -0.402413   0.102837  -3.913 9.11e-05 ***
## DepartmentSales                   0.041108   0.106275   0.387 0.698901    
## DistanceFromHome                  0.022014   0.002497   8.816  < 2e-16 ***
## `Employee Source`Company Website  0.200175   0.074567   2.684 0.007264 ** 
## `Employee Source`GlassDoor       -0.002062   0.089568  -0.023 0.981630    
## `Employee Source`Indeed          -0.048126   0.088966  -0.541 0.588545    
## `Employee Source`Jora             0.202494   0.084534   2.395 0.016602 *  
## `Employee Source`LinkedIn        -0.086527   0.090292  -0.958 0.337911    
## `Employee Source`Recruit.net     -0.024145   0.088800  -0.272 0.785699    
## `Employee Source`Referral         0.222132   0.147177   1.509 0.131226    
## `Employee Source`Seek             0.039192   0.079096   0.495 0.620253    
## JobRoleHuman Resources            0.092163   0.125250   0.736 0.461832    
## JobRoleLaboratory Technician      0.313456   0.079749   3.931 8.48e-05 ***
## JobRoleManager                   -0.370055   0.121400  -3.048 0.002302 ** 
## JobRoleManufacturing Director    -0.091942   0.094178  -0.976 0.328937    
## JobRoleResearch Director         -0.326907   0.125855  -2.597 0.009391 ** 
## JobRoleResearch Scientist         0.102218   0.078537   1.302 0.193080    
## JobRoleSales Executive           -0.030434   0.079097  -0.385 0.700414    
## JobRoleSales Representative       0.484732   0.095181   5.093 3.53e-07 ***
## MaritalStatusMarried              0.179376   0.053279   3.367 0.000761 ***
## MaritalStatusSingle               0.740422   0.053393  13.867  < 2e-16 ***
## AverageTenure                    -0.016927   0.009230  -1.834 0.066663 .  
## PriorYearsOfExperience            0.018901   0.005353   3.531 0.000414 ***
## GenderMale                        0.033768   0.038421   0.879 0.379467    
## Education2                        0.096221   0.068965   1.395 0.162951    
## Education3                        0.129656   0.061109   2.122 0.033862 *  
## Education4                        0.120603   0.066456   1.815 0.069558 .  
## Education5                       -0.221560   0.134302  -1.650 0.099001 .  
## EducationFieldLife Sciences      -0.149802   0.143779  -1.042 0.297462    
## EducationFieldMarketing          -0.122315   0.152984  -0.800 0.423984    
## EducationFieldMedical            -0.176829   0.145066  -1.219 0.222859    
## EducationFieldOther              -0.170949   0.161651  -1.058 0.290274    
## EducationFieldTechnical Degree    0.183255   0.154276   1.188 0.234898    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 20272  on 23057  degrees of freedom
## Residual deviance: 18904  on 23023  degrees of freedom
## AIC: 18974
## 
## Number of Fisher Scoring iterations: 5
# A quantidade de asteriscos *** mostra a relevancia que tem para a variavel alvo.
?vif
vif(modelo_v1)
##                            GVIF Df GVIF^(1/(2*Df))
## Age                    1.197853  1        1.094465
## Department             2.027501  2        1.193274
## DistanceFromHome       1.321206  1        1.149437
## `Employee Source`      1.107922  8        1.006426
## JobRole                2.564522  8        1.060628
## MaritalStatus          1.042578  2        1.010479
## AverageTenure          2.478002  1        1.574167
## PriorYearsOfExperience 2.440072  1        1.562073
## Gender                 1.019571  1        1.009738
## Education              1.121235  4        1.014407
## EducationField         1.648089  5        1.051231
#Variaveis com maior importancia, concentar na primeira coluna, quando maior o numero maior vai ser a sua importancia.


# modelo base, falta dividir a base em treino e teste para o experimento
#

Dividir em Treino e Teste

Vou tirar do dataset pessoas que forma demitidas

set.seed(2004)

index_treino <- sample.split(Y = dados_rh_1$Attrition, SplitRatio = 0.7)#divisor 70% para treino dados verdadeiros
dados_rh_1_treino <- subset(dados_rh_1, train = T)
dados_rh_1_teste <- subset(dados_rh_1, train =  F)

Segunda Versão do Modelo

modelo_v2 <- glm(Attrition ~ Age + Department + DistanceFromHome + `Employee Source` + 
                   JobRole + MaritalStatus + AverageTenure + PriorYearsOfExperience + Gender + 
                   Education + EducationField, 
                 family = binomial, 
                 data = dados_rh_1_treino)#Vou utilizar a base de dados criado na versão V1, dados esse criando sem as pessoas demitidas. 
summary(modelo_v2)
## 
## Call:
## glm(formula = Attrition ~ Age + Department + DistanceFromHome + 
##     `Employee Source` + JobRole + MaritalStatus + AverageTenure + 
##     PriorYearsOfExperience + Gender + Education + EducationField, 
##     family = binomial, data = dados_rh_1_treino)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4484  -0.6177  -0.4918  -0.3558   2.7300  
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      -0.499751   0.199492  -2.505 0.012241 *  
## Age                              -0.044889   0.002446 -18.348  < 2e-16 ***
## DepartmentResearch & Development -0.427955   0.103053  -4.153 3.28e-05 ***
## DepartmentSales                   0.025684   0.106499   0.241 0.809423    
## DistanceFromHome                  0.020372   0.002522   8.076 6.69e-16 ***
## `Employee Source`Company Website  0.183335   0.074868   2.449 0.014334 *  
## `Employee Source`GlassDoor        0.006274   0.089680   0.070 0.944229    
## `Employee Source`Indeed          -0.080908   0.089734  -0.902 0.367244    
## `Employee Source`Jora             0.183678   0.084958   2.162 0.030618 *  
## `Employee Source`LinkedIn        -0.079145   0.090405  -0.875 0.381325    
## `Employee Source`Recruit.net     -0.050665   0.089444  -0.566 0.571095    
## `Employee Source`Referral         0.230121   0.147168   1.564 0.117897    
## `Employee Source`Seek            -0.005837   0.079828  -0.073 0.941708    
## JobRoleHuman Resources            0.107348   0.125753   0.854 0.393302    
## JobRoleLaboratory Technician      0.314968   0.080707   3.903 9.52e-05 ***
## JobRoleManager                   -0.402633   0.123788  -3.253 0.001144 ** 
## JobRoleManufacturing Director    -0.083426   0.095273  -0.876 0.381221    
## JobRoleResearch Director         -0.292195   0.126243  -2.315 0.020637 *  
## JobRoleResearch Scientist         0.111877   0.079359   1.410 0.158608    
## JobRoleSales Executive           -0.028140   0.079873  -0.352 0.724611    
## JobRoleSales Representative       0.478077   0.096067   4.977 6.47e-07 ***
## MaritalStatusMarried              0.176289   0.053865   3.273 0.001065 ** 
## MaritalStatusSingle               0.747383   0.053896  13.867  < 2e-16 ***
## AverageTenure                    -0.021245   0.009467  -2.244 0.024825 *  
## PriorYearsOfExperience            0.019787   0.005399   3.665 0.000248 ***
## GenderMale                        0.030982   0.038752   0.800 0.424000    
## Education2                        0.067584   0.069195   0.977 0.328712    
## Education3                        0.092553   0.061236   1.511 0.130684    
## Education4                        0.071013   0.066760   1.064 0.287461    
## Education5                       -0.233758   0.134267  -1.741 0.081685 .  
## EducationFieldLife Sciences      -0.148858   0.143810  -1.035 0.300620    
## EducationFieldMarketing          -0.106268   0.152995  -0.695 0.487317    
## EducationFieldMedical            -0.202212   0.145203  -1.393 0.163736    
## EducationFieldOther              -0.137807   0.161652  -0.852 0.393940    
## EducationFieldTechnical Degree    0.180977   0.154552   1.171 0.241608    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 19951  on 22970  degrees of freedom
## Residual deviance: 18626  on 22936  degrees of freedom
## AIC: 18696
## 
## Number of Fisher Scoring iterations: 5
vif(modelo_v2)
##                            GVIF Df GVIF^(1/(2*Df))
## Age                    1.196689  1        1.093933
## Department             2.033631  2        1.194175
## DistanceFromHome       1.321839  1        1.149712
## `Employee Source`      1.109202  8        1.006499
## JobRole                2.550173  8        1.060256
## MaritalStatus          1.043120  2        1.010610
## AverageTenure          2.441327  1        1.562475
## PriorYearsOfExperience 2.412397  1        1.553189
## Gender                 1.018652  1        1.009283
## Education              1.123257  4        1.014635
## EducationField         1.650765  5        1.051401
# Previsões
threshold <- 0.5
previsoes_v2 <- predict(modelo_v2, type = 'response', newdata = dados_rh_1_teste)
previsoes_finais_v2 <- ifelse(previsoes_v2 > threshold, 'Voluntary Resignation', 'Current employee')
table(dados_rh_1_teste$Attrition, previsoes_finais_v2)
##                        previsoes_finais_v2
##                         Current employee Voluntary Resignation
##   Current employee                 19328                    42
##   Voluntary Resignation             3523                    78

quando a claaase era que o funcionario continuaria como empregado o nosso modelo previu 19328

Quando a classe real era pedido de demissão o nosso previu que continuaria como empregado no total de 3523, ou seja o nosso modelo errou. uma taxa de erro consideravel. POderiamos fazer uma balanceamento de classes para ter um modelo mais preciso.

Terceira versão do modelo de treino e sem variaveis de educação

modelo_v3 <- glm(Attrition ~ Age + Department + DistanceFromHome + `Employee Source` + 
                   JobRole + MaritalStatus + AverageTenure + PriorYearsOfExperience + Gender, 
                 family = binomial, 
                 data = dados_rh_1_treino)
summary(modelo_v3)
## 
## Call:
## glm(formula = Attrition ~ Age + Department + DistanceFromHome + 
##     `Employee Source` + JobRole + MaritalStatus + AverageTenure + 
##     PriorYearsOfExperience + Gender, family = binomial, data = dados_rh_1_treino)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3428  -0.6201  -0.4941  -0.3619   2.7143  
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      -0.594443   0.163302  -3.640 0.000272 ***
## Age                              -0.044338   0.002361 -18.781  < 2e-16 ***
## DepartmentResearch & Development -0.455831   0.097648  -4.668 3.04e-06 ***
## DepartmentSales                   0.006375   0.100798   0.063 0.949567    
## DistanceFromHome                  0.023945   0.002219  10.792  < 2e-16 ***
## `Employee Source`Company Website  0.185836   0.074684   2.488 0.012835 *  
## `Employee Source`GlassDoor        0.004131   0.089469   0.046 0.963174    
## `Employee Source`Indeed          -0.084488   0.089587  -0.943 0.345638    
## `Employee Source`Jora             0.182141   0.084629   2.152 0.031378 *  
## `Employee Source`LinkedIn        -0.073833   0.090249  -0.818 0.413300    
## `Employee Source`Recruit.net     -0.058670   0.089241  -0.657 0.510903    
## `Employee Source`Referral         0.237922   0.146800   1.621 0.105078    
## `Employee Source`Seek            -0.006818   0.079571  -0.086 0.931717    
## JobRoleHuman Resources            0.099083   0.125594   0.789 0.430163    
## JobRoleLaboratory Technician      0.312339   0.080556   3.877 0.000106 ***
## JobRoleManager                   -0.418085   0.123665  -3.381 0.000723 ***
## JobRoleManufacturing Director    -0.079696   0.095061  -0.838 0.401826    
## JobRoleResearch Director         -0.308958   0.126075  -2.451 0.014263 *  
## JobRoleResearch Scientist         0.119993   0.079265   1.514 0.130071    
## JobRoleSales Executive           -0.023432   0.079774  -0.294 0.768961    
## JobRoleSales Representative       0.483836   0.095952   5.042 4.60e-07 ***
## MaritalStatusMarried              0.176480   0.053793   3.281 0.001035 ** 
## MaritalStatusSingle               0.747665   0.053772  13.904  < 2e-16 ***
## AverageTenure                    -0.019906   0.009465  -2.103 0.035453 *  
## PriorYearsOfExperience            0.019187   0.005400   3.553 0.000381 ***
## GenderMale                        0.033764   0.038690   0.873 0.382838    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 19951  on 22970  degrees of freedom
## Residual deviance: 18668  on 22945  degrees of freedom
## AIC: 18720
## 
## Number of Fisher Scoring iterations: 5
vif(modelo_v3)
##                            GVIF Df GVIF^(1/(2*Df))
## Age                    1.116616  1        1.056701
## Department             1.733091  2        1.147375
## DistanceFromHome       1.008189  1        1.004086
## `Employee Source`      1.080863  8        1.004872
## JobRole                2.519868  8        1.059464
## MaritalStatus          1.037566  2        1.009262
## AverageTenure          2.443412  1        1.563142
## PriorYearsOfExperience 2.418527  1        1.555161
## Gender                 1.017671  1        1.008797
# Previsões
threshold <- 0.5
previsoes_v3 <- predict(modelo_v3, type = 'response', newdata = dados_rh_1_teste)
previsoes_finais_v3 <- ifelse(previsoes_v3 > threshold, 'Voluntary Resignation', 'Current employee')
table(dados_rh_1_teste$Attrition, previsoes_finais_v3)
##                        previsoes_finais_v3
##                         Current employee Voluntary Resignation
##   Current employee                 19328                    42
##   Voluntary Resignation             3541                    60

Quarta versão do modelo com dados de treino e sem variáveis de educação e genero

modelo_v4 <- glm(Attrition ~ Age + Department + DistanceFromHome + `Employee Source` + 
                   JobRole + MaritalStatus + AverageTenure + PriorYearsOfExperience, 
                 family = binomial, 
                 data = dados_rh_1_treino)
summary(modelo_v4)
## 
## Call:
## glm(formula = Attrition ~ Age + Department + DistanceFromHome + 
##     `Employee Source` + JobRole + MaritalStatus + AverageTenure + 
##     PriorYearsOfExperience, family = binomial, data = dados_rh_1_treino)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3360  -0.6192  -0.4939  -0.3622   2.7205  
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      -0.569968   0.160865  -3.543 0.000395 ***
## Age                              -0.044408   0.002359 -18.822  < 2e-16 ***
## DepartmentResearch & Development -0.457114   0.097648  -4.681 2.85e-06 ***
## DepartmentSales                   0.004776   0.100790   0.047 0.962208    
## DistanceFromHome                  0.023979   0.002218  10.810  < 2e-16 ***
## `Employee Source`Company Website  0.185968   0.074691   2.490 0.012780 *  
## `Employee Source`GlassDoor        0.004217   0.089473   0.047 0.962404    
## `Employee Source`Indeed          -0.082065   0.089543  -0.916 0.359412    
## `Employee Source`Jora             0.182210   0.084632   2.153 0.031321 *  
## `Employee Source`LinkedIn        -0.073105   0.090254  -0.810 0.417948    
## `Employee Source`Recruit.net     -0.058149   0.089234  -0.652 0.514631    
## `Employee Source`Referral         0.240776   0.146746   1.641 0.100844    
## `Employee Source`Seek            -0.006816   0.079577  -0.086 0.931742    
## JobRoleHuman Resources            0.100479   0.125614   0.800 0.423769    
## JobRoleLaboratory Technician      0.315123   0.080478   3.916 9.02e-05 ***
## JobRoleManager                   -0.419678   0.123673  -3.393 0.000690 ***
## JobRoleManufacturing Director    -0.082962   0.094978  -0.873 0.382397    
## JobRoleResearch Director         -0.310452   0.126056  -2.463 0.013785 *  
## JobRoleResearch Scientist         0.120223   0.079252   1.517 0.129277    
## JobRoleSales Executive           -0.023015   0.079761  -0.289 0.772925    
## JobRoleSales Representative       0.482258   0.095927   5.027 4.97e-07 ***
## MaritalStatusMarried              0.175136   0.053769   3.257 0.001125 ** 
## MaritalStatusSingle               0.745551   0.053714  13.880  < 2e-16 ***
## AverageTenure                    -0.019985   0.009465  -2.112 0.034727 *  
## PriorYearsOfExperience            0.019266   0.005398   3.569 0.000358 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 19951  on 22970  degrees of freedom
## Residual deviance: 18668  on 22946  degrees of freedom
## AIC: 18718
## 
## Number of Fisher Scoring iterations: 5
vif(modelo_v4)
##                            GVIF Df GVIF^(1/(2*Df))
## Age                    1.115570  1        1.056205
## Department             1.732346  2        1.147252
## DistanceFromHome       1.007827  1        1.003906
## `Employee Source`      1.078323  8        1.004724
## JobRole                2.493476  8        1.058767
## MaritalStatus          1.035363  2        1.008726
## AverageTenure          2.443871  1        1.563288
## PriorYearsOfExperience 2.417511  1        1.554835
# Previsões
threshold <- 0.5
previsoes_v4 <- predict(modelo_v4, type = 'response', newdata = dados_rh_1_teste)
previsoes_finais_v4 <- ifelse(previsoes_v4 > threshold, 'Voluntary Resignation', 'Current employee')
table(dados_rh_1_teste$Attrition, previsoes_finais_v4)
##                        previsoes_finais_v4
##                         Current employee Voluntary Resignation
##   Current employee                 19326                    44
##   Voluntary Resignation             3545                    56

Os três modelos feito mostrou a relevancia dos JobRole ,PriorYearsOfExperience e AverageTenure. apresentar esses valores aos tomadores de decisão.

Quinta versão do modelo com dados de treino e sem variáveis de educação, genero e outro algoritmo

?rpart
modelo_v5 <- rpart(Attrition ~ Age + Department + DistanceFromHome + JobRole + MaritalStatus + 
                     AverageTenure + PriorYearsOfExperience, 
                   method = "class", 
                   control = rpart.control(minsplit = 500, cp = 0),
                   data = dados_rh_1_treino)
summary(modelo_v5)
## Call:
## rpart(formula = Attrition ~ Age + Department + DistanceFromHome + 
##     JobRole + MaritalStatus + AverageTenure + PriorYearsOfExperience, 
##     data = dados_rh_1_treino, method = "class", control = rpart.control(minsplit = 500, 
##         cp = 0))
##   n= 22971 
## 
##            CP nsplit rel error    xerror       xstd
## 1 0.006803666      0 1.0000000 1.0000000 0.01530253
## 2 0.000416551      3 0.9780616 0.9800056 0.01517691
## 3 0.000000000      7 0.9763954 0.9816718 0.01518746
## 
## Variable importance
##                    Age          MaritalStatus                JobRole 
##                     68                     15                      9 
##       DistanceFromHome          AverageTenure PriorYearsOfExperience 
##                      6                      2                      1 
## 
## Node number 1: 22971 observations,    complexity param=0.006803666
##   predicted class=Current employee       expected loss=0.1567629  P(node) =1
##     class counts: 19370  3601
##    probabilities: 0.843 0.157 
##   left son=2 (14161 obs) right son=3 (8810 obs)
##   Primary splits:
##       Age              < 33.5      to the right, improve=192.45070, (0 missing)
##       MaritalStatus    splits as  LLR, improve= 98.38529, (0 missing)
##       DistanceFromHome < 11.5      to the left,  improve= 45.04993, (0 missing)
##       Department       splits as  RLR, improve= 44.05590, (0 missing)
##       JobRole          splits as  LRRLLLRRR, improve= 43.19011, (0 missing)
##   Surrogate splits:
##       JobRole splits as  LLLLLLLLR, agree=0.622, adj=0.015, (0 split)
## 
## Node number 2: 14161 observations
##   predicted class=Current employee       expected loss=0.1057129  P(node) =0.6164729
##     class counts: 12664  1497
##    probabilities: 0.894 0.106 
## 
## Node number 3: 8810 observations,    complexity param=0.006803666
##   predicted class=Current employee       expected loss=0.2388195  P(node) =0.3835271
##     class counts:  6706  2104
##    probabilities: 0.761 0.239 
##   left son=6 (8239 obs) right son=7 (571 obs)
##   Primary splits:
##       Age              < 21.5      to the right, improve=112.91850, (0 missing)
##       MaritalStatus    splits as  LLR, improve=105.19850, (0 missing)
##       DistanceFromHome < 11.5      to the left,  improve= 31.14560, (0 missing)
##       JobRole          splits as  LRRLLLLLR, improve= 28.54958, (0 missing)
##       Department       splits as  RLL, improve= 13.48102, (0 missing)
## 
## Node number 6: 8239 observations,    complexity param=0.000416551
##   predicted class=Current employee       expected loss=0.2177449  P(node) =0.3586696
##     class counts:  6445  1794
##    probabilities: 0.782 0.218 
##   left son=12 (5600 obs) right son=13 (2639 obs)
##   Primary splits:
##       MaritalStatus          splits as  LLR, improve=67.13122, (0 missing)
##       DistanceFromHome       < 11.5      to the left,  improve=41.60399, (0 missing)
##       JobRole                splits as  LRRLLLRRR, improve=19.17166, (0 missing)
##       Age                    < 26.5      to the right, improve=14.50616, (0 missing)
##       PriorYearsOfExperience < 14.5      to the right, improve=12.24881, (0 missing)
## 
## Node number 7: 571 observations,    complexity param=0.006803666
##   predicted class=Voluntary Resignation  expected loss=0.4570928  P(node) =0.02485743
##     class counts:   261   310
##    probabilities: 0.457 0.543 
##   left son=14 (190 obs) right son=15 (381 obs)
##   Primary splits:
##       JobRole          splits as  LRRRRRLRR, improve=8.456258, (0 missing)
##       Age              < 19.5      to the right, improve=5.644184, (0 missing)
##       Department       splits as  RLR, improve=2.925194, (0 missing)
##       DistanceFromHome < 5.5       to the left,  improve=2.502770, (0 missing)
##   Surrogate splits:
##       PriorYearsOfExperience < 15        to the right, agree=0.681, adj=0.042, (0 split)
##       DistanceFromHome       < 23.5      to the right, agree=0.673, adj=0.016, (0 split)
##       MaritalStatus          splits as  RLR, agree=0.671, adj=0.011, (0 split)
## 
## Node number 12: 5600 observations
##   predicted class=Current employee       expected loss=0.1739286  P(node) =0.2437856
##     class counts:  4626   974
##    probabilities: 0.826 0.174 
## 
## Node number 13: 2639 observations,    complexity param=0.000416551
##   predicted class=Current employee       expected loss=0.3107238  P(node) =0.114884
##     class counts:  1819   820
##    probabilities: 0.689 0.311 
##   left son=26 (378 obs) right son=27 (2261 obs)
##   Primary splits:
##       JobRole                splits as  RLRLLRRRR, improve=28.098750, (0 missing)
##       DistanceFromHome       < 20.5      to the left,  improve=25.339560, (0 missing)
##       PriorYearsOfExperience < 5.5       to the right, improve= 8.692388, (0 missing)
##       AverageTenure          < 2.770833  to the right, improve= 4.328876, (0 missing)
##       Age                    < 27.5      to the left,  improve= 3.922510, (0 missing)
##   Surrogate splits:
##       AverageTenure          < 8.25      to the right, agree=0.869, adj=0.087, (0 split)
##       PriorYearsOfExperience < 19        to the right, agree=0.865, adj=0.058, (0 split)
## 
## Node number 14: 190 observations
##   predicted class=Current employee       expected loss=0.4210526  P(node) =0.008271299
##     class counts:   110    80
##    probabilities: 0.579 0.421 
## 
## Node number 15: 381 observations
##   predicted class=Voluntary Resignation  expected loss=0.3963255  P(node) =0.01658613
##     class counts:   151   230
##    probabilities: 0.396 0.604 
## 
## Node number 26: 378 observations
##   predicted class=Current employee       expected loss=0.1322751  P(node) =0.01645553
##     class counts:   328    50
##    probabilities: 0.868 0.132 
## 
## Node number 27: 2261 observations,    complexity param=0.000416551
##   predicted class=Current employee       expected loss=0.3405573  P(node) =0.09842845
##     class counts:  1491   770
##    probabilities: 0.659 0.341 
##   left son=54 (1621 obs) right son=55 (640 obs)
##   Primary splits:
##       DistanceFromHome       < 12.5      to the left,  improve=27.233170, (0 missing)
##       PriorYearsOfExperience < 1.5       to the left,  improve= 8.703747, (0 missing)
##       Age                    < 27.5      to the left,  improve= 8.372829, (0 missing)
##       AverageTenure          < 0.2111111 to the left,  improve= 7.042972, (0 missing)
##       JobRole                splits as  L-L--LLLR, improve= 3.730849, (0 missing)
## 
## Node number 54: 1621 observations
##   predicted class=Current employee       expected loss=0.2917952  P(node) =0.07056724
##     class counts:  1148   473
##    probabilities: 0.708 0.292 
## 
## Node number 55: 640 observations,    complexity param=0.000416551
##   predicted class=Current employee       expected loss=0.4640625  P(node) =0.02786122
##     class counts:   343   297
##    probabilities: 0.536 0.464 
##   left son=110 (172 obs) right son=111 (468 obs)
##   Primary splits:
##       AverageTenure          < 0.6458333 to the right, improve=6.245802, (0 missing)
##       JobRole                splits as  L-L--RLRR, improve=5.369857, (0 missing)
##       Department             splits as  RLR, improve=4.685078, (0 missing)
##       DistanceFromHome       < 20.5      to the left,  improve=4.056358, (0 missing)
##       PriorYearsOfExperience < 0.5       to the right, improve=2.844245, (0 missing)
##   Surrogate splits:
##       PriorYearsOfExperience < 2.5       to the right, agree=0.820, adj=0.331, (0 split)
##       Age                    < 23.5      to the left,  agree=0.755, adj=0.087, (0 split)
## 
## Node number 110: 172 observations
##   predicted class=Current employee       expected loss=0.3488372  P(node) =0.007487702
##     class counts:   112    60
##    probabilities: 0.651 0.349 
## 
## Node number 111: 468 observations
##   predicted class=Voluntary Resignation  expected loss=0.4935897  P(node) =0.02037351
##     class counts:   231   237
##    probabilities: 0.494 0.506
#Arvore de decisão
rpart.plot(modelo_v5)

RESUMO:

Apresentar esse grafico par aos gestores, é uma forma facil de interpretar. Observamos que a classe idade tem grande peso.