#Contexto La clave del Exito en cualquier organizacion es atraer y retener a los mejores talentos.
Education 1 ‘Below College’ 2 ‘College’ 3 ‘Bachelor’ 4 ‘Master’ 5 ‘Doctor’
EnvironmentSatisfaction 1 ‘Low’ 2 ‘Medium’ 3 ‘High’ 4 ‘Very High’
JobInvolvement 1 ‘Low’ 2 ‘Medium’ 3 ‘High’ 4 ‘Very High’
JobSatisfaction 1 ‘Low’ 2 ‘Medium’ 3 ‘High’ 4 ‘Very High’
PerformanceRating 1 ‘Low’ 2 ‘Good’ 3 ‘Excellent’ 4 ‘Outstanding’
RelationshipSatisfaction 1 ‘Low’ 2 ‘Medium’ 3 ‘High’ 4 ‘Very High’
WorkLifeBalance 1 ‘Bad’ 2 ‘Good’ 3 ‘Better’ 4 ‘Best’
#Definir directorio
setwd("G:/TRABAJO/DOCENCIA/KONRAND LORENZ/HR_datasets")
#Cargar las bases de datos de interes
emp_att=read.csv2("HR-Employee-Attrition.csv", header = TRUE, sep = ";", dec=".")
head(emp_att,5)
## ï..CC Age BusinessTravel DailyRate Department DistanceFromHome
## 1 7001 41 Travel_Rarely 1102 Sales 10
## 2 7002 49 Travel_Frequently 279 Research_Development 11
## 3 7003 26 Travel_Rarely 1050 Research_Development 5
## 4 7004 33 Travel_Frequently 1392 Research_Development 10
## 5 7005 27 Travel_Rarely 591 Research_Development 9
## Education EducationField EnvironmentSatisfaction Gender HourlyRate
## 1 2 Life_Sciences 2 Female 48
## 2 1 Life_Sciences 3 Male 68
## 3 2 Other 4 Male 96
## 4 4 Life_Sciences 4 Female 33
## 5 1 Medical 1 Male 35
## JobInvolvement JobLevel JobRole JobSatisfaction MaritalStatus
## 1 3 2 Sales_Executive 4 Single
## 2 2 2 Research_Scientist 2 Married
## 3 2 1 Laboratory_Technician 3 Single
## 4 3 1 Research_Scientist 3 Married
## 5 3 1 Laboratory_Technician 2 Married
## MonthlyIncome MonthlyRate NumCompaniesWorked OverTime PercentSalaryHike
## 1 5993 19479 8 1 11
## 2 5130 24907 1 0 23
## 3 2090 2396 6 1 15
## 4 2909 23159 1 1 11
## 5 3468 16632 9 0 12
## PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
## 1 3 1 80 0
## 2 4 4 80 1
## 3 3 2 80 0
## 4 3 3 80 0
## 5 3 4 80 1
## TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## 1 8 0 1 6
## 2 10 3 3 10
## 3 7 3 3 0
## 4 8 3 3 8
## 5 6 3 3 2
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager Attrition
## 1 4 0 5 1
## 2 7 1 7 0
## 3 0 0 0 1
## 4 7 3 0 0
## 5 2 2 2 0
#Cargar librerias
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 4.1.2
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.1.2
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(FactoInvestigate)
## Warning: package 'FactoInvestigate' was built under R version 4.1.2
library(ggdendro)
## Warning: package 'ggdendro' was built under R version 4.1.2
library(ggplot2)
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
library(fastDummies)
library(readxl)
#verificar valores nulos
sapply(emp_att, function(x) sum(is.na(x)))
## ï..CC Age BusinessTravel
## 0 0 0
## DailyRate Department DistanceFromHome
## 0 0 0
## Education EducationField EnvironmentSatisfaction
## 0 0 0
## Gender HourlyRate JobInvolvement
## 0 0 0
## JobLevel JobRole JobSatisfaction
## 0 0 0
## MaritalStatus MonthlyIncome MonthlyRate
## 0 0 0
## NumCompaniesWorked OverTime PercentSalaryHike
## 0 0 0
## PerformanceRating RelationshipSatisfaction StandardHours
## 0 0 0
## StockOptionLevel TotalWorkingYears TrainingTimesLastYear
## 0 0 0
## WorkLifeBalance YearsAtCompany YearsInCurrentRole
## 0 0 0
## YearsSinceLastPromotion YearsWithCurrManager Attrition
## 0 0 0
#Exploraciones iniciales sobre variables continuas
par(mfrow=c(3,3))
hist(emp_att$Age, main='Edad')
hist(emp_att$DistanceFromHome, main='Distancia hasta la casa')
hist(emp_att$MonthlyIncome, main='Ingreso Mensual')
hist(emp_att$YearsAtCompany ,main='Anos en la compania')
hist(emp_att$TotalWorkingYears ,main='Anos Trabajados')
hist(emp_att$YearsInCurrentRole ,main='Anos en el actual rol')
summary(emp_att$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 27.00 30.50 33.79 39.00 60.00
summary(emp_att$TotalWorkingYears)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 6.00 10.00 11.28 15.00 40.00
par(mfrow=c(1,2))
boxplot(emp_att$Age, main='Edad')
boxplot(emp_att$TotalWorkingYears ,main='Anos Trabajados')
#realizamos conteos de variables cualitativas (categoricas)
q1=sqldf('SELECT BusinessTravel, COUNT(BusinessTravel) as CONTEO
FROM emp_att
GROUP BY BusinessTravel
')
q1
## BusinessTravel CONTEO
## 1 Non_Travel 150
## 2 Travel_Frequently 277
## 3 Travel_Rarely 1043
q2=sqldf('SELECT JobInvolvement, COUNT(JobInvolvement) as CONTEO
FROM emp_att
GROUP BY JobInvolvement
')
q2
## JobInvolvement CONTEO
## 1 1 83
## 2 2 375
## 3 3 868
## 4 4 144
#JobInvolvement
#1 'Low' 2 'Medium' 3 'High' 4 'Very High'
Se debe convertir las variables Cualitativas en dummies.
emp_att = dummy_cols(emp_att, select_columns = c("BusinessTravel","Department","EducationField","Gender","JobRole","MaritalStatus"))
colnames(emp_att)
## [1] "ï..CC" "Age"
## [3] "BusinessTravel" "DailyRate"
## [5] "Department" "DistanceFromHome"
## [7] "Education" "EducationField"
## [9] "EnvironmentSatisfaction" "Gender"
## [11] "HourlyRate" "JobInvolvement"
## [13] "JobLevel" "JobRole"
## [15] "JobSatisfaction" "MaritalStatus"
## [17] "MonthlyIncome" "MonthlyRate"
## [19] "NumCompaniesWorked" "OverTime"
## [21] "PercentSalaryHike" "PerformanceRating"
## [23] "RelationshipSatisfaction" "StandardHours"
## [25] "StockOptionLevel" "TotalWorkingYears"
## [27] "TrainingTimesLastYear" "WorkLifeBalance"
## [29] "YearsAtCompany" "YearsInCurrentRole"
## [31] "YearsSinceLastPromotion" "YearsWithCurrManager"
## [33] "Attrition" "BusinessTravel_Non_Travel"
## [35] "BusinessTravel_Travel_Frequently" "BusinessTravel_Travel_Rarely"
## [37] "Department_Human_Resources" "Department_Research_Development"
## [39] "Department_Sales" "EducationField_Human_Resources"
## [41] "EducationField_Life_Sciences" "EducationField_Marketing"
## [43] "EducationField_Medical" "EducationField_Other"
## [45] "EducationField_Technical_Degree" "Gender_Female"
## [47] "Gender_Male" "JobRole_Healthcare_Representative"
## [49] "JobRole_Human_Resources" "JobRole_Laboratory_Technician"
## [51] "JobRole_Manager" "JobRole_Manufacturing_Director"
## [53] "JobRole_Research_Director" "JobRole_Research_Scientist"
## [55] "JobRole_Sales_Executive" "JobRole_Sales_Representative"
## [57] "MaritalStatus_Divorced" "MaritalStatus_Married"
## [59] "MaritalStatus_Single"
head(emp_att)
## ï..CC Age BusinessTravel DailyRate Department DistanceFromHome
## 1 7001 41 Travel_Rarely 1102 Sales 10
## 2 7002 49 Travel_Frequently 279 Research_Development 11
## 3 7003 26 Travel_Rarely 1050 Research_Development 5
## 4 7004 33 Travel_Frequently 1392 Research_Development 10
## 5 7005 27 Travel_Rarely 591 Research_Development 9
## 6 7006 28 Travel_Frequently 1003 Research_Development 5
## Education EducationField EnvironmentSatisfaction Gender HourlyRate
## 1 2 Life_Sciences 2 Female 48
## 2 1 Life_Sciences 3 Male 68
## 3 2 Other 4 Male 96
## 4 4 Life_Sciences 4 Female 33
## 5 1 Medical 1 Male 35
## 6 2 Life_Sciences 4 Male 144
## JobInvolvement JobLevel JobRole JobSatisfaction MaritalStatus
## 1 3 2 Sales_Executive 4 Single
## 2 2 2 Research_Scientist 2 Married
## 3 2 1 Laboratory_Technician 3 Single
## 4 3 1 Research_Scientist 3 Married
## 5 3 1 Laboratory_Technician 2 Married
## 6 3 1 Laboratory_Technician 4 Single
## MonthlyIncome MonthlyRate NumCompaniesWorked OverTime PercentSalaryHike
## 1 5993 19479 8 1 11
## 2 5130 24907 1 0 23
## 3 2090 2396 6 1 15
## 4 2909 23159 1 1 11
## 5 3468 16632 9 0 12
## 6 3068 11864 0 0 13
## PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
## 1 3 1 80 0
## 2 4 4 80 1
## 3 3 2 80 0
## 4 3 3 80 0
## 5 3 4 80 1
## 6 3 3 80 0
## TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## 1 8 0 1 6
## 2 10 3 3 10
## 3 7 3 3 0
## 4 8 3 3 8
## 5 6 3 3 2
## 6 8 2 2 7
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager Attrition
## 1 4 0 5 1
## 2 7 1 7 0
## 3 0 0 0 1
## 4 7 3 0 0
## 5 2 2 2 0
## 6 7 3 6 0
## BusinessTravel_Non_Travel BusinessTravel_Travel_Frequently
## 1 0 0
## 2 0 1
## 3 0 0
## 4 0 1
## 5 0 0
## 6 0 1
## BusinessTravel_Travel_Rarely Department_Human_Resources
## 1 1 0
## 2 0 0
## 3 1 0
## 4 0 0
## 5 1 0
## 6 0 0
## Department_Research_Development Department_Sales
## 1 0 1
## 2 1 0
## 3 1 0
## 4 1 0
## 5 1 0
## 6 1 0
## EducationField_Human_Resources EducationField_Life_Sciences
## 1 0 1
## 2 0 1
## 3 0 0
## 4 0 1
## 5 0 0
## 6 0 1
## EducationField_Marketing EducationField_Medical EducationField_Other
## 1 0 0 0
## 2 0 0 0
## 3 0 0 1
## 4 0 0 0
## 5 0 1 0
## 6 0 0 0
## EducationField_Technical_Degree Gender_Female Gender_Male
## 1 0 1 0
## 2 0 0 1
## 3 0 0 1
## 4 0 1 0
## 5 0 0 1
## 6 0 0 1
## JobRole_Healthcare_Representative JobRole_Human_Resources
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## JobRole_Laboratory_Technician JobRole_Manager JobRole_Manufacturing_Director
## 1 0 0 0
## 2 0 0 0
## 3 1 0 0
## 4 0 0 0
## 5 1 0 0
## 6 1 0 0
## JobRole_Research_Director JobRole_Research_Scientist JobRole_Sales_Executive
## 1 0 0 1
## 2 0 1 0
## 3 0 0 0
## 4 0 1 0
## 5 0 0 0
## 6 0 0 0
## JobRole_Sales_Representative MaritalStatus_Divorced MaritalStatus_Married
## 1 0 0 0
## 2 0 0 1
## 3 0 0 0
## 4 0 0 1
## 5 0 0 1
## 6 0 0 0
## MaritalStatus_Single
## 1 1
## 2 0
## 3 1
## 4 0
## 5 0
## 6 1
Consolidamos la data para el analisis de clasificacion
#Seleccionamos todas las variables continuas, menos attrition (la usaremos luego para clasificar)
data_hr=sqldf('SELECT Age,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,
HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,
MonthlyRate,NumCompaniesWorked,OverTime,PercentSalaryHike,
PerformanceRating,StockOptionLevel,TotalWorkingYears,
TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,
YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,
BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,
Department_Human_Resources,Department_Research_Development,
EducationField_Human_Resources,EducationField_Life_Sciences,
EducationField_Marketing,EducationField_Medical,EducationField_Other,
Gender_Female,JobRole_Healthcare_Representative,JobRole_Human_Resources,
JobRole_Laboratory_Technician,JobRole_Manager,JobRole_Manufacturing_Director,
JobRole_Research_Director,JobRole_Research_Scientist,JobRole_Sales_Representative,
MaritalStatus_Married,MaritalStatus_Single
FROM emp_att
')
head(data_hr,3)
## Age DailyRate DistanceFromHome Education EnvironmentSatisfaction HourlyRate
## 1 41 1102 10 2 2 48
## 2 49 279 11 1 3 68
## 3 26 1050 5 2 4 96
## JobInvolvement JobLevel JobSatisfaction MonthlyIncome MonthlyRate
## 1 3 2 4 5993 19479
## 2 2 2 2 5130 24907
## 3 2 1 3 2090 2396
## NumCompaniesWorked OverTime PercentSalaryHike PerformanceRating
## 1 8 1 11 3
## 2 1 0 23 4
## 3 6 1 15 3
## StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance
## 1 0 8 0 1
## 2 1 10 3 3
## 3 0 7 3 3
## YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion
## 1 6 4 0
## 2 10 7 1
## 3 0 0 0
## YearsWithCurrManager BusinessTravel_Travel_Frequently
## 1 5 0
## 2 7 1
## 3 0 0
## BusinessTravel_Travel_Rarely Department_Human_Resources
## 1 1 0
## 2 0 0
## 3 1 0
## Department_Research_Development EducationField_Human_Resources
## 1 0 0
## 2 1 0
## 3 1 0
## EducationField_Life_Sciences EducationField_Marketing EducationField_Medical
## 1 1 0 0
## 2 1 0 0
## 3 0 0 0
## EducationField_Other Gender_Female JobRole_Healthcare_Representative
## 1 0 1 0
## 2 0 0 0
## 3 1 0 0
## JobRole_Human_Resources JobRole_Laboratory_Technician JobRole_Manager
## 1 0 0 0
## 2 0 0 0
## 3 0 1 0
## JobRole_Manufacturing_Director JobRole_Research_Director
## 1 0 0
## 2 0 0
## 3 0 0
## JobRole_Research_Scientist JobRole_Sales_Representative MaritalStatus_Married
## 1 0 0 0
## 2 1 0 1
## 3 0 0 0
## MaritalStatus_Single
## 1 1
## 2 0
## 3 1
summary(data_hr)
## Age DailyRate DistanceFromHome Education
## Min. :18.00 Min. : 102.0 Min. : 1.000 Min. :1.000
## 1st Qu.:27.00 1st Qu.: 697.0 1st Qu.: 5.000 1st Qu.:2.000
## Median :30.50 Median : 974.5 Median : 7.000 Median :3.000
## Mean :33.79 Mean : 900.0 Mean : 7.274 Mean :2.913
## 3rd Qu.:39.00 3rd Qu.:1119.8 3rd Qu.: 9.000 3rd Qu.:4.000
## Max. :60.00 Max. :1496.0 Max. :29.000 Max. :5.000
## EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel
## Min. :1.000 Min. : -5.00 Min. :1.00 Min. :1.000
## 1st Qu.:2.000 1st Qu.: 47.00 1st Qu.:2.00 1st Qu.:1.000
## Median :3.000 Median : 69.00 Median :3.00 Median :2.000
## Mean :2.722 Mean : 77.05 Mean :2.73 Mean :2.064
## 3rd Qu.:4.000 3rd Qu.:106.00 3rd Qu.:3.00 3rd Qu.:3.000
## Max. :4.000 Max. :184.00 Max. :4.00 Max. :5.000
## JobSatisfaction MonthlyIncome MonthlyRate NumCompaniesWorked
## Min. :1.000 Min. : 1009 Min. : 2094 Min. :0.000
## 1st Qu.:2.000 1st Qu.: 2911 1st Qu.: 8047 1st Qu.:1.000
## Median :3.000 Median : 4919 Median :14236 Median :2.000
## Mean :2.729 Mean : 6503 Mean :14313 Mean :2.693
## 3rd Qu.:4.000 3rd Qu.: 8379 3rd Qu.:20462 3rd Qu.:4.000
## Max. :4.000 Max. :19999 Max. :26999 Max. :9.000
## OverTime PercentSalaryHike PerformanceRating StockOptionLevel
## Min. :0.000 Min. :11.00 Min. :3.000 Min. :0.0000
## 1st Qu.:0.000 1st Qu.:12.00 1st Qu.:3.000 1st Qu.:0.0000
## Median :0.000 Median :14.00 Median :3.000 Median :1.0000
## Mean :0.283 Mean :15.21 Mean :3.154 Mean :0.7939
## 3rd Qu.:1.000 3rd Qu.:18.00 3rd Qu.:3.000 3rd Qu.:1.0000
## Max. :1.000 Max. :25.00 Max. :4.000 Max. :3.0000
## TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## Min. : 0.00 Min. :0.000 Min. :1.000 Min. : 0.000
## 1st Qu.: 6.00 1st Qu.:2.000 1st Qu.:2.000 1st Qu.: 3.000
## Median :10.00 Median :3.000 Median :3.000 Median : 5.000
## Mean :11.28 Mean :2.799 Mean :2.761 Mean : 7.008
## 3rd Qu.:15.00 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.: 9.000
## Max. :40.00 Max. :6.000 Max. :4.000 Max. :40.000
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 2.000 1st Qu.: 0.000 1st Qu.: 2.000
## Median : 3.000 Median : 1.000 Median : 3.000
## Mean : 4.229 Mean : 2.188 Mean : 4.123
## 3rd Qu.: 7.000 3rd Qu.: 3.000 3rd Qu.: 7.000
## Max. :18.000 Max. :15.000 Max. :17.000
## BusinessTravel_Travel_Frequently BusinessTravel_Travel_Rarely
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :1.0000
## Mean :0.1884 Mean :0.7095
## 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000
## Department_Human_Resources Department_Research_Development
## Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median :1.0000
## Mean :0.04286 Mean :0.6537
## 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.0000
## EducationField_Human_Resources EducationField_Life_Sciences
## Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000
## Mean :0.01837 Mean :0.4122
## 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.0000
## EducationField_Marketing EducationField_Medical EducationField_Other
## Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.0000 Median :0.00000
## Mean :0.1082 Mean :0.3156 Mean :0.05578
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.00000
## Gender_Female JobRole_Healthcare_Representative JobRole_Human_Resources
## Min. :0.0 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.0 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.0 Median :0.00000 Median :0.00000
## Mean :0.4 Mean :0.08912 Mean :0.03537
## 3rd Qu.:1.0 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.0 Max. :1.00000 Max. :1.00000
## JobRole_Laboratory_Technician JobRole_Manager JobRole_Manufacturing_Director
## Min. :0.0000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.0000 Median :0.00000 Median :0.00000
## Mean :0.1762 Mean :0.06939 Mean :0.09864
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000 Max. :1.00000
## JobRole_Research_Director JobRole_Research_Scientist
## Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000
## Mean :0.05442 Mean :0.1986
## 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.0000
## JobRole_Sales_Representative MaritalStatus_Married MaritalStatus_Single
## Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000 Median :0.0000
## Mean :0.05646 Mean :0.4578 Mean :0.3197
## 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.0000 Max. :1.0000
#Posibles Agrupaciones
distMatrix = as.matrix(dist(data_hr))
heatmap(distMatrix)
#Aplicamos PCA
pca.data_hr <- prcomp(data_hr,scale=TRUE)
summary(pca.data_hr)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.3264 1.65469 1.51599 1.43853 1.37953 1.34907 1.33021
## Proportion of Variance 0.1259 0.06367 0.05345 0.04812 0.04426 0.04233 0.04115
## Cumulative Proportion 0.1259 0.18954 0.24299 0.29111 0.33537 0.37770 0.41885
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 1.30128 1.26286 1.14405 1.10643 1.09445 1.07601 1.06268
## Proportion of Variance 0.03938 0.03709 0.03044 0.02847 0.02786 0.02693 0.02626
## Cumulative Proportion 0.45823 0.49531 0.52575 0.55422 0.58208 0.60900 0.63527
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 1.04894 1.01966 1.00288 0.99856 0.98456 0.97833 0.95793
## Proportion of Variance 0.02559 0.02418 0.02339 0.02319 0.02254 0.02226 0.02134
## Cumulative Proportion 0.66085 0.68503 0.70842 0.73161 0.75416 0.77641 0.79776
## PC22 PC23 PC24 PC25 PC26 PC27 PC28
## Standard deviation 0.93791 0.9110 0.90507 0.87834 0.86035 0.85092 0.74455
## Proportion of Variance 0.02046 0.0193 0.01905 0.01794 0.01721 0.01684 0.01289
## Cumulative Proportion 0.81821 0.8375 0.85656 0.87451 0.89172 0.90856 0.92145
## PC29 PC30 PC31 PC32 PC33 PC34 PC35
## Standard deviation 0.71390 0.68924 0.61988 0.56632 0.52346 0.5078 0.47467
## Proportion of Variance 0.01185 0.01105 0.00894 0.00746 0.00637 0.0060 0.00524
## Cumulative Proportion 0.93330 0.94435 0.95329 0.96074 0.96712 0.9731 0.97835
## PC36 PC37 PC38 PC39 PC40 PC41 PC42
## Standard deviation 0.4635 0.44607 0.38566 0.35510 0.3346 0.28047 0.18629
## Proportion of Variance 0.0050 0.00463 0.00346 0.00293 0.0026 0.00183 0.00081
## Cumulative Proportion 0.9833 0.98798 0.99144 0.99437 0.9970 0.99880 0.99961
## PC43
## Standard deviation 0.12985
## Proportion of Variance 0.00039
## Cumulative Proportion 1.00000
res= PCA(X = data_hr, scale.unit = TRUE, ncp = 5, graph = FALSE)
print(res)
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 1470 individuals, described by 43 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
#Varianza explicada por componente
fviz_screeplot(res, addlabels = TRUE, ylim = c(0, 100))
# Color by the contributions x individuos
fviz_pca_ind(res, col.ind="contrib") +
scale_color_gradient2(low="white", mid="blue",
high="red", midpoint=0.5)
# Color by the contributions x variables
fviz_pca_var(res, col.var="contrib")+
scale_color_gradient2(low="white", mid="blue",
high="red", midpoint=20)
#Combinando
fviz_pca_biplot(res)+
xlim(-3, 3) + ylim (-3, 3)
## Warning: Removed 286 rows containing missing values (geom_point).
## Warning: Removed 286 rows containing missing values (geom_text).
## Warning: Removed 13 rows containing missing values (geom_text).
## Warning: Removed 13 rows containing missing values (geom_segment).
#Aplicamos metodo jerarquico
dendrogram <- hclust(dist(data_hr, method = 'euclidean'), method = 'ward.D')
ggdendrogram(dendrogram, rotate = FALSE, labels = TRUE, theme_dendro = TRUE) +
labs(title = "Dendrograma")
#Obteniendo los k-means
#Calculando los k-means ideales, metodo del codo
set.seed(1234)
wcss <- vector()
for(i in 1:20){
wcss[i] <- sum(kmeans(data_hr, i)$withinss)
}
ggplot() + geom_point(aes(x = 1:20, y = wcss), color = 'blue') +
geom_line(aes(x = 1:20, y = wcss), color = 'blue') +
ggtitle("Metodo del Codo") +
xlab('Cantidad de Centroides k') +
ylab('WCSS')
# Iteracion k-means with k = 2
set.seed(740)
km.res <- kmeans(data_hr, 2, nstart = 25)
#Calcular las medias de cada grupo
aggregate(data_hr, by=list(cluster=km.res$cluster), mean)
## cluster Age DailyRate DistanceFromHome Education EnvironmentSatisfaction
## 1 1 30.42493 1002.6836 6.214477 2.942359 2.675603
## 2 2 37.26519 794.1533 8.366022 2.882597 2.769337
## HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome MonthlyRate
## 1 104.21314 2.735925 2.012064 2.723861 6282.029 8217.247
## 2 49.06215 2.723757 2.117403 2.733425 6730.546 20594.193
## NumCompaniesWorked OverTime PercentSalaryHike PerformanceRating
## 1 2.691689 0.2707775 15.33646 3.167560
## 2 2.694751 0.2955801 15.07873 3.139503
## StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance
## 1 0.8203753 11.01340 2.764075 2.769437
## 2 0.7665746 11.55387 2.835635 2.752762
## YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion
## 1 7.040214 4.313673 2.151475
## 2 6.975138 4.142265 2.225138
## YearsWithCurrManager BusinessTravel_Travel_Frequently
## 1 4.254692 0.1916890
## 2 3.987569 0.1850829
## BusinessTravel_Travel_Rarely Department_Human_Resources
## 1 0.7104558 0.04691689
## 2 0.7085635 0.03867403
## Department_Research_Development EducationField_Human_Resources
## 1 0.6528150 0.01742627
## 2 0.6546961 0.01933702
## EducationField_Life_Sciences EducationField_Marketing EducationField_Medical
## 1 0.3954424 0.1112601 0.3203753
## 2 0.4295580 0.1049724 0.3107735
## EducationField_Other Gender_Female JobRole_Healthcare_Representative
## 1 0.06568365 0.3806971 0.08847185
## 2 0.04558011 0.4198895 0.08977901
## JobRole_Human_Resources JobRole_Laboratory_Technician JobRole_Manager
## 1 0.04021448 0.1863271 0.05898123
## 2 0.03038674 0.1657459 0.08011050
## JobRole_Manufacturing_Director JobRole_Research_Director
## 1 0.09651475 0.04557641
## 2 0.10082873 0.06353591
## JobRole_Research_Scientist JobRole_Sales_Representative MaritalStatus_Married
## 1 0.2064343 0.05495979 0.4758713
## 2 0.1906077 0.05801105 0.4392265
## MaritalStatus_Single
## 1 0.3016086
## 2 0.3383978
#graficar agrupaciones
fviz_cluster(object = km.res, data = data_hr, show.clust.cent = TRUE,
ellipse.type = "norm", star.plot = TRUE, repel = TRUE,
pointsize=0.5,outlier.color="darkred") +
labs(title = "Resultados clustering K-means") +
theme_bw() + theme(legend.position = "none")
## Warning: ggrepel: 1438 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
#agregar el cluster a la data original
data_final= cbind(data_hr, cluster = km.res$cluster,attrition=emp_att$Attrition)
head(data_final,10)
## Age DailyRate DistanceFromHome Education EnvironmentSatisfaction HourlyRate
## 1 41 1102 10 2 2 48
## 2 49 279 11 1 3 68
## 3 26 1050 5 2 4 96
## 4 33 1392 10 4 4 33
## 5 27 591 9 1 1 35
## 6 28 1003 5 2 4 144
## 7 29 1086 5 3 3 147
## 8 26 1058 5 1 4 128
## 9 27 944 7 3 4 75
## 10 36 1299 9 3 3 45
## JobInvolvement JobLevel JobSatisfaction MonthlyIncome MonthlyRate
## 1 3 2 4 5993 19479
## 2 2 2 2 5130 24907
## 3 2 1 3 2090 2396
## 4 3 1 3 2909 23159
## 5 3 1 2 3468 16632
## 6 3 1 4 3068 11864
## 7 4 1 1 2670 9964
## 8 3 1 3 2693 13335
## 9 2 3 3 9526 8787
## 10 3 2 3 5237 16577
## NumCompaniesWorked OverTime PercentSalaryHike PerformanceRating
## 1 8 1 11 3
## 2 1 0 23 4
## 3 6 1 15 3
## 4 1 1 11 3
## 5 9 0 12 3
## 6 0 0 13 3
## 7 4 1 20 4
## 8 1 0 22 4
## 9 0 0 21 4
## 10 6 0 13 3
## StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance
## 1 0 8 0 1
## 2 1 10 3 3
## 3 0 7 3 3
## 4 0 8 3 3
## 5 1 6 3 3
## 6 0 8 2 2
## 7 3 12 3 2
## 8 1 1 2 3
## 9 0 10 2 3
## 10 2 17 3 2
## YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion
## 1 6 4 0
## 2 10 7 1
## 3 0 0 0
## 4 8 7 3
## 5 2 2 2
## 6 7 7 3
## 7 1 0 0
## 8 1 0 0
## 9 9 7 1
## 10 7 7 7
## YearsWithCurrManager BusinessTravel_Travel_Frequently
## 1 5 0
## 2 7 1
## 3 0 0
## 4 0 1
## 5 2 0
## 6 6 1
## 7 0 0
## 8 0 0
## 9 8 1
## 10 7 0
## BusinessTravel_Travel_Rarely Department_Human_Resources
## 1 1 0
## 2 0 0
## 3 1 0
## 4 0 0
## 5 1 0
## 6 0 0
## 7 1 0
## 8 1 0
## 9 0 0
## 10 1 0
## Department_Research_Development EducationField_Human_Resources
## 1 0 0
## 2 1 0
## 3 1 0
## 4 1 0
## 5 1 0
## 6 1 0
## 7 1 0
## 8 1 0
## 9 1 0
## 10 1 0
## EducationField_Life_Sciences EducationField_Marketing EducationField_Medical
## 1 1 0 0
## 2 1 0 0
## 3 0 0 0
## 4 1 0 0
## 5 0 0 1
## 6 1 0 0
## 7 0 0 1
## 8 1 0 0
## 9 1 0 0
## 10 0 0 1
## EducationField_Other Gender_Female JobRole_Healthcare_Representative
## 1 0 1 0
## 2 0 0 0
## 3 1 0 0
## 4 0 1 0
## 5 0 0 0
## 6 0 0 0
## 7 0 1 0
## 8 0 0 0
## 9 0 0 0
## 10 0 0 1
## JobRole_Human_Resources JobRole_Laboratory_Technician JobRole_Manager
## 1 0 0 0
## 2 0 0 0
## 3 0 1 0
## 4 0 0 0
## 5 0 1 0
## 6 0 1 0
## 7 0 1 0
## 8 0 1 0
## 9 0 0 0
## 10 0 0 0
## JobRole_Manufacturing_Director JobRole_Research_Director
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## 7 0 0
## 8 0 0
## 9 1 0
## 10 0 0
## JobRole_Research_Scientist JobRole_Sales_Representative
## 1 0 0
## 2 1 0
## 3 0 0
## 4 1 0
## 5 0 0
## 6 0 0
## 7 0 0
## 8 0 0
## 9 0 0
## 10 0 0
## MaritalStatus_Married MaritalStatus_Single cluster attrition
## 1 0 1 2 1
## 2 1 0 2 0
## 3 0 1 1 1
## 4 1 0 2 0
## 5 1 0 2 0
## 6 0 1 1 0
## 7 1 0 1 0
## 8 0 0 1 0
## 9 0 1 1 0
## 10 1 0 2 0
#Bonus track
write.csv(data_final,"G:/TRABAJO/DOCENCIA/KONRAND LORENZ/HR_datasets/data_hrx2.csv")