library(readxl)
library(plyr)
library(knitr)
library(e1071)
library(naivebayes)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(car)
## Loading required package: carData
library(rpart)
library(rpart.plot)
Tabla<- read.csv("Classification.csv")
head(Tabla)
## ï..Age Attrition BusinessTravel DailyRate Department
## 1 41 Yes Travel_Rarely 1102 Sales
## 2 49 No Travel_Frequently 279 Research & Development
## 3 37 Yes Travel_Rarely 1373 Research & Development
## 4 33 No Travel_Frequently 1392 Research & Development
## 5 27 No Travel_Rarely 591 Research & Development
## 6 32 No Travel_Frequently 1005 Research & Development
## DistanceFromHome Education EducationField EmployeeCount EmployeeNumber
## 1 1 2 Life Sciences 1 1
## 2 8 1 Life Sciences 1 2
## 3 2 2 Other 1 4
## 4 3 4 Life Sciences 1 5
## 5 2 1 Medical 1 7
## 6 2 2 Life Sciences 1 8
## EnvironmentSatisfaction Gender HourlyRate JobInvolvement JobLevel
## 1 2 Female 94 3 2
## 2 3 Male 61 2 2
## 3 4 Male 92 2 1
## 4 4 Female 56 3 1
## 5 1 Male 40 3 1
## 6 4 Male 79 3 1
## JobRole JobSatisfaction MaritalStatus MonthlyIncome
## 1 Sales Executive 4 Single 5993
## 2 Research Scientist 2 Married 5130
## 3 Laboratory Technician 3 Single 2090
## 4 Research Scientist 3 Married 2909
## 5 Laboratory Technician 2 Married 3468
## 6 Laboratory Technician 4 Single 3068
## MonthlyRate NumCompaniesWorked OverTime PercentSalaryHike
## 1 19479 8 Yes 11
## 2 24907 1 No 23
## 3 2396 6 Yes 15
## 4 23159 1 Yes 11
## 5 16632 9 No 12
## 6 11864 0 No 13
## PerformanceRating RelationshipSatisfaction StockOptionLevel
## 1 3 1 0
## 2 4 4 1
## 3 3 2 0
## 4 3 3 0
## 5 3 4 1
## 6 3 3 0
## TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## 1 8 0 1 6
## 2 10 3 3 10
## 3 7 3 3 0
## 4 8 3 3 8
## 5 6 3 3 2
## 6 8 2 2 7
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## 1 4 0 5
## 2 7 1 7
## 3 0 0 0
## 4 7 3 0
## 5 2 2 2
## 6 7 3 6
class(Tabla$ï..Age)
## [1] "integer"
summary(Tabla$ï..Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 30.00 36.00 36.92 43.00 60.00
class(Tabla$Attrition)
## [1] "factor"
summary(Tabla$Attrition)
## No Yes
## 1233 237
class(Tabla$BusinessTravel)
## [1] "factor"
summary(Tabla$BusinessTravel)
## Non-Travel Travel_Frequently Travel_Rarely
## 150 277 1043
class(Tabla$DailyRate)
## [1] "integer"
summary(Tabla$DailyRate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 102.0 465.0 802.0 802.5 1157.0 1499.0
class(Tabla$Department)
## [1] "factor"
summary(Tabla$Department)
## Human Resources Research & Development Sales
## 63 961 446
class(Tabla$DistanceFromHome)
## [1] "integer"
summary(Tabla$DistanceFromHome)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 7.000 9.193 14.000 29.000
class(Tabla$Education)
## [1] "integer"
summary(Tabla$Education)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 2.913 4.000 5.000
class(Tabla$EducationField)
## [1] "factor"
summary(Tabla$EducationField)
## Human Resources Life Sciences Marketing Medical
## 27 606 159 464
## Other Technical Degree
## 82 132
class(Tabla$EmployeeCount)
## [1] "integer"
summary(Tabla$EmployeeCount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 1 1 1 1 1
class(Tabla$EmployeeNumber)
## [1] "integer"
summary(Tabla$EmployeeNumber)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 491.2 1020.5 1024.9 1555.8 2068.0
class(Tabla$EnvironmentSatisfaction)
## [1] "integer"
summary(Tabla$EnvironmentSatisfaction)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 2.722 4.000 4.000
class(Tabla$Gender)
## [1] "factor"
summary(Tabla$Gender)
## Female Male
## 588 882
class(Tabla$HourlyRate)
## [1] "integer"
summary(Tabla$HourlyRate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 30.00 48.00 66.00 65.89 83.75 100.00
class(Tabla$JobInvolvement)
## [1] "integer"
summary(Tabla$JobInvolvement)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 2.00 3.00 2.73 3.00 4.00
class(Tabla$JobLevel)
## [1] "integer"
summary(Tabla$JobLevel)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 2.064 3.000 5.000
class(Tabla$JobRole)
## [1] "factor"
summary(Tabla$JobRole)
## Healthcare Representative Human Resources
## 131 52
## Laboratory Technician Manager
## 259 102
## Manufacturing Director Research Director
## 145 80
## Research Scientist Sales Executive
## 292 326
## Sales Representative
## 83
class(Tabla$JobSatisfaction)
## [1] "integer"
summary(Tabla$JobSatisfaction)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 2.729 4.000 4.000
class(Tabla$MaritalStatus)
## [1] "factor"
summary(Tabla$MaritalStatus)
## Divorced Married Single
## 327 673 470
class(Tabla$MonthlyIncome)
## [1] "integer"
summary(Tabla$MonthlyIncome)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1009 2911 4919 6503 8379 19999
class(Tabla$MonthlyRate)
## [1] "integer"
summary(Tabla$MonthlyRate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2094 8047 14236 14313 20462 26999
class(Tabla$NumCompaniesWorked)
## [1] "integer"
summary(Tabla$NumCompaniesWorked)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 2.000 2.693 4.000 9.000
class(Tabla$OverTime)
## [1] "factor"
summary(Tabla$OverTime)
## No Yes
## 1054 416
class(Tabla$PercentSalaryHike)
## [1] "integer"
summary(Tabla$PercentSalaryHike)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 11.00 12.00 14.00 15.21 18.00 25.00
class(Tabla$PerformanceRating)
## [1] "integer"
summary(Tabla$PerformanceRating)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 3.000 3.000 3.154 3.000 4.000
class(Tabla$RelationshipSatisfaction)
## [1] "integer"
summary(Tabla$RelationshipSatisfaction)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 2.712 4.000 4.000
class(Tabla$StockOptionLevel)
## [1] "integer"
summary(Tabla$StockOptionLevel)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 1.0000 0.7939 1.0000 3.0000
class(Tabla$TotalWorkingYears)
## [1] "integer"
summary(Tabla$TotalWorkingYears)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 6.00 10.00 11.28 15.00 40.00
class(Tabla$TrainingTimesLastYear)
## [1] "integer"
summary(Tabla$TrainingTimesLastYear)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 3.000 2.799 3.000 6.000
class(Tabla$WorkLifeBalance)
## [1] "integer"
summary(Tabla$WorkLifeBalance)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 2.761 3.000 4.000
class(Tabla$YearsAtCompany)
## [1] "integer"
summary(Tabla$YearsAtCompany)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 3.000 5.000 7.008 9.000 40.000
class(Tabla$YearsInCurrentRole)
## [1] "integer"
summary(Tabla$YearsInCurrentRole)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 3.000 4.229 7.000 18.000
class(Tabla$YearsSinceLastPromotion)
## [1] "integer"
summary(Tabla$YearsSinceLastPromotion)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 2.188 3.000 15.000
class(Tabla$YearsWithCurrManager)
## [1] "integer"
summary(Tabla$YearsWithCurrManager)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 3.000 4.123 7.000 17.000
#la columna EmployeeCount presenta el mismo valor para todos sus datos por lo que podria considerarse no representativa y podria eliminarse para reducir la dimencion del dataset, del mismo modo no es coherente con el titulo de la columna pór lo que se puede pensar que se trata de una variable dañada.
summary(Tabla$EmployeeCount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 1 1 1 1 1
hist(Tabla$EmployeeCount)
boxplot(Tabla$EmployeeCount, horizontal = T)
#la columna YearsAtCompany presenta datos atipicos entre los 20 y 40 años como se observa en los siguientes graficos
summary(Tabla$YearsAtCompany)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 3.000 5.000 7.008 9.000 40.000
hist(Tabla$YearsAtCompany)
boxplot(Tabla$YearsAtCompany, horizontal = T)
#la columna YearsInCurrentRole presenta datos atipicos a partir de los 15 años como se observa en los siguientes graficos
summary(Tabla$YearsInCurrentRole)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 3.000 4.229 7.000 18.000
hist(Tabla$YearsInCurrentRole)
boxplot(Tabla$YearsInCurrentRole, horizontal = T)
# la columna YearsSinceLastPromotion presenta una distribucion bastante desbalanceada mostrando como datos atipicos desde los 7 años aproximadamente
summary(Tabla$YearsSinceLastPromotion)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 2.188 3.000 15.000
hist(Tabla$YearsSinceLastPromotion)
boxplot(Tabla$YearsSinceLastPromotion, horizontal = T)
# la columna YearsWithCurrManager presenta datos atipicos a partir de los 15 años como se observa en los siguientes graficos
summary(Tabla$YearsWithCurrManager)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 3.000 4.123 7.000 17.000
hist(Tabla$YearsWithCurrManager)
boxplot(Tabla$YearsWithCurrManager, horizontal = T)
c=0
set.seed(600)
# Creo tabla donde voy a hacer las modificaciones pertinentes
tabla2<-Tabla
# Elimino columna EmployeeCount
tabla2$EmployeeCount<- NULL
# Elimino las observaciones con la columna YearsAtCompany mayor a 18
tabla3<- tabla2[tabla2$YearsAtCompany<= 18,]
hist(tabla3$YearsAtCompany)
boxplot(tabla3$YearsAtCompany, horizontal = T)
# Elimino las observaciones con la columna YearsInCurrentRole mayor a 14
tabla4<- tabla3[tabla3$YearsInCurrentRole<= 14,]
hist(tabla4$YearsInCurrentRole)
boxplot(tabla4$YearsInCurrentRole, horizontal = T)
# Para balancear el data set se debe eliminar alrededor del 95% observaciones donde la variable YearsSinceLastPromotion < 2
sobra<- tabla4[tabla4$YearsSinceLastPromotion== 0,]
ind <- sample(2,nrow(sobra), replace = TRUE, prob = c(0.95, 0.05) )
sobra<- sobra[ind== 2,]
tabla5<- tabla4[tabla4$YearsSinceLastPromotion> 1,]
sobra2<- tabla4[tabla4$YearsSinceLastPromotion== 1,]
ind <- sample(2,nrow(sobra2), replace = TRUE, prob = c(0.95, 0.05) )
sobra2<- sobra2[ind== 2,]
tabla5<- rbind(tabla5, sobra)
tabla5<- rbind(tabla5, sobra2)
hist(tabla5$YearsSinceLastPromotion)
boxplot(tabla5$YearsSinceLastPromotion, horizontal = T)
# Para balancear el data set se debe eliminar alrededor de 70% observaciones donde la variable YearsWithCurrManager < 2
sobra<- tabla5[tabla5$YearsWithCurrManager== 0,]
ind <- sample(2,nrow(sobra), replace = TRUE, prob = c(0.7, 0.3) )
sobra<- sobra[ind== 2,]
tabla6<- tabla5[tabla5$YearsWithCurrManager> 1,]
sobra2<- tabla5[tabla5$YearsWithCurrManager== 1,]
ind <- sample(2,nrow(sobra2), replace = TRUE, prob = c(0.7, 0.3) )
sobra2<- sobra2[ind== 2,]
tabla6<- rbind(tabla6, sobra)
tabla6<- rbind(tabla6, sobra2)
hist(tabla6$YearsWithCurrManager)
boxplot(tabla6$YearsWithCurrManager, horizontal = T)
#Convertir variable dependiente en tipo categorico
tabla6$JobSatisfaction<- as.factor(tabla6$JobSatisfaction)
tabla6$MaritalStatus<- NULL
summary(tabla6$JobSatisfaction)
## 1 2 3 4
## 82 87 127 153
# Creo set de entrenamiento y de test del sistema balanceado tabla 6
ind <- sample(2,nrow(tabla6), replace = TRUE, prob = c(0.8, 0.2) )
entrenamiento<- tabla6[ind==1,]
test<- tabla6[ind==2,]
# Creo set de entrenamiento y de test del sistema sin balancear tabla 2
tabla2$JobSatisfaction<- as.factor(tabla2$JobSatisfaction)
tabla2$MaritalStatus<- NULL
ind <- sample(2,nrow(tabla2), replace = TRUE, prob = c(0.8, 0.2) )
entrenamiento2<- tabla2[ind==1,]
test2<- tabla2[ind==2,]
# Creo set de entrenamiento sobre balanceado
e1<- entrenamiento[entrenamiento$JobSatisfaction==1,]
e2<- entrenamiento[entrenamiento$JobSatisfaction==2,]
e3<- entrenamiento[entrenamiento$JobSatisfaction==3,]
e4<- entrenamiento[entrenamiento$JobSatisfaction==4,]
entrenamiento3<- rbind(e1[1:60, ], e2[1:60, ])
entrenamiento3<- rbind(entrenamiento3, e3[1:60, ])
entrenamiento3<- rbind(entrenamiento3, e4[1:60, ])
# Modelo Balanceado
plot(entrenamiento$JobSatisfaction)
modelo1 <- naive_bayes(JobSatisfaction ~ ., data = entrenamiento)
# Modelo sin Balancear
plot(entrenamiento2$JobSatisfaction)
modelo2 <- naive_bayes(JobSatisfaction ~ ., data = entrenamiento2)
# Modelo sobre balanceado
plot(entrenamiento3$JobSatisfaction)
modelo3 <- naive_bayes(JobSatisfaction ~ ., data = entrenamiento3)
# Prediccion en modelo Balanceado
pred<- predict(modelo1 , test)
tab <- table(test$JobSatisfaction, pred, dnn = c("Actual", "Predicha"))
confusionMatrix(tab)
## Confusion Matrix and Statistics
##
## Predicha
## Actual 1 2 3 4
## 1 3 4 1 6
## 2 1 1 3 10
## 3 5 6 2 19
## 4 3 10 7 17
##
## Overall Statistics
##
## Accuracy : 0.2347
## 95% CI : (0.155, 0.3311)
## No Information Rate : 0.5306
## P-Value [Acc > NIR] : 1.00000
##
## Kappa : -0.0839
##
## Mcnemar's Test P-Value : 0.06185
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 0.25000 0.04762 0.15385 0.3269
## Specificity 0.87209 0.81818 0.64706 0.5652
## Pos Pred Value 0.21429 0.06667 0.06250 0.4595
## Neg Pred Value 0.89286 0.75904 0.83333 0.4262
## Prevalence 0.12245 0.21429 0.13265 0.5306
## Detection Rate 0.03061 0.01020 0.02041 0.1735
## Detection Prevalence 0.14286 0.15306 0.32653 0.3776
## Balanced Accuracy 0.56105 0.43290 0.40045 0.4461
# Predicion en modelo sin Balancear
pred2<- predict(modelo2 , test2)
tab2 <- table(test2$JobSatisfaction, pred2, dnn = c("Actual", "Predicha"))
confusionMatrix(tab2)
## Confusion Matrix and Statistics
##
## Predicha
## Actual 1 2 3 4
## 1 3 9 22 23
## 2 2 8 23 21
## 3 8 9 33 37
## 4 7 12 37 41
##
## Overall Statistics
##
## Accuracy : 0.2881
## 95% CI : (0.2371, 0.3435)
## No Information Rate : 0.4136
## P-Value [Acc > NIR] : 1
##
## Kappa : 7e-04
##
## Mcnemar's Test P-Value : 8.994e-05
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 0.15000 0.21053 0.2870 0.3361
## Specificity 0.80364 0.82101 0.7000 0.6763
## Pos Pred Value 0.05263 0.14815 0.3793 0.4227
## Neg Pred Value 0.92857 0.87552 0.6058 0.5909
## Prevalence 0.06780 0.12881 0.3898 0.4136
## Detection Rate 0.01017 0.02712 0.1119 0.1390
## Detection Prevalence 0.19322 0.18305 0.2949 0.3288
## Balanced Accuracy 0.47682 0.51577 0.4935 0.5062
# Predicion en modelo sobre balanceado
pred3<- predict(modelo3 , test2)
tab3 <- table(test2$JobSatisfaction, pred3, dnn = c("Actual", "Predicha"))
confusionMatrix(tab3)
## Confusion Matrix and Statistics
##
## Predicha
## Actual 1 2 3 4
## 1 18 13 19 7
## 2 10 19 17 8
## 3 14 25 34 14
## 4 25 22 31 19
##
## Overall Statistics
##
## Accuracy : 0.3051
## 95% CI : (0.253, 0.3611)
## No Information Rate : 0.3424
## P-Value [Acc > NIR] : 0.9219213
##
## Kappa : 0.0767
##
## Mcnemar's Test P-Value : 0.0002475
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 0.26866 0.24051 0.3366 0.39583
## Specificity 0.82895 0.83796 0.7268 0.68421
## Pos Pred Value 0.31579 0.35185 0.3908 0.19588
## Neg Pred Value 0.79412 0.75104 0.6779 0.85354
## Prevalence 0.22712 0.26780 0.3424 0.16271
## Detection Rate 0.06102 0.06441 0.1153 0.06441
## Detection Prevalence 0.19322 0.18305 0.2949 0.32881
## Balanced Accuracy 0.54880 0.53923 0.5317 0.54002
#Se crea el arbol con Modelo Balanceado
arbol<- rpart(JobSatisfaction ~ ., data = entrenamiento, method = "class")
# Muestro el arbol de dos formas diferentes
print(arbol)
## n= 351
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 351 235 4 (0.19373219 0.20512821 0.27065527 0.33048433)
## 2) DistanceFromHome>=2.5 246 169 3 (0.22357724 0.19918699 0.31300813 0.26422764)
## 4) DailyRate>=272 219 149 3 (0.24657534 0.16894977 0.31963470 0.26484018)
## 8) JobRole=Healthcare Representative,Laboratory Technician,Manufacturing Director,Research Scientist,Sales Executive 173 122 4 (0.27745665 0.13872832 0.28901734 0.29479769)
## 16) TotalWorkingYears>=5.5 147 99 4 (0.30612245 0.12244898 0.24489796 0.32653061)
## 32) HourlyRate>=64.5 75 45 1 (0.40000000 0.12000000 0.25333333 0.22666667)
## 64) MonthlyRate< 4751 8 1 1 (0.87500000 0.00000000 0.12500000 0.00000000) *
## 65) MonthlyRate>=4751 67 44 1 (0.34328358 0.13432836 0.26865672 0.25373134)
## 130) TrainingTimesLastYear< 3.5 56 34 1 (0.39285714 0.16071429 0.26785714 0.17857143)
## 260) YearsAtCompany< 7.5 17 5 1 (0.70588235 0.05882353 0.23529412 0.00000000) *
## 261) YearsAtCompany>=7.5 39 28 3 (0.25641026 0.20512821 0.28205128 0.25641026)
## 522) BusinessTravel=Travel_Rarely 29 19 1 (0.34482759 0.27586207 0.17241379 0.20689655)
## 1044) DailyRate>=818.5 16 8 1 (0.50000000 0.37500000 0.06250000 0.06250000) *
## 1045) DailyRate< 818.5 13 8 4 (0.15384615 0.15384615 0.30769231 0.38461538) *
## 523) BusinessTravel=Non-Travel,Travel_Frequently 10 4 3 (0.00000000 0.00000000 0.60000000 0.40000000) *
## 131) TrainingTimesLastYear>=3.5 11 4 4 (0.09090909 0.00000000 0.27272727 0.63636364) *
## 33) HourlyRate< 64.5 72 41 4 (0.20833333 0.12500000 0.23611111 0.43055556)
## 66) JobRole=Healthcare Representative,Manufacturing Director,Research Scientist 32 21 3 (0.28125000 0.12500000 0.34375000 0.25000000)
## 132) MonthlyRate< 14392.5 23 12 3 (0.30434783 0.08695652 0.47826087 0.13043478) *
## 133) MonthlyRate>=14392.5 9 4 4 (0.22222222 0.22222222 0.00000000 0.55555556) *
## 67) JobRole=Laboratory Technician,Sales Executive 40 17 4 (0.15000000 0.12500000 0.15000000 0.57500000) *
## 17) TotalWorkingYears< 5.5 26 12 3 (0.11538462 0.23076923 0.53846154 0.11538462) *
## 9) JobRole=Human Resources,Manager,Research Director,Sales Representative 46 26 3 (0.13043478 0.28260870 0.43478261 0.15217391)
## 18) EducationField=Life Sciences,Marketing,Other 26 14 2 (0.07692308 0.46153846 0.26923077 0.19230769)
## 36) DailyRate< 1166.5 17 8 2 (0.11764706 0.52941176 0.05882353 0.29411765) *
## 37) DailyRate>=1166.5 9 3 3 (0.00000000 0.33333333 0.66666667 0.00000000) *
## 19) EducationField=Medical,Technical Degree 20 7 3 (0.20000000 0.05000000 0.65000000 0.10000000) *
## 5) DailyRate< 272 27 15 2 (0.03703704 0.44444444 0.25925926 0.25925926) *
## 3) DistanceFromHome< 2.5 105 54 4 (0.12380952 0.21904762 0.17142857 0.48571429)
## 6) JobRole=Manager,Manufacturing Director,Research Director,Sales Executive,Sales Representative 54 37 2 (0.16666667 0.31481481 0.20370370 0.31481481)
## 12) PercentSalaryHike< 12.5 15 5 2 (0.06666667 0.66666667 0.20000000 0.06666667) *
## 13) PercentSalaryHike>=12.5 39 23 4 (0.20512821 0.17948718 0.20512821 0.41025641) *
## 7) JobRole=Healthcare Representative,Human Resources,Laboratory Technician,Research Scientist 51 17 4 (0.07843137 0.11764706 0.13725490 0.66666667) *
rpart.plot(arbol, extra = 4)
# presento informacion sobre el modelo creado
printcp(arbol)
##
## Classification tree:
## rpart(formula = JobSatisfaction ~ ., data = entrenamiento, method = "class")
##
## Variables actually used in tree construction:
## [1] BusinessTravel DailyRate DistanceFromHome
## [4] EducationField HourlyRate JobRole
## [7] MonthlyRate PercentSalaryHike TotalWorkingYears
## [10] TrainingTimesLastYear YearsAtCompany
##
## Root node error: 235/351 = 0.66952
##
## n= 351
##
## CP nsplit rel error xerror xstd
## 1 0.051064 0 1.00000 1.00000 0.037501
## 2 0.031915 1 0.94894 1.05106 0.036404
## 3 0.021277 5 0.82128 1.06809 0.035984
## 4 0.019149 6 0.80000 1.04255 0.036603
## 5 0.017021 8 0.76170 1.03404 0.036795
## 6 0.012766 10 0.72766 1.02979 0.036889
## 7 0.010000 16 0.65106 0.99149 0.037661
plotcp(arbol)
# se observa que a partir de cp= 0.010000 el error empieza a aumentar, por lo que la poda se hara desde este punto
#Se crea el arbol con Modelo sin Balancear
arbol2<- rpart(JobSatisfaction ~ ., data = entrenamiento2, method = "class")
# Muestro el arbol de dos formas diferentes
print(arbol2)
## n= 1175
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 1175 813 4 (0.1974468 0.1923404 0.3021277 0.3080851)
## 2) Attrition=Yes 184 127 3 (0.2826087 0.2173913 0.3097826 0.1902174) *
## 3) Attrition=No 991 664 4 (0.1816347 0.1876892 0.3007064 0.3299697)
## 6) NumCompaniesWorked>=4.5 200 144 3 (0.2100000 0.2650000 0.2800000 0.2450000)
## 12) JobRole=Human Resources,Laboratory Technician 34 15 2 (0.1176471 0.5588235 0.2058824 0.1176471) *
## 13) JobRole=Healthcare Representative,Manager,Manufacturing Director,Research Director,Research Scientist,Sales Executive,Sales Representative 166 117 3 (0.2289157 0.2048193 0.2951807 0.2710843) *
## 7) NumCompaniesWorked< 4.5 791 513 4 (0.1744627 0.1681416 0.3059418 0.3514539)
## 14) MonthlyRate>=3020.5 761 492 4 (0.1773982 0.1747700 0.2943495 0.3534823)
## 28) MonthlyRate>=24566.5 58 32 3 (0.2068966 0.1724138 0.4482759 0.1724138) *
## 29) MonthlyRate< 24566.5 703 444 4 (0.1749644 0.1749644 0.2816501 0.3684211) *
## 15) MonthlyRate< 3020.5 30 12 3 (0.1000000 0.0000000 0.6000000 0.3000000) *
rpart.plot(arbol2, extra = 4)
# presento informacion sobre el modelo creado
printcp(arbol2)
##
## Classification tree:
## rpart(formula = JobSatisfaction ~ ., data = entrenamiento2, method = "class")
##
## Variables actually used in tree construction:
## [1] Attrition JobRole MonthlyRate
## [4] NumCompaniesWorked
##
## Root node error: 813/1175 = 0.69191
##
## n= 1175
##
## CP nsplit rel error xerror xstd
## 1 0.02706 0 1.00000 1.01230 0.019314
## 2 0.01353 1 0.97294 0.98770 0.019612
## 3 0.01000 5 0.91882 0.99016 0.019583
plotcp(arbol2)
# se observa que a partir de cp= 0.01353 el error empieza a aumentar, por lo que la poda se hara desde este punto
#Se crea el arbol con Modelo sobre Balanceado
arbol3<- rpart(JobSatisfaction ~ ., data = entrenamiento3, method = "class")
# Muestro el arbol de dos formas diferentes
print(arbol3)
## n= 240
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 240 180 1 (0.25000000 0.25000000 0.25000000 0.25000000)
## 2) EmployeeNumber>=1390 37 18 1 (0.51351351 0.45945946 0.02702703 0.00000000)
## 4) StockOptionLevel>=0.5 25 8 1 (0.68000000 0.32000000 0.00000000 0.00000000) *
## 5) StockOptionLevel< 0.5 12 3 2 (0.16666667 0.75000000 0.08333333 0.00000000) *
## 3) EmployeeNumber< 1390 203 143 4 (0.20197044 0.21182266 0.29064039 0.29556650)
## 6) EmployeeNumber>=1261.5 17 8 3 (0.17647059 0.29411765 0.52941176 0.00000000) *
## 7) EmployeeNumber< 1261.5 186 126 4 (0.20430108 0.20430108 0.26881720 0.32258065)
## 14) DistanceFromHome>=2.5 130 89 3 (0.21538462 0.20769231 0.31538462 0.26153846)
## 28) EmployeeNumber< 206 24 11 3 (0.25000000 0.08333333 0.54166667 0.12500000) *
## 29) EmployeeNumber>=206 106 75 4 (0.20754717 0.23584906 0.26415094 0.29245283)
## 58) EmployeeNumber>=927 36 21 3 (0.25000000 0.11111111 0.41666667 0.22222222)
## 116) ï..Age< 44.5 28 19 3 (0.28571429 0.10714286 0.32142857 0.28571429)
## 232) Education>=3.5 9 4 3 (0.33333333 0.11111111 0.55555556 0.00000000) *
## 233) Education< 3.5 19 11 4 (0.26315789 0.10526316 0.21052632 0.42105263) *
## 117) ï..Age>=44.5 8 2 3 (0.12500000 0.12500000 0.75000000 0.00000000) *
## 59) EmployeeNumber< 927 70 47 4 (0.18571429 0.30000000 0.18571429 0.32857143)
## 118) JobRole=Human Resources,Manager,Sales Representative 8 2 2 (0.00000000 0.75000000 0.25000000 0.00000000) *
## 119) JobRole=Healthcare Representative,Laboratory Technician,Manufacturing Director,Research Director,Research Scientist,Sales Executive 62 39 4 (0.20967742 0.24193548 0.17741935 0.37096774)
## 238) PercentSalaryHike< 13.5 25 16 1 (0.36000000 0.24000000 0.24000000 0.16000000)
## 476) RelationshipSatisfaction>=3.5 7 2 2 (0.14285714 0.71428571 0.14285714 0.00000000) *
## 477) RelationshipSatisfaction< 3.5 18 10 1 (0.44444444 0.05555556 0.27777778 0.22222222) *
## 239) PercentSalaryHike>=13.5 37 18 4 (0.10810811 0.24324324 0.13513514 0.51351351) *
## 15) DistanceFromHome< 2.5 56 30 4 (0.17857143 0.19642857 0.16071429 0.46428571)
## 30) PercentSalaryHike< 12.5 13 7 2 (0.07692308 0.46153846 0.30769231 0.15384615) *
## 31) PercentSalaryHike>=12.5 43 19 4 (0.20930233 0.11627907 0.11627907 0.55813953)
## 62) WorkLifeBalance>=2.5 30 18 4 (0.26666667 0.16666667 0.16666667 0.40000000)
## 124) MonthlyIncome>=5272.5 14 7 1 (0.50000000 0.07142857 0.21428571 0.21428571) *
## 125) MonthlyIncome< 5272.5 16 7 4 (0.06250000 0.25000000 0.12500000 0.56250000) *
## 63) WorkLifeBalance< 2.5 13 1 4 (0.07692308 0.00000000 0.00000000 0.92307692) *
rpart.plot(arbol3, extra = 4)
# presento informacion sobre el modelo creado
printcp(arbol3)
##
## Classification tree:
## rpart(formula = JobSatisfaction ~ ., data = entrenamiento3, method = "class")
##
## Variables actually used in tree construction:
## [1] DistanceFromHome Education
## [3] EmployeeNumber ï..Age
## [5] JobRole MonthlyIncome
## [7] PercentSalaryHike RelationshipSatisfaction
## [9] StockOptionLevel WorkLifeBalance
##
## Root node error: 180/240 = 0.75
##
## n= 240
##
## CP nsplit rel error xerror xstd
## 1 0.105556 0 1.00000 1.11667 0.031751
## 2 0.050000 1 0.89444 0.95556 0.038783
## 3 0.038889 2 0.84444 0.93889 0.039282
## 4 0.027778 4 0.76667 0.94444 0.039120
## 5 0.022222 8 0.65000 0.93889 0.039282
## 6 0.011111 10 0.60556 0.93889 0.039282
## 7 0.010000 14 0.56111 0.96667 0.038430
plotcp(arbol3)
# se observa que a partir de cp= 0.038889 el error no baja mas, por lo que la poda se hara desde este punto
#Modelo Balanceado
ArbolPoda<- prune(arbol, cp = 0.010000)
printcp(ArbolPoda)
##
## Classification tree:
## rpart(formula = JobSatisfaction ~ ., data = entrenamiento, method = "class")
##
## Variables actually used in tree construction:
## [1] BusinessTravel DailyRate DistanceFromHome
## [4] EducationField HourlyRate JobRole
## [7] MonthlyRate PercentSalaryHike TotalWorkingYears
## [10] TrainingTimesLastYear YearsAtCompany
##
## Root node error: 235/351 = 0.66952
##
## n= 351
##
## CP nsplit rel error xerror xstd
## 1 0.051064 0 1.00000 1.00000 0.037501
## 2 0.031915 1 0.94894 1.05106 0.036404
## 3 0.021277 5 0.82128 1.06809 0.035984
## 4 0.019149 6 0.80000 1.04255 0.036603
## 5 0.017021 8 0.76170 1.03404 0.036795
## 6 0.012766 10 0.72766 1.02979 0.036889
## 7 0.010000 16 0.65106 0.99149 0.037661
#Modelo sin Balancear
ArbolPoda2<- prune(arbol2, cp = 0.01353)
printcp(ArbolPoda2)
##
## Classification tree:
## rpart(formula = JobSatisfaction ~ ., data = entrenamiento2, method = "class")
##
## Variables actually used in tree construction:
## [1] Attrition JobRole MonthlyRate
## [4] NumCompaniesWorked
##
## Root node error: 813/1175 = 0.69191
##
## n= 1175
##
## CP nsplit rel error xerror xstd
## 1 0.02706 0 1.00000 1.01230 0.019314
## 2 0.01353 1 0.97294 0.98770 0.019612
## 3 0.01000 5 0.91882 0.99016 0.019583
#Modelo sobre Balanceado
ArbolPoda3<- prune(arbol3, cp = 0.038889)
printcp(ArbolPoda3)
##
## Classification tree:
## rpart(formula = JobSatisfaction ~ ., data = entrenamiento3, method = "class")
##
## Variables actually used in tree construction:
## [1] EmployeeNumber
##
## Root node error: 180/240 = 0.75
##
## n= 240
##
## CP nsplit rel error xerror xstd
## 1 0.105556 0 1.00000 1.11667 0.031751
## 2 0.050000 1 0.89444 0.95556 0.038783
## 3 0.038889 2 0.84444 0.93889 0.039282
# Evaluar modelo balanceado
predArb<- predict(ArbolPoda, newdata = test, type = "class")
predArb
## 28 56 61 93 95 111 113 117 123 138 140 146 162 174 204
## 4 2 4 4 1 4 2 3 1 4 3 2 3 2 1
## 227 270 277 286 304 307 314 318 338 363 378 399 400 463 485
## 2 4 3 4 2 1 2 4 3 3 4 3 4 2 4
## 509 510 537 548 550 556 563 579 583 601 631 637 647 676 687
## 1 3 1 4 1 2 4 2 4 1 4 2 1 2 2
## 740 746 749 766 774 819 832 834 849 913 930 939 944 998 1096
## 4 2 2 3 4 2 3 2 3 3 4 3 4 2 4
## 1099 1100 1104 1131 1158 1165 1179 1185 1213 1221 1278 1280 1282 1286 1323
## 1 4 4 4 2 1 4 3 4 4 4 4 4 1 4
## 1353 1387 1397 1399 1415 1417 1419 1423 1445 1448 37 213 276 479 484
## 4 4 1 4 1 4 3 3 1 4 2 1 4 3 4
## 1218 580 1438 1447 442 297 129 132
## 3 4 2 4 4 2 4 1
## Levels: 1 2 3 4
tabArb <- table(test$JobSatisfaction, predArb, dnn = c("Actual", "Predicha"))
confusionMatrix(tabArb)
## Confusion Matrix and Statistics
##
## Predicha
## Actual 1 2 3 4
## 1 2 5 1 6
## 2 2 4 2 7
## 3 7 6 9 10
## 4 6 7 6 18
##
## Overall Statistics
##
## Accuracy : 0.3367
## 95% CI : (0.2444, 0.4393)
## No Information Rate : 0.4184
## P-Value [Acc > NIR] : 0.9604
##
## Kappa : 0.0825
##
## Mcnemar's Test P-Value : 0.1860
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 0.11765 0.18182 0.50000 0.4390
## Specificity 0.85185 0.85526 0.71250 0.6667
## Pos Pred Value 0.14286 0.26667 0.28125 0.4865
## Neg Pred Value 0.82143 0.78313 0.86364 0.6230
## Prevalence 0.17347 0.22449 0.18367 0.4184
## Detection Rate 0.02041 0.04082 0.09184 0.1837
## Detection Prevalence 0.14286 0.15306 0.32653 0.3776
## Balanced Accuracy 0.48475 0.51854 0.60625 0.5528
# Evaluar modelo sin balancear
predArb2<- predict(ArbolPoda2, newdata = test2, type = "class")
predArb2
## 1 2 9 11 14 22 24 25 46 48 49 57 59 62 65
## 3 3 4 4 4 3 4 3 3 4 4 4 4 4 4
## 67 69 71 73 79 82 88 101 104 106 108 114 119 123 127
## 4 4 3 4 3 3 4 3 4 3 3 4 2 3 3
## 130 135 138 140 142 146 147 157 172 176 184 204 206 213 217
## 4 4 4 4 3 3 4 4 3 4 4 2 3 4 3
## 222 226 230 233 234 235 236 238 240 241 256 259 260 261 265
## 3 4 3 2 4 3 3 4 3 2 4 4 3 3 3
## 268 271 274 276 279 295 312 314 327 328 336 341 345 346 368
## 4 4 4 4 3 4 4 3 4 3 4 3 4 4 3
## 369 377 381 392 393 396 408 410 414 424 426 427 429 431 440
## 3 4 4 2 4 2 4 4 4 3 4 4 3 4 3
## 447 450 451 458 467 470 480 485 488 491 499 502 504 505 526
## 3 4 4 3 3 3 3 4 4 4 4 4 4 3 3
## 535 537 541 553 561 562 574 577 582 585 586 600 602 606 612
## 4 3 3 3 4 4 3 4 4 4 3 3 2 4 4
## 616 620 628 629 636 642 654 660 661 664 674 678 681 685 692
## 4 4 4 4 4 4 4 4 3 3 4 4 4 4 3
## 694 701 708 725 734 739 740 748 749 767 771 777 782 788 795
## 3 3 4 4 4 4 4 3 3 4 3 3 4 4 4
## 800 802 803 809 810 819 820 823 824 827 828 835 843 844 862
## 4 3 4 3 4 4 4 4 4 4 4 4 3 4 3
## 869 874 876 877 878 882 883 891 894 897 901 903 905 912 916
## 4 4 4 4 4 3 4 3 4 4 4 4 3 3 3
## 917 918 928 931 935 952 957 958 960 963 972 973 980 984 992
## 4 4 3 4 4 4 3 3 4 4 4 4 4 4 4
## 1004 1007 1008 1018 1026 1029 1031 1041 1044 1060 1061 1063 1065 1072 1073
## 4 3 3 4 4 4 4 3 4 4 3 3 4 4 4
## 1074 1075 1080 1082 1083 1089 1099 1100 1103 1110 1111 1112 1122 1127 1129
## 4 4 4 4 3 4 4 4 4 4 3 3 3 4 4
## 1133 1151 1158 1161 1168 1176 1177 1178 1186 1191 1196 1197 1212 1215 1216
## 4 3 4 4 3 4 4 3 4 3 4 4 4 4 4
## 1219 1220 1227 1236 1240 1242 1250 1252 1256 1261 1268 1271 1285 1296 1317
## 4 2 3 4 4 4 3 4 3 4 4 3 3 3 4
## 1335 1339 1340 1342 1344 1347 1349 1352 1355 1357 1362 1369 1370 1372 1376
## 4 3 3 4 4 4 4 3 3 3 4 3 3 4 3
## 1378 1381 1385 1387 1392 1394 1395 1399 1407 1412 1416 1419 1423 1426 1427
## 4 4 4 4 4 4 3 4 3 4 4 4 2 4 4
## 1429 1431 1439 1443 1445 1449 1452 1464 1465 1467
## 4 4 3 3 3 4 4 4 4 4
## Levels: 1 2 3 4
tabArb2 <- table(test2$JobSatisfaction, predArb2, dnn = c("Actual", "Predicha"))
confusionMatrix(tabArb2)
## Confusion Matrix and Statistics
##
## Predicha
## Actual 1 2 3 4
## 1 0 0 22 35
## 2 0 0 18 36
## 3 0 6 28 53
## 4 0 3 33 61
##
## Overall Statistics
##
## Accuracy : 0.3017
## 95% CI : (0.2498, 0.3576)
## No Information Rate : 0.6271
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.0161
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity NA 0.00000 0.27723 0.3297
## Specificity 0.8068 0.81119 0.69588 0.6727
## Pos Pred Value NA 0.00000 0.32184 0.6289
## Neg Pred Value NA 0.96266 0.64904 0.3737
## Prevalence 0.0000 0.03051 0.34237 0.6271
## Detection Rate 0.0000 0.00000 0.09492 0.2068
## Detection Prevalence 0.1932 0.18305 0.29492 0.3288
## Balanced Accuracy NA 0.40559 0.48655 0.5012
# Evaluar modelo sobre balanceado
predArb3<- predict(ArbolPoda3, newdata = test2, type = "class")
predArb3
## 1 2 9 11 14 22 24 25 46 48 49 57 59 62 65
## 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## 67 69 71 73 79 82 88 101 104 106 108 114 119 123 127
## 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## 130 135 138 140 142 146 147 157 172 176 184 204 206 213 217
## 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## 222 226 230 233 234 235 236 238 240 241 256 259 260 261 265
## 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## 268 271 274 276 279 295 312 314 327 328 336 341 345 346 368
## 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## 369 377 381 392 393 396 408 410 414 424 426 427 429 431 440
## 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## 447 450 451 458 467 470 480 485 488 491 499 502 504 505 526
## 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## 535 537 541 553 561 562 574 577 582 585 586 600 602 606 612
## 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## 616 620 628 629 636 642 654 660 661 664 674 678 681 685 692
## 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## 694 701 708 725 734 739 740 748 749 767 771 777 782 788 795
## 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## 800 802 803 809 810 819 820 823 824 827 828 835 843 844 862
## 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## 869 874 876 877 878 882 883 891 894 897 901 903 905 912 916
## 4 4 4 4 4 4 4 4 4 4 4 4 3 3 3
## 917 918 928 931 935 952 957 958 960 963 972 973 980 984 992
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1
## 1004 1007 1008 1018 1026 1029 1031 1041 1044 1060 1061 1063 1065 1072 1073
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1074 1075 1080 1082 1083 1089 1099 1100 1103 1110 1111 1112 1122 1127 1129
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1133 1151 1158 1161 1168 1176 1177 1178 1186 1191 1196 1197 1212 1215 1216
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1219 1220 1227 1236 1240 1242 1250 1252 1256 1261 1268 1271 1285 1296 1317
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1335 1339 1340 1342 1344 1347 1349 1352 1355 1357 1362 1369 1370 1372 1376
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1378 1381 1385 1387 1392 1394 1395 1399 1407 1412 1416 1419 1423 1426 1427
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1429 1431 1439 1443 1445 1449 1452 1464 1465 1467
## 1 1 1 1 1 1 1 1 1 1
## Levels: 1 2 3 4
tabArb3 <- table(test2$JobSatisfaction, predArb3, dnn = c("Actual", "Predicha"))
confusionMatrix(tabArb3)
## Confusion Matrix and Statistics
##
## Predicha
## Actual 1 2 3 4
## 1 22 0 1 34
## 2 21 0 7 26
## 3 30 0 4 53
## 4 28 0 5 64
##
## Overall Statistics
##
## Accuracy : 0.3051
## 95% CI : (0.253, 0.3611)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0343
##
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 0.21782 NA 0.23529 0.3616
## Specificity 0.81959 0.8169 0.70144 0.7203
## Pos Pred Value 0.38596 NA 0.04598 0.6598
## Neg Pred Value 0.66807 NA 0.93750 0.4293
## Prevalence 0.34237 0.0000 0.05763 0.6000
## Detection Rate 0.07458 0.0000 0.01356 0.2169
## Detection Prevalence 0.19322 0.1831 0.29492 0.3288
## Balanced Accuracy 0.51870 NA 0.46837 0.5410
De acuerdo a las pruebas realizadas se evidencia que tanto el modelo de clasificacion Bayesiano como el arbol de clasificacion alcanzan porcentajes de exactitud similares en las predicciones. esto refleja la buena implementacion de dichos metodos. por otra parte se observa que el porcentaje de prediccion realizado por estos modelos es muy baja y no varia significativamente frente a los ajustes de balanceo realizados a los datos de entrenamiento, razon por la cual se deduce que en este problema la variable dependiente seleccionada presenta una alta independencia con las demas variables del data set. la alta independencia que presenta la variable seleccionada impide desarrollar un clasificador exitoso ya que no esxisten reglas que permitan ligar dicha variable con el resto de variables. en cuanto a interpretacion resulta mucho mas facil de entender un arbol clasificador que un clasificador bayesiano cuando se pinta el arbol, ya que es facil de observar los posibles caminos que puede tomar. por otra parte la seguridad que brinda el clasificador bayesiano puede ser mas alta en determinados entornos ya que dispone de respaldo estadistico solido que permita aumentar la sefuridad de las clasificaciones. por simplicidad en cuanto a la implementacion ambos modelos se pueden considerar muy practicos, aunque el arbol clasificador toma un poco mas de trabajo debido a la poda que debe realizarse. finalmente se concluye que estos modelos presentan poderosas herramientas para diferentes areas de la ingenieria, sin embargo hay que tener mucho cuidado de como se aplica, ya que como vimos escoger de forma eeeonea la variable dependiente no permite demostrar un buen funcionamiento.