taller clasificadores

library(readxl)
library(plyr)
library(knitr)
library(e1071)
library(naivebayes)
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

library(car)

## Loading required package: carData

library(rpart)
library(rpart.plot)
Tabla<- read.csv("Classification.csv")

Mostrar Resumen de datos

head(Tabla)

##   ï..Age Attrition    BusinessTravel DailyRate             Department
## 1     41       Yes     Travel_Rarely      1102                  Sales
## 2     49        No Travel_Frequently       279 Research & Development
## 3     37       Yes     Travel_Rarely      1373 Research & Development
## 4     33        No Travel_Frequently      1392 Research & Development
## 5     27        No     Travel_Rarely       591 Research & Development
## 6     32        No Travel_Frequently      1005 Research & Development
##   DistanceFromHome Education EducationField EmployeeCount EmployeeNumber
## 1                1         2  Life Sciences             1              1
## 2                8         1  Life Sciences             1              2
## 3                2         2          Other             1              4
## 4                3         4  Life Sciences             1              5
## 5                2         1        Medical             1              7
## 6                2         2  Life Sciences             1              8
##   EnvironmentSatisfaction Gender HourlyRate JobInvolvement JobLevel
## 1                       2 Female         94              3        2
## 2                       3   Male         61              2        2
## 3                       4   Male         92              2        1
## 4                       4 Female         56              3        1
## 5                       1   Male         40              3        1
## 6                       4   Male         79              3        1
##                 JobRole JobSatisfaction MaritalStatus MonthlyIncome
## 1       Sales Executive               4        Single          5993
## 2    Research Scientist               2       Married          5130
## 3 Laboratory Technician               3        Single          2090
## 4    Research Scientist               3       Married          2909
## 5 Laboratory Technician               2       Married          3468
## 6 Laboratory Technician               4        Single          3068
##   MonthlyRate NumCompaniesWorked OverTime PercentSalaryHike
## 1       19479                  8      Yes                11
## 2       24907                  1       No                23
## 3        2396                  6      Yes                15
## 4       23159                  1      Yes                11
## 5       16632                  9       No                12
## 6       11864                  0       No                13
##   PerformanceRating RelationshipSatisfaction StockOptionLevel
## 1                 3                        1                0
## 2                 4                        4                1
## 3                 3                        2                0
## 4                 3                        3                0
## 5                 3                        4                1
## 6                 3                        3                0
##   TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## 1                 8                     0               1              6
## 2                10                     3               3             10
## 3                 7                     3               3              0
## 4                 8                     3               3              8
## 5                 6                     3               3              2
## 6                 8                     2               2              7
##   YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## 1                  4                       0                    5
## 2                  7                       1                    7
## 3                  0                       0                    0
## 4                  7                       3                    0
## 5                  2                       2                    2
## 6                  7                       3                    6

class(Tabla$ï..Age)

## [1] "integer"

summary(Tabla$ï..Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   30.00   36.00   36.92   43.00   60.00

class(Tabla$Attrition)

## [1] "factor"

summary(Tabla$Attrition)

##   No  Yes 
## 1233  237

class(Tabla$BusinessTravel)

## [1] "factor"

summary(Tabla$BusinessTravel)

##        Non-Travel Travel_Frequently     Travel_Rarely 
##               150               277              1043

class(Tabla$DailyRate)

## [1] "integer"

summary(Tabla$DailyRate)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   102.0   465.0   802.0   802.5  1157.0  1499.0

class(Tabla$Department)

## [1] "factor"

summary(Tabla$Department)

##        Human Resources Research & Development                  Sales 
##                     63                    961                    446

class(Tabla$DistanceFromHome)

## [1] "integer"

summary(Tabla$DistanceFromHome)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   7.000   9.193  14.000  29.000

class(Tabla$Education)

## [1] "integer"

summary(Tabla$Education)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   2.913   4.000   5.000

class(Tabla$EducationField)

## [1] "factor"

summary(Tabla$EducationField)

##  Human Resources    Life Sciences        Marketing          Medical 
##               27              606              159              464 
##            Other Technical Degree 
##               82              132

class(Tabla$EmployeeCount)

## [1] "integer"

summary(Tabla$EmployeeCount)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1       1       1       1       1       1

class(Tabla$EmployeeNumber)

## [1] "integer"

summary(Tabla$EmployeeNumber)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0   491.2  1020.5  1024.9  1555.8  2068.0

class(Tabla$EnvironmentSatisfaction)

## [1] "integer"

summary(Tabla$EnvironmentSatisfaction)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   2.722   4.000   4.000

class(Tabla$Gender)

## [1] "factor"

summary(Tabla$Gender)

## Female   Male 
##    588    882

class(Tabla$HourlyRate)

## [1] "integer"

summary(Tabla$HourlyRate)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   30.00   48.00   66.00   65.89   83.75  100.00

class(Tabla$JobInvolvement)

## [1] "integer"

summary(Tabla$JobInvolvement)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    2.00    3.00    2.73    3.00    4.00

class(Tabla$JobLevel)

## [1] "integer"

summary(Tabla$JobLevel)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   2.064   3.000   5.000

class(Tabla$JobRole)

## [1] "factor"

summary(Tabla$JobRole)

## Healthcare Representative           Human Resources 
##                       131                        52 
##     Laboratory Technician                   Manager 
##                       259                       102 
##    Manufacturing Director         Research Director 
##                       145                        80 
##        Research Scientist           Sales Executive 
##                       292                       326 
##      Sales Representative 
##                        83

class(Tabla$JobSatisfaction)

## [1] "integer"

summary(Tabla$JobSatisfaction)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   2.729   4.000   4.000

class(Tabla$MaritalStatus)

## [1] "factor"

summary(Tabla$MaritalStatus)

## Divorced  Married   Single 
##      327      673      470

class(Tabla$MonthlyIncome)

## [1] "integer"

summary(Tabla$MonthlyIncome)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1009    2911    4919    6503    8379   19999

class(Tabla$MonthlyRate)

## [1] "integer"

summary(Tabla$MonthlyRate)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2094    8047   14236   14313   20462   26999

class(Tabla$NumCompaniesWorked)

## [1] "integer"

summary(Tabla$NumCompaniesWorked)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   2.000   2.693   4.000   9.000

class(Tabla$OverTime)

## [1] "factor"

summary(Tabla$OverTime)

##   No  Yes 
## 1054  416

class(Tabla$PercentSalaryHike)

## [1] "integer"

summary(Tabla$PercentSalaryHike)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   11.00   12.00   14.00   15.21   18.00   25.00

class(Tabla$PerformanceRating)

## [1] "integer"

summary(Tabla$PerformanceRating)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   3.000   3.000   3.154   3.000   4.000

class(Tabla$RelationshipSatisfaction)

## [1] "integer"

summary(Tabla$RelationshipSatisfaction)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   2.712   4.000   4.000

class(Tabla$StockOptionLevel)

## [1] "integer"

summary(Tabla$StockOptionLevel)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  1.0000  0.7939  1.0000  3.0000

class(Tabla$TotalWorkingYears)

## [1] "integer"

summary(Tabla$TotalWorkingYears)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    6.00   10.00   11.28   15.00   40.00

class(Tabla$TrainingTimesLastYear)

## [1] "integer"

summary(Tabla$TrainingTimesLastYear)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.000   3.000   2.799   3.000   6.000

class(Tabla$WorkLifeBalance)

## [1] "integer"

summary(Tabla$WorkLifeBalance)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   2.761   3.000   4.000

class(Tabla$YearsAtCompany)

## [1] "integer"

summary(Tabla$YearsAtCompany)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   3.000   5.000   7.008   9.000  40.000

class(Tabla$YearsInCurrentRole)

## [1] "integer"

summary(Tabla$YearsInCurrentRole)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.000   3.000   4.229   7.000  18.000

class(Tabla$YearsSinceLastPromotion)

## [1] "integer"

summary(Tabla$YearsSinceLastPromotion)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   2.188   3.000  15.000

class(Tabla$YearsWithCurrManager)

## [1] "integer"

summary(Tabla$YearsWithCurrManager)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.000   3.000   4.123   7.000  17.000

Buscar Outliers o datos extraños

#la columna EmployeeCount presenta el mismo valor para todos sus datos por lo que podria considerarse no representativa y podria eliminarse para reducir la dimencion del dataset, del mismo modo no es coherente con el titulo de la columna pór lo que se puede pensar que se trata de una variable dañada.
summary(Tabla$EmployeeCount)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1       1       1       1       1       1

hist(Tabla$EmployeeCount)

boxplot(Tabla$EmployeeCount, horizontal = T)

#la columna YearsAtCompany presenta datos atipicos entre los 20 y 40 años como se observa en los siguientes graficos
summary(Tabla$YearsAtCompany)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   3.000   5.000   7.008   9.000  40.000

hist(Tabla$YearsAtCompany)

boxplot(Tabla$YearsAtCompany, horizontal = T)

#la columna YearsInCurrentRole presenta datos atipicos a partir de los 15 años  como se observa en los siguientes graficos
summary(Tabla$YearsInCurrentRole)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.000   3.000   4.229   7.000  18.000

hist(Tabla$YearsInCurrentRole)

boxplot(Tabla$YearsInCurrentRole, horizontal = T)

# la columna YearsSinceLastPromotion presenta una distribucion bastante desbalanceada mostrando como datos atipicos desde los 7 años aproximadamente
summary(Tabla$YearsSinceLastPromotion)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   2.188   3.000  15.000

hist(Tabla$YearsSinceLastPromotion)

boxplot(Tabla$YearsSinceLastPromotion, horizontal = T)

# la columna YearsWithCurrManager presenta datos atipicos a partir de los 15 años  como se observa en los siguientes graficos
summary(Tabla$YearsWithCurrManager)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.000   3.000   4.123   7.000  17.000

hist(Tabla$YearsWithCurrManager)

boxplot(Tabla$YearsWithCurrManager, horizontal = T)

c=0

Preparar set de entrenamiento variable objetivo satisfaccion laboral

set.seed(600)
# Creo tabla donde voy a hacer las modificaciones pertinentes

tabla2<-Tabla

# Elimino columna EmployeeCount

tabla2$EmployeeCount<- NULL

# Elimino las observaciones con la columna YearsAtCompany mayor a 18

tabla3<- tabla2[tabla2$YearsAtCompany<= 18,]
hist(tabla3$YearsAtCompany)

boxplot(tabla3$YearsAtCompany, horizontal = T)

# Elimino las observaciones con la columna YearsInCurrentRole mayor a 14

tabla4<- tabla3[tabla3$YearsInCurrentRole<= 14,]
hist(tabla4$YearsInCurrentRole)

boxplot(tabla4$YearsInCurrentRole, horizontal = T)

# Para balancear el data set se debe eliminar alrededor del 95% observaciones donde la variable YearsSinceLastPromotion < 2

sobra<- tabla4[tabla4$YearsSinceLastPromotion== 0,]
ind <- sample(2,nrow(sobra), replace = TRUE, prob = c(0.95, 0.05) )
sobra<- sobra[ind== 2,]
tabla5<- tabla4[tabla4$YearsSinceLastPromotion> 1,]
sobra2<- tabla4[tabla4$YearsSinceLastPromotion== 1,]
ind <- sample(2,nrow(sobra2), replace = TRUE, prob = c(0.95, 0.05) )
sobra2<- sobra2[ind== 2,]
tabla5<- rbind(tabla5, sobra)
tabla5<- rbind(tabla5, sobra2)
hist(tabla5$YearsSinceLastPromotion)

boxplot(tabla5$YearsSinceLastPromotion, horizontal = T)

# Para balancear el data set se debe eliminar alrededor de 70% observaciones donde la variable YearsWithCurrManager < 2

sobra<- tabla5[tabla5$YearsWithCurrManager== 0,]
ind <- sample(2,nrow(sobra), replace = TRUE, prob = c(0.7, 0.3) )
sobra<- sobra[ind== 2,]
tabla6<- tabla5[tabla5$YearsWithCurrManager> 1,]
sobra2<- tabla5[tabla5$YearsWithCurrManager== 1,]
ind <- sample(2,nrow(sobra2), replace = TRUE, prob = c(0.7, 0.3) )
sobra2<- sobra2[ind== 2,]
tabla6<- rbind(tabla6, sobra)
tabla6<- rbind(tabla6, sobra2)
hist(tabla6$YearsWithCurrManager)

boxplot(tabla6$YearsWithCurrManager, horizontal = T)

#Convertir variable dependiente en tipo categorico
tabla6$JobSatisfaction<- as.factor(tabla6$JobSatisfaction)
tabla6$MaritalStatus<- NULL
summary(tabla6$JobSatisfaction)

##   1   2   3   4 
##  82  87 127 153

# Creo set de entrenamiento y de test del sistema balanceado tabla 6
ind <- sample(2,nrow(tabla6), replace = TRUE, prob = c(0.8, 0.2) )
entrenamiento<- tabla6[ind==1,]
test<- tabla6[ind==2,]

# Creo set de entrenamiento y de test del sistema sin balancear tabla 2
tabla2$JobSatisfaction<- as.factor(tabla2$JobSatisfaction)
tabla2$MaritalStatus<- NULL
ind <- sample(2,nrow(tabla2), replace = TRUE, prob = c(0.8, 0.2) )
entrenamiento2<- tabla2[ind==1,]
test2<- tabla2[ind==2,]

# Creo set de entrenamiento sobre balanceado
e1<- entrenamiento[entrenamiento$JobSatisfaction==1,]
e2<- entrenamiento[entrenamiento$JobSatisfaction==2,]
e3<- entrenamiento[entrenamiento$JobSatisfaction==3,]
e4<- entrenamiento[entrenamiento$JobSatisfaction==4,]
entrenamiento3<- rbind(e1[1:60, ], e2[1:60, ])
entrenamiento3<- rbind(entrenamiento3, e3[1:60, ])
entrenamiento3<- rbind(entrenamiento3, e4[1:60, ])

Crear modelo bayes

# Modelo Balanceado
plot(entrenamiento$JobSatisfaction)

modelo1 <- naive_bayes(JobSatisfaction ~ ., data = entrenamiento)
# Modelo sin Balancear
plot(entrenamiento2$JobSatisfaction)

modelo2 <- naive_bayes(JobSatisfaction ~ ., data = entrenamiento2)
# Modelo sobre balanceado
plot(entrenamiento3$JobSatisfaction)

modelo3 <- naive_bayes(JobSatisfaction ~ ., data = entrenamiento3)

Probar modelo

# Prediccion en modelo Balanceado
pred<- predict(modelo1 , test)
tab <- table(test$JobSatisfaction, pred, dnn = c("Actual", "Predicha"))
confusionMatrix(tab)

## Confusion Matrix and Statistics
## 
##       Predicha
## Actual  1  2  3  4
##      1  3  4  1  6
##      2  1  1  3 10
##      3  5  6  2 19
##      4  3 10  7 17
## 
## Overall Statistics
##                                          
##                Accuracy : 0.2347         
##                  95% CI : (0.155, 0.3311)
##     No Information Rate : 0.5306         
##     P-Value [Acc > NIR] : 1.00000        
##                                          
##                   Kappa : -0.0839        
##                                          
##  Mcnemar's Test P-Value : 0.06185        
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity           0.25000  0.04762  0.15385   0.3269
## Specificity           0.87209  0.81818  0.64706   0.5652
## Pos Pred Value        0.21429  0.06667  0.06250   0.4595
## Neg Pred Value        0.89286  0.75904  0.83333   0.4262
## Prevalence            0.12245  0.21429  0.13265   0.5306
## Detection Rate        0.03061  0.01020  0.02041   0.1735
## Detection Prevalence  0.14286  0.15306  0.32653   0.3776
## Balanced Accuracy     0.56105  0.43290  0.40045   0.4461

# Predicion en modelo sin Balancear
pred2<- predict(modelo2 , test2)
tab2 <- table(test2$JobSatisfaction, pred2, dnn = c("Actual", "Predicha"))
confusionMatrix(tab2)

## Confusion Matrix and Statistics
## 
##       Predicha
## Actual  1  2  3  4
##      1  3  9 22 23
##      2  2  8 23 21
##      3  8  9 33 37
##      4  7 12 37 41
## 
## Overall Statistics
##                                           
##                Accuracy : 0.2881          
##                  95% CI : (0.2371, 0.3435)
##     No Information Rate : 0.4136          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 7e-04           
##                                           
##  Mcnemar's Test P-Value : 8.994e-05       
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity           0.15000  0.21053   0.2870   0.3361
## Specificity           0.80364  0.82101   0.7000   0.6763
## Pos Pred Value        0.05263  0.14815   0.3793   0.4227
## Neg Pred Value        0.92857  0.87552   0.6058   0.5909
## Prevalence            0.06780  0.12881   0.3898   0.4136
## Detection Rate        0.01017  0.02712   0.1119   0.1390
## Detection Prevalence  0.19322  0.18305   0.2949   0.3288
## Balanced Accuracy     0.47682  0.51577   0.4935   0.5062

# Predicion en modelo sobre balanceado
pred3<- predict(modelo3 , test2)
tab3 <- table(test2$JobSatisfaction, pred3, dnn = c("Actual", "Predicha"))
confusionMatrix(tab3)

## Confusion Matrix and Statistics
## 
##       Predicha
## Actual  1  2  3  4
##      1 18 13 19  7
##      2 10 19 17  8
##      3 14 25 34 14
##      4 25 22 31 19
## 
## Overall Statistics
##                                          
##                Accuracy : 0.3051         
##                  95% CI : (0.253, 0.3611)
##     No Information Rate : 0.3424         
##     P-Value [Acc > NIR] : 0.9219213      
##                                          
##                   Kappa : 0.0767         
##                                          
##  Mcnemar's Test P-Value : 0.0002475      
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity           0.26866  0.24051   0.3366  0.39583
## Specificity           0.82895  0.83796   0.7268  0.68421
## Pos Pred Value        0.31579  0.35185   0.3908  0.19588
## Neg Pred Value        0.79412  0.75104   0.6779  0.85354
## Prevalence            0.22712  0.26780   0.3424  0.16271
## Detection Rate        0.06102  0.06441   0.1153  0.06441
## Detection Prevalence  0.19322  0.18305   0.2949  0.32881
## Balanced Accuracy     0.54880  0.53923   0.5317  0.54002

arbol de clasificacion

#Se crea el arbol con Modelo Balanceado
arbol<- rpart(JobSatisfaction ~ ., data = entrenamiento, method = "class")
# Muestro el arbol de dos formas diferentes
print(arbol)

## n= 351 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##    1) root 351 235 4 (0.19373219 0.20512821 0.27065527 0.33048433)  
##      2) DistanceFromHome>=2.5 246 169 3 (0.22357724 0.19918699 0.31300813 0.26422764)  
##        4) DailyRate>=272 219 149 3 (0.24657534 0.16894977 0.31963470 0.26484018)  
##          8) JobRole=Healthcare Representative,Laboratory Technician,Manufacturing Director,Research Scientist,Sales Executive 173 122 4 (0.27745665 0.13872832 0.28901734 0.29479769)  
##           16) TotalWorkingYears>=5.5 147  99 4 (0.30612245 0.12244898 0.24489796 0.32653061)  
##             32) HourlyRate>=64.5 75  45 1 (0.40000000 0.12000000 0.25333333 0.22666667)  
##               64) MonthlyRate< 4751 8   1 1 (0.87500000 0.00000000 0.12500000 0.00000000) *
##               65) MonthlyRate>=4751 67  44 1 (0.34328358 0.13432836 0.26865672 0.25373134)  
##                130) TrainingTimesLastYear< 3.5 56  34 1 (0.39285714 0.16071429 0.26785714 0.17857143)  
##                  260) YearsAtCompany< 7.5 17   5 1 (0.70588235 0.05882353 0.23529412 0.00000000) *
##                  261) YearsAtCompany>=7.5 39  28 3 (0.25641026 0.20512821 0.28205128 0.25641026)  
##                    522) BusinessTravel=Travel_Rarely 29  19 1 (0.34482759 0.27586207 0.17241379 0.20689655)  
##                     1044) DailyRate>=818.5 16   8 1 (0.50000000 0.37500000 0.06250000 0.06250000) *
##                     1045) DailyRate< 818.5 13   8 4 (0.15384615 0.15384615 0.30769231 0.38461538) *
##                    523) BusinessTravel=Non-Travel,Travel_Frequently 10   4 3 (0.00000000 0.00000000 0.60000000 0.40000000) *
##                131) TrainingTimesLastYear>=3.5 11   4 4 (0.09090909 0.00000000 0.27272727 0.63636364) *
##             33) HourlyRate< 64.5 72  41 4 (0.20833333 0.12500000 0.23611111 0.43055556)  
##               66) JobRole=Healthcare Representative,Manufacturing Director,Research Scientist 32  21 3 (0.28125000 0.12500000 0.34375000 0.25000000)  
##                132) MonthlyRate< 14392.5 23  12 3 (0.30434783 0.08695652 0.47826087 0.13043478) *
##                133) MonthlyRate>=14392.5 9   4 4 (0.22222222 0.22222222 0.00000000 0.55555556) *
##               67) JobRole=Laboratory Technician,Sales Executive 40  17 4 (0.15000000 0.12500000 0.15000000 0.57500000) *
##           17) TotalWorkingYears< 5.5 26  12 3 (0.11538462 0.23076923 0.53846154 0.11538462) *
##          9) JobRole=Human Resources,Manager,Research Director,Sales Representative 46  26 3 (0.13043478 0.28260870 0.43478261 0.15217391)  
##           18) EducationField=Life Sciences,Marketing,Other 26  14 2 (0.07692308 0.46153846 0.26923077 0.19230769)  
##             36) DailyRate< 1166.5 17   8 2 (0.11764706 0.52941176 0.05882353 0.29411765) *
##             37) DailyRate>=1166.5 9   3 3 (0.00000000 0.33333333 0.66666667 0.00000000) *
##           19) EducationField=Medical,Technical Degree 20   7 3 (0.20000000 0.05000000 0.65000000 0.10000000) *
##        5) DailyRate< 272 27  15 2 (0.03703704 0.44444444 0.25925926 0.25925926) *
##      3) DistanceFromHome< 2.5 105  54 4 (0.12380952 0.21904762 0.17142857 0.48571429)  
##        6) JobRole=Manager,Manufacturing Director,Research Director,Sales Executive,Sales Representative 54  37 2 (0.16666667 0.31481481 0.20370370 0.31481481)  
##         12) PercentSalaryHike< 12.5 15   5 2 (0.06666667 0.66666667 0.20000000 0.06666667) *
##         13) PercentSalaryHike>=12.5 39  23 4 (0.20512821 0.17948718 0.20512821 0.41025641) *
##        7) JobRole=Healthcare Representative,Human Resources,Laboratory Technician,Research Scientist 51  17 4 (0.07843137 0.11764706 0.13725490 0.66666667) *

rpart.plot(arbol, extra = 4)

# presento informacion sobre el modelo creado
printcp(arbol)

## 
## Classification tree:
## rpart(formula = JobSatisfaction ~ ., data = entrenamiento, method = "class")
## 
## Variables actually used in tree construction:
##  [1] BusinessTravel        DailyRate             DistanceFromHome     
##  [4] EducationField        HourlyRate            JobRole              
##  [7] MonthlyRate           PercentSalaryHike     TotalWorkingYears    
## [10] TrainingTimesLastYear YearsAtCompany       
## 
## Root node error: 235/351 = 0.66952
## 
## n= 351 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.051064      0   1.00000 1.00000 0.037501
## 2 0.031915      1   0.94894 1.05106 0.036404
## 3 0.021277      5   0.82128 1.06809 0.035984
## 4 0.019149      6   0.80000 1.04255 0.036603
## 5 0.017021      8   0.76170 1.03404 0.036795
## 6 0.012766     10   0.72766 1.02979 0.036889
## 7 0.010000     16   0.65106 0.99149 0.037661

plotcp(arbol)

# se observa que a partir de cp= 0.010000 el error empieza a aumentar, por lo que la poda se hara desde este punto

#Se crea el arbol con Modelo sin Balancear
arbol2<- rpart(JobSatisfaction ~ ., data = entrenamiento2, method = "class")
# Muestro el arbol de dos formas diferentes
print(arbol2)

## n= 1175 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 1175 813 4 (0.1974468 0.1923404 0.3021277 0.3080851)  
##    2) Attrition=Yes 184 127 3 (0.2826087 0.2173913 0.3097826 0.1902174) *
##    3) Attrition=No 991 664 4 (0.1816347 0.1876892 0.3007064 0.3299697)  
##      6) NumCompaniesWorked>=4.5 200 144 3 (0.2100000 0.2650000 0.2800000 0.2450000)  
##       12) JobRole=Human Resources,Laboratory Technician 34  15 2 (0.1176471 0.5588235 0.2058824 0.1176471) *
##       13) JobRole=Healthcare Representative,Manager,Manufacturing Director,Research Director,Research Scientist,Sales Executive,Sales Representative 166 117 3 (0.2289157 0.2048193 0.2951807 0.2710843) *
##      7) NumCompaniesWorked< 4.5 791 513 4 (0.1744627 0.1681416 0.3059418 0.3514539)  
##       14) MonthlyRate>=3020.5 761 492 4 (0.1773982 0.1747700 0.2943495 0.3534823)  
##         28) MonthlyRate>=24566.5 58  32 3 (0.2068966 0.1724138 0.4482759 0.1724138) *
##         29) MonthlyRate< 24566.5 703 444 4 (0.1749644 0.1749644 0.2816501 0.3684211) *
##       15) MonthlyRate< 3020.5 30  12 3 (0.1000000 0.0000000 0.6000000 0.3000000) *

rpart.plot(arbol2, extra = 4)

# presento informacion sobre el modelo creado
printcp(arbol2)

## 
## Classification tree:
## rpart(formula = JobSatisfaction ~ ., data = entrenamiento2, method = "class")
## 
## Variables actually used in tree construction:
## [1] Attrition          JobRole            MonthlyRate       
## [4] NumCompaniesWorked
## 
## Root node error: 813/1175 = 0.69191
## 
## n= 1175 
## 
##        CP nsplit rel error  xerror     xstd
## 1 0.02706      0   1.00000 1.01230 0.019314
## 2 0.01353      1   0.97294 0.98770 0.019612
## 3 0.01000      5   0.91882 0.99016 0.019583

plotcp(arbol2)

# se observa que a partir de cp= 0.01353 el error empieza a aumentar, por lo que la poda se hara desde este punto

#Se crea el arbol con Modelo sobre Balanceado
arbol3<- rpart(JobSatisfaction ~ ., data = entrenamiento3, method = "class")
# Muestro el arbol de dos formas diferentes
print(arbol3)

## n= 240 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 240 180 1 (0.25000000 0.25000000 0.25000000 0.25000000)  
##     2) EmployeeNumber>=1390 37  18 1 (0.51351351 0.45945946 0.02702703 0.00000000)  
##       4) StockOptionLevel>=0.5 25   8 1 (0.68000000 0.32000000 0.00000000 0.00000000) *
##       5) StockOptionLevel< 0.5 12   3 2 (0.16666667 0.75000000 0.08333333 0.00000000) *
##     3) EmployeeNumber< 1390 203 143 4 (0.20197044 0.21182266 0.29064039 0.29556650)  
##       6) EmployeeNumber>=1261.5 17   8 3 (0.17647059 0.29411765 0.52941176 0.00000000) *
##       7) EmployeeNumber< 1261.5 186 126 4 (0.20430108 0.20430108 0.26881720 0.32258065)  
##        14) DistanceFromHome>=2.5 130  89 3 (0.21538462 0.20769231 0.31538462 0.26153846)  
##          28) EmployeeNumber< 206 24  11 3 (0.25000000 0.08333333 0.54166667 0.12500000) *
##          29) EmployeeNumber>=206 106  75 4 (0.20754717 0.23584906 0.26415094 0.29245283)  
##            58) EmployeeNumber>=927 36  21 3 (0.25000000 0.11111111 0.41666667 0.22222222)  
##             116) ï..Age< 44.5 28  19 3 (0.28571429 0.10714286 0.32142857 0.28571429)  
##               232) Education>=3.5 9   4 3 (0.33333333 0.11111111 0.55555556 0.00000000) *
##               233) Education< 3.5 19  11 4 (0.26315789 0.10526316 0.21052632 0.42105263) *
##             117) ï..Age>=44.5 8   2 3 (0.12500000 0.12500000 0.75000000 0.00000000) *
##            59) EmployeeNumber< 927 70  47 4 (0.18571429 0.30000000 0.18571429 0.32857143)  
##             118) JobRole=Human Resources,Manager,Sales Representative 8   2 2 (0.00000000 0.75000000 0.25000000 0.00000000) *
##             119) JobRole=Healthcare Representative,Laboratory Technician,Manufacturing Director,Research Director,Research Scientist,Sales Executive 62  39 4 (0.20967742 0.24193548 0.17741935 0.37096774)  
##               238) PercentSalaryHike< 13.5 25  16 1 (0.36000000 0.24000000 0.24000000 0.16000000)  
##                 476) RelationshipSatisfaction>=3.5 7   2 2 (0.14285714 0.71428571 0.14285714 0.00000000) *
##                 477) RelationshipSatisfaction< 3.5 18  10 1 (0.44444444 0.05555556 0.27777778 0.22222222) *
##               239) PercentSalaryHike>=13.5 37  18 4 (0.10810811 0.24324324 0.13513514 0.51351351) *
##        15) DistanceFromHome< 2.5 56  30 4 (0.17857143 0.19642857 0.16071429 0.46428571)  
##          30) PercentSalaryHike< 12.5 13   7 2 (0.07692308 0.46153846 0.30769231 0.15384615) *
##          31) PercentSalaryHike>=12.5 43  19 4 (0.20930233 0.11627907 0.11627907 0.55813953)  
##            62) WorkLifeBalance>=2.5 30  18 4 (0.26666667 0.16666667 0.16666667 0.40000000)  
##             124) MonthlyIncome>=5272.5 14   7 1 (0.50000000 0.07142857 0.21428571 0.21428571) *
##             125) MonthlyIncome< 5272.5 16   7 4 (0.06250000 0.25000000 0.12500000 0.56250000) *
##            63) WorkLifeBalance< 2.5 13   1 4 (0.07692308 0.00000000 0.00000000 0.92307692) *

rpart.plot(arbol3, extra = 4)

# presento informacion sobre el modelo creado
printcp(arbol3)

## 
## Classification tree:
## rpart(formula = JobSatisfaction ~ ., data = entrenamiento3, method = "class")
## 
## Variables actually used in tree construction:
##  [1] DistanceFromHome         Education               
##  [3] EmployeeNumber           ï..Age                  
##  [5] JobRole                  MonthlyIncome           
##  [7] PercentSalaryHike        RelationshipSatisfaction
##  [9] StockOptionLevel         WorkLifeBalance         
## 
## Root node error: 180/240 = 0.75
## 
## n= 240 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.105556      0   1.00000 1.11667 0.031751
## 2 0.050000      1   0.89444 0.95556 0.038783
## 3 0.038889      2   0.84444 0.93889 0.039282
## 4 0.027778      4   0.76667 0.94444 0.039120
## 5 0.022222      8   0.65000 0.93889 0.039282
## 6 0.011111     10   0.60556 0.93889 0.039282
## 7 0.010000     14   0.56111 0.96667 0.038430

plotcp(arbol3)

# se observa que a partir de cp= 0.038889 el error no baja mas, por lo que la poda se hara desde este punto

Poda de arbol

#Modelo Balanceado
ArbolPoda<- prune(arbol, cp = 0.010000)
printcp(ArbolPoda)

## 
## Classification tree:
## rpart(formula = JobSatisfaction ~ ., data = entrenamiento, method = "class")
## 
## Variables actually used in tree construction:
##  [1] BusinessTravel        DailyRate             DistanceFromHome     
##  [4] EducationField        HourlyRate            JobRole              
##  [7] MonthlyRate           PercentSalaryHike     TotalWorkingYears    
## [10] TrainingTimesLastYear YearsAtCompany       
## 
## Root node error: 235/351 = 0.66952
## 
## n= 351 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.051064      0   1.00000 1.00000 0.037501
## 2 0.031915      1   0.94894 1.05106 0.036404
## 3 0.021277      5   0.82128 1.06809 0.035984
## 4 0.019149      6   0.80000 1.04255 0.036603
## 5 0.017021      8   0.76170 1.03404 0.036795
## 6 0.012766     10   0.72766 1.02979 0.036889
## 7 0.010000     16   0.65106 0.99149 0.037661

#Modelo sin Balancear
ArbolPoda2<- prune(arbol2, cp = 0.01353)
printcp(ArbolPoda2)

## 
## Classification tree:
## rpart(formula = JobSatisfaction ~ ., data = entrenamiento2, method = "class")
## 
## Variables actually used in tree construction:
## [1] Attrition          JobRole            MonthlyRate       
## [4] NumCompaniesWorked
## 
## Root node error: 813/1175 = 0.69191
## 
## n= 1175 
## 
##        CP nsplit rel error  xerror     xstd
## 1 0.02706      0   1.00000 1.01230 0.019314
## 2 0.01353      1   0.97294 0.98770 0.019612
## 3 0.01000      5   0.91882 0.99016 0.019583

#Modelo sobre Balanceado
ArbolPoda3<- prune(arbol3, cp = 0.038889)
printcp(ArbolPoda3)

## 
## Classification tree:
## rpart(formula = JobSatisfaction ~ ., data = entrenamiento3, method = "class")
## 
## Variables actually used in tree construction:
## [1] EmployeeNumber
## 
## Root node error: 180/240 = 0.75
## 
## n= 240 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.105556      0   1.00000 1.11667 0.031751
## 2 0.050000      1   0.89444 0.95556 0.038783
## 3 0.038889      2   0.84444 0.93889 0.039282

Evaluar Arbol

# Evaluar modelo balanceado
predArb<- predict(ArbolPoda, newdata =  test, type = "class")
predArb

##   28   56   61   93   95  111  113  117  123  138  140  146  162  174  204 
##    4    2    4    4    1    4    2    3    1    4    3    2    3    2    1 
##  227  270  277  286  304  307  314  318  338  363  378  399  400  463  485 
##    2    4    3    4    2    1    2    4    3    3    4    3    4    2    4 
##  509  510  537  548  550  556  563  579  583  601  631  637  647  676  687 
##    1    3    1    4    1    2    4    2    4    1    4    2    1    2    2 
##  740  746  749  766  774  819  832  834  849  913  930  939  944  998 1096 
##    4    2    2    3    4    2    3    2    3    3    4    3    4    2    4 
## 1099 1100 1104 1131 1158 1165 1179 1185 1213 1221 1278 1280 1282 1286 1323 
##    1    4    4    4    2    1    4    3    4    4    4    4    4    1    4 
## 1353 1387 1397 1399 1415 1417 1419 1423 1445 1448   37  213  276  479  484 
##    4    4    1    4    1    4    3    3    1    4    2    1    4    3    4 
## 1218  580 1438 1447  442  297  129  132 
##    3    4    2    4    4    2    4    1 
## Levels: 1 2 3 4

tabArb <- table(test$JobSatisfaction, predArb, dnn = c("Actual", "Predicha"))
confusionMatrix(tabArb)

## Confusion Matrix and Statistics
## 
##       Predicha
## Actual  1  2  3  4
##      1  2  5  1  6
##      2  2  4  2  7
##      3  7  6  9 10
##      4  6  7  6 18
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3367          
##                  95% CI : (0.2444, 0.4393)
##     No Information Rate : 0.4184          
##     P-Value [Acc > NIR] : 0.9604          
##                                           
##                   Kappa : 0.0825          
##                                           
##  Mcnemar's Test P-Value : 0.1860          
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity           0.11765  0.18182  0.50000   0.4390
## Specificity           0.85185  0.85526  0.71250   0.6667
## Pos Pred Value        0.14286  0.26667  0.28125   0.4865
## Neg Pred Value        0.82143  0.78313  0.86364   0.6230
## Prevalence            0.17347  0.22449  0.18367   0.4184
## Detection Rate        0.02041  0.04082  0.09184   0.1837
## Detection Prevalence  0.14286  0.15306  0.32653   0.3776
## Balanced Accuracy     0.48475  0.51854  0.60625   0.5528

# Evaluar modelo sin balancear
predArb2<- predict(ArbolPoda2, newdata =  test2, type = "class")
predArb2

##    1    2    9   11   14   22   24   25   46   48   49   57   59   62   65 
##    3    3    4    4    4    3    4    3    3    4    4    4    4    4    4 
##   67   69   71   73   79   82   88  101  104  106  108  114  119  123  127 
##    4    4    3    4    3    3    4    3    4    3    3    4    2    3    3 
##  130  135  138  140  142  146  147  157  172  176  184  204  206  213  217 
##    4    4    4    4    3    3    4    4    3    4    4    2    3    4    3 
##  222  226  230  233  234  235  236  238  240  241  256  259  260  261  265 
##    3    4    3    2    4    3    3    4    3    2    4    4    3    3    3 
##  268  271  274  276  279  295  312  314  327  328  336  341  345  346  368 
##    4    4    4    4    3    4    4    3    4    3    4    3    4    4    3 
##  369  377  381  392  393  396  408  410  414  424  426  427  429  431  440 
##    3    4    4    2    4    2    4    4    4    3    4    4    3    4    3 
##  447  450  451  458  467  470  480  485  488  491  499  502  504  505  526 
##    3    4    4    3    3    3    3    4    4    4    4    4    4    3    3 
##  535  537  541  553  561  562  574  577  582  585  586  600  602  606  612 
##    4    3    3    3    4    4    3    4    4    4    3    3    2    4    4 
##  616  620  628  629  636  642  654  660  661  664  674  678  681  685  692 
##    4    4    4    4    4    4    4    4    3    3    4    4    4    4    3 
##  694  701  708  725  734  739  740  748  749  767  771  777  782  788  795 
##    3    3    4    4    4    4    4    3    3    4    3    3    4    4    4 
##  800  802  803  809  810  819  820  823  824  827  828  835  843  844  862 
##    4    3    4    3    4    4    4    4    4    4    4    4    3    4    3 
##  869  874  876  877  878  882  883  891  894  897  901  903  905  912  916 
##    4    4    4    4    4    3    4    3    4    4    4    4    3    3    3 
##  917  918  928  931  935  952  957  958  960  963  972  973  980  984  992 
##    4    4    3    4    4    4    3    3    4    4    4    4    4    4    4 
## 1004 1007 1008 1018 1026 1029 1031 1041 1044 1060 1061 1063 1065 1072 1073 
##    4    3    3    4    4    4    4    3    4    4    3    3    4    4    4 
## 1074 1075 1080 1082 1083 1089 1099 1100 1103 1110 1111 1112 1122 1127 1129 
##    4    4    4    4    3    4    4    4    4    4    3    3    3    4    4 
## 1133 1151 1158 1161 1168 1176 1177 1178 1186 1191 1196 1197 1212 1215 1216 
##    4    3    4    4    3    4    4    3    4    3    4    4    4    4    4 
## 1219 1220 1227 1236 1240 1242 1250 1252 1256 1261 1268 1271 1285 1296 1317 
##    4    2    3    4    4    4    3    4    3    4    4    3    3    3    4 
## 1335 1339 1340 1342 1344 1347 1349 1352 1355 1357 1362 1369 1370 1372 1376 
##    4    3    3    4    4    4    4    3    3    3    4    3    3    4    3 
## 1378 1381 1385 1387 1392 1394 1395 1399 1407 1412 1416 1419 1423 1426 1427 
##    4    4    4    4    4    4    3    4    3    4    4    4    2    4    4 
## 1429 1431 1439 1443 1445 1449 1452 1464 1465 1467 
##    4    4    3    3    3    4    4    4    4    4 
## Levels: 1 2 3 4

tabArb2 <- table(test2$JobSatisfaction, predArb2, dnn = c("Actual", "Predicha"))
confusionMatrix(tabArb2)

## Confusion Matrix and Statistics
## 
##       Predicha
## Actual  1  2  3  4
##      1  0  0 22 35
##      2  0  0 18 36
##      3  0  6 28 53
##      4  0  3 33 61
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3017          
##                  95% CI : (0.2498, 0.3576)
##     No Information Rate : 0.6271          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.0161         
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity                NA  0.00000  0.27723   0.3297
## Specificity            0.8068  0.81119  0.69588   0.6727
## Pos Pred Value             NA  0.00000  0.32184   0.6289
## Neg Pred Value             NA  0.96266  0.64904   0.3737
## Prevalence             0.0000  0.03051  0.34237   0.6271
## Detection Rate         0.0000  0.00000  0.09492   0.2068
## Detection Prevalence   0.1932  0.18305  0.29492   0.3288
## Balanced Accuracy          NA  0.40559  0.48655   0.5012

# Evaluar modelo sobre balanceado
predArb3<- predict(ArbolPoda3, newdata =  test2, type = "class")
predArb3

##    1    2    9   11   14   22   24   25   46   48   49   57   59   62   65 
##    4    4    4    4    4    4    4    4    4    4    4    4    4    4    4 
##   67   69   71   73   79   82   88  101  104  106  108  114  119  123  127 
##    4    4    4    4    4    4    4    4    4    4    4    4    4    4    4 
##  130  135  138  140  142  146  147  157  172  176  184  204  206  213  217 
##    4    4    4    4    4    4    4    4    4    4    4    4    4    4    4 
##  222  226  230  233  234  235  236  238  240  241  256  259  260  261  265 
##    4    4    4    4    4    4    4    4    4    4    4    4    4    4    4 
##  268  271  274  276  279  295  312  314  327  328  336  341  345  346  368 
##    4    4    4    4    4    4    4    4    4    4    4    4    4    4    4 
##  369  377  381  392  393  396  408  410  414  424  426  427  429  431  440 
##    4    4    4    4    4    4    4    4    4    4    4    4    4    4    4 
##  447  450  451  458  467  470  480  485  488  491  499  502  504  505  526 
##    4    4    4    4    4    4    4    4    4    4    4    4    4    4    4 
##  535  537  541  553  561  562  574  577  582  585  586  600  602  606  612 
##    4    4    4    4    4    4    4    4    4    4    4    4    4    4    4 
##  616  620  628  629  636  642  654  660  661  664  674  678  681  685  692 
##    4    4    4    4    4    4    4    4    4    4    4    4    4    4    4 
##  694  701  708  725  734  739  740  748  749  767  771  777  782  788  795 
##    4    4    4    4    4    4    4    4    4    4    4    4    4    4    4 
##  800  802  803  809  810  819  820  823  824  827  828  835  843  844  862 
##    4    4    4    4    4    4    4    4    4    4    4    4    4    4    4 
##  869  874  876  877  878  882  883  891  894  897  901  903  905  912  916 
##    4    4    4    4    4    4    4    4    4    4    4    4    3    3    3 
##  917  918  928  931  935  952  957  958  960  963  972  973  980  984  992 
##    3    3    3    3    3    3    3    3    3    3    3    3    3    3    1 
## 1004 1007 1008 1018 1026 1029 1031 1041 1044 1060 1061 1063 1065 1072 1073 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
## 1074 1075 1080 1082 1083 1089 1099 1100 1103 1110 1111 1112 1122 1127 1129 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
## 1133 1151 1158 1161 1168 1176 1177 1178 1186 1191 1196 1197 1212 1215 1216 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
## 1219 1220 1227 1236 1240 1242 1250 1252 1256 1261 1268 1271 1285 1296 1317 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
## 1335 1339 1340 1342 1344 1347 1349 1352 1355 1357 1362 1369 1370 1372 1376 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
## 1378 1381 1385 1387 1392 1394 1395 1399 1407 1412 1416 1419 1423 1426 1427 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
## 1429 1431 1439 1443 1445 1449 1452 1464 1465 1467 
##    1    1    1    1    1    1    1    1    1    1 
## Levels: 1 2 3 4

tabArb3 <- table(test2$JobSatisfaction, predArb3, dnn = c("Actual", "Predicha"))
confusionMatrix(tabArb3)

## Confusion Matrix and Statistics
## 
##       Predicha
## Actual  1  2  3  4
##      1 22  0  1 34
##      2 21  0  7 26
##      3 30  0  4 53
##      4 28  0  5 64
## 
## Overall Statistics
##                                          
##                Accuracy : 0.3051         
##                  95% CI : (0.253, 0.3611)
##     No Information Rate : 0.6            
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.0343         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity           0.21782       NA  0.23529   0.3616
## Specificity           0.81959   0.8169  0.70144   0.7203
## Pos Pred Value        0.38596       NA  0.04598   0.6598
## Neg Pred Value        0.66807       NA  0.93750   0.4293
## Prevalence            0.34237   0.0000  0.05763   0.6000
## Detection Rate        0.07458   0.0000  0.01356   0.2169
## Detection Prevalence  0.19322   0.1831  0.29492   0.3288
## Balanced Accuracy     0.51870       NA  0.46837   0.5410

Conclusiones

De acuerdo a las pruebas realizadas se evidencia que tanto el modelo de clasificacion Bayesiano como el arbol de clasificacion alcanzan porcentajes de exactitud similares en las predicciones. esto refleja la buena implementacion de dichos metodos. por otra parte se observa que el porcentaje de prediccion realizado por estos modelos es muy baja y no varia significativamente frente a los ajustes de balanceo realizados a los datos de entrenamiento, razon por la cual se deduce que en este problema la variable dependiente seleccionada presenta una alta independencia con las demas variables del data set. la alta independencia que presenta la variable seleccionada impide desarrollar un clasificador exitoso ya que no esxisten reglas que permitan ligar dicha variable con el resto de variables. en cuanto a interpretacion resulta mucho mas facil de entender un arbol clasificador que un clasificador bayesiano cuando se pinta el arbol, ya que es facil de observar los posibles caminos que puede tomar. por otra parte la seguridad que brinda el clasificador bayesiano puede ser mas alta en determinados entornos ya que dispone de respaldo estadistico solido que permita aumentar la sefuridad de las clasificaciones. por simplicidad en cuanto a la implementacion ambos modelos se pueden considerar muy practicos, aunque el arbol clasificador toma un poco mas de trabajo debido a la poda que debe realizarse. finalmente se concluye que estos modelos presentan poderosas herramientas para diferentes areas de la ingenieria, sin embargo hay que tener mucho cuidado de como se aplica, ya que como vimos escoger de forma eeeonea la variable dependiente no permite demostrar un buen funcionamiento.