Employee_IBM <- read.csv("Employee-IBM.csv")
head(Employee_IBM)
## Satisfaction Age Gender HourlyRate JobInvolvement MonthlyIncome
## 1 1 41 Female 94 3 5993
## 2 4 49 Male 61 2 5130
## 3 2 37 Male 92 2 2090
## 4 3 33 Female 56 3 2909
## 5 4 27 Male 40 3 3468
## 6 3 32 Male 79 3 3068
## NumCompaniesWorked PercentSalaryHike StockOptionLevel TotalWorkingYears
## 1 8 11 0 8
## 2 1 23 1 10
## 3 6 15 0 7
## 4 1 11 0 8
## 5 9 12 1 6
## 6 0 13 0 8
## TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole
## 1 0 1 6 4
## 2 3 3 10 7
## 3 3 3 0 0
## 4 3 3 8 7
## 5 3 3 2 2
## 6 2 2 7 7
## YearsSinceLastPromotion YearsWithCurrManager
## 1 0 5
## 2 1 7
## 3 0 0
## 4 3 0
## 5 2 2
## 6 3 6
Employee_IBM$Satisfaction <- as.factor(Employee_IBM$Satisfaction)
Employee_IBM$Gender <- as.factor(Employee_IBM$Gender)
library(caret)
set.seed(2025)
folds <- createFolds(Employee_IBM$Satisfaction, k = 5)
entrenamiento <- Employee_IBM[-folds[[5]], ]
prueba <- Employee_IBM[folds[[5]], ]
# Etiquetas
entrenamiento_labels <- Employee_IBM$Satisfaction[-folds[[5]]]
prueba_labels <- Employee_IBM$Satisfaction[folds[[5]]]
# Verificar dimensiones
dim(entrenamiento)
## [1] 1176 16
dim(prueba)
## [1] 294 16
# Verificar distribución de clases
table(entrenamiento$Satisfaction)
##
## 1 2 3 4
## 220 243 367 346
prop.table(table(entrenamiento$Satisfaction))
##
## 1 2 3 4
## 0.1870748 0.2066327 0.3120748 0.2942177
table(prueba$Satisfaction)
##
## 1 2 3 4
## 56 60 92 86
prop.table(table(prueba$Satisfaction))
##
## 1 2 3 4
## 0.1904762 0.2040816 0.3129252 0.2925170
library(rpart)
modelo1 <- rpart(Satisfaction ~ ., data = entrenamiento)
modelo1
## n= 1176
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 1176 809 3 (0.1870748 0.2066327 0.3120748 0.2942177)
## 2) YearsSinceLastPromotion< 12.5 1151 789 3 (0.1894005 0.2085143 0.3145091 0.2875760)
## 4) PercentSalaryHike>=11.5 990 695 3 (0.1919192 0.2141414 0.2979798 0.2959596)
## 8) MonthlyIncome< 8386.5 779 538 4 (0.2053915 0.2079589 0.2772786 0.3093710)
## 16) Age< 52.5 752 525 4 (0.2047872 0.2101064 0.2832447 0.3018617)
## 32) NumCompaniesWorked< 4.5 601 414 4 (0.2229617 0.1980033 0.2678869 0.3111481)
## 64) TotalWorkingYears>=10.5 123 84 3 (0.2601626 0.2113821 0.3170732 0.2113821) *
## 65) TotalWorkingYears< 10.5 478 317 4 (0.2133891 0.1945607 0.2552301 0.3368201) *
## 33) NumCompaniesWorked>=4.5 151 99 3 (0.1324503 0.2582781 0.3443709 0.2649007) *
## 17) Age>=52.5 27 13 4 (0.2222222 0.1481481 0.1111111 0.5185185) *
## 9) MonthlyIncome>=8386.5 211 132 3 (0.1421801 0.2369668 0.3744076 0.2464455) *
## 5) PercentSalaryHike< 11.5 161 94 3 (0.1739130 0.1739130 0.4161491 0.2360248) *
## 3) YearsSinceLastPromotion>=12.5 25 10 4 (0.0800000 0.1200000 0.2000000 0.6000000) *
library(rpart.plot)
plot.new()
rpart.plot(modelo1)
pred <- predict(modelo1, entrenamiento, type = "class")
tt <- table(pred, entrenamiento$Satisfaction)
tt
##
## pred 1 2 3 4
## 1 0 0 0 0
## 2 0 0 0 0
## 3 110 143 237 156
## 4 110 100 130 190
library(scales)
library(dplyr)
# Convertir Gender a numérico para poder normalizar
Employee_IBM_knn <- Employee_IBM
Employee_IBM_knn$Gender <- ifelse(Employee_IBM_knn$Gender == "Male", 1, 0)
# Normalizar todas las variables excepto Satisfaction
Employee_IBM_norm <- Employee_IBM_knn %>%
select(-Satisfaction) %>%
mutate(across(everything(), rescale))
# Agregar variable respuesta
Employee_IBM_norm$Satisfaction <- as.factor(Employee_IBM_knn$Satisfaction)
# Crear conjuntos usando los mismos folds del Paso 2
entrenamiento_knn <- Employee_IBM_norm[-folds[[5]], ]
prueba_knn <- Employee_IBM_norm[folds[[5]], ]
library(kknn)
library(class)
# Convertir etiquetas a factor
entrenamiento_labels <- as.factor(Employee_IBM$Satisfaction[-folds[[5]]])
prueba_labels <- as.factor(Employee_IBM$Satisfaction[folds[[5]]])
# Buscar el mejor valor de k
modelo_knn <- train.kknn(Satisfaction ~ .,
data = entrenamiento_knn,
kmax = 50)
modelo_knn
##
## Call:
## train.kknn(formula = Satisfaction ~ ., data = entrenamiento_knn, kmax = 50)
##
## Type of response variable: nominal
## Minimal misclassification: 0.7040816
## Best kernel: optimal
## Best k: 50
# Ver el mejor k encontrado
modelo_knn$best.parameters$k
## [1] 50
# Predecir con el mejor k
pred_knn <- knn(entrenamiento_knn[, -ncol(entrenamiento_knn)],
prueba_knn[, -ncol(prueba_knn)],
cl = entrenamiento_labels,
k = modelo_knn$best.parameters$k)
# Tabla de clasificación
tabla_knn <- table(prueba_labels, pred_knn)
tabla_knn
## pred_knn
## prueba_labels 1 2 3 4
## 1 0 3 25 28
## 2 3 2 33 22
## 3 1 1 51 39
## 4 0 9 47 30
# Tasa de aciertos
TA_knn <- (sum(diag(tabla_knn))) / sum(tabla_knn)
round(TA_knn, 4)
## [1] 0.2823
# Matriz de confusión con caret
confusionMatrix(data = pred_knn, reference = prueba_labels)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4
## 1 0 3 1 0
## 2 3 2 1 9
## 3 25 33 51 47
## 4 28 22 39 30
##
## Overall Statistics
##
## Accuracy : 0.2823
## 95% CI : (0.2316, 0.3375)
## No Information Rate : 0.3129
## P-Value [Acc > NIR] : 0.8847
##
## Kappa : -0.0215
##
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 0.00000 0.033333 0.5543 0.3488
## Specificity 0.98319 0.944444 0.4802 0.5721
## Pos Pred Value 0.00000 0.133333 0.3269 0.2521
## Neg Pred Value 0.80690 0.792115 0.7029 0.6800
## Prevalence 0.19048 0.204082 0.3129 0.2925
## Detection Rate 0.00000 0.006803 0.1735 0.1020
## Detection Prevalence 0.01361 0.051020 0.5306 0.4048
## Balanced Accuracy 0.49160 0.488889 0.5173 0.4605
exactitud_knn <- numeric(length = 5)
for(i in 1:5){
prueba_cv_knn <- Employee_IBM_norm[folds[[i]], ]
entrenamiento_cv_knn <- Employee_IBM_norm[-folds[[i]], ]
entrenamiento_labels_cv <- as.factor(Employee_IBM$Satisfaction[-folds[[i]]])
prueba_labels_cv <- as.factor(Employee_IBM$Satisfaction[folds[[i]]])
pred_cv_knn <- knn(entrenamiento_cv_knn[, -ncol(entrenamiento_cv_knn)],
prueba_cv_knn[, -ncol(prueba_cv_knn)],
cl = entrenamiento_labels_cv,
k = modelo_knn$best.parameters$k)
cm <- confusionMatrix(pred_cv_knn, prueba_labels_cv)
exactitud_knn[i] <- cm$overall["Accuracy"]
cat("Fold", i, "- Exactitud:", exactitud_knn[i], "\n")
}
## Fold 1 - Exactitud: 0.2823129
## Fold 2 - Exactitud: 0.3037543
## Fold 3 - Exactitud: 0.3129252
## Fold 4 - Exactitud: 0.3288136
## Fold 5 - Exactitud: 0.3027211
# Exactitud promedio
Exactitud_promedio_knn <- round(mean(exactitud_knn), 4) * 100
paste("Exactitud promedio: ", Exactitud_promedio_knn, "%", sep = "")
## [1] "Exactitud promedio: 30.61%"
# Matriz de confusión del modelo
tabla1_knn <- table(prueba_labels, pred_knn)
tabla1_knn
## pred_knn
## prueba_labels 1 2 3 4
## 1 0 3 25 28
## 2 3 2 33 22
## 3 1 1 51 39
## 4 0 9 47 30
TA <- (sum(diag(tabla1_knn))) / sum(tabla1_knn)
round(TA, 4)
## [1] 0.2823
# Validación cruzada automática con caret
set.seed(2025)
train_control_knn <- trainControl(method = "cv", number = 10,
savePredictions = TRUE)
knn_cv <- train(Satisfaction ~ .,
data = Employee_IBM_norm,
method = "knn",
trControl = train_control_knn,
tuneGrid = data.frame(k = modelo_knn$best.parameters$k))
knn_cv
## k-Nearest Neighbors
##
## 1470 samples
## 15 predictor
## 4 classes: '1', '2', '3', '4'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1324, 1323, 1323, 1324, 1323, 1321, ...
## Resampling results:
##
## Accuracy Kappa
## 0.2931674 -0.001017946
##
## Tuning parameter 'k' was held constant at a value of 50
# Matriz de confusión usando predicciones guardadas por caret
confusionMatrix(knn_cv$pred$pred, knn_cv$pred$obs)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4
## 1 11 16 17 18
## 2 13 18 26 20
## 3 139 146 224 216
## 4 113 123 192 178
##
## Overall Statistics
##
## Accuracy : 0.2932
## 95% CI : (0.27, 0.3172)
## No Information Rate : 0.3122
## P-Value [Acc > NIR] : 0.9463
##
## Kappa : -9e-04
##
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 0.039855 0.05941 0.4880 0.4120
## Specificity 0.957286 0.94944 0.5045 0.5877
## Pos Pred Value 0.177419 0.23377 0.3090 0.2937
## Neg Pred Value 0.811790 0.79541 0.6846 0.7060
## Prevalence 0.187755 0.20612 0.3122 0.2939
## Detection Rate 0.007483 0.01224 0.1524 0.1211
## Detection Prevalence 0.042177 0.05238 0.4932 0.4122
## Balanced Accuracy 0.498571 0.50442 0.4962 0.4999
library(e1071)
modelo_nb <- naiveBayes(Satisfaction ~ ., data = entrenamiento)
modelo_nb
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## 1 2 3 4
## 0.1870748 0.2066327 0.3120748 0.2942177
##
## Conditional probabilities:
## Age
## Y [,1] [,2]
## 1 36.31818 8.789712
## 2 36.35391 8.813392
## 3 37.02997 8.828770
## 4 37.34971 9.841746
##
## Gender
## Y Female Male
## 1 0.4181818 0.5818182
## 2 0.3621399 0.6378601
## 3 0.4168937 0.5831063
## 4 0.4046243 0.5953757
##
## HourlyRate
## Y [,1] [,2]
## 1 64.70000 20.11976
## 2 67.50206 20.38359
## 3 64.72480 20.18529
## 4 66.78902 19.90001
##
## JobInvolvement
## Y [,1] [,2]
## 1 2.722727 0.7527313
## 2 2.674897 0.7196098
## 3 2.746594 0.6962688
## 4 2.765896 0.7263919
##
## MonthlyIncome
## Y [,1] [,2]
## 1 6230.464 4432.888
## 2 6418.255 4529.077
## 3 6514.967 4673.431
## 4 6457.890 4824.202
##
## NumCompaniesWorked
## Y [,1] [,2]
## 1 2.400000 2.256194
## 2 2.650206 2.442113
## 3 2.858311 2.494054
## 4 2.731214 2.635325
##
## PercentSalaryHike
## Y [,1] [,2]
## 1 15.40455 3.684470
## 2 15.45679 3.748441
## 3 15.07629 3.754729
## 4 15.15607 3.514596
##
## StockOptionLevel
## Y [,1] [,2]
## 1 0.8272727 0.8370321
## 2 0.7530864 0.7956109
## 3 0.8092643 0.8567452
## 4 0.7341040 0.8262673
##
## TotalWorkingYears
## Y [,1] [,2]
## 1 11.04545 7.459340
## 2 11.02881 7.024396
## 3 11.40599 7.784057
## 4 11.47399 8.420518
##
## TrainingTimesLastYear
## Y [,1] [,2]
## 1 2.840909 1.360620
## 2 2.744856 1.230100
## 3 2.822888 1.279822
## 4 2.803468 1.328218
##
## WorkLifeBalance
## Y [,1] [,2]
## 1 2.672727 0.7537921
## 2 2.810700 0.7020989
## 3 2.716621 0.6944916
## 4 2.783237 0.6910248
##
## YearsAtCompany
## Y [,1] [,2]
## 1 7.045455 5.861236
## 2 6.617284 5.561604
## 3 6.694823 5.951493
## 4 7.534682 6.787474
##
## YearsInCurrentRole
## Y [,1] [,2]
## 1 4.350000 3.556491
## 2 4.205761 3.593907
## 3 3.986376 3.480206
## 4 4.358382 3.667152
##
## YearsSinceLastPromotion
## Y [,1] [,2]
## 1 2.172727 3.110915
## 2 2.168724 3.023382
## 3 1.776567 2.836953
## 4 2.630058 3.660461
##
## YearsWithCurrManager
## Y [,1] [,2]
## 1 4.240909 3.524365
## 2 3.934156 3.468242
## 3 3.877384 3.523335
## 4 4.387283 3.642783
# Predicción sobre el conjunto de prueba
pred_nb <- predict(modelo_nb, prueba, type = "class")
# Tabla de clasificación (matriz de confusión manual)
tt <- table(pred_nb, prueba_labels)
tt
## prueba_labels
## pred_nb 1 2 3 4
## 1 2 2 5 6
## 2 9 6 10 6
## 3 28 39 56 52
## 4 17 13 21 22
# Tasa de aciertos
TA <- (sum(diag(tt))) / sum(tt)
paste("Tasa de aciertos: ", round(TA, 4) * 100, "%", sep = "")
## [1] "Tasa de aciertos: 29.25%"
install.packages("naivebayes")
library(naivebayes)
set.seed(2025)
train_control <- trainControl(method = "cv", number = 5, savePredictions = TRUE)
NBC_cv <- train(Satisfaction ~ .,
data = Employee_IBM,
method = "naive_bayes",
trControl = train_control)
NBC_cv
## Naive Bayes
##
## 1470 samples
## 15 predictor
## 4 classes: '1', '2', '3', '4'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1176, 1175, 1177, 1175, 1177
## Resampling results across tuning parameters:
##
## usekernel Accuracy Kappa
## FALSE 0.2918579 0.01327425
## TRUE 0.3142586 0.02111006
##
## Tuning parameter 'laplace' was held constant at a value of 0
## Tuning
## parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were laplace = 0, usekernel = TRUE
## and adjust = 1.
# Matriz de confusión con predicciones guardadas por caret
confusionMatrix(NBC_cv$pred$pred, NBC_cv$pred$obs)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4
## 1 60 51 80 69
## 2 24 21 44 46
## 3 311 356 539 478
## 4 157 178 255 271
##
## Overall Statistics
##
## Accuracy : 0.3031
## 95% CI : (0.2865, 0.32)
## No Information Rate : 0.3122
## P-Value [Acc > NIR] : 0.8632
##
## Kappa : 0.017
##
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 0.10870 0.034653 0.5871 0.31366
## Specificity 0.91625 0.951157 0.4337 0.71580
## Pos Pred Value 0.23077 0.155556 0.3201 0.31475
## Neg Pred Value 0.81642 0.791444 0.6982 0.71477
## Prevalence 0.18776 0.206122 0.3122 0.29388
## Detection Rate 0.02041 0.007143 0.1833 0.09218
## Detection Prevalence 0.08844 0.045918 0.5728 0.29286
## Balanced Accuracy 0.51247 0.492905 0.5104 0.51473
# Matriz de confusión detallada del conjunto de prueba
confusionMatrix(pred_nb, prueba_labels)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4
## 1 2 2 5 6
## 2 9 6 10 6
## 3 28 39 56 52
## 4 17 13 21 22
##
## Overall Statistics
##
## Accuracy : 0.2925
## 95% CI : (0.2411, 0.3481)
## No Information Rate : 0.3129
## P-Value [Acc > NIR] : 0.7924
##
## Kappa : 0.0034
##
## Mcnemar's Test P-Value : 8.451e-11
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 0.035714 0.10000 0.6087 0.25581
## Specificity 0.945378 0.89316 0.4109 0.75481
## Pos Pred Value 0.133333 0.19355 0.3200 0.30137
## Neg Pred Value 0.806452 0.79468 0.6975 0.71041
## Prevalence 0.190476 0.20408 0.3129 0.29252
## Detection Rate 0.006803 0.02041 0.1905 0.07483
## Detection Prevalence 0.051020 0.10544 0.5952 0.24830
## Balanced Accuracy 0.490546 0.49658 0.5098 0.50531