library(readr)
library(dplyr)
library(kknn)
library(VIM)
## Warning: package 'VIM' was built under R version 4.5.2
library(class)
library(scales)
library(caret)
Employee_IBM <- read_csv("Employee-IBM.csv")
Employee_norm <- Employee_IBM %>%
select(-Satisfaction) %>%
mutate(across(where(is.numeric),rescale))
Employee_IBM$Satisfaction <- factor(Employee_IBM$Satisfaction,
levels = 1:4,
labels = c("Very Dissatisfied", "Dissatisfied", "Satisfied", "Very Satisfied"))
Employee_norm <- na.omit(Employee_norm)
Employee_IBM <- na.omit(Employee_IBM)
# División en conjuntos de entrenamiento y prueba
set.seed(2025)
folds_emp <- createFolds(Employee_IBM$Satisfaction, k = 6)
train_emp <- Employee_norm[-folds_emp[[6]],]
test_emp <- Employee_norm[folds_emp[[6]],]
#Guardar etiquetas
train_emp_labels <- Employee_IBM$Satisfaction[-folds_emp[[6]]]
test_emp_labels <- Employee_IBM$Satisfaction[folds_emp[[6]]]
#Aplicación de KNN
train.kknn(train_emp_labels ~ ., data = train_emp, kmax = 50)
##
## Call:
## train.kknn(formula = train_emp_labels ~ ., data = train_emp, kmax = 50)
##
## Type of response variable: nominal
## Minimal misclassification: 0.7050654
## Best kernel: optimal
## Best k: 32
#Predicción
train_emp <- train_emp %>%
select(where(is.numeric))
test_emp <- test_emp %>%
select(where(is.numeric))
pred <- knn(train_emp,test_emp, cl = train_emp_labels, k = 32)
confusionMatrix(data = pred, reference = test_emp_labels)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Very Dissatisfied Dissatisfied Satisfied Very Satisfied
## Very Dissatisfied 1 2 2 10
## Dissatisfied 7 5 4 8
## Satisfied 21 21 49 32
## Very Satisfied 17 23 22 22
##
## Overall Statistics
##
## Accuracy : 0.313
## 95% CI : (0.2556, 0.375)
## No Information Rate : 0.313
## P-Value [Acc > NIR] : 0.524
##
## Kappa : 0.035
##
## Mcnemar's Test P-Value : 2.951e-07
##
## Statistics by Class:
##
## Class: Very Dissatisfied Class: Dissatisfied
## Sensitivity 0.021739 0.09804
## Specificity 0.930000 0.90256
## Pos Pred Value 0.066667 0.20833
## Neg Pred Value 0.805195 0.79279
## Prevalence 0.186992 0.20732
## Detection Rate 0.004065 0.02033
## Detection Prevalence 0.060976 0.09756
## Balanced Accuracy 0.475870 0.50030
## Class: Satisfied Class: Very Satisfied
## Sensitivity 0.6364 0.30556
## Specificity 0.5621 0.64368
## Pos Pred Value 0.3984 0.26190
## Neg Pred Value 0.7724 0.69136
## Prevalence 0.3130 0.29268
## Detection Rate 0.1992 0.08943
## Detection Prevalence 0.5000 0.34146
## Balanced Accuracy 0.5992 0.47462
# Valores k más pequeños
for (k in c(3, 5, 7, 10, 15, 20)) {
pred <- knn(train_emp, test_emp, cl = train_emp_labels, k = k)
cat("k =", k, "- Accuracy:", mean(pred == test_emp_labels), "\n")
}
## k = 3 - Accuracy: 0.2154472
## k = 5 - Accuracy: 0.2520325
## k = 7 - Accuracy: 0.2845528
## k = 10 - Accuracy: 0.2845528
## k = 15 - Accuracy: 0.3130081
## k = 20 - Accuracy: 0.3292683
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
# Normalización con z score
Employee_norm2 <- Employee_IBM %>%
mutate(across(where(is.numeric), scale))
# Random Forest
rf_model <- randomForest(
x = train_emp,
y = train_emp_labels,
ntree = 100,
importance = TRUE
)
# Predicción
rf_pred <- predict(rf_model, test_emp)
# Evaluate
rf_pred <- factor(rf_pred)
test_emp_labels_factor <- factor(test_emp_labels)
levels(rf_pred) <- levels(test_emp_labels_factor) <-
sort(unique(c(rf_pred, test_emp_labels_factor)))
confusionMatrix(data = rf_pred, reference = test_emp_labels_factor)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Very Dissatisfied Dissatisfied Satisfied Very Satisfied
## Very Dissatisfied 1 3 6 9
## Dissatisfied 7 0 12 14
## Satisfied 23 27 30 26
## Very Satisfied 15 21 29 23
##
## Overall Statistics
##
## Accuracy : 0.2195
## 95% CI : (0.1694, 0.2765)
## No Information Rate : 0.313
## P-Value [Acc > NIR] : 0.999568
##
## Kappa : -0.0868
##
## Mcnemar's Test P-Value : 0.002351
##
## Statistics by Class:
##
## Class: Very Dissatisfied Class: Dissatisfied
## Sensitivity 0.021739 0.0000
## Specificity 0.910000 0.8308
## Pos Pred Value 0.052632 0.0000
## Neg Pred Value 0.801762 0.7606
## Prevalence 0.186992 0.2073
## Detection Rate 0.004065 0.0000
## Detection Prevalence 0.077236 0.1341
## Balanced Accuracy 0.465870 0.4154
## Class: Satisfied Class: Very Satisfied
## Sensitivity 0.3896 0.3194
## Specificity 0.5503 0.6264
## Pos Pred Value 0.2830 0.2614
## Neg Pred Value 0.6643 0.6899
## Prevalence 0.3130 0.2927
## Detection Rate 0.1220 0.0935
## Detection Prevalence 0.4309 0.3577
## Balanced Accuracy 0.4700 0.4729
#SVM
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.2
##
## Attaching package: 'e1071'
## The following object is masked from 'package:ggplot2':
##
## element
# Modelo
svm_linear <- tune.svm(
x = train_emp,
y = train_emp_labels,
kernel = "linear",
cost = c(0.1,1,10,100)
)
#Mejor modelo
modelo_svm <- svm_linear$best.model
# Predicción
svm_pred <- predict(modelo_svm, test_emp)
#Matriz de confusión
svm_pred <- factor(svm_pred)
test_emp_labels_factor <- factor(test_emp_labels)
levels(svm_pred) <- levels(test_emp_labels_factor) <-
sort(unique(c(svm_pred, test_emp_labels_factor)))
confusionMatrix(data = svm_pred, reference = test_emp_labels_factor)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Satisfied Very Satisfied Very Dissatisfied Dissatisfied
## Satisfied 34 36 60 60
## Very Satisfied 12 15 17 12
## Very Dissatisfied 0 0 0 0
## Dissatisfied 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.1992
## 95% CI : (0.1511, 0.2546)
## No Information Rate : 0.313
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0094
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Satisfied Class: Very Satisfied
## Sensitivity 0.7391 0.29412
## Specificity 0.2200 0.78974
## Pos Pred Value 0.1789 0.26786
## Neg Pred Value 0.7857 0.81053
## Prevalence 0.1870 0.20732
## Detection Rate 0.1382 0.06098
## Detection Prevalence 0.7724 0.22764
## Balanced Accuracy 0.4796 0.54193
## Class: Very Dissatisfied Class: Dissatisfied
## Sensitivity 0.000 0.0000
## Specificity 1.000 1.0000
## Pos Pred Value NaN NaN
## Neg Pred Value 0.687 0.7073
## Prevalence 0.313 0.2927
## Detection Rate 0.000 0.0000
## Detection Prevalence 0.000 0.0000
## Balanced Accuracy 0.500 0.5000
LIBRERIAS
library(rlang)
## Warning: package 'rlang' was built under R version 4.5.2
library(cluster) # silhouette
## Warning: package 'cluster' was built under R version 4.5.2
library(dendextend) # personalización del dendrograma
##
## ---------------------
## Welcome to dendextend version 1.19.1
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
library(factoextra) # fviz_nbclust, fviz_dend, fviz_cluster
## Warning: package 'factoextra' was built under R version 4.5.2
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
library(ggplot2)
library(dplyr)
# El upload del cvs se hizo por files.
Econ <- read.csv("Size_of_the_economy.csv")
cat("Dimensiones iniciales:", dim(Econ), "\n")
## Dimensiones iniciales: 210 7
agregados <- c("World",
"Lower middle income",
"Upper middle income",
"East Asia & Pacific",
"South Asia",
"High income",
"Sub-Saharan Africa",
"Europe & Central Asia",
"Low income",
"Latin America & Caribbean",
"Middle East & North Africa",
"North America")
Econ <- Econ[!Econ$Country %in% agregados, ]
Econ <- na.omit(Econ)
cat("Tras limpieza:", dim(Econ), "\n")
## Tras limpieza: 192 7
# Separar nombres de variables
countries <- Econ$Country
X <- Econ[, -1]
rownames(X) <- countries
X_log <- sign(X) * log1p(abs(X))
# Escalamos (k-medias es sensible a la escala de las variables)
data.scaled <- scale(X_log)
-> CLUSTERS?
# Método de silueta
fviz_nbclust(data.scaled, kmeans, method = "silhouette", k.max = 10) +
labs(title = "Número óptimo de clusters",
x = "Valor de k", y = "Promedio de silueta")
# Método del codo
fviz_nbclust(data.scaled, kmeans, method = "wss", k.max = 10) +
labs(title = "Número óptimo de clusters",
x = "Valor de k", y = "Suma de cuadrados intra-grupos")
# Método de brecha (gap statistic)
fviz_nbclust(data.scaled, kmeans, method = "gap_stat", k.max = 10) +
labs(title = "Número óptimo de clusters",
x = "Valor de k", y = "Estadístico de brecha (Gap)")
# Cada método puede sugerir un k distinto, como menciona el material.
# Elegimos k = 3 como compromiso: tres grupos interpretables.
K <- 3
-> K-MEDIAS
set.seed(123)
km <- kmeans(data.scaled, centers = K, nstart = 25)
cat("\nTamaños de los clusters K-medias:\n")
##
## Tamaños de los clusters K-medias:
print(km$size)
## [1] 76 54 62
# Visualización con PCA
fviz_cluster(km, data = data.scaled,
geom = "point",
ellipse.type = "convex",
palette = c("#0073C2", "#EFC000", "#CD534C"),
ggtheme = theme_minimal()) +
labs(title = "K-medias con k = 3")
-> Cluster Jerarquico
# Matriz de distancias euclidianas
dist_mat <- dist(data.scaled, method = "euclidean")
# Método de Ward (minimiza la varianza dentro de los grupos)
hc <- hclust(dist_mat, method = "ward.D2")
# Dendrograma con estilo "jco" (como en las notas)
fviz_dend(hc, k = K,
cex = 0.4,
k_colors = "jco",
rect = TRUE,
rect_border = "jco",
rect_fill = TRUE,
main = "Dendrograma — Clustering jerárquico (Ward)",
ggtheme = theme_bw())
# Cortamos el árbol para obtener K grupos
hc_clusters <- cutree(hc, k = K)
cat("\nTamaños de los clusters jerárquicos:\n")
##
## Tamaños de los clusters jerárquicos:
print(table(hc_clusters))
## hc_clusters
## 1 2 3
## 100 36 56
-> Comparacion
# Tabla cruzada: ¿qué tanto coinciden los dos métodos?
cat("\nTabla cruzada K-medias vs Jerárquico:\n")
##
## Tabla cruzada K-medias vs Jerárquico:
print(table(KMeans = km$cluster, HC = hc_clusters))
## HC
## KMeans 1 2 3
## 1 70 1 5
## 2 5 0 49
## 3 25 35 2
-> Caracterizacion
Econ$Cluster_KMeans <- km$cluster
Econ$Cluster_HC <- hc_clusters
# Medias por cluster (K-medias)
cluster_summary <- Econ %>%
group_by(Cluster_KMeans) %>%
summarise(across(where(is.numeric), mean, na.rm = TRUE),
n = n()) %>%
arrange(Cluster_KMeans)
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(where(is.numeric), mean, na.rm = TRUE)`.
## ℹ In group 1: `Cluster_KMeans = 1`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
cat("\nPerfil promedio de cada cluster:\n")
##
## Perfil promedio de cada cluster:
print(cluster_summary)
## # A tibble: 3 × 9
## Cluster_KMeans Population..millions. Surface_area Population_density
## <int> <dbl> <dbl> <dbl>
## 1 1 11.8 407. 64.4
## 2 2 1.52 8.12 728.
## 3 3 109. 1616. 406.
## # ℹ 5 more variables: Gross_national_income <dbl>,
## # Purchasing_power_parity <dbl>, Gross_domestic_product <dbl>,
## # Cluster_HC <dbl>, n <int>
# Algunos países representativos por cluster
cat("\nEjemplos de países por cluster:\n")
##
## Ejemplos de países por cluster:
for (c in 1:K) {
paises_c <- Econ$Country[Econ$Cluster_KMeans == c]
cat(sprintf("Cluster %d (n=%d): %s, ...\n",
c, length(paises_c),
paste(head(paises_c, 6), collapse = ", ")))
}
## Cluster 1 (n=76): Afghanistan, Albania, Angola, Armenia, Azerbaijan, Belarus, ...
## Cluster 2 (n=54): Antigua and Barbuda, Aruba, Bahamas, The, Bahrain, Barbados, Belize, ...
## Cluster 3 (n=62): Algeria, Argentina, Australia, Austria, Bangladesh, Belgium, ...