library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
library(readr)
library(tidytext)
library(stringi)
library(class)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(dplyr)
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(readxl)
library(GGally)
library(corrplot)
## corrplot 0.95 loaded
library(class)
library(nnet)
library(smotefamily)
NombresGenero <- read_csv("NombresGenero.csv")
## New names:
## Rows: 9810 Columns: 3
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): name, gender dbl (1): ...1
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
View(NombresGenero)
glass <- read_excel("Glass Identification.xlsx")
View(glass)
NombresGenero <- NombresGenero %>%
mutate(
nombre_original = name,
# Convertir caracteres especiales a ASCII
nombre = stringi::stri_trans_general(name, "Latin-ASCII"),
# Convertir a minúsculas
nombre = tolower(name),
# Eliminar números
nombre = str_remove_all(name, "[0-9]"),
# Eliminar caracteres especiales
nombre = str_remove_all(name, "[^a-z]")
)
head(NombresGenero)
## # A tibble: 6 × 5
## ...1 name gender nombre_original nombre
## <dbl> <chr> <chr> <chr> <chr>
## 1 1 "aaronit" male "aaronit" aaronit
## 2 2 "aar\\u00f3n" male "aar\\u00f3n" aarufn
## 3 3 "abaco" male "abaco" abaco
## 4 4 "abalen" male "abalen" abalen
## 5 5 "abbott" male "abbott" abbott
## 6 6 "abd al-aziz" male "abd al-aziz" abdalaziz
matriz_criterios <- NombresGenero %>%
mutate(
termina_a = ifelse(str_ends(nombre, "a"), 1, 0),
termina_o = ifelse(str_ends(nombre, "o"), 1, 0),
termina_e = ifelse(str_ends(nombre, "e"), 1, 0),
empieza_vocal = ifelse(str_detect(nombre, "^[AEIOUaeiou]"), 1, 0),
longitud_mayor6 = ifelse(nchar(nombre) > 6, 1, 0)
)
head(matriz_criterios)
## # A tibble: 6 × 10
## ...1 name gender nombre_original nombre termina_a termina_o termina_e
## <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1 "aaronit" male "aaronit" aaron… 0 0 0
## 2 2 "aar\\u00f3… male "aar\\u00f3n" aarufn 0 0 0
## 3 3 "abaco" male "abaco" abaco 0 1 0
## 4 4 "abalen" male "abalen" abalen 0 0 0
## 5 5 "abbott" male "abbott" abbott 0 0 0
## 6 6 "abd al-azi… male "abd al-aziz" abdal… 0 0 0
## # ℹ 2 more variables: empieza_vocal <dbl>, longitud_mayor6 <dbl>
ngramas <- matriz_criterios %>%
unnest_tokens(
output = ngrama,
input = nombre,
token = "character_shingles",
n = 2
)
head(ngramas)
## # A tibble: 6 × 10
## ...1 name gender nombre_original termina_a termina_o termina_e empieza_vocal
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1 aaro… male aaronit 0 0 0 1
## 2 1 aaro… male aaronit 0 0 0 1
## 3 1 aaro… male aaronit 0 0 0 1
## 4 1 aaro… male aaronit 0 0 0 1
## 5 1 aaro… male aaronit 0 0 0 1
## 6 1 aaro… male aaronit 0 0 0 1
## # ℹ 2 more variables: longitud_mayor6 <dbl>, ngrama <chr>
matriz_ngramas <- ngramas %>%
mutate(valor = 1) %>%
group_by(name, ngrama) %>%
summarise(valor = max(valor), .groups = "drop") %>%
pivot_wider(
names_from = ngrama,
values_from = valor,
values_fill = list(valor = 0)
)
head(matriz_ngramas)
## # A tibble: 6 × 514
## name di ed it ti ue ca ef fr ic ri at eg
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 "\\u0… 1 1 1 1 1 0 0 0 0 0 0 0
## 2 "\\u0… 0 0 0 0 1 1 1 1 1 1 0 0
## 3 "\\u0… 0 0 0 0 1 0 0 0 0 0 1 1
## 4 "\\u0… 0 0 0 0 1 0 0 0 0 0 1 1
## 5 "\\u0… 0 1 0 0 1 0 0 0 0 0 0 1
## 6 "\\u0… 0 0 0 0 1 0 0 0 0 0 0 0
## # ℹ 501 more variables: ga <dbl>, ta <dbl>, ha <dbl>, th <dbl>, da <dbl>,
## # gu <dbl>, ar <dbl>, el <dbl>, lv <dbl>, ro <dbl>, va <dbl>, ba <dbl>,
## # em <dbl>, mb <dbl>, en <dbl>, ge <dbl>, ng <dbl>, la <dbl>, es <dbl>,
## # le <dbl>, al <dbl>, sa <dbl>, et <dbl>, ea <dbl>, eu <dbl>, re <dbl>,
## # ur <dbl>, er <dbl>, lm <dbl>, me <dbl>, ce <dbl>, ni <dbl>, un <dbl>,
## # ap <dbl>, fa <dbl>, pu <dbl>, uf <dbl>, fi <dbl>, hu <dbl>, ih <dbl>,
## # ir <dbl>, fo <dbl>, om <dbl>, oq <dbl>, qu <dbl>, ui <dbl>, fs <dbl>, …
matriz_final <- matriz_criterios %>%
left_join(matriz_ngramas, by = "name")
head(matriz_final)
## # A tibble: 6 × 523
## ...1 name gender nombre_original nombre termina_a termina_o termina_e
## <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1 "aaronit" male "aaronit" aaron… 0 0 0
## 2 2 "aar\\u00f3… male "aar\\u00f3n" aarufn 0 0 0
## 3 3 "abaco" male "abaco" abaco 0 1 0
## 4 4 "abalen" male "abalen" abalen 0 0 0
## 5 5 "abbott" male "abbott" abbott 0 0 0
## 6 6 "abd al-azi… male "abd al-aziz" abdal… 0 0 0
## # ℹ 515 more variables: empieza_vocal <dbl>, longitud_mayor6 <dbl>, di <dbl>,
## # ed <dbl>, it <dbl>, ti <dbl>, ue <dbl>, ca <dbl>, ef <dbl>, fr <dbl>,
## # ic <dbl>, ri <dbl>, at <dbl>, eg <dbl>, ga <dbl>, ta <dbl>, ha <dbl>,
## # th <dbl>, da <dbl>, gu <dbl>, ar <dbl>, el <dbl>, lv <dbl>, ro <dbl>,
## # va <dbl>, ba <dbl>, em <dbl>, mb <dbl>, en <dbl>, ge <dbl>, ng <dbl>,
## # la <dbl>, es <dbl>, le <dbl>, al <dbl>, sa <dbl>, et <dbl>, ea <dbl>,
## # eu <dbl>, re <dbl>, ur <dbl>, er <dbl>, lm <dbl>, me <dbl>, ce <dbl>, …
glimpse(NombresGenero)
## Rows: 9,810
## Columns: 5
## $ ...1 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…
## $ name <chr> "aaronit", "aar\\u00f3n", "abaco", "abalen", "abbott",…
## $ gender <chr> "male", "male", "male", "male", "male", "male", "male"…
## $ nombre_original <chr> "aaronit", "aar\\u00f3n", "abaco", "abalen", "abbott",…
## $ nombre <chr> "aaronit", "aarufn", "abaco", "abalen", "abbott", "abd…
datos_modelo <- matriz_final %>%
mutate(genero = as.factor(gender)) %>%
dplyr::select(-nombre,-nombre_original,-name,-1)
head(datos_modelo)
## # A tibble: 6 × 520
## gender termina_a termina_o termina_e empieza_vocal longitud_mayor6 di ed
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 male 0 0 0 1 1 0 0
## 2 male 0 0 0 1 0 0 0
## 3 male 0 1 0 1 0 0 0
## 4 male 0 0 0 1 0 0 0
## 5 male 0 0 0 1 0 0 0
## 6 male 0 0 0 1 1 0 0
## # ℹ 512 more variables: it <dbl>, ti <dbl>, ue <dbl>, ca <dbl>, ef <dbl>,
## # fr <dbl>, ic <dbl>, ri <dbl>, at <dbl>, eg <dbl>, ga <dbl>, ta <dbl>,
## # ha <dbl>, th <dbl>, da <dbl>, gu <dbl>, ar <dbl>, el <dbl>, lv <dbl>,
## # ro <dbl>, va <dbl>, ba <dbl>, em <dbl>, mb <dbl>, en <dbl>, ge <dbl>,
## # ng <dbl>, la <dbl>, es <dbl>, le <dbl>, al <dbl>, sa <dbl>, et <dbl>,
## # ea <dbl>, eu <dbl>, re <dbl>, ur <dbl>, er <dbl>, lm <dbl>, me <dbl>,
## # ce <dbl>, ni <dbl>, un <dbl>, ap <dbl>, fa <dbl>, pu <dbl>, uf <dbl>, …
X <- datos_modelo %>% dplyr::select(-gender)
y <- datos_modelo$gender
X <- data.matrix(X)
90% prueba y 10% entrenamiento
set.seed(123)
train_index <- createDataPartition(y, p = 0.9, list = FALSE)
train_x <- X[train_index, ]
test_x <- X[-train_index, ]
train_y <- y[train_index]
test_y <- y[-train_index]
nzv <- nearZeroVar(train_x)
if(length(nzv) > 0){
train_x <- train_x[, -nzv]
test_x <- test_x[, -nzv]
}
train_scaled <- scale(train_x)
test_scaled <- scale(
test_x,
center = attr(train_scaled, "scaled:center"),
scale = attr(train_scaled, "scaled:scale")
)
##Eliminar NA
train_scaled[is.na(train_scaled)] <- 0
test_scaled[is.na(test_scaled)] <- 0
##Convertir variable objetivo a Factor
train_y <- as.factor(train_y)
test_y <- as.factor(test_y)
pred <- knn(
train = train_scaled,
test = test_scaled,
cl = train_y,
k = 5
)
confusionMatrix(pred, test_y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction female male
## female 408 23
## male 27 522
##
## Accuracy : 0.949
## 95% CI : (0.9333, 0.9619)
## No Information Rate : 0.5561
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8966
##
## Mcnemar's Test P-Value : 0.6714
##
## Sensitivity : 0.9379
## Specificity : 0.9578
## Pos Pred Value : 0.9466
## Neg Pred Value : 0.9508
## Prevalence : 0.4439
## Detection Rate : 0.4163
## Detection Prevalence : 0.4398
## Balanced Accuracy : 0.9479
##
## 'Positive' Class : female
##
for(k in c(3,5,7,9,11)){
pred <- knn(train_scaled, test_scaled, train_y, k = k)
acc <- mean(pred == test_y)
print(paste("k =", k, "Accuracy =", acc))
}
## [1] "k = 3 Accuracy = 0.95"
## [1] "k = 5 Accuracy = 0.947959183673469"
## [1] "k = 7 Accuracy = 0.952040816326531"
## [1] "k = 9 Accuracy = 0.939795918367347"
## [1] "k = 11 Accuracy = 0.941836734693878"
Para la implementación del modelo de KNN, se dividió la base de datos en 90% para entrenamiento y 10% para prueba. Las variables se estandarizaron y se eliminaron aquellas con varianza cercana a 0, lo cual garantiza que columnas que en su mayoría tienen valores de 0 no introduzcan ruido en el modelo y generen problemas en la predicción.
Al calcular la matriz de confusión, los resultados arrojaron 408 nombres femeninos y 522 nombres masculinos correctamente clasificados, con solo 50 errores de clasificación. Esto permitió un accuracy del 95% en la clasificación de los nombres, así como una sensibilidad del 93,7% (identificación de nombres femeninos) y una especificidad del 95,78% (identificación de nombres masculinos).
## Eliminar variables constantes dentro de cada grupo (necesario para LDA/QDA)
var_por_grupo <- apply(train_scaled, 2, function(v){
var(v[train_y == levels(train_y)[1]]) == 0 |
var(v[train_y == levels(train_y)[2]]) == 0
})
train_scaled <- train_scaled[, !var_por_grupo]
test_scaled <- test_scaled[, !var_por_grupo]
modelo_lda <- lda(x = train_scaled, grouping = train_y)
pred_lda <- predict(modelo_lda, test_scaled)$class
confusionMatrix(pred_lda, test_y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction female male
## female 301 54
## male 134 491
##
## Accuracy : 0.8082
## 95% CI : (0.7821, 0.8324)
## No Information Rate : 0.5561
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6041
##
## Mcnemar's Test P-Value : 8.329e-09
##
## Sensitivity : 0.6920
## Specificity : 0.9009
## Pos Pred Value : 0.8479
## Neg Pred Value : 0.7856
## Prevalence : 0.4439
## Detection Rate : 0.3071
## Detection Prevalence : 0.3622
## Balanced Accuracy : 0.7964
##
## 'Positive' Class : female
##
modelo_qda <- qda(x = train_scaled, grouping = train_y)
pred_qda <- predict(modelo_qda, test_scaled)$class
confusionMatrix(pred_qda, test_y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction female male
## female 354 129
## male 81 416
##
## Accuracy : 0.7857
## 95% CI : (0.7587, 0.811)
## No Information Rate : 0.5561
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5707
##
## Mcnemar's Test P-Value : 0.001182
##
## Sensitivity : 0.8138
## Specificity : 0.7633
## Pos Pred Value : 0.7329
## Neg Pred Value : 0.8370
## Prevalence : 0.4439
## Detection Rate : 0.3612
## Detection Prevalence : 0.4929
## Balanced Accuracy : 0.7885
##
## 'Positive' Class : female
##
train_df <- data.frame(train_scaled)
train_df$gender <- train_y
test_df <- data.frame(test_scaled)
test_df$gender <- test_y
modelo_log <- glm(gender ~ ., data = train_df, family = binomial)
prob_log <- predict(modelo_log, test_df, type = "response")
pred_log <- ifelse(prob_log > 0.5, "female", "male") #Umbral de clasificación
pred_log <- as.factor(pred_log)
confusionMatrix(pred_log, test_y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction female male
## female 124 489
## male 311 56
##
## Accuracy : 0.1837
## 95% CI : (0.1599, 0.2094)
## No Information Rate : 0.5561
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.5879
##
## Mcnemar's Test P-Value : 3.902e-10
##
## Sensitivity : 0.2851
## Specificity : 0.1028
## Pos Pred Value : 0.2023
## Neg Pred Value : 0.1526
## Prevalence : 0.4439
## Detection Rate : 0.1265
## Detection Prevalence : 0.6255
## Balanced Accuracy : 0.1939
##
## 'Positive' Class : female
##
acc_knn <- confusionMatrix(pred, test_y)$overall['Accuracy']
acc_lda <- confusionMatrix(pred_lda, test_y)$overall['Accuracy']
acc_qda <- confusionMatrix(pred_qda, test_y)$overall['Accuracy']
acc_log <- confusionMatrix(pred_log, test_y)$overall['Accuracy']
data.frame(
Modelo = c("KNN","LDA","QDA","Logistica"),
Accuracy = c(acc_knn, acc_lda, acc_qda, acc_log)
)
## Modelo Accuracy
## 1 KNN 0.9418367
## 2 LDA 0.8081633
## 3 QDA 0.7857143
## 4 Logistica 0.1836735
Al comparar los 4 modelos implementados, el que presenta mejores resultados en cuanto al accuracy es el modelo de KNN, pues demuestra que con un ajuste apropiado del valor de K, arroja resultados de precisión superiores al 95%. Por su parte, los modelos de LDA y QDA fueron una buena aproximación al problema, pues arrojaron resultados de accuracy cercanos al 80%, lo cual representa una alternativa más simple pero robusta para predecir el genero de los nombres.
Se descarta el modelo de regresión logística, que arrojó una precisión de tan solo el 18%, pues para este tipo de problemas existen muchas variables predictoras que implicitamente pueden tener multicolinealidad entre si (por la separación de los nombres en n-gramas), lo cual vuelve al modelo inestable e impide el cumplimiento de sus supuestos.
glass$Type_of_glass <- as.factor(glass$Type_of_glass)
glass$Id <- as.factor(glass$Id)
str(glass)
## tibble [214 × 11] (S3: tbl_df/tbl/data.frame)
## $ Id : Factor w/ 214 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ RI : num [1:214] 1.52 1.52 1.52 1.52 1.52 ...
## $ Na : num [1:214] 13.6 13.9 13.5 13.2 13.3 ...
## $ Mg : num [1:214] 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
## $ Al : num [1:214] 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
## $ Si : num [1:214] 71.8 72.7 73 72.6 73.1 ...
## $ K : num [1:214] 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
## $ Ca : num [1:214] 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
## $ Ba : num [1:214] 0 0 0 0 0 0 0 0 0 0 ...
## $ Fe : num [1:214] 0 0 0 0 0 0.26 0 0 0 0.11 ...
## $ Type_of_glass: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
dim(glass)
## [1] 214 11
names(glass)
## [1] "Id" "RI" "Na" "Mg"
## [5] "Al" "Si" "K" "Ca"
## [9] "Ba" "Fe" "Type_of_glass"
colSums(is.na(glass))
## Id RI Na Mg Al
## 0 0 0 0 0
## Si K Ca Ba Fe
## 0 0 0 0 0
## Type_of_glass
## 0
sum(is.na(glass))
## [1] 0
table(glass$Type_of_glass)
##
## 1 2 3 5 6 7
## 70 76 17 13 9 29
prop.table(table(glass$Type_of_glass))
##
## 1 2 3 5 6 7
## 0.32710280 0.35514019 0.07943925 0.06074766 0.04205607 0.13551402
ggplot(glass, aes(x = Type_of_glass)) +
geom_bar(fill = "steelblue") +
labs(title = "Distribución de tipos de vidrio",
x = "Tipo de vidrio",
y = "Frecuencia")
glass %>%
dplyr::select(-Id, -Type_of_glass) %>%
summary()
## RI Na Mg Al
## Min. :1.511 Min. :10.73 Min. :0.000 Min. :0.290
## 1st Qu.:1.517 1st Qu.:12.91 1st Qu.:2.115 1st Qu.:1.190
## Median :1.518 Median :13.30 Median :3.480 Median :1.360
## Mean :1.518 Mean :13.41 Mean :2.685 Mean :1.445
## 3rd Qu.:1.519 3rd Qu.:13.82 3rd Qu.:3.600 3rd Qu.:1.630
## Max. :1.534 Max. :17.38 Max. :4.490 Max. :3.500
## Si K Ca Ba
## Min. :69.81 Min. :0.0000 Min. : 5.430 Min. :0.000
## 1st Qu.:72.28 1st Qu.:0.1225 1st Qu.: 8.240 1st Qu.:0.000
## Median :72.79 Median :0.5550 Median : 8.600 Median :0.000
## Mean :72.65 Mean :0.4971 Mean : 8.957 Mean :0.175
## 3rd Qu.:73.09 3rd Qu.:0.6100 3rd Qu.: 9.172 3rd Qu.:0.000
## Max. :75.41 Max. :6.2100 Max. :16.190 Max. :3.150
## Fe
## Min. :0.00000
## 1st Qu.:0.00000
## Median :0.00000
## Mean :0.05701
## 3rd Qu.:0.10000
## Max. :0.51000
glass %>%
group_by(Type_of_glass) %>%
summarise(
RI = mean(RI),
Na = mean(Na),
Mg = mean(Mg),
Al = mean(Al),
Si = mean(Si),
K = mean(K),
Ca = mean(Ca),
Ba = mean(Ba),
Fe = mean(Fe)
)
## # A tibble: 6 × 10
## Type_of_glass RI Na Mg Al Si K Ca Ba Fe
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1.52 13.2 3.55 1.16 72.6 0.447 8.80 0.0127 0.057
## 2 2 1.52 13.1 3.00 1.41 72.6 0.521 9.07 0.0503 0.0797
## 3 3 1.52 13.4 3.54 1.20 72.4 0.406 8.78 0.00882 0.0571
## 4 5 1.52 12.8 0.774 2.03 72.4 1.47 10.1 0.188 0.0608
## 5 6 1.52 14.6 1.31 1.37 73.2 0 9.36 0 0
## 6 7 1.52 14.4 0.538 2.12 73.0 0.325 8.49 1.04 0.0134
glass %>%
dplyr::select(-Id, -Type_of_glass) %>%
cor()
## RI Na Mg Al Si K
## RI 1.0000000000 -0.19188538 -0.122274039 -0.40732603 -0.54205220 -0.289832711
## Na -0.1918853790 1.00000000 -0.273731961 0.15679367 -0.06980881 -0.266086504
## Mg -0.1222740393 -0.27373196 1.000000000 -0.48179851 -0.16592672 0.005395667
## Al -0.4073260341 0.15679367 -0.481798509 1.00000000 -0.00552372 0.325958446
## Si -0.5420521997 -0.06980881 -0.165926723 -0.00552372 1.00000000 -0.193330854
## K -0.2898327111 -0.26608650 0.005395667 0.32595845 -0.19333085 1.000000000
## Ca 0.8104026963 -0.27544249 -0.443750026 -0.25959201 -0.20873215 -0.317836155
## Ba -0.0003860189 0.32660288 -0.492262118 0.47940390 -0.10215131 -0.042618059
## Fe 0.1430096093 -0.24134641 0.083059529 -0.07440215 -0.09420073 -0.007719049
## Ca Ba Fe
## RI 0.8104027 -0.0003860189 0.143009609
## Na -0.2754425 0.3266028795 -0.241346411
## Mg -0.4437500 -0.4922621178 0.083059529
## Al -0.2595920 0.4794039017 -0.074402151
## Si -0.2087322 -0.1021513105 -0.094200731
## K -0.3178362 -0.0426180594 -0.007719049
## Ca 1.0000000 -0.1128409671 0.124968219
## Ba -0.1128410 1.0000000000 -0.058691755
## Fe 0.1249682 -0.0586917554 1.000000000
corr <- glass %>%
dplyr::select(-Id, -Type_of_glass) %>%
cor()
corrplot(corr, method = "color")
glass_long <- glass %>%
pivot_longer(cols = RI:Fe,
names_to = "Variable",
values_to = "Valor")
ggplot(glass_long, aes(Type_of_glass, Valor, fill = Type_of_glass)) +
geom_boxplot() +
facet_wrap(~Variable, scales = "free")
La base de datos contiene 214 registros con 11 variables, la variable objetivo (type of glass), y las variables predictoras que describen la composición del vidrio y el índice de refracción (RI). No se evidenciaron efectivamente valores faltantes, pero se muestra un desbalance claro en la variable objetivo, pues la mayoría de muestras están concentradas en los tipos de vidrio 1 y 2, correspondientes a ventanas de edificios, la clase 4 no aparece (ventanas de vehículos), y las demás clases tienen menos observaciones.
Ahora bien, al revisar cada variable, se observa que existen variables como el RI, el SI y el Ca que no presentan tanta variabilidad entre clases, por lo cual, pueden no ser tan eficientes para discriminar las clases, mientras que variables como el Mg se encuentran más presentes en vidrios de edificios, o como el Al y K, se encuentran más presentes en contenedores y faros de automóviles. Finalmente, el Ba se encuentra muy presente en faros de automóviles y es de casi cero en el resto de vidrios.
glass <- glass %>% dplyr::select(-Id)
set.seed(123)
trainIndex <- createDataPartition(glass$Type_of_glass,
p = 0.8,
list = FALSE)
train <- glass[trainIndex, ]
test <- glass[-trainIndex, ]
table(train$Type_of_glass)
##
## 1 2 3 5 6 7
## 56 61 14 11 8 24
##Ajuste de desbalance de clases (SMOTE)
## Ajuste de desbalance de clases (SMOTE)
# Separar variables predictoras y variable objetivo
train_x <- train %>% dplyr::select(-Type_of_glass)
train_y <- train$Type_of_glass
# Aplicar SMOTE
smote_output <- SMOTE(train_x, train_y, K = 5)
# Reconstruir dataset balanceado
train_smote <- smote_output$data
train_smote$Type_of_glass <- as.factor(train_smote$class)
# eliminar columna auxiliar
train_smote$class <- NULL
# revisar distribución balanceada
table(train_smote$Type_of_glass)
##
## 1 2 3 5 6 7
## 56 61 14 11 160 24
target_size <- 64
set.seed(123)
train_balanced <- train_smote %>%
group_by(Type_of_glass) %>%
sample_n(target_size, replace = TRUE) %>%
ungroup()
table(train_balanced$Type_of_glass)
##
## 1 2 3 5 6 7
## 64 64 64 64 64 64
modelo_log <- multinom(Type_of_glass ~ ., data = train)
## # weights: 66 (50 variable)
## initial value 311.766148
## iter 10 value 201.662210
## iter 20 value 138.926468
## iter 30 value 118.031437
## iter 40 value 115.013659
## iter 50 value 112.048464
## iter 60 value 109.486167
## iter 70 value 108.285344
## iter 80 value 107.660731
## iter 90 value 106.704985
## iter 100 value 106.156879
## final value 106.156879
## stopped after 100 iterations
pred_log <- predict(modelo_log, test)
niveles <- levels(glass$Type_of_glass)
pred_log <- factor(pred_log, levels = niveles)
test$Type_of_glass <- factor(test$Type_of_glass, levels = niveles)
confusionMatrix(pred_log, test$Type_of_glass)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 5 6 7
## 1 11 9 1 0 0 0
## 2 3 5 2 0 0 0
## 3 0 0 0 0 0 0
## 5 0 1 0 1 0 0
## 6 0 0 0 0 1 0
## 7 0 0 0 1 0 5
##
## Overall Statistics
##
## Accuracy : 0.575
## 95% CI : (0.4089, 0.7296)
## No Information Rate : 0.375
## P-Value [Acc > NIR] : 0.008001
##
## Kappa : 0.3934
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 5 Class: 6 Class: 7
## Sensitivity 0.7857 0.3333 0.000 0.5000 1.000 1.0000
## Specificity 0.6154 0.8000 1.000 0.9737 1.000 0.9714
## Pos Pred Value 0.5238 0.5000 NaN 0.5000 1.000 0.8333
## Neg Pred Value 0.8421 0.6667 0.925 0.9737 1.000 1.0000
## Prevalence 0.3500 0.3750 0.075 0.0500 0.025 0.1250
## Detection Rate 0.2750 0.1250 0.000 0.0250 0.025 0.1250
## Detection Prevalence 0.5250 0.2500 0.000 0.0500 0.025 0.1500
## Balanced Accuracy 0.7005 0.5667 0.500 0.7368 1.000 0.9857
modelo_lda <- lda(Type_of_glass ~ ., data = train)
pred_lda <- predict(modelo_lda, test)
confusionMatrix(pred_lda$class, test$Type_of_glass)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 5 6 7
## 1 9 10 2 0 0 1
## 2 4 4 1 0 0 0
## 3 1 0 0 0 0 0
## 5 0 1 0 1 0 0
## 6 0 0 0 0 1 0
## 7 0 0 0 1 0 4
##
## Overall Statistics
##
## Accuracy : 0.475
## 95% CI : (0.3151, 0.6387)
## No Information Rate : 0.375
## P-Value [Acc > NIR] : 0.1271
##
## Kappa : 0.2527
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 5 Class: 6 Class: 7
## Sensitivity 0.6429 0.2667 0.0000 0.5000 1.000 0.8000
## Specificity 0.5000 0.8000 0.9730 0.9737 1.000 0.9714
## Pos Pred Value 0.4091 0.4444 0.0000 0.5000 1.000 0.8000
## Neg Pred Value 0.7222 0.6452 0.9231 0.9737 1.000 0.9714
## Prevalence 0.3500 0.3750 0.0750 0.0500 0.025 0.1250
## Detection Rate 0.2250 0.1000 0.0000 0.0250 0.025 0.1000
## Detection Prevalence 0.5500 0.2250 0.0250 0.0500 0.025 0.1250
## Balanced Accuracy 0.5714 0.5333 0.4865 0.7368 1.000 0.8857
#separar variables
train_x <- train_balanced %>% dplyr::select(-Type_of_glass)
test_x <- test %>% dplyr::select(-Type_of_glass)
#Estandarizar variables
preProc <- preProcess(train_x, method = c("center","scale"))
train_scaled <- predict(preProc, train_x)
test_scaled <- predict(preProc, test_x)
#Aplicar PCA
pca_model <- prcomp(train_scaled)
var_exp <- cumsum(pca_model$sdev^2 / sum(pca_model$sdev^2))
num_comp <- which(var_exp >= 0.95)[1]
#Crear dataset reducido
train_pca <- data.frame(pca_model$x[,1:num_comp])
train_pca$Type_of_glass <- train_balanced$Type_of_glass
test_pca <- data.frame(predict(pca_model, test_scaled)[,1:num_comp])
test_pca$Type_of_glass <- test$Type_of_glass
#Ajustar el modelo
modelo_qda <- qda(Type_of_glass ~ ., data = train_pca)
pred_qda <- predict(modelo_qda, test_pca)
cm_qda <- confusionMatrix(pred_qda$class, test_pca$Type_of_glass)
cm_qda
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 5 6 7
## 1 8 6 1 0 0 0
## 2 1 5 1 1 1 0
## 3 5 4 1 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 0 0 0
## 7 0 0 0 1 0 5
##
## Overall Statistics
##
## Accuracy : 0.475
## 95% CI : (0.3151, 0.6387)
## No Information Rate : 0.375
## P-Value [Acc > NIR] : 0.1271
##
## Kappa : 0.2971
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 5 Class: 6 Class: 7
## Sensitivity 0.5714 0.3333 0.3333 0.00 0.000 1.0000
## Specificity 0.7308 0.8400 0.7568 1.00 1.000 0.9714
## Pos Pred Value 0.5333 0.5556 0.1000 NaN NaN 0.8333
## Neg Pred Value 0.7600 0.6774 0.9333 0.95 0.975 1.0000
## Prevalence 0.3500 0.3750 0.0750 0.05 0.025 0.1250
## Detection Rate 0.2000 0.1250 0.0250 0.00 0.000 0.1250
## Detection Prevalence 0.3750 0.2250 0.2500 0.00 0.000 0.1500
## Balanced Accuracy 0.6511 0.5867 0.5450 0.50 0.500 0.9857
#Modelo KNN
#Separar variables
train_knn <- train %>% dplyr::select(-Type_of_glass)
test_knn <- test %>% dplyr::select(-Type_of_glass)
#Escalar datos
preProc <- preProcess(train_knn, method = c("center","scale"))
train_knn <- predict(preProc, train_knn)
test_knn <- predict(preProc, test_knn)
#Aplicar modelo
pred_knn <- knn(train = train_knn,
test = test_knn,
cl = train$Type_of_glass,
k = 7)
confusionMatrix(pred_knn, test$Type_of_glass)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 5 6 7
## 1 10 7 2 0 0 0
## 2 4 7 1 1 0 1
## 3 0 0 0 0 0 0
## 5 0 1 0 0 0 0
## 6 0 0 0 0 1 0
## 7 0 0 0 1 0 4
##
## Overall Statistics
##
## Accuracy : 0.55
## 95% CI : (0.3849, 0.7074)
## No Information Rate : 0.375
## P-Value [Acc > NIR] : 0.01821
##
## Kappa : 0.3431
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 5 Class: 6 Class: 7
## Sensitivity 0.7143 0.4667 0.000 0.0000 1.000 0.8000
## Specificity 0.6538 0.7200 1.000 0.9737 1.000 0.9714
## Pos Pred Value 0.5263 0.5000 NaN 0.0000 1.000 0.8000
## Neg Pred Value 0.8095 0.6923 0.925 0.9487 1.000 0.9714
## Prevalence 0.3500 0.3750 0.075 0.0500 0.025 0.1250
## Detection Rate 0.2500 0.1750 0.000 0.0000 0.025 0.1000
## Detection Prevalence 0.4750 0.3500 0.000 0.0250 0.025 0.1250
## Balanced Accuracy 0.6841 0.5933 0.500 0.4868 1.000 0.8857
resultados <- data.frame(
Modelo = c("Logistica","LDA","QDA","KNN"),
Accuracy = c(
confusionMatrix(pred_log, test$Type_of_glass)$overall["Accuracy"],
confusionMatrix(pred_lda$class, test$Type_of_glass)$overall["Accuracy"],
confusionMatrix(pred_qda$class, test$Type_of_glass)$overall["Accuracy"],
confusionMatrix(pred_knn, test$Type_of_glass)$overall["Accuracy"]
)
)
resultados
## Modelo Accuracy
## 1 Logistica 0.575
## 2 LDA 0.475
## 3 QDA 0.475
## 4 KNN 0.550
Dado el análisis exploratorio, se intentó balancear el dataset bajo el mecanismo de SMOTE, con un target value de 64 para todas las clases de vidrios. Sin embargo, este balanceo afectó el accuracy de la mayoría de modelos (exceptuando el QDA), por lo cual se decidió trabajar con el dataset original (sin balancear).
Dentro de los retos encontrados para el planteamiento del modelo, se resaltan los siguientes:
En el modelo de regresión logística, algunas relaciones entre los componentes químicos del vidrio podían ser no lineales (ej. relación entre RI y los componentes químicos). Esto pudo afectar el accuracy del modelo y disminuirlo considerablemente.
En el modelo LDA, se asume la misma matriz de covarianza para todas las clases, sin embargo, el supuesto de igualdad en el dataset no se cumple completamente.
El modelo QDA requería suficientes observaciones por clase, sin embargo, el desbalanceo afectó este supuesto, por lo cual fue necesario aplicar SMOTE y PCA, para poder predecir los resultados. Sin embargo, dada su inestabilidad, el accuracy no mejoró tanto.
Finalmente, el modelo KNN tuvo mejores resultados, sin embargo, el tamaño del dataset de solo 214 observaciones no generó resultados tan favorables como se esperaban.
Con base en los resultados obtenidos, se recomienda utilizar el modelo de regresión logística, que obtuvo el mayor accuracy, por su interpretabilidad y estabilidad, pero aplicando técnicas de regularización que permitan reducir la varianza y manejar las variables correlacionadas, así como hacer un PCA antes de implementar el modelo.