# install.packages
library(ggplot2)
library(lattice)
library(caret)
library(DataExplorer)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df <- read.csv("/Users/nataliamartinez/Desktop/M1_data.csv")
# Vista rápida
dim(df)
## [1] 133 22
head(df)
## trust_apple interest_computers age_computer user_pcmac appleproducts_count
## 1 No 4 8 PC 0
## 2 Yes 2 4 PC 1
## 3 Yes 5 6 PC 0
## 4 Yes 2 6 Apple 4
## 5 Yes 4 4 Apple 7
## 6 Yes 3 1 Apple 2
## familiarity_m1 f_batterylife f_price f_size f_multitasking f_noise
## 1 No 5 4 3 4 4
## 2 No 5 5 5 3 4
## 3 No 3 4 2 4 1
## 4 No 4 3 3 4 4
## 5 Yes 5 3 3 4 4
## 6 No 5 5 4 4 5
## f_performance f_neural f_synergy f_performanceloss m1_consideration
## 1 2 2 1 1 1
## 2 5 2 2 4 2
## 3 4 2 2 2 4
## 4 4 4 4 3 2
## 5 5 3 4 4 4
## 6 5 5 4 2 2
## m1_purchase gender age_group income_group status domain
## 1 Yes Male 2 2 Student Science
## 2 No Male 2 3 Employed Finance
## 3 Yes Male 2 2 Student IT & Technology
## 4 No Female 2 2 Student Arts & Culture
## 5 Yes Male 5 7 Employed Hospitality
## 6 No Female 2 2 Student Politics
# 1) Convertir strings a factor
df <- df %>% mutate(across(where(is.character), as.factor))
# 2) Target como factor
df$m1_purchase <- as.factor(df$m1_purchase)
# 3) Quitar filas incompletas
df <- na.omit(df)
# 4) Ver balance de clases
table(df$m1_purchase)
##
## No Yes
## 45 88
prop.table(table(df$m1_purchase))
##
## No Yes
## 0.3383459 0.6616541
set.seed(123)
renglones_entrenamiento <- createDataPartition(df$m1_purchase, p = 0.8, list = FALSE)
entrenamiento <- df[renglones_entrenamiento, ]
prueba <- df[-renglones_entrenamiento, ]
dim(entrenamiento)
## [1] 107 22
dim(prueba)
## [1] 26 22
ctrl <- trainControl(method = "cv", number = 10)
set.seed(123)
modelo1 <- train(
m1_purchase ~ .,
data = entrenamiento,
method = "svmLinear",
preProcess = c("center", "scale"),
trControl = ctrl,
tuneGrid = data.frame(C = 1)
)
resultado_entrenamiento1 <- predict(modelo1, entrenamiento)
resultado_prueba1 <- predict(modelo1, prueba)
mcre1 <- confusionMatrix(resultado_entrenamiento1, entrenamiento$m1_purchase)
mcrp1 <- confusionMatrix(resultado_prueba1, prueba$m1_purchase)
mcre1
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 30 4
## Yes 6 67
##
## Accuracy : 0.9065
## 95% CI : (0.8348, 0.9543)
## No Information Rate : 0.6636
## P-Value [Acc > NIR] : 4.281e-09
##
## Kappa : 0.7878
##
## Mcnemar's Test P-Value : 0.7518
##
## Sensitivity : 0.8333
## Specificity : 0.9437
## Pos Pred Value : 0.8824
## Neg Pred Value : 0.9178
## Prevalence : 0.3364
## Detection Rate : 0.2804
## Detection Prevalence : 0.3178
## Balanced Accuracy : 0.8885
##
## 'Positive' Class : No
##
mcrp1
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 3 6
## Yes 6 11
##
## Accuracy : 0.5385
## 95% CI : (0.3337, 0.7341)
## No Information Rate : 0.6538
## P-Value [Acc > NIR] : 0.9231
##
## Kappa : -0.0196
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.3333
## Specificity : 0.6471
## Pos Pred Value : 0.3333
## Neg Pred Value : 0.6471
## Prevalence : 0.3462
## Detection Rate : 0.1154
## Detection Prevalence : 0.3462
## Balanced Accuracy : 0.4902
##
## 'Positive' Class : No
##
set.seed(123)
modelo2 <- train(
m1_purchase ~ .,
data = entrenamiento,
method = "svmRadial",
preProcess = c("center", "scale"),
trControl = ctrl,
tuneGrid = data.frame(sigma = 0.1, C = 1) # puedes ajustar sigma si quieres
)
resultado_entrenamiento2 <- predict(modelo2, entrenamiento)
resultado_prueba2 <- predict(modelo2, prueba)
mcre2 <- confusionMatrix(resultado_entrenamiento2, entrenamiento$m1_purchase)
mcrp2 <- confusionMatrix(resultado_prueba2, prueba$m1_purchase)
mcre2
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 34 0
## Yes 2 71
##
## Accuracy : 0.9813
## 95% CI : (0.9341, 0.9977)
## No Information Rate : 0.6636
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9576
##
## Mcnemar's Test P-Value : 0.4795
##
## Sensitivity : 0.9444
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9726
## Prevalence : 0.3364
## Detection Rate : 0.3178
## Detection Prevalence : 0.3178
## Balanced Accuracy : 0.9722
##
## 'Positive' Class : No
##
mcrp2
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 0 0
## Yes 9 17
##
## Accuracy : 0.6538
## 95% CI : (0.4433, 0.8279)
## No Information Rate : 0.6538
## P-Value [Acc > NIR] : 0.589398
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 0.007661
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.6538
## Prevalence : 0.3462
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : No
##
set.seed(123)
modelo3 <- train(
m1_purchase ~ .,
data = entrenamiento,
method = "svmPoly",
preProcess = c("center", "scale"),
trControl = ctrl,
tuneGrid = data.frame(degree = 2, scale = 1, C = 1)
)
resultado_entrenamiento3 <- predict(modelo3, entrenamiento)
resultado_prueba3 <- predict(modelo3, prueba)
mcre3 <- confusionMatrix(resultado_entrenamiento3, entrenamiento$m1_purchase)
mcrp3 <- confusionMatrix(resultado_prueba3, prueba$m1_purchase)
mcre3
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 34 0
## Yes 2 71
##
## Accuracy : 0.9813
## 95% CI : (0.9341, 0.9977)
## No Information Rate : 0.6636
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9576
##
## Mcnemar's Test P-Value : 0.4795
##
## Sensitivity : 0.9444
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9726
## Prevalence : 0.3364
## Detection Rate : 0.3178
## Detection Prevalence : 0.3178
## Balanced Accuracy : 0.9722
##
## 'Positive' Class : No
##
mcrp3
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1 4
## Yes 8 13
##
## Accuracy : 0.5385
## 95% CI : (0.3337, 0.7341)
## No Information Rate : 0.6538
## P-Value [Acc > NIR] : 0.9231
##
## Kappa : -0.1387
##
## Mcnemar's Test P-Value : 0.3865
##
## Sensitivity : 0.11111
## Specificity : 0.76471
## Pos Pred Value : 0.20000
## Neg Pred Value : 0.61905
## Prevalence : 0.34615
## Detection Rate : 0.03846
## Detection Prevalence : 0.19231
## Balanced Accuracy : 0.43791
##
## 'Positive' Class : No
##
set.seed(123)
modelo4 <- train(
m1_purchase ~ .,
data = entrenamiento,
method = "rpart",
trControl = ctrl,
tuneLength = 10
)
resultado_entrenamiento4 <- predict(modelo4, entrenamiento)
resultado_prueba4 <- predict(modelo4, prueba)
mcre4 <- confusionMatrix(resultado_entrenamiento4, entrenamiento$m1_purchase)
mcrp4 <- confusionMatrix(resultado_prueba4, prueba$m1_purchase)
mcre4
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 17 2
## Yes 19 69
##
## Accuracy : 0.8037
## 95% CI : (0.7158, 0.8742)
## No Information Rate : 0.6636
## P-Value [Acc > NIR] : 0.0010139
##
## Kappa : 0.5025
##
## Mcnemar's Test P-Value : 0.0004803
##
## Sensitivity : 0.4722
## Specificity : 0.9718
## Pos Pred Value : 0.8947
## Neg Pred Value : 0.7841
## Prevalence : 0.3364
## Detection Rate : 0.1589
## Detection Prevalence : 0.1776
## Balanced Accuracy : 0.7220
##
## 'Positive' Class : No
##
mcrp4
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 4 6
## Yes 5 11
##
## Accuracy : 0.5769
## 95% CI : (0.3692, 0.7665)
## No Information Rate : 0.6538
## P-Value [Acc > NIR] : 0.8485
##
## Kappa : 0.0892
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.4444
## Specificity : 0.6471
## Pos Pred Value : 0.4000
## Neg Pred Value : 0.6875
## Prevalence : 0.3462
## Detection Rate : 0.1538
## Detection Prevalence : 0.3846
## Balanced Accuracy : 0.5458
##
## 'Positive' Class : No
##
set.seed(123)
modelo5 <- train(
m1_purchase ~ .,
data = entrenamiento,
method = "nnet",
preProcess = c("center", "scale"),
trControl = ctrl,
tuneLength = 10,
trace = FALSE
)
resultado_entrenamiento5 <- predict(modelo5, entrenamiento)
resultado_prueba5 <- predict(modelo5, prueba)
mcre5 <- confusionMatrix(resultado_entrenamiento5, entrenamiento$m1_purchase)
mcrp5 <- confusionMatrix(resultado_prueba5, prueba$m1_purchase)
mcre5
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 34 0
## Yes 2 71
##
## Accuracy : 0.9813
## 95% CI : (0.9341, 0.9977)
## No Information Rate : 0.6636
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9576
##
## Mcnemar's Test P-Value : 0.4795
##
## Sensitivity : 0.9444
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9726
## Prevalence : 0.3364
## Detection Rate : 0.3178
## Detection Prevalence : 0.3178
## Balanced Accuracy : 0.9722
##
## 'Positive' Class : No
##
mcrp5
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 3 9
## Yes 6 8
##
## Accuracy : 0.4231
## 95% CI : (0.2335, 0.6308)
## No Information Rate : 0.6538
## P-Value [Acc > NIR] : 0.9954
##
## Kappa : -0.1818
##
## Mcnemar's Test P-Value : 0.6056
##
## Sensitivity : 0.3333
## Specificity : 0.4706
## Pos Pred Value : 0.2500
## Neg Pred Value : 0.5714
## Prevalence : 0.3462
## Detection Rate : 0.1154
## Detection Prevalence : 0.4615
## Balanced Accuracy : 0.4020
##
## 'Positive' Class : No
##
set.seed(123)
modelo6 <- train(
m1_purchase ~ .,
data = entrenamiento,
method = "rf",
trControl = ctrl,
tuneGrid = expand.grid(mtry = c(2, 4, 6))
)
resultado_entrenamiento6 <- predict(modelo6, entrenamiento)
resultado_prueba6 <- predict(modelo6, prueba)
mcre6 <- confusionMatrix(resultado_entrenamiento6, entrenamiento$m1_purchase)
mcrp6 <- confusionMatrix(resultado_prueba6, prueba$m1_purchase)
mcre6
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 34 0
## Yes 2 71
##
## Accuracy : 0.9813
## 95% CI : (0.9341, 0.9977)
## No Information Rate : 0.6636
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9576
##
## Mcnemar's Test P-Value : 0.4795
##
## Sensitivity : 0.9444
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9726
## Prevalence : 0.3364
## Detection Rate : 0.3178
## Detection Prevalence : 0.3178
## Balanced Accuracy : 0.9722
##
## 'Positive' Class : No
##
mcrp6
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 3 3
## Yes 6 14
##
## Accuracy : 0.6538
## 95% CI : (0.4433, 0.8279)
## No Information Rate : 0.6538
## P-Value [Acc > NIR] : 0.5894
##
## Kappa : 0.1702
##
## Mcnemar's Test P-Value : 0.5050
##
## Sensitivity : 0.3333
## Specificity : 0.8235
## Pos Pred Value : 0.5000
## Neg Pred Value : 0.7000
## Prevalence : 0.3462
## Detection Rate : 0.1154
## Detection Prevalence : 0.2308
## Balanced Accuracy : 0.5784
##
## 'Positive' Class : No
##
resultados <- data.frame(
svmLinear = c(as.numeric(mcre1$overall["Accuracy"]), as.numeric(mcrp1$overall["Accuracy"])),
svmRadial = c(as.numeric(mcre2$overall["Accuracy"]), as.numeric(mcrp2$overall["Accuracy"])),
svmPoly = c(as.numeric(mcre3$overall["Accuracy"]), as.numeric(mcrp3$overall["Accuracy"])),
rpart = c(as.numeric(mcre4$overall["Accuracy"]), as.numeric(mcrp4$overall["Accuracy"])),
nnet = c(as.numeric(mcre5$overall["Accuracy"]), as.numeric(mcrp5$overall["Accuracy"])),
rf = c(as.numeric(mcre6$overall["Accuracy"]), as.numeric(mcrp6$overall["Accuracy"]))
)
rownames(resultados) <- c("Precisión de entrenamiento", "Precisión de prueba")
resultados
## svmLinear svmRadial svmPoly rpart nnet
## Precisión de entrenamiento 0.9065421 0.9813084 0.9813084 0.8037383 0.9813084
## Precisión de prueba 0.5384615 0.6538462 0.5384615 0.5769231 0.4230769
## rf
## Precisión de entrenamiento 0.9813084
## Precisión de prueba 0.6538462
# Ordenado por precisión de prueba
resultados_ordenados <- resultados[, order(as.numeric(resultados["Precisión de prueba", ]), decreasing = TRUE)]
resultados_ordenados
## svmRadial rf rpart svmLinear svmPoly
## Precisión de entrenamiento 0.9813084 0.9813084 0.8037383 0.9065421 0.9813084
## Precisión de prueba 0.6538462 0.6538462 0.5769231 0.5384615 0.5384615
## nnet
## Precisión de entrenamiento 0.9813084
## Precisión de prueba 0.4230769
# Top model por accuracy de prueba
mejor_modelo <- names(which.max(resultados["Precisión de prueba", ]))
mejor_acc <- max(resultados["Precisión de prueba", ])
mejor_modelo
## [1] "svmRadial"
mejor_acc
## [1] 0.6538462
En este análisis se entrenaron 6 modelos de clasificación para
predecir m1_purchase usando un split 80-20 y validación cruzada
10-fold.
Se comparó el desempeño con Accuracy y matrices de confusión en
entrenamiento y prueba.
El mejor modelo (según Accuracy de prueba) fue: svmRadial con un
Accuracy de 0.65.
Para elegir el modelo final, además de la precisión, se revisa que la
diferencia entre entrenamiento y prueba no sea grande, para evitar
sobreajuste.