KNN

## Cargando paquete requerido: ggplot2
## Cargando paquete requerido: lattice
## 
## Adjuntando el paquete: 'MLmetrics'
## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE
## The following object is masked from 'package:base':
## 
##     Recall

Preparación del Dataset

cereal <- read.csv("C:/Users/willi/GITHUB/MACHINE-LEARNING/Semana 5/cereal2.csv", sep=";")
str(cereal)
## 'data.frame':    880 obs. of  5 variables:
##  $ edad_cat: chr  "Menor de 31" "46-60" "Mas de 60" "31-45" ...
##  $ genero  : chr  "Hombre" "Hombre" "Hombre" "Mujer" ...
##  $ ecivil  : chr  "Casado" "Casado" "Casado" "Casado" ...
##  $ activo  : chr  "En activo" "Sin actividad" "Sin actividad" "En activo" ...
##  $ desayuno: chr  "Cereales" "Barrita de desayuno" "Avena" "Avena" ...
##Crea un set completo de variables dummy
cerealeswin<-dummyVars("~.",data=cereal)
cerealesfin<-as.data.frame(predict(cerealeswin,newdata=cereal))

cerealesfin$desayunoOtros <- NULL
colnames(cerealesfin) <- make.names(colnames(cerealesfin))

#Dataset de entrenamiento y prueba
set.seed(1)
#aquí se define el tamaño de la muestra, en este caso entrenamiento tendrá el 75% de los casos
sample <- sample.int(nrow(cerealesfin), floor(.75*nrow(cerealesfin)))
cereal.train <- cerealesfin[sample, ]
cereal.test <- cerealesfin[-sample, ]

cereal.train1 <- cereal.train
cereal.test1 <- cereal.test

# Volvemos la columna que queremos predecir de tipo 'Factor'
# Lo hacemos para el conjunto de entrenamiento y de prueba
cereal.train$desayunoAvena<-as.factor(cereal.train$desayunoAvena)
cereal.train1$desayunoAvena<-as.factor(cereal.train1$desayunoAvena)
cereal.test$desayunoAvena<-as.factor(cereal.test$desayunoAvena)
cereal.test1$desayunoAvena<-as.factor(cereal.test1$desayunoAvena)
str(cereal.train)
## 'data.frame':    660 obs. of  13 variables:
##  $ edad_cat31.45              : num  0 1 0 0 0 0 0 1 0 0 ...
##  $ edad_cat46.60              : num  0 0 1 1 0 1 0 0 1 1 ...
##  $ edad_catMas.de.60          : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ edad_catMenor.de.31        : num  1 0 0 0 1 0 0 0 0 0 ...
##  $ generoHombre               : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ generoMujer                : num  1 1 1 1 1 1 1 0 1 1 ...
##  $ ecivilCasado               : num  0 0 1 1 0 0 0 1 1 1 ...
##  $ ecivilSoltero              : num  1 1 0 0 1 1 1 0 0 0 ...
##  $ activoEn.activo            : num  1 0 0 0 1 0 1 0 0 0 ...
##  $ activoSin.actividad        : num  0 1 1 1 0 1 0 1 1 1 ...
##  $ desayunoAvena              : Factor w/ 2 levels "0","1": 1 1 2 1 1 2 1 1 1 1 ...
##  $ desayunoBarrita.de.desayuno: num  1 1 0 1 1 0 1 1 0 0 ...
##  $ desayunoCereales           : num  0 0 0 0 0 0 0 0 1 1 ...
head(cereal.train, 3)
##     edad_cat31.45 edad_cat46.60 edad_catMas.de.60 edad_catMenor.de.31
## 836             0             0                 0                   1
## 679             1             0                 0                   0
## 129             0             1                 0                   0
##     generoHombre generoMujer ecivilCasado ecivilSoltero activoEn.activo
## 836            0           1            0             1               1
## 679            0           1            0             1               0
## 129            0           1            1             0               0
##     activoSin.actividad desayunoAvena desayunoBarrita.de.desayuno
## 836                   0             0                           1
## 679                   1             0                           1
## 129                   1             1                           0
##     desayunoCereales
## 836                0
## 679                0
## 129                0

Validadción Cruzada

# creo parámetros de validación cruzada
set.seed(1)
cross<-trainControl(method="cv",number=10)
modeloknn1<-train(desayunoAvena~.,method="knn",
                  tuneGrid=expand.grid(k=1:30),
                  trControl=cross,
                  metric="Accuracy",
                  data=cereal.train)
modeloknn1
## k-Nearest Neighbors 
## 
## 660 samples
##  12 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 595, 594, 594, 593, 594, 594, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    1  0.9939846  0.9867794
##    2  0.9788557  0.9531672
##    3  0.9318620  0.8523510
##    4  0.9015802  0.7905758
##    5  0.8757527  0.7352349
##    6  0.8666145  0.7179015
##    7  0.8515534  0.6858298
##    8  0.8424632  0.6659877
##    9  0.8303413  0.6386273
##   10  0.8333024  0.6419212
##   11  0.8378944  0.6514291
##   12  0.8469627  0.6699604
##   13  0.8272884  0.6311444
##   14  0.8334394  0.6451307
##   15  0.8243252  0.6295701
##   16  0.8318791  0.6451070
##   17  0.8348875  0.6542830
##   18  0.8333723  0.6517494
##   19  0.8137206  0.6005838
##   20  0.8076366  0.5824939
##   21  0.8061215  0.5791374
##   22  0.8122280  0.5934821
##   23  0.8061900  0.5791310
##   24  0.8152583  0.6009389
##   25  0.8259110  0.6229384
##   26  0.8440476  0.6618645
##   27  0.8425098  0.6576124
##   28  0.8531392  0.6821861
##   29  0.8592231  0.6978200
##   30  0.8592238  0.6964133
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 1.
plot(modeloknn1)


set.seed(1)

levels(cereal.train$desayunoAvena) <- make.names(levels(factor(cereal.train$desayunoAvena)))
# creo parámetros de validación cruzada
cross<-trainControl(method="cv",number=5,
                    classProbs = TRUE,
                    summaryFunction =prSummary)

modeloknn2<-train(desayunoAvena~.,method="knn",
                  tuneGrid=expand.grid(k=1:30),
                  trControl=cross,
                  metric="AUC",
                  data=cereal.train)
modeloknn2
## k-Nearest Neighbors 
## 
## 660 samples
##  12 predictor
##   2 classes: 'X0', 'X1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 527, 529, 528, 528, 528 
## Resampling results across tuning parameters:
## 
##   k   AUC         Precision  Recall     F        
##    1  0.01531351  0.9906690  0.9813680  0.9858674
##    2  0.03975411  0.9718050  0.9627360  0.9670969
##    3  0.08794891  0.9471378  0.9137893  0.9300171
##    4  0.14852669  0.9363456  0.8857182  0.9101235
##    5  0.20134065  0.9275594  0.8670588  0.8954945
##    6  0.23262267  0.9271053  0.8577565  0.8902832
##    7  0.26626455  0.9045271  0.8367715  0.8684708
##    8  0.30199311  0.9006608  0.8345280  0.8652216
##    9  0.33676949  0.8998245  0.8368536  0.8663024
##   10  0.37544557  0.9034392  0.8531327  0.8766393
##   11  0.41977588  0.8974069  0.8389877  0.8659569
##   12  0.51278777  0.9015892  0.8063748  0.8507208
##   13  0.53533805  0.9139745  0.8133789  0.8601818
##   14  0.56322610  0.9201311  0.8297127  0.8721634
##   15  0.59509878  0.9220910  0.8437483  0.8802925
##   16  0.62541808  0.9200700  0.8391518  0.8770375
##   17  0.63911595  0.8997668  0.8344460  0.8644435
##   18  0.63629313  0.8850692  0.8367715  0.8593495
##   19  0.63411355  0.8799844  0.8367989  0.8573031
##   20  0.63569334  0.8734876  0.8461012  0.8586705
##   21  0.63641764  0.8878423  0.8461286  0.8650525
##   22  0.63885140  0.8902156  0.8530780  0.8701018
##   23  0.64129663  0.8949971  0.8601094  0.8761299
##   24  0.65043252  0.8996528  0.8647332  0.8809300
##   25  0.65415752  0.9005268  0.8764159  0.8878991
##   26  0.66272448  0.9092332  0.8787141  0.8932304
##   27  0.66408085  0.9157686  0.8764159  0.8953258
##   28  0.66631582  0.9171438  0.8905609  0.9031136
##   29  0.66988241  0.9197105  0.8952394  0.9067297
##   30  0.66898087  0.9189684  0.8858550  0.9016642
## 
## AUC was used to select the optimal model using the largest value.
## The final value used for the model was k = 29.
plot(modeloknn2)

De los anteriores de validacion cruzada, vemos que existen algunos K buenos que vamos a comparar entre sí: K= [5, 14, 30]

Modelo KNN

K=5

# MODELO CON K = 5
set.seed(1)
test_pred = knn3(desayunoAvena~., data=cereal.train1, k = 5)
# Desempeño en entrenamiento.  K = 5
predmod <- predict(test_pred, cereal.train1, type = "prob")
pronknn<-ifelse(predmod[,2] > 0.5 ,1, 0)
confknn<-confusionMatrix(as.factor(pronknn),
                          cereal.train1$desayunoAvena, positive = "1")
confknn$table
##           Reference
## Prediction   0   1
##          0 392  18
##          1  37 213
confknn$overall
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##   9.166667e-01   8.202614e-01   8.929101e-01   9.366069e-01   6.500000e-01 
## AccuracyPValue  McnemarPValue 
##   6.055453e-58   1.521924e-02
confknn$byClass
##          Sensitivity          Specificity       Pos Pred Value 
##            0.9220779            0.9137529            0.8520000 
##       Neg Pred Value            Precision               Recall 
##            0.9560976            0.8520000            0.9220779 
##                   F1           Prevalence       Detection Rate 
##            0.8856549            0.3500000            0.3227273 
## Detection Prevalence    Balanced Accuracy 
##            0.3787879            0.9179154
# Desempeño en prueba.  K = 5
predmod <- predict(test_pred, cereal.test1, type = "prob")
pronknn<-ifelse(predmod[,2] > 0.5 ,1, 0)
confknn<-confusionMatrix(as.factor(pronknn),
                          cereal.test1$desayunoAvena, positive = "1")
confknn$table
##           Reference
## Prediction   0   1
##          0 123   6
##          1  18  73
confknn$overall
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##   8.909091e-01   7.706542e-01   8.420393e-01   9.288411e-01   6.409091e-01 
## AccuracyPValue  McnemarPValue 
##   2.642856e-17   2.474467e-02
confknn$byClass
##          Sensitivity          Specificity       Pos Pred Value 
##            0.9240506            0.8723404            0.8021978 
##       Neg Pred Value            Precision               Recall 
##            0.9534884            0.8021978            0.9240506 
##                   F1           Prevalence       Detection Rate 
##            0.8588235            0.3590909            0.3318182 
## Detection Prevalence    Balanced Accuracy 
##            0.4136364            0.8981955
# CURVA ROC y AUC.  K = 5

#crear objeto de predicciones
pr<-prediction(pronknn,cereal.test$desayunoAvena)
#creacion del objeto de la curva
curvaROC<-performance(pr,measure="tpr",x.measure="fpr")
#grafico de la curva
plot(curvaROC)

#calcular el AUC
auc<-performance(pr,measure = "auc")
auc <- auc@y.values[[1]]
#ver el AUC
auc
## [1] 0.8981955

K=14

# MODELO CON K = 14
set.seed(1)
test_pred = knn3(desayunoAvena~., data=cereal.train1, k = 14)
# Desempeño en entrenamiento.  K = 14
predmod <- predict(test_pred, cereal.train1, type = "prob")
pronknn<-ifelse(predmod[,2] > 0.5 ,1, 0)
confknn<-confusionMatrix(as.factor(pronknn),
                          cereal.train1$desayunoAvena, positive = "1")
confknn$table
##           Reference
## Prediction   0   1
##          0 374  38
##          1  55 193
confknn$overall
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##   8.590909e-01   6.954813e-01   8.301839e-01   8.847384e-01   6.500000e-01 
## AccuracyPValue  McnemarPValue 
##   7.487918e-34   9.709107e-02
confknn$byClass
##          Sensitivity          Specificity       Pos Pred Value 
##            0.8354978            0.8717949            0.7782258 
##       Neg Pred Value            Precision               Recall 
##            0.9077670            0.7782258            0.8354978 
##                   F1           Prevalence       Detection Rate 
##            0.8058455            0.3500000            0.2924242 
## Detection Prevalence    Balanced Accuracy 
##            0.3757576            0.8536464
# Desempeño en prueba.  K = 14
predmod <- predict(test_pred, cereal.test1, type = "prob")
pronknn<-ifelse(predmod[,2] > 0.5 ,1, 0)
confknn<-confusionMatrix(as.factor(pronknn),
                          cereal.test1$desayunoAvena, positive = "1")
confknn$table
##           Reference
## Prediction   0   1
##          0 118  13
##          1  23  66
confknn$overall
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##   8.363636e-01   6.541183e-01   7.807270e-01   8.826822e-01   6.409091e-01 
## AccuracyPValue  McnemarPValue 
##   1.146879e-10   1.336144e-01
confknn$byClass
##          Sensitivity          Specificity       Pos Pred Value 
##            0.8354430            0.8368794            0.7415730 
##       Neg Pred Value            Precision               Recall 
##            0.9007634            0.7415730            0.8354430 
##                   F1           Prevalence       Detection Rate 
##            0.7857143            0.3590909            0.3000000 
## Detection Prevalence    Balanced Accuracy 
##            0.4045455            0.8361612
# CURVA ROC y AUC.  K = 14

#crear objeto de predicciones
pr<-prediction(pronknn,cereal.test$desayunoAvena)
#creacion del objeto de la curva
curvaROC<-performance(pr,measure="tpr",x.measure="fpr")
#grafico de la curva
plot(curvaROC)

#calcular el AUC
auc<-performance(pr,measure = "auc")
auc <- auc@y.values[[1]]
#ver el AUC
auc
## [1] 0.8361612

K=30

# MODELO CON K = 30
set.seed(1)
test_pred = knn3(desayunoAvena~., data=cereal.train1, k = 30)
# Desempeño en entrenamiento.  K = 30
predmod <- predict(test_pred, cereal.train1, type = "prob")
pronknn<-ifelse(predmod[,2] > 0.5 ,1, 0)
confknn<-confusionMatrix(as.factor(pronknn),
                          cereal.train1$desayunoAvena, positive = "1")
confknn$table
##           Reference
## Prediction   0   1
##          0 371  43
##          1  58 188
confknn$overall
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##   8.469697e-01   6.686352e-01   8.172017e-01   8.735919e-01   6.500000e-01 
## AccuracyPValue  McnemarPValue 
##   6.904146e-30   1.636040e-01
confknn$byClass
##          Sensitivity          Specificity       Pos Pred Value 
##            0.8138528            0.8648019            0.7642276 
##       Neg Pred Value            Precision               Recall 
##            0.8961353            0.7642276            0.8138528 
##                   F1           Prevalence       Detection Rate 
##            0.7882600            0.3500000            0.2848485 
## Detection Prevalence    Balanced Accuracy 
##            0.3727273            0.8393273
# Desempeño en prueba.  K = 30
predmod <- predict(test_pred, cereal.test1, type = "prob")
pronknn<-ifelse(predmod[,2] > 0.5 ,1, 0)
confknn<-confusionMatrix(as.factor(pronknn),
                          cereal.test1$desayunoAvena, positive = "1")
confknn$table
##           Reference
## Prediction   0   1
##          0 123  12
##          1  18  67
confknn$overall
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##   8.636364e-01   7.086093e-01   8.110884e-01   9.060683e-01   6.409091e-01 
## AccuracyPValue  McnemarPValue 
##   1.089003e-13   3.613104e-01
confknn$byClass
##          Sensitivity          Specificity       Pos Pred Value 
##            0.8481013            0.8723404            0.7882353 
##       Neg Pred Value            Precision               Recall 
##            0.9111111            0.7882353            0.8481013 
##                   F1           Prevalence       Detection Rate 
##            0.8170732            0.3590909            0.3045455 
## Detection Prevalence    Balanced Accuracy 
##            0.3863636            0.8602208
# CURVA ROC y AUC.  K = 30

#crear objeto de predicciones
pr<-prediction(pronknn,cereal.test$desayunoAvena)
#creacion del objeto de la curva
curvaROC<-performance(pr,measure="tpr",x.measure="fpr")
#grafico de la curva
plot(curvaROC)

#calcular el AUC
auc<-performance(pr,measure = "auc")
auc <- auc@y.values[[1]]
#ver el AUC
auc
## [1] 0.8602208

Conclusión

Reusltados finales
Reusltados finales