Modelos de Clasificacion

Subir los datos

rm(list=ls())
source(list.files(pattern = "LIBRERIAS.R"))
## Warning: replacing previous import 'vctrs::data_frame' by 'tibble::data_frame'
## when loading 'dplyr'
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## Loading required package: lattice
## 
## Attaching package: 'BSDA'
## The following objects are masked from 'package:carData':
## 
##     Vocab, Wool
## The following object is masked from 'package:datasets':
## 
##     Orange
## Warning: package 'gmodels' was built under R version 4.0.4
## Warning: package 'kknn' was built under R version 4.0.5
## 
## Attaching package: 'kknn'
## The following object is masked from 'package:caret':
## 
##     contr.dummy
getwd()
## [1] "D:/Usuarios/OSCAGAAl/Documents/Asesorias/Clases Diego Tarquino/Modelos de Clasificacion"
df <- as.data.frame(read_csv("D:/Usuarios/OSCAGAAl/Documents/Asesorias/Clases Diego Tarquino/Modelos de Clasificacion/Insumos/default.csv"))
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   SEX = col_character(),
##   EDUCATION = col_character(),
##   MARRIAGE = col_character()
## )
## See spec(...) for full column specifications.
df$SEX=as.factor(df$SEX)
df$EDUCATION=as.factor(df$EDUCATION)
df$MARRIAGE=as.factor(df$MARRIAGE)
df$Cluster=as.factor(df$Cluster)
df$default=as.factor(df$default)
names(df)
##  [1] "ID"        "LIMIT_BAL" "SEX"       "EDUCATION" "MARRIAGE"  "AGE"      
##  [7] "PAY_0"     "PAY_2"     "PAY_3"     "PAY_4"     "PAY_5"     "PAY_6"    
## [13] "BILL_AMT1" "BILL_AMT2" "BILL_AMT3" "BILL_AMT4" "BILL_AMT5" "BILL_AMT6"
## [19] "PAY_AMT1"  "PAY_AMT2"  "PAY_AMT3"  "PAY_AMT4"  "PAY_AMT5"  "PAY_AMT6" 
## [25] "default"   "Cluster"

Elimino cluster

df1=df[,-26]
names(df1)
##  [1] "ID"        "LIMIT_BAL" "SEX"       "EDUCATION" "MARRIAGE"  "AGE"      
##  [7] "PAY_0"     "PAY_2"     "PAY_3"     "PAY_4"     "PAY_5"     "PAY_6"    
## [13] "BILL_AMT1" "BILL_AMT2" "BILL_AMT3" "BILL_AMT4" "BILL_AMT5" "BILL_AMT6"
## [19] "PAY_AMT1"  "PAY_AMT2"  "PAY_AMT3"  "PAY_AMT4"  "PAY_AMT5"  "PAY_AMT6" 
## [25] "default"

Esta la Data Desbanlaceada

(table(df1$default)/nrow(df1))*100
## 
##        0        1 
## 77.68656 22.31344

Estrategia de Entrenamiento

Dividir la Base en Entrenamiento y Test

set.seed(12147)
#df1$Filas=row.names(df1)

filas_entrenamiento=createDataPartition(df1$ID,p=0.8,list=FALSE,times=1)
entrenamiento=df1[filas_entrenamiento,]
prueba=df1[-filas_entrenamiento,]

Modelo Vecino Mas Cercano

Es un método no paramétrico usado para clasificación y regresión, la idea básica es que un nuevo caso se va a clasificar según la clase que tengan sus K - Vecinos más cercanos. Es un concepto simple , intuitivo y fácil de implementar por eso es un método de uso común.

Seleccionar el Numero de Vecinos k

Construimos el modelo alimentandolo con los datos de aprendizaje, se le indica el valor máximo de K que el modelo puede usar y él determina el óptimo. (Es importante aclarar que el modelo deberia ser calibrado para obtener el mejor resultado, este se corre con las opciones por defecto)

# Seleccion de variables

V= c("LIMIT_BAL", "AGE","PAY_0","PAY_2","PAY_3","PAY_4",
     "PAY_5","PAY_6","BILL_AMT1","BILL_AMT2", "BILL_AMT3",
     "BILL_AMT4", "BILL_AMT5", "BILL_AMT6",
     "PAY_AMT1" , "PAY_AMT2",  "PAY_AMT3" ,
     "PAY_AMT4" , "PAY_AMT5"  ,"PAY_AMT6" , "default" )
entrenamiento1=entrenamiento[,V]
prueba1=prueba[,V]

Modelo sin Variables Categoricas

Modelo=train.kknn(default~.,data = entrenamiento1,kmax = 15)
Modelo
## 
## Call:
## train.kknn(formula = default ~ ., data = entrenamiento1, kmax = 15)
## 
## Type of response variable: nominal
## Minimal misclassification: 0.1953465
## Best kernel: optimal
## Best k: 15

Validamos el Algoritmo con la Base de Prueba

names(prueba1)
##  [1] "LIMIT_BAL" "AGE"       "PAY_0"     "PAY_2"     "PAY_3"     "PAY_4"    
##  [7] "PAY_5"     "PAY_6"     "BILL_AMT1" "BILL_AMT2" "BILL_AMT3" "BILL_AMT4"
## [13] "BILL_AMT5" "BILL_AMT6" "PAY_AMT1"  "PAY_AMT2"  "PAY_AMT3"  "PAY_AMT4" 
## [19] "PAY_AMT5"  "PAY_AMT6"  "default"
predicciones_test=predict(Modelo,prueba1[,-21])

Matrix de confusion

confusionMatrix(prueba1$default,predicciones_test)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4258  330
##          1  840  492
##                                          
##                Accuracy : 0.8024         
##                  95% CI : (0.792, 0.8124)
##     No Information Rate : 0.8611         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.3442         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.8352         
##             Specificity : 0.5985         
##          Pos Pred Value : 0.9281         
##          Neg Pred Value : 0.3694         
##              Prevalence : 0.8611         
##          Detection Rate : 0.7193         
##    Detection Prevalence : 0.7750         
##       Balanced Accuracy : 0.7169         
##                                          
##        'Positive' Class : 0              
## 
print("Accuracy Clase0")
## [1] "Accuracy Clase0"
((4258/(4258+840))*100)
## [1] 83.52295
print("Accuracy Clase1")
## [1] "Accuracy Clase1"
((492/(492+330))*100)
## [1] 59.85401