Subir los datos
rm(list=ls())
source(list.files(pattern = "LIBRERIAS.R"))
## Warning: replacing previous import 'vctrs::data_frame' by 'tibble::data_frame'
## when loading 'dplyr'
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## Loading required package: lattice
##
## Attaching package: 'BSDA'
## The following objects are masked from 'package:carData':
##
## Vocab, Wool
## The following object is masked from 'package:datasets':
##
## Orange
## Warning: package 'gmodels' was built under R version 4.0.4
## Warning: package 'kknn' was built under R version 4.0.5
##
## Attaching package: 'kknn'
## The following object is masked from 'package:caret':
##
## contr.dummy
getwd()
## [1] "D:/Usuarios/OSCAGAAl/Documents/Asesorias/Clases Diego Tarquino/Modelos de Clasificacion"
df <- as.data.frame(read_csv("D:/Usuarios/OSCAGAAl/Documents/Asesorias/Clases Diego Tarquino/Modelos de Clasificacion/Insumos/default.csv"))
## Parsed with column specification:
## cols(
## .default = col_double(),
## SEX = col_character(),
## EDUCATION = col_character(),
## MARRIAGE = col_character()
## )
## See spec(...) for full column specifications.
df$SEX=as.factor(df$SEX)
df$EDUCATION=as.factor(df$EDUCATION)
df$MARRIAGE=as.factor(df$MARRIAGE)
df$Cluster=as.factor(df$Cluster)
df$default=as.factor(df$default)
names(df)
## [1] "ID" "LIMIT_BAL" "SEX" "EDUCATION" "MARRIAGE" "AGE"
## [7] "PAY_0" "PAY_2" "PAY_3" "PAY_4" "PAY_5" "PAY_6"
## [13] "BILL_AMT1" "BILL_AMT2" "BILL_AMT3" "BILL_AMT4" "BILL_AMT5" "BILL_AMT6"
## [19] "PAY_AMT1" "PAY_AMT2" "PAY_AMT3" "PAY_AMT4" "PAY_AMT5" "PAY_AMT6"
## [25] "default" "Cluster"
Elimino cluster
df1=df[,-26]
names(df1)
## [1] "ID" "LIMIT_BAL" "SEX" "EDUCATION" "MARRIAGE" "AGE"
## [7] "PAY_0" "PAY_2" "PAY_3" "PAY_4" "PAY_5" "PAY_6"
## [13] "BILL_AMT1" "BILL_AMT2" "BILL_AMT3" "BILL_AMT4" "BILL_AMT5" "BILL_AMT6"
## [19] "PAY_AMT1" "PAY_AMT2" "PAY_AMT3" "PAY_AMT4" "PAY_AMT5" "PAY_AMT6"
## [25] "default"
Esta la Data Desbanlaceada
(table(df1$default)/nrow(df1))*100
##
## 0 1
## 77.68656 22.31344
Dividir la Base en Entrenamiento y Test
set.seed(12147)
#df1$Filas=row.names(df1)
filas_entrenamiento=createDataPartition(df1$ID,p=0.8,list=FALSE,times=1)
entrenamiento=df1[filas_entrenamiento,]
prueba=df1[-filas_entrenamiento,]
Es un método no paramétrico usado para clasificación y regresión, la idea básica es que un nuevo caso se va a clasificar según la clase que tengan sus K - Vecinos más cercanos. Es un concepto simple , intuitivo y fácil de implementar por eso es un método de uso común.
Seleccionar el Numero de Vecinos k
Construimos el modelo alimentandolo con los datos de aprendizaje, se le indica el valor máximo de K que el modelo puede usar y él determina el óptimo. (Es importante aclarar que el modelo deberia ser calibrado para obtener el mejor resultado, este se corre con las opciones por defecto)
# Seleccion de variables
V= c("LIMIT_BAL", "AGE","PAY_0","PAY_2","PAY_3","PAY_4",
"PAY_5","PAY_6","BILL_AMT1","BILL_AMT2", "BILL_AMT3",
"BILL_AMT4", "BILL_AMT5", "BILL_AMT6",
"PAY_AMT1" , "PAY_AMT2", "PAY_AMT3" ,
"PAY_AMT4" , "PAY_AMT5" ,"PAY_AMT6" , "default" )
entrenamiento1=entrenamiento[,V]
prueba1=prueba[,V]
Modelo sin Variables Categoricas
Modelo=train.kknn(default~.,data = entrenamiento1,kmax = 15)
Modelo
##
## Call:
## train.kknn(formula = default ~ ., data = entrenamiento1, kmax = 15)
##
## Type of response variable: nominal
## Minimal misclassification: 0.1953465
## Best kernel: optimal
## Best k: 15
Validamos el Algoritmo con la Base de Prueba
names(prueba1)
## [1] "LIMIT_BAL" "AGE" "PAY_0" "PAY_2" "PAY_3" "PAY_4"
## [7] "PAY_5" "PAY_6" "BILL_AMT1" "BILL_AMT2" "BILL_AMT3" "BILL_AMT4"
## [13] "BILL_AMT5" "BILL_AMT6" "PAY_AMT1" "PAY_AMT2" "PAY_AMT3" "PAY_AMT4"
## [19] "PAY_AMT5" "PAY_AMT6" "default"
predicciones_test=predict(Modelo,prueba1[,-21])
Matrix de confusion
confusionMatrix(prueba1$default,predicciones_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4258 330
## 1 840 492
##
## Accuracy : 0.8024
## 95% CI : (0.792, 0.8124)
## No Information Rate : 0.8611
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3442
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8352
## Specificity : 0.5985
## Pos Pred Value : 0.9281
## Neg Pred Value : 0.3694
## Prevalence : 0.8611
## Detection Rate : 0.7193
## Detection Prevalence : 0.7750
## Balanced Accuracy : 0.7169
##
## 'Positive' Class : 0
##
print("Accuracy Clase0")
## [1] "Accuracy Clase0"
((4258/(4258+840))*100)
## [1] 83.52295
print("Accuracy Clase1")
## [1] "Accuracy Clase1"
((492/(492+330))*100)
## [1] 59.85401