library(ISLR)
library(MASS)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
En la ultima clase aprendimos dos metodos de clasificación,
Lo que haremos ahora es evaluar la calidad de la predicción del modelo.
Para poder utilizar este metodo de evaluación de un modelo de clasificación necesitamos separa nuestra data de entrenamiento en dos datasets.
Lo que haremos es entrenar el modelo utilizando las observaciones que estan en el dataset train y luego evaluaremos el modelo utilizando las observaciones del dataset test.
Esto nos ayudar a medir como se comporta nuestro modelo cuando se lo aplicamos a data nueva. La matriz tiene la siguiente estructura.
En general, que porcentage de la data clasifica correctamente?
\[Exactitud=\frac{VP+VN}{Total}\]
En general, que porcentage de la data clasifica incorrectamente? \[\text{Tasa de error}=\frac{FP+FN}{Total}\]
Traducción al ingles,
Cuando la clase es positiva, que porcentage logra clasificar?
\[\text{Sensibilidad}=\frac{VP}{\text{Total Positivos}}\]
Traducción al ingles,
Cuando la clase es negativa, que porcentage logra clasificar?
\[\text{Especificidad}=\frac{VN}{\text{Total Negativos}}\]
Cuando predice positvos, que porcentage clasifica correctamente?
\[\text{Precisión}=\frac{VP}{\text{Total clasificados positivos}}\]
Cuando predice negativo, que porcentage clasifica correctamente?
\[\text{VPN}=\frac{VN}{\text{Total clasificados negativos}}\]
names(Default)
## [1] "default" "student" "balance" "income"
summary(Default)
## default student balance income
## No :9667 No :7056 Min. : 0.0 Min. : 772
## Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
## Median : 823.6 Median :34553
## Mean : 835.4 Mean :33517
## 3rd Qu.:1166.3 3rd Qu.:43808
## Max. :2654.3 Max. :73554
nrow(Default)
## [1] 10000
ntrain <- nrow(Default)*0.8
ntest <- nrow(Default)*0.2
c(ntrain,ntest)
## [1] 8000 2000
set.seed(161)
index_train<-sample(1:nrow(Default),size = ntrain)
train<-Default[index_train,]
test<-Default[-index_train,]
summary(train)
## default student balance income
## No :7749 No :5664 Min. : 0.0 Min. : 772
## Yes: 251 Yes:2336 1st Qu.: 478.8 1st Qu.:21391
## Median : 819.1 Median :34677
## Mean : 829.8 Mean :33556
## 3rd Qu.:1160.0 3rd Qu.:43782
## Max. :2654.3 Max. :73554
summary(test)
## default student balance income
## No :1918 No :1392 Min. : 0.0 Min. : 2541
## Yes: 82 Yes: 608 1st Qu.: 493.2 1st Qu.:20955
## Median : 837.4 Median :33878
## Mean : 857.5 Mean :33362
## 3rd Qu.:1193.3 3rd Qu.:43883
## Max. :2578.5 Max. :70701
logit_reg <- glm(default~balance+student+income,data = train,family = "binomial")
lda_fit<-lda(default~balance+student+income,data=train)
predicted_value <- predict(logit_reg,test,type = "response")
predicted_class <- ifelse(predicted_value>0.5, "Yes","No")
performance_data<-data.frame(observed=test$default,
predicted= predicted_class)
positive <- sum(performance_data$observed=="Yes")
negative <- sum(performance_data$observed=="No")
predicted_positive <- sum(performance_data$predicted=="Yes")
predicted_negative <- sum(performance_data$predicted=="No")
total <- nrow(performance_data)
data.frame(positive, negative,predicted_positive,predicted_negative)
## positive negative predicted_positive predicted_negative
## 1 82 1918 44 1956
tp<-sum(performance_data$observed=="Yes" & performance_data$predicted=="Yes")
tn<-sum(performance_data$observed=="No" & performance_data$predicted=="No")
fp<-sum(performance_data$observed=="No" & performance_data$predicted=="Yes")
fn<-sum(performance_data$observed=="Yes" & performance_data$predicted=="No")
data.frame(tp,tn,fp,fn)
## tp tn fp fn
## 1 32 1906 12 50
accuracy <- (tp+tn)/total
error_rate <- (fp+fn)/total
sensitivity <- tp/positive
especificity <- tn/negative
precision <- tp/predicted_positive
npv <- tn / predicted_negative
data.frame(accuracy,error_rate,sensitivity,especificity,precision,npv)
## accuracy error_rate sensitivity especificity precision npv
## 1 0.969 0.031 0.3902439 0.9937435 0.7272727 0.9744376
predicted_value <- predict(logit_reg,test,type = "response")
predicted_class <- ifelse(predicted_value>0.25, "Yes","No")
performance_data<-data.frame(observed=test$default,
predicted= predicted_class)
positive <- sum(performance_data$observed=="Yes")
negative <- sum(performance_data$observed=="No")
predicted_positive <- sum(performance_data$predicted=="Yes")
predicted_negative <- sum(performance_data$predicted=="No")
total <- nrow(performance_data)
tp<-sum(performance_data$observed=="Yes" & performance_data$predicted=="Yes")
tn<-sum(performance_data$observed=="No" & performance_data$predicted=="No")
fp<-sum(performance_data$observed=="No" & performance_data$predicted=="Yes")
fn<-sum(performance_data$observed=="Yes" & performance_data$predicted=="No")
accuracy <- (tp+tn)/total
error_rate <- (fp+fn)/total
sensitivity <- tp/positive
especificity <- tn/negative
precision <- tp/predicted_positive
npv <- tn / predicted_negative
data.frame(tp,tn,fp,fn)
## tp tn fp fn
## 1 46 1873 45 36
data.frame(accuracy,error_rate,sensitivity,especificity,precision,npv)
## accuracy error_rate sensitivity especificity precision npv
## 1 0.9595 0.0405 0.5609756 0.9765381 0.5054945 0.981142
roc_data <- function(model, test, step=0.01) {
out<-data.frame()
cut <- seq(step, 1, by = step)
for (i in cut) {
predicted_value <- predict(logit_reg, test, type = "response")
predicted_class <- ifelse(predicted_value > i, "Yes", "No")
performance_data <- data.frame(observed = test$default,predicted = predicted_class)
predicted_positive <- sum(performance_data$predicted=="Yes")
predicted_negative <- sum(performance_data$predicted=="No")
positive <- sum(performance_data$observed == "Yes")
negative <- sum(performance_data$observed == "No")
total <- nrow(performance_data)
tp <-sum(performance_data$observed == "Yes" &performance_data$predicted == "Yes")
tn <-sum(performance_data$observed == "No" & performance_data$predicted == "No")
fn<-sum(performance_data$observed=="Yes" & performance_data$predicted=="No")
sensitivity <- tp / positive
especificity <- tn / negative
precision <- tp / predicted_positive
npv <- tn / predicted_negative
fpr <- fn/negative
out<-rbind(out,c(i,1-especificity,sensitivity,especificity,precision,npv,fpr))
}
names(out)<-c("cut","1-Especificity","Sensivity","Especificity","Precision","npv","fpr")
return(out)
}
roc_graph <- roc_data(logit_reg,test,step = 0.01)
plot(roc_graph$`1-Especificity`,roc_graph$Sensivity,xlab = "1-especificity", ylab = "Sensitivity",type = "l")
index<-seq(1,nrow(roc_graph), by=nrow(roc_graph)*0.05)
text(roc_graph$`1-Especificity`[index],roc_graph$Sensivity[index], labels = roc_graph$cut[index], cex=0.6, pos=4)