library(ISLR)
library(ggplot2)
library(caret)
Default
logit_reg<-glm(formula=default ~ balance, data=Default,
family = binomial(link="logit"))
summary(logit_reg)
Call:
glm(formula = default ~ balance, family = binomial(link = "logit"),
data = Default)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.2697 -0.1465 -0.0589 -0.0221 3.7589
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.065e+01 3.612e-01 -29.49 <2e-16 ***
balance 5.499e-03 2.204e-04 24.95 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2920.6 on 9999 degrees of freedom
Residual deviance: 1596.5 on 9998 degrees of freedom
AIC: 1600.5
Number of Fisher Scoring iterations: 8
dominio<-seq(min(Default$balance), max(Default$balance),1)
pred<-predict(object = logit_reg,
newdata = list(balance=dominio),
type = "response")
plt.logit<-data.frame(dominio,pred)
colnames(plt.logit)<-c("Balance", "Probabilidad")
plt.logit
ggplot(data = plt.logit, aes(x=Balance, y=Probabilidad))+
geom_line(color="blue",size=1)+
ggtitle("Modelo 1 - Regresion Logistica")+
labs(x="Balance")+
labs(y="Probabilidad")+
theme_minimal()
nrow(Default)
[1] 10000
df.train<-Default[1:(0.7*(nrow(Default))), ]
df.test<-Default[(0.7*nrow(Default)):nrow(Default),]
logit_reg2<-glm(formula=default ~ balance, data=df.train,
family = binomial(link="logit"))
summary(logit_reg2)
Call:
glm(formula = default ~ balance, family = binomial(link = "logit"),
data = df.train)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.3037 -0.1474 -0.0585 -0.0219 3.6909
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.069e+01 4.289e-01 -24.93 <2e-16 ***
balance 5.558e-03 2.625e-04 21.18 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2090.7 on 6999 degrees of freedom
Residual deviance: 1128.9 on 6998 degrees of freedom
AIC: 1132.9
Number of Fisher Scoring iterations: 8
preds2<-predict(object = logit_reg2,
newdata = df.test,
type = "response")
df.results2<-data.frame(df.test$balance, Probabilidad=preds2)
df.results2
resultado de evaluar la probabilidad que esta dando el modelo
threshresults<-ifelse(df.results2$Probabilidad >= 0.5, "Yes", "No")
class(df.test$default)
[1] "factor"
levels(df.test$default)
[1] "No" "Yes"
class(threshresults)
[1] "character"
confusionMatrix(as.factor(threshresults), df.test$default)
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 2895 61
Yes 13 32
Accuracy : 0.9753
95% CI : (0.9691, 0.9806)
No Information Rate : 0.969
P-Value [Acc > NIR] : 0.02275
Kappa : 0.4527
Mcnemar's Test P-Value : 4.665e-08
Sensitivity : 0.9955
Specificity : 0.3441
Pos Pred Value : 0.9794
Neg Pred Value : 0.7111
Prevalence : 0.9690
Detection Rate : 0.9647
Detection Prevalence : 0.9850
Balanced Accuracy : 0.6698
'Positive' Class : No