library(readxl)
fondo <- read_excel("C:/Users/User/Downloads/BD-1.xlsx")
View(fondo)
library(readxl)
library(MASS)
library(stats)
library( psych )
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(caret)
## Loading required package: lattice
summary(fondo)
## ID Year Default WCTA
## Min. : 1.0 Min. :1995 Min. :0.000 Min. :-2.24027
## 1st Qu.:192.0 1st Qu.:1998 1st Qu.:0.000 1st Qu.: 0.03117
## Median :358.0 Median :2000 Median :0.000 Median : 0.11727
## Mean :356.3 Mean :2000 Mean :0.018 Mean : 0.14251
## 3rd Qu.:521.0 3rd Qu.:2002 3rd Qu.:0.000 3rd Qu.: 0.24178
## Max. :830.0 Max. :2004 Max. :1.000 Max. : 0.76604
## RETA EBITTA METL STA
## Min. :-3.31242 Min. :-0.59182 Min. : 0.02372 Min. :0.03586
## 1st Qu.: 0.09474 1st Qu.: 0.03899 1st Qu.: 0.62349 1st Qu.:0.17084
## Median : 0.21893 Median : 0.05156 Median : 1.13646 Median :0.26104
## Mean : 0.21047 Mean : 0.05180 Mean : 1.95396 Mean :0.30365
## 3rd Qu.: 0.37342 3rd Qu.: 0.06509 3rd Qu.: 2.24429 3rd Qu.:0.36694
## Max. : 1.63965 Max. : 0.19804 Max. :60.60715 Max. :5.00777
str(fondo)
## tibble [4,000 x 8] (S3: tbl_df/tbl/data.frame)
## $ ID : num [1:4000] 1 1 1 1 1 1 2 2 2 2 ...
## $ Year : num [1:4000] 1999 2000 2001 2002 2003 ...
## $ Default: num [1:4000] 0 0 0 0 0 0 0 0 0 0 ...
## $ WCTA : num [1:4000] 0.501 0.548 0.451 0.307 0.447 ...
## $ RETA : num [1:4000] 0.307 0.322 0.225 0.192 0.217 ...
## $ EBITTA : num [1:4000] 0.0434 0.0518 0.0268 0.0301 0.0325 ...
## $ METL : num [1:4000] 0.956 1.065 0.804 0.387 0.792 ...
## $ STA : num [1:4000] 0.335 0.335 0.246 0.253 0.276 ...
set.seed(1234)
train<-createDataPartition(y=fondo$Default, p=0.8, list=FALSE,times=1)
dt_train<-fondo[train,]
dt_test<-fondo[-train,]
set.seed(1234)
glm.null.P=glm(Default ~ 1 ,data=dt_train,family=binomial(link="probit"))
model.aic.forward.P<-step(glm.null.P,direction="forward",trace=1,scope=~WCTA+RETA+EBITTA+METL+STA,family=binomial(link="probit"))
## Start: AIC=574.15
## Default ~ 1
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## + METL 1 482.56 486.56
## + RETA 1 495.18 499.18
## + EBITTA 1 526.09 530.09
## + WCTA 1 554.13 558.13
## <none> 572.15 574.15
## + STA 1 570.51 574.51
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=486.56
## Default ~ METL
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## + RETA 1 446.07 452.07
## + EBITTA 1 465.89 471.89
## + WCTA 1 478.54 484.54
## <none> 482.56 486.56
## + STA 1 481.37 487.37
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=452.07
## Default ~ METL + RETA
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## + EBITTA 1 437.33 445.33
## <none> 446.07 452.07
## + STA 1 444.70 452.70
## + WCTA 1 446.03 454.03
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=445.33
## Default ~ METL + RETA + EBITTA
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## + STA 1 434.54 444.54
## <none> 437.33 445.33
## + WCTA 1 437.18 447.18
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=444.54
## Default ~ METL + RETA + EBITTA + STA
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## <none> 434.54 444.54
## + WCTA 1 434.50 446.50
summary(glm.null.P)
##
## Call:
## glm(formula = Default ~ 1, family = binomial(link = "probit"),
## data = dt_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.1896 -0.1896 -0.1896 -0.1896 2.8383
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.10118 0.05329 -39.43 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 572.15 on 3199 degrees of freedom
## Residual deviance: 572.15 on 3199 degrees of freedom
## AIC: 574.15
##
## Number of Fisher Scoring iterations: 6
set.seed(1234)
glm.null.L=glm(Default~ 1,data=dt_train,family=binomial(link="logit"))
model.aic.forward.L<-step(glm.null.L,direction="forward",trace=1,scope=~WCTA+RETA+EBITTA+METL+STA,family=binomial(link="logit"))
## Start: AIC=574.15
## Default ~ 1
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## + METL 1 464.35 468.35
## + RETA 1 506.58 510.58
## + EBITTA 1 532.14 536.14
## + WCTA 1 554.12 558.12
## <none> 572.15 574.15
## + STA 1 570.71 574.71
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=468.35
## Default ~ METL
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## + RETA 1 440.42 446.42
## + EBITTA 1 451.23 457.23
## + WCTA 1 460.50 466.50
## <none> 464.35 468.35
## + STA 1 463.64 469.64
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=446.42
## Default ~ METL + RETA
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## + EBITTA 1 432.89 440.89
## <none> 440.42 446.42
## + STA 1 439.64 447.64
## + WCTA 1 440.32 448.32
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=440.89
## Default ~ METL + RETA + EBITTA
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## <none> 432.89 440.89
## + STA 1 431.16 441.16
## + WCTA 1 432.74 442.74
summary(glm.null.L)
##
## Call:
## glm(formula = Default ~ 1, family = binomial(link = "logit"),
## data = dt_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.1896 -0.1896 -0.1896 -0.1896 2.8383
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.0099 0.1336 -30.01 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 572.15 on 3199 degrees of freedom
## Residual deviance: 572.15 on 3199 degrees of freedom
## AIC: 574.15
##
## Number of Fisher Scoring iterations: 6
mod_HIP<-train(Default~ METL+RETA+EBITTA,data = dt_train,method="glm",family = binomial(link = "probit"))
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
mod_HIP$finalModel
##
## Call: NULL
##
## Coefficients:
## (Intercept) METL RETA EBITTA
## -1.3586 -0.5756 -0.7428 -5.2781
##
## Degrees of Freedom: 3199 Total (i.e. Null); 3196 Residual
## Null Deviance: 572.2
## Residual Deviance: 437.3 AIC: 445.3
exp(-5.2781)
## [1] 0.005102116
Predicciones1<-predict(mod_HIP,dt_test)
predicciones <-ifelse(Predicciones1>0.5,yes = 1,no=0)
matriz_confusion<-table(dt_test$Default,predicciones,dnn=c("observaciones","predicciones"))
matriz_confusion
## predicciones
## observaciones 0 1
## 0 784 1
## 1 15 0
(784+0)/800
## [1] 0.98