#anañysis of the database load the database
database<-read_excel("Downloads/credit_short.xlsx")
#analyse the information, classify it and filter it
freq<-count(database, loan_status,sort = TRUE)
freq
## # A tibble: 5 × 2
## loan_status n
## <chr> <int>
## 1 Fully Paid 728
## 2 Charged Off 145
## 3 Current 121
## 4 Late (31-120 days) 5
## 5 In Grace Period 1
dim(database)
## [1] 1000 26
This visual aid will help undersatbar the caracteristicas of the database
barchart<-barplot(freq$n,names.arg = freq$loan_status, main= "Loan status", ylab="frequency", xlab="status", las=2)
since we only need charge off and fully paid, we will filter thhis data
base to show only the obnes that have these clasiffication
data_filter<-filter(database, loan_status=="Charged Off"| loan_status=="Fully Paid")
data_filter
## # A tibble: 873 × 26
## loan_amnt term int_r…¹ insta…² grade sub_g…³ emp_t…⁴ emp_l…⁵ home_…⁶ annua…⁷
## <dbl> <chr> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 3600 36 m… 14.0 123. C C4 leadman 10+ ye… MORTGA… 55000
## 2 24700 36 m… 12.0 820. C C1 Engine… 10+ ye… MORTGA… 65000
## 3 20000 60 m… 10.8 433. B B4 truck … 10+ ye… MORTGA… 63000
## 4 10400 60 m… 22.4 290. F F1 Contra… 3 years MORTGA… 104433
## 5 11950 36 m… 13.4 405. C C3 Veteri… 4 years RENT 34000
## 6 20000 36 m… 9.17 638. B B2 Vice P… 10+ ye… MORTGA… 180000
## 7 20000 36 m… 8.49 631. B B1 road d… 10+ ye… MORTGA… 85000
## 8 10000 36 m… 6.49 306. A A2 SERVIC… 6 years RENT 85000
## 9 8000 36 m… 11.5 264. B B5 Vendor… 10+ ye… MORTGA… 42000
## 10 1400 36 m… 12.9 47.1 C C2 Logist… 3 years MORTGA… 64000
## # … with 863 more rows, 16 more variables: verification_status <chr>,
## # issue_d <chr>, loan_status <chr>, purpose <chr>, title <chr>, dti <dbl>,
## # earliest_cr_line <chr>, open_acc <dbl>, pub_rec <dbl>, revol_bal <dbl>,
## # revol_util <dbl>, total_acc <dbl>, initial_list_status <chr>,
## # application_type <chr>, mort_acc <dbl>, pub_rec_bankruptcies <dbl>, and
## # abbreviated variable names ¹int_rate, ²installment, ³sub_grade, ⁴emp_title,
## # ⁵emp_length, ⁶home_ownership, ⁷annual_inc
## # ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names
db1<-na.omit(data_filter)
dim(db1)
## [1] 805 26
#Logistic regression \[ \]
##cHANGE THE DATA INTO THE NUMERIC for the regression to work alll the data must be numeric. So firts wer will change the loan stattuis to a dummy variable (1=charge iff, 0=fully paid)
default<-ifelse(db1$loan_status=="Charged Off",1,0)
db2<-cbind(db1,default)
db2<-db2[,-13]
#machine learing trading split the data in two parts: training set (80%) and the testing set (20%)
set.seed(123)
dim3<-dim(db3)
train_sample<-sample(dim3[1],dim3[1]*0.8) ##tomara el 80% de los y hara una base de datos con el
credit_train<-db3[train_sample,]
credit_test<-db3[-train_sample,]
###generate the logic in the training set
credit_model<-glm(default~.,data=credit_train,binomial())
credit_step<-step(credit_model,direction = "both",trace=F)
summary(credit_step)
##
## Call:
## glm(formula = default ~ loan_amnt + term + int_rate + emp_length +
## home_ownership + open_acc + pub_rec + total_acc + pub_rec_bankruptcies,
## family = binomial(), data = credit_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0091 -0.5964 -0.4121 -0.2649 2.6594
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.922e+00 6.241e-01 -9.489 < 2e-16 ***
## loan_amnt -3.405e-05 1.540e-05 -2.212 0.026997 *
## term 1.101e+00 3.016e-01 3.650 0.000263 ***
## int_rate 1.318e-01 2.957e-02 4.458 8.29e-06 ***
## emp_length 6.094e-02 3.061e-02 1.991 0.046478 *
## home_ownership 5.960e-01 1.597e-01 3.731 0.000191 ***
## open_acc 6.782e-02 2.873e-02 2.360 0.018257 *
## pub_rec 4.838e-01 2.319e-01 2.086 0.036992 *
## total_acc -1.876e-02 1.333e-02 -1.408 0.159215
## pub_rec_bankruptcies -6.151e-01 3.902e-01 -1.577 0.114881
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 585.66 on 643 degrees of freedom
## Residual deviance: 493.77 on 634 degrees of freedom
## AIC: 513.77
##
## Number of Fisher Scoring iterations: 5
###apply the equation in the testing set
we are going to apply the equation that we created with the training set in the testing set and check how accurately it can predict the person to fall into default
prediction<-predict(credit_step,newdata=credit_test)
head(prediction)
## 1 3 9 22 25 27
## -2.2120046 -0.2405108 0.3137358 -0.5644444 -2.4280834 -0.6962604
prob<-exp(prediction)/(1+exp(prediction))
head(prob)
## 1 3 9 22 25 27
## 0.09867764 0.44016047 0.57779687 0.36251974 0.08105611 0.33264187
mn<-mean(prob)
predp<-ifelse(prob>0.5,1,0)
head(predp)
## 1 3 9 22 25 27
## 0 0 1 0 0 0
###confusion matrix test how many defaults and full payment were predicted correlyctu and how many were not, it also calculates they accuracy
predictfactor<-factor(predp,levels = c(0,1))
test<-factor(credit_test$default,levels = c(0,1))
#confusionMatrix(predict_factor,test)