#anañysis of the database load the database

database<-read_excel("Downloads/credit_short.xlsx")

#analyse the information, classify it and filter it

freq<-count(database, loan_status,sort = TRUE)
freq
## # A tibble: 5 × 2
##   loan_status            n
##   <chr>              <int>
## 1 Fully Paid           728
## 2 Charged Off          145
## 3 Current              121
## 4 Late (31-120 days)     5
## 5 In Grace Period        1
dim(database)
## [1] 1000   26

This visual aid will help undersatbar the caracteristicas of the database

barchart<-barplot(freq$n,names.arg = freq$loan_status, main= "Loan status", ylab="frequency", xlab="status", las=2)

since we only need charge off and fully paid, we will filter thhis data base to show only the obnes that have these clasiffication

data_filter<-filter(database, loan_status=="Charged Off"| loan_status=="Fully Paid")
data_filter
## # A tibble: 873 × 26
##    loan_amnt term  int_r…¹ insta…² grade sub_g…³ emp_t…⁴ emp_l…⁵ home_…⁶ annua…⁷
##        <dbl> <chr>   <dbl>   <dbl> <chr> <chr>   <chr>   <chr>   <chr>     <dbl>
##  1      3600 36 m…   14.0    123.  C     C4      leadman 10+ ye… MORTGA…   55000
##  2     24700 36 m…   12.0    820.  C     C1      Engine… 10+ ye… MORTGA…   65000
##  3     20000 60 m…   10.8    433.  B     B4      truck … 10+ ye… MORTGA…   63000
##  4     10400 60 m…   22.4    290.  F     F1      Contra… 3 years MORTGA…  104433
##  5     11950 36 m…   13.4    405.  C     C3      Veteri… 4 years RENT      34000
##  6     20000 36 m…    9.17   638.  B     B2      Vice P… 10+ ye… MORTGA…  180000
##  7     20000 36 m…    8.49   631.  B     B1      road d… 10+ ye… MORTGA…   85000
##  8     10000 36 m…    6.49   306.  A     A2      SERVIC… 6 years RENT      85000
##  9      8000 36 m…   11.5    264.  B     B5      Vendor… 10+ ye… MORTGA…   42000
## 10      1400 36 m…   12.9     47.1 C     C2      Logist… 3 years MORTGA…   64000
## # … with 863 more rows, 16 more variables: verification_status <chr>,
## #   issue_d <chr>, loan_status <chr>, purpose <chr>, title <chr>, dti <dbl>,
## #   earliest_cr_line <chr>, open_acc <dbl>, pub_rec <dbl>, revol_bal <dbl>,
## #   revol_util <dbl>, total_acc <dbl>, initial_list_status <chr>,
## #   application_type <chr>, mort_acc <dbl>, pub_rec_bankruptcies <dbl>, and
## #   abbreviated variable names ¹​int_rate, ²​installment, ³​sub_grade, ⁴​emp_title,
## #   ⁵​emp_length, ⁶​home_ownership, ⁷​annual_inc
## # ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names
db1<-na.omit(data_filter)
dim(db1)
## [1] 805  26

#Logistic regression \[ \]

##cHANGE THE DATA INTO THE NUMERIC for the regression to work alll the data must be numeric. So firts wer will change the loan stattuis to a dummy variable (1=charge iff, 0=fully paid)

default<-ifelse(db1$loan_status=="Charged Off",1,0)
db2<-cbind(db1,default)
db2<-db2[,-13]

#machine learing trading split the data in two parts: training set (80%) and the testing set (20%)

set.seed(123)
dim3<-dim(db3)
train_sample<-sample(dim3[1],dim3[1]*0.8) ##tomara el 80% de los y hara una base de datos con el
credit_train<-db3[train_sample,]
credit_test<-db3[-train_sample,]

###generate the logic in the training set

credit_model<-glm(default~.,data=credit_train,binomial())
credit_step<-step(credit_model,direction = "both",trace=F)
summary(credit_step)
## 
## Call:
## glm(formula = default ~ loan_amnt + term + int_rate + emp_length + 
##     home_ownership + open_acc + pub_rec + total_acc + pub_rec_bankruptcies, 
##     family = binomial(), data = credit_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0091  -0.5964  -0.4121  -0.2649   2.6594  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -5.922e+00  6.241e-01  -9.489  < 2e-16 ***
## loan_amnt            -3.405e-05  1.540e-05  -2.212 0.026997 *  
## term                  1.101e+00  3.016e-01   3.650 0.000263 ***
## int_rate              1.318e-01  2.957e-02   4.458 8.29e-06 ***
## emp_length            6.094e-02  3.061e-02   1.991 0.046478 *  
## home_ownership        5.960e-01  1.597e-01   3.731 0.000191 ***
## open_acc              6.782e-02  2.873e-02   2.360 0.018257 *  
## pub_rec               4.838e-01  2.319e-01   2.086 0.036992 *  
## total_acc            -1.876e-02  1.333e-02  -1.408 0.159215    
## pub_rec_bankruptcies -6.151e-01  3.902e-01  -1.577 0.114881    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 585.66  on 643  degrees of freedom
## Residual deviance: 493.77  on 634  degrees of freedom
## AIC: 513.77
## 
## Number of Fisher Scoring iterations: 5

###apply the equation in the testing set

we are going to apply the equation that we created with the training set in the testing set and check how accurately it can predict the person to fall into default

prediction<-predict(credit_step,newdata=credit_test)
head(prediction)
##          1          3          9         22         25         27 
## -2.2120046 -0.2405108  0.3137358 -0.5644444 -2.4280834 -0.6962604
prob<-exp(prediction)/(1+exp(prediction))
head(prob)
##          1          3          9         22         25         27 
## 0.09867764 0.44016047 0.57779687 0.36251974 0.08105611 0.33264187
mn<-mean(prob)
predp<-ifelse(prob>0.5,1,0)
head(predp)
##  1  3  9 22 25 27 
##  0  0  1  0  0  0

###confusion matrix test how many defaults and full payment were predicted correlyctu and how many were not, it also calculates they accuracy

predictfactor<-factor(predp,levels = c(0,1))
test<-factor(credit_test$default,levels = c(0,1))
#confusionMatrix(predict_factor,test)