Το dataset το οποίο επιλέκτηκε για την εργασία αφορά την χορήγηση ή απόρριψη αίτησης δανείου, βάσει στοιχείων. Το σετ δεδομένων αυτό αποτελεί μετεξέλιξη του Credit Risk data set επί του οποίου έχουν γίνει αλλαγές και προσθήκες μεταβλητών.
Αποτελείται από ένα αρχείο με το όνομα loan_data.csv
Link Dataset: https://www.kaggle.com/datasets/taweilo/loan-approval-classification-data
print(summary(df))
## person_age person_gender person_education person_income
## Min. : 20.00 Length:45000 Length:45000 Min. : 8000
## 1st Qu.: 24.00 Class :character Class :character 1st Qu.: 47204
## Median : 26.00 Mode :character Mode :character Median : 67048
## Mean : 27.76 Mean : 80319
## 3rd Qu.: 30.00 3rd Qu.: 95789
## Max. :144.00 Max. :7200766
## person_emp_exp person_home_ownership loan_amnt loan_intent
## Min. : 0.00 Length:45000 Min. : 500 Length:45000
## 1st Qu.: 1.00 Class :character 1st Qu.: 5000 Class :character
## Median : 4.00 Mode :character Median : 8000 Mode :character
## Mean : 5.41 Mean : 9583
## 3rd Qu.: 8.00 3rd Qu.:12237
## Max. :125.00 Max. :35000
## loan_int_rate loan_percent_income cb_person_cred_hist_length credit_score
## Min. : 5.42 Min. :0.0000 Min. : 2.000 Min. :390.0
## 1st Qu.: 8.59 1st Qu.:0.0700 1st Qu.: 3.000 1st Qu.:601.0
## Median :11.01 Median :0.1200 Median : 4.000 Median :640.0
## Mean :11.01 Mean :0.1397 Mean : 5.867 Mean :632.6
## 3rd Qu.:12.99 3rd Qu.:0.1900 3rd Qu.: 8.000 3rd Qu.:670.0
## Max. :20.00 Max. :0.6600 Max. :30.000 Max. :850.0
## previous_loan_defaults_on_file loan_status
## Length:45000 Min. :0.0000
## Class :character 1st Qu.:0.0000
## Mode :character Median :0.0000
## Mean :0.2222
## 3rd Qu.:0.0000
## Max. :1.0000
print(str(df))
## spc_tbl_ [45,000 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ person_age : num [1:45000] 22 21 25 23 24 21 26 24 24 21 ...
## $ person_gender : chr [1:45000] "female" "female" "female" "female" ...
## $ person_education : chr [1:45000] "Master" "High School" "High School" "Bachelor" ...
## $ person_income : num [1:45000] 71948 12282 12438 79753 66135 ...
## $ person_emp_exp : num [1:45000] 0 0 3 0 1 0 1 5 3 0 ...
## $ person_home_ownership : chr [1:45000] "RENT" "OWN" "MORTGAGE" "RENT" ...
## $ loan_amnt : num [1:45000] 35000 1000 5500 35000 35000 2500 35000 35000 35000 1600 ...
## $ loan_intent : chr [1:45000] "PERSONAL" "EDUCATION" "MEDICAL" "MEDICAL" ...
## $ loan_int_rate : num [1:45000] 16 11.1 12.9 15.2 14.3 ...
## $ loan_percent_income : num [1:45000] 0.49 0.08 0.44 0.44 0.53 0.19 0.37 0.37 0.35 0.13 ...
## $ cb_person_cred_hist_length : num [1:45000] 3 2 3 2 4 2 3 4 2 3 ...
## $ credit_score : num [1:45000] 561 504 635 675 586 532 701 585 544 640 ...
## $ previous_loan_defaults_on_file: chr [1:45000] "No" "Yes" "No" "No" ...
## $ loan_status : num [1:45000] 1 0 1 1 1 1 1 1 1 1 ...
## - attr(*, "spec")=
## .. cols(
## .. person_age = col_double(),
## .. person_gender = col_character(),
## .. person_education = col_character(),
## .. person_income = col_double(),
## .. person_emp_exp = col_double(),
## .. person_home_ownership = col_character(),
## .. loan_amnt = col_double(),
## .. loan_intent = col_character(),
## .. loan_int_rate = col_double(),
## .. loan_percent_income = col_double(),
## .. cb_person_cred_hist_length = col_double(),
## .. credit_score = col_double(),
## .. previous_loan_defaults_on_file = col_character(),
## .. loan_status = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
## NULL
set.seed(901)
split <- sample.split(df$loan_status, SplitRatio = 0.65)
train <- subset(df, split==TRUE)
test <- subset(df, split==FALSE)
cat("Το dataset έχει στο test set ", nrow(test), " και στο train set ", nrow(train))
## Το dataset έχει στο test set 15750 και στο train set 29250
loanLog <- glm(loan_status ~ ., data = train, family = binomial)
summary(loanLog)
##
## Call:
## glm(formula = loan_status ~ ., family = binomial, data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.920e-01 4.472e-01 -0.877 0.3807
## person_age 2.159e-02 1.354e-02 1.595 0.1108
## person_gendermale 4.555e-02 4.406e-02 1.034 0.3012
## person_educationBachelor 1.049e-02 5.835e-02 0.180 0.8574
## person_educationDoctorate 5.155e-02 1.850e-01 0.279 0.7805
## person_educationHigh School 1.100e-02 6.127e-02 0.180 0.8575
## person_educationMaster 2.004e-02 7.012e-02 0.286 0.7750
## person_income 6.764e-07 2.626e-07 2.576 0.0100 **
## person_emp_exp -2.006e-02 1.206e-02 -1.664 0.0961 .
## person_home_ownershipOTHER -1.334e-01 3.924e-01 -0.340 0.7339
## person_home_ownershipOWN -1.409e+00 1.243e-01 -11.334 < 2e-16 ***
## person_home_ownershipRENT 6.806e-01 4.967e-02 13.702 < 2e-16 ***
## loan_amnt -1.076e-04 4.963e-06 -21.692 < 2e-16 ***
## loan_intentEDUCATION -9.481e-01 7.332e-02 -12.930 < 2e-16 ***
## loan_intentHOMEIMPROVEMENT -1.432e-01 8.221e-02 -1.742 0.0815 .
## loan_intentMEDICAL -3.730e-01 7.022e-02 -5.313 1.08e-07 ***
## loan_intentPERSONAL -7.570e-01 7.505e-02 -10.086 < 2e-16 ***
## loan_intentVENTURE -1.343e+00 8.006e-02 -16.778 < 2e-16 ***
## loan_int_rate 3.372e-01 8.251e-03 40.869 < 2e-16 ***
## loan_percent_income 1.611e+01 3.863e-01 41.703 < 2e-16 ***
## cb_person_cred_hist_length -5.202e-04 1.131e-02 -0.046 0.9633
## credit_score -8.650e-03 5.109e-04 -16.930 < 2e-16 ***
## previous_loan_defaults_on_fileYes -2.035e+01 1.271e+02 -0.160 0.8728
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 30988 on 29249 degrees of freedom
## Residual deviance: 12865 on 29227 degrees of freedom
## AIC: 12911
##
## Number of Fisher Scoring iterations: 19
predictTest <- predict(loanLog, type='response', newdata=test)
tapply(predictTest, test$loan_status, mean)
## 0 1
## 0.09468006 0.67905682
Από το παραπάνω μπορούμε να υποθέσουμε πως οι ανεξάρτητες μεταβλητές με ισχυρή συσχέτηση με την εξαρτημένη είναι οι: 1. person_home_ownershipOWN 2. person_home_ownershipRENT 3. loan_amnt 4. loan_intentEDUCATION 5. loan_intentMEDICAL 6. loan_intentPERSONAL 7. loan_intentVENTURE 8. loan_int_rate 9. loan_percent_income 10. credit_score
Υπάρχουν επίσης και μεταβλητές με λιγότερο ισχυρή συσχέτηση: 1. person_income: Μεσαία ισχυή συσχέτησης 2. person_emp_exp: Σχετική συσχέτηση 3. loan_intentHOMEIMPROVEMENT: Σχετική συσχέτηση
Η predict χρησιμοποιείται για την εξέταση της εξίσωσης με βάση τις τιμές ενός άλλου σετ δεδομένων.
table_result <- table(test$loan_status, predictTest > 0.5)
accuracy <- sum(diag(table_result)) / sum(table_result)
sensitivity <- table_result[2,2] /(table_result[2,2] + table_result[2,1])
specificity <- table_result[1,1] / (table_result[1,1] + table_result[1,2])
cat("Accuracy: ", accuracy, "\nSensitivity: ", sensitivity, "\nSpecificity: ", specificity)
## Accuracy: 0.8965714
## Sensitivity: 0.7531429
## Specificity: 0.937551
table(test$loan_status)
##
## 0 1
## 12250 3500
baseline_accuracy <- max(table(test$loan_status)) / nrow(test)
print(baseline_accuracy)
## [1] 0.7777778
Στο baseline accuracy φαίνεται να έχουμε χαμηλότερες επιδόσεις αντί με το κατώφλι t = 0.5
ROCRpred <- prediction(predictTest, test$loan_status)
train2 = na.omit(train)
test2 = na.omit(test)
loanLog2 <- glm(loan_status ~ ., data = train2, family = binomial)
predictTest2 = predict(loanLog2, type = "response", newdata = test2)
cat("Το dataset έχει στο test set ", nrow(test2), " και στο train set ", nrow(train2))
## Το dataset έχει στο test set 15750 και στο train set 29250
ROCRpred2 <- prediction(predictTest2, test2$loan_status)
ROCRperf2 <- performance(ROCRpred2, 'tpr', 'fpr')
plot(ROCRperf2,colorize = TRUE)
as.numeric(performance(ROCRpred2, "auc")@y.values)
## [1] 0.9532357
Από το διάγραμμα βλέπουμε πως το διάγραμμα ROC έχει πολύ απότομη άνοδο με την τιμή του TPR παραμένει 1 για όλες τις τιμές του FPR από 0.3 και πάνω. Αυτό σημαίνει πως η επιφάνεια κάτω από την καμπύλη, το AUC, είναι μεγάλο και ως εκ τούτου έχουμε μεγάλη διαχωριστική ικανότητα. Το παραπάνω επαλιθέυεται από το auc που είναι ίσο με 0.95.
dfTree <- rpart(loan_status ~ ., data = train2, method = "class", minbucket = 25)
prp(dfTree)
summary(dfTree)
## Call:
## rpart(formula = loan_status ~ ., data = train2, method = "class",
## minbucket = 25)
## n= 29250
##
## CP nsplit rel error xerror xstd
## 1 0.16407692 0 1.0000000 1.0000000 0.010938835
## 2 0.05338462 3 0.4855385 0.4896923 0.008193841
## 3 0.01230769 4 0.4321538 0.4360000 0.007783182
## 4 0.01076923 5 0.4198462 0.4296923 0.007732670
## 5 0.01030769 6 0.4090769 0.4141538 0.007606046
## 6 0.01000000 7 0.3987692 0.4100000 0.007571652
##
## Variable importance
## previous_loan_defaults_on_file loan_percent_income
## 37 21
## loan_int_rate person_income
## 18 10
## person_home_ownership credit_score
## 8 5
## loan_amnt
## 1
##
## Node number 1: 29250 observations, complexity param=0.1640769
## predicted class=0 expected loss=0.2222222 P(node) =1
## class counts: 22750 6500
## probabilities: 0.778 0.222
## left son=2 (14914 obs) right son=3 (14336 obs)
## Primary splits:
## previous_loan_defaults_on_file splits as RL, improve=3005.3630, (0 missing)
## loan_percent_income < 0.245 to the left, improve=1777.2260, (0 missing)
## loan_int_rate < 14.355 to the left, improve=1180.8420, (0 missing)
## person_income < 42452 to the right, improve= 746.3979, (0 missing)
## person_home_ownership splits as LRLR, improve= 659.3834, (0 missing)
## Surrogate splits:
## loan_percent_income < 0.225 to the left, agree=0.583, adj=0.149, (0 split)
## loan_int_rate < 13.075 to the left, agree=0.579, adj=0.140, (0 split)
## credit_score < 632.5 to the left, agree=0.575, adj=0.132, (0 split)
## person_home_ownership splits as LRLR, agree=0.569, adj=0.121, (0 split)
## person_income < 54673.5 to the right, agree=0.566, adj=0.115, (0 split)
##
## Node number 2: 14914 observations
## predicted class=0 expected loss=0 P(node) =0.5098803
## class counts: 14914 0
## probabilities: 1.000 0.000
##
## Node number 3: 14336 observations, complexity param=0.1640769
## predicted class=0 expected loss=0.453404 P(node) =0.4901197
## class counts: 7836 6500
## probabilities: 0.547 0.453
## left son=6 (11431 obs) right son=7 (2905 obs)
## Primary splits:
## loan_percent_income < 0.245 to the left, improve=1247.2010, (0 missing)
## loan_int_rate < 13.995 to the left, improve= 877.8428, (0 missing)
## person_home_ownership splits as LRLR, improve= 685.4276, (0 missing)
## person_income < 48454.5 to the right, improve= 629.6683, (0 missing)
## loan_intent splits as RLRRLL, improve= 178.5640, (0 missing)
## Surrogate splits:
## loan_amnt < 21006 to the left, agree=0.803, adj=0.03, (0 split)
##
## Node number 6: 11431 observations, complexity param=0.1640769
## predicted class=0 expected loss=0.3482635 P(node) =0.3908034
## class counts: 7450 3981
## probabilities: 0.652 0.348
## left son=12 (8786 obs) right son=13 (2645 obs)
## Primary splits:
## loan_int_rate < 13.995 to the left, improve=997.2887, (0 missing)
## person_income < 42677 to the right, improve=383.6299, (0 missing)
## person_home_ownership splits as LLLR, improve=201.2552, (0 missing)
## loan_intent splits as RLRRLL, improve=188.3031, (0 missing)
## credit_score < 572.5 to the right, improve=120.8526, (0 missing)
## Surrogate splits:
## credit_score < 448.5 to the right, agree=0.769, adj=0.001, (0 split)
##
## Node number 7: 2905 observations, complexity param=0.01230769
## predicted class=1 expected loss=0.1328744 P(node) =0.09931624
## class counts: 386 2519
## probabilities: 0.133 0.867
## left son=14 (642 obs) right son=15 (2263 obs)
## Primary splits:
## person_home_ownership splits as LRLR, improve=303.95780, (0 missing)
## person_income < 40357.5 to the right, improve= 15.75163, (0 missing)
## loan_int_rate < 11.135 to the left, improve= 12.15318, (0 missing)
## loan_intent splits as RLLRRL, improve= 12.11617, (0 missing)
## loan_amnt < 11991 to the right, improve= 11.66541, (0 missing)
## Surrogate splits:
## person_income < 115814 to the right, agree=0.783, adj=0.017, (0 split)
## loan_int_rate < 5.58 to the left, agree=0.780, adj=0.005, (0 split)
## loan_percent_income < 0.555 to the right, agree=0.780, adj=0.003, (0 split)
##
## Node number 12: 8786 observations, complexity param=0.05338462
## predicted class=0 expected loss=0.2336672 P(node) =0.3003761
## class counts: 6733 2053
## probabilities: 0.766 0.234
## left son=24 (8351 obs) right son=25 (435 obs)
## Primary splits:
## person_income < 24446.5 to the right, improve=404.99970, (0 missing)
## loan_int_rate < 11.005 to the left, improve= 96.07259, (0 missing)
## loan_amnt < 5061 to the right, improve= 89.85364, (0 missing)
## person_home_ownership splits as LLLR, improve= 83.22105, (0 missing)
## credit_score < 567.5 to the right, improve= 83.21551, (0 missing)
## Surrogate splits:
## loan_amnt < 950 to the right, agree=0.951, adj=0.005, (0 split)
##
## Node number 13: 2645 observations
## predicted class=1 expected loss=0.2710775 P(node) =0.09042735
## class counts: 717 1928
## probabilities: 0.271 0.729
##
## Node number 14: 642 observations, complexity param=0.01076923
## predicted class=0 expected loss=0.4376947 P(node) =0.02194872
## class counts: 361 281
## probabilities: 0.562 0.438
## left son=28 (572 obs) right son=29 (70 obs)
## Primary splits:
## person_income < 25127 to the right, improve=49.68341, (0 missing)
## loan_int_rate < 14.365 to the left, improve=39.76239, (0 missing)
## loan_amnt < 6900 to the right, improve=32.63347, (0 missing)
## loan_intent splits as RLRRLL, improve=25.36245, (0 missing)
## credit_score < 625.5 to the right, improve=17.73896, (0 missing)
## Surrogate splits:
## loan_amnt < 7443 to the right, agree=0.969, adj=0.714, (0 split)
## loan_int_rate < 5.605 to the right, agree=0.894, adj=0.029, (0 split)
## loan_percent_income < 0.495 to the left, agree=0.894, adj=0.029, (0 split)
##
## Node number 15: 2263 observations
## predicted class=1 expected loss=0.01104728 P(node) =0.07736752
## class counts: 25 2238
## probabilities: 0.011 0.989
##
## Node number 24: 8351 observations
## predicted class=0 expected loss=0.1990181 P(node) =0.2855043
## class counts: 6689 1662
## probabilities: 0.801 0.199
##
## Node number 25: 435 observations
## predicted class=1 expected loss=0.1011494 P(node) =0.01487179
## class counts: 44 391
## probabilities: 0.101 0.899
##
## Node number 28: 572 observations, complexity param=0.01030769
## predicted class=0 expected loss=0.3688811 P(node) =0.01955556
## class counts: 361 211
## probabilities: 0.631 0.369
## left son=56 (447 obs) right son=57 (125 obs)
## Primary splits:
## loan_int_rate < 14.365 to the left, improve=50.96043, (0 missing)
## loan_intent splits as RLRRLL, improve=28.75421, (0 missing)
## person_home_ownership splits as R-L-, improve=22.22472, (0 missing)
## credit_score < 625.5 to the right, improve=14.20263, (0 missing)
## loan_amnt < 24848 to the left, improve=12.61550, (0 missing)
## Surrogate splits:
## person_income < 120761 to the left, agree=0.790, adj=0.040, (0 split)
## credit_score < 518 to the right, agree=0.787, adj=0.024, (0 split)
##
## Node number 29: 70 observations
## predicted class=1 expected loss=0 P(node) =0.002393162
## class counts: 0 70
## probabilities: 0.000 1.000
##
## Node number 56: 447 observations
## predicted class=0 expected loss=0.2572707 P(node) =0.01528205
## class counts: 332 115
## probabilities: 0.743 0.257
##
## Node number 57: 125 observations
## predicted class=1 expected loss=0.232 P(node) =0.004273504
## class counts: 29 96
## probabilities: 0.232 0.768
PredictCART <- predict(dfTree, newdata = test2, type = 'class')
table(test2$loan_status, PredictCART)
## PredictCART
## 0 1
## 0 11801 449
## 1 937 2563
PredictROC <- predict(dfTree, newdata = test2)
pred <- prediction(PredictROC[,2], test2$loan_status)
perf <- performance(pred, "tpr", "fpr")
plot(perf,colorize = TRUE)
as.numeric(performance(pred, "auc")@y.values)
## [1] 0.9418629
Φαίνεται πως η τιμή AUC είναι ελαφρώς χαμηλότερη από αυτή της λογιστικής παλινδρόμισης. Επίσης η γραμμή ROC έχει γωνιές εν αντιθέση με αυτης της λογιστικής παλινδρόμησης. Συνολικά η χρήση της CART παράσκει θετικά αποτελέσματα, ωστόσο αυτά είναι λιγότερο βέλτιστα από αυτά της λογιστικής παλινδρόμησης. Αξίζει να αναφερθεί πως και οι δύο καμπήλες έχουν δημιουργηθεί μέσω της χρήσης των test2 και train2, των οποιών η διαφορά μεταξύ αυτών και των test και train είναι η αφαίρεση των null και κενών τιμών.