Part 1
loan = read.csv("C:/Users/Ricky's Computer/Desktop/LoanStats3c.csv", skip = 1)
loan1 <- loan[-c(235632:235633),]
set.seed(1)
Part 2
highgrade <- ifelse(loan1$grade == "A" | loan1$grade == "B", 1, 0)
loan2 <- cbind(loan1, highgrade)
summary(loan2)
id member_id loan_amnt funded_amnt funded_amnt_inv term
: 2 Min. : 137225 Min. : 1000 Min. : 1000 Min. : 950 : 2
10004568: 1 1st Qu.:15579724 1st Qu.: 8325 1st Qu.: 8325 1st Qu.: 8325 36 months:162570
10004652: 1 Median :22953173 Median :13000 Median :13000 Median :13000 60 months: 73059
10004654: 1 Mean :24019300 Mean :14870 Mean :14870 Mean :14865
10004703: 1 3rd Qu.:31767065 3rd Qu.:20000 3rd Qu.:20000 3rd Qu.:20000
10014694: 1 Max. :40860827 Max. :35000 Max. :35000 Max. :35000
(Other) :235624 NA's :2 NA's :2 NA's :2 NA's :2
int_rate installment grade sub_grade emp_title emp_length
12.99% : 12634 Min. : 23.36 C :66565 C2 : 13965 : 13236 10+ years:79505
10.99% : 10684 1st Qu.: 265.68 B :61935 C3 : 13794 Teacher : 4569 2 years :20487
15.61% : 10310 Median : 384.12 D :42992 C1 : 13498 Manager : 3772 3 years :18267
12.49% : 9705 Mean : 442.48 A :36108 B4 : 13475 Registered Nurse: 1960 < 1 year :17982
13.98% : 8858 3rd Qu.: 578.69 E :20121 B5 : 13309 RN : 1816 1 year :14593
14.99% : 8103 Max. :1409.99 F : 6223 C4 : 13093 (Other) :210277 4 years :13528
(Other):175337 NA's :2 (Other): 1687 (Other):154497 NA's : 1 (Other) :71269
home_ownership annual_inc verification_status issue_d loan_status pymnt_plan
: 2 Min. : 3000 : 2 Oct-14 :38783 Current :152902 : 2
ANY : 1 1st Qu.: 45377 Not Verified :70659 Jul-14 :29306 Fully Paid : 59256 n:235626
MORTGAGE:119937 Median : 65000 Source Verified:97741 Nov-14 :25054 Charged Off : 16252 y: 3
OWN : 23007 Mean : 74854 Verified :67229 May-14 :19099 Late (31-120 days): 4289
RENT : 92684 3rd Qu.: 90000 Apr-14 :19071 In Grace Period : 2038
Max. :7500000 Aug-14 :18814 Late (16-30 days) : 702
NA's :2 (Other):85504 (Other) : 192
url
: 2
https://www.lendingclub.com/browse/loanDetail.action?loan_id=10004568: 1
https://www.lendingclub.com/browse/loanDetail.action?loan_id=10004652: 1
https://www.lendingclub.com/browse/loanDetail.action?loan_id=10004654: 1
https://www.lendingclub.com/browse/loanDetail.action?loan_id=10004703: 1
https://www.lendingclub.com/browse/loanDetail.action?loan_id=10014694: 1
(Other) :235624
desc purpose
:220352 debt_consolidation:143006
Borrower added on 03/17/14 > Debt consolidation<br>: 11 credit_card : 55522
Borrower added on 03/10/14 > Debt consolidation<br>: 10 home_improvement : 13045
Borrower added on 02/19/14 > Debt consolidation<br>: 9 other : 10371
Borrower added on 01/29/14 > Debt consolidation<br>: 8 major_purchase : 3858
Borrower added on 01/15/14 > Debt consolidation<br>: 7 medical : 2331
(Other) : 15234 (Other) : 7498
title zip_code addr_state dti delinq_2yrs earliest_cr_line
Debt consolidation :140624 750xx : 2546 CA : 33288 Min. : 0.00 Min. : 0.0000 Aug-01 : 1980
Credit card refinancing: 54347 945xx : 2418 NY : 19923 1st Qu.:12.02 1st Qu.: 0.0000 Aug-00 : 1945
Home improvement : 12880 112xx : 2382 TX : 18967 Median :17.63 Median : 0.0000 Sep-00 : 1719
Other : 10230 606xx : 2273 FL : 15691 Mean :18.04 Mean : 0.3445 Aug-02 : 1711
Major purchase : 3817 300xx : 2085 IL : 9628 3rd Qu.:23.76 3rd Qu.: 0.0000 Aug-99 : 1696
Medical expenses : 2303 070xx : 1936 NJ : 8863 Max. :39.99 Max. :22.0000 Oct-00 : 1658
(Other) : 11430 (Other):221991 (Other):129271 NA's :2 NA's :2 (Other):224922
inq_last_6mths mths_since_last_delinq mths_since_last_record open_acc pub_rec revol_bal
Min. :0.0000 Min. : 0.0 Min. : 0.00 Min. : 0.00 Min. : 0.0000 Min. : 0
1st Qu.:0.0000 1st Qu.: 15.0 1st Qu.: 50.00 1st Qu.: 8.00 1st Qu.: 0.0000 1st Qu.: 6336
Median :0.0000 Median : 30.0 Median : 69.00 Median :11.00 Median : 0.0000 Median : 11686
Mean :0.7558 Mean : 33.4 Mean : 70.71 Mean :11.67 Mean : 0.2225 Mean : 16508
3rd Qu.:1.0000 3rd Qu.: 49.0 3rd Qu.: 97.00 3rd Qu.:14.00 3rd Qu.: 0.0000 3rd Qu.: 20528
Max. :6.0000 Max. :188.0 Max. :121.00 Max. :84.00 Max. :63.0000 Max. :2560703
NA's :2 NA's :115883 NA's :194107 NA's :2 NA's :2 NA's :2
revol_util total_acc initial_list_status out_prncp out_prncp_inv total_pymnt total_pymnt_inv
0% : 590 Min. : 2.00 : 2 Min. : 0 Min. : 0 Min. : 0 Min. : 0
59% : 520 1st Qu.: 17.00 f:112156 1st Qu.: 0 1st Qu.: 0 1st Qu.: 5039 1st Qu.: 5038
58% : 515 Median : 24.00 w:123473 Median : 4537 Median : 4535 Median : 8040 Median : 8037
53% : 512 Mean : 26.01 Mean : 6466 Mean : 6464 Mean : 9979 Mean : 9975
48% : 485 3rd Qu.: 33.00 3rd Qu.:10401 3rd Qu.:10396 3rd Qu.:12816 3rd Qu.:12812
50% : 483 Max. :156.00 Max. :31554 Max. :31531 Max. :50457 Max. :50421
(Other):232526 NA's :2 NA's :2 NA's :2 NA's :2 NA's :2
total_rec_prncp total_rec_int total_rec_late_fee recoveries collection_recovery_fee last_pymnt_d
Min. : 0 Min. : 0.0 Min. : 0.0000 Min. : 0.00 Min. : 0.000 Feb-16 :148107
1st Qu.: 3237 1st Qu.: 937.3 1st Qu.: 0.0000 1st Qu.: 0.00 1st Qu.: 0.000 Jan-16 : 14305
Median : 5444 Median : 1759.4 Median : 0.0000 Median : 0.00 Median : 0.000 Jul-15 : 5878
Mean : 7536 Mean : 2382.9 Mean : 0.5025 Mean : 59.74 Mean : 8.441 Oct-15 : 5578
3rd Qu.: 9737 3rd Qu.: 3174.0 3rd Qu.: 0.0000 3rd Qu.: 0.00 3rd Qu.: 0.000 Dec-15 : 5299
Max. :35000 Max. :16764.0 Max. :252.8000 Max. :20502.14 Max. :3089.500 Sep-15 : 5045
NA's :2 NA's :2 NA's :2 NA's :2 NA's :2 (Other): 51419
last_pymnt_amnt next_pymnt_d last_credit_pull_d collections_12_mths_ex_med mths_since_last_major_derog
Min. : 0.0 : 75510 Feb-16 :195399 Min. : 0.00000 Min. : 0.00
1st Qu.: 305.5 Apr-16: 6771 Dec-15 : 5628 1st Qu.: 0.00000 1st Qu.: 26.00
Median : 500.2 Feb-16: 59 Jan-16 : 5380 Median : 0.00000 Median : 43.00
Mean : 2917.5 Mar-16:153291 Sep-15 : 2980 Mean : 0.01544 Mean : 43.37
3rd Qu.: 1151.3 Nov-15 : 2853 3rd Qu.: 0.00000 3rd Qu.: 60.00
Max. :36234.4 Oct-15 : 2637 Max. :20.00000 Max. :188.00
NA's :2 (Other): 20754 NA's :2 NA's :169153
policy_code application_type annual_inc_joint dti_joint verification_status_joint acc_now_delinq
Min. :1 : 2 Mode:logical Mode:logical Mode:logical Min. :0.000000
1st Qu.:1 INDIVIDUAL:235629 NA's:235631 NA's:235631 NA's:235631 1st Qu.:0.000000
Median :1 Median :0.000000
Mean :1 Mean :0.005734
3rd Qu.:1 3rd Qu.:0.000000
Max. :1 Max. :4.000000
NA's :2 NA's :2
tot_coll_amt tot_cur_bal open_acc_6m open_il_6m open_il_12m open_il_24m mths_since_rcnt_il
Min. : 0 Min. : 0 Mode:logical Mode:logical Mode:logical Mode:logical Mode:logical
1st Qu.: 0 1st Qu.: 29460 NA's:235631 NA's:235631 NA's:235631 NA's:235631 NA's:235631
Median : 0 Median : 82027
Mean : 270 Mean : 139802
3rd Qu.: 0 3rd Qu.: 209506
Max. :9152545 Max. :4026405
NA's :2 NA's :2
total_bal_il il_util open_rv_12m open_rv_24m max_bal_bc all_util total_rev_hi_lim
Mode:logical Mode:logical Mode:logical Mode:logical Mode:logical Mode:logical Min. : 0
NA's:235631 NA's:235631 NA's:235631 NA's:235631 NA's:235631 NA's:235631 1st Qu.: 13300
Median : 22800
Mean : 30709
3rd Qu.: 38400
Max. :9999999
NA's :2
inq_fi total_cu_tl inq_last_12m acc_open_past_24mths avg_cur_bal bc_open_to_buy bc_util
Mode:logical Mode:logical Mode:logical Min. : 0.000 Min. : 0 Min. : 0 Min. : 0.00
NA's:235631 NA's:235631 NA's:235631 1st Qu.: 2.000 1st Qu.: 3110 1st Qu.: 1110 1st Qu.: 45.90
Median : 4.000 Median : 7533 Median : 3625 Median : 68.90
Mean : 4.405 Mean : 13413 Mean : 8462 Mean : 64.66
3rd Qu.: 6.000 3rd Qu.: 18722 3rd Qu.: 9880 3rd Qu.: 87.50
Max. :53.000 Max. :497484 Max. :260250 Max. :255.20
NA's :2 NA's :8 NA's :2447 NA's :2613
chargeoff_within_12_mths delinq_amnt mo_sin_old_il_acct mo_sin_old_rev_tl_op mo_sin_rcnt_rev_tl_op
Min. :0.00000 Min. : 0.0 Min. : 0.0 Min. : 3.0 Min. : 0.00
1st Qu.:0.00000 1st Qu.: 0.0 1st Qu.:101.0 1st Qu.:120.0 1st Qu.: 4.00
Median :0.00000 Median : 0.0 Median :131.0 Median :168.0 Median : 8.00
Mean :0.01072 Mean : 10.2 Mean :128.5 Mean :185.7 Mean : 13.07
3rd Qu.:0.00000 3rd Qu.: 0.0 3rd Qu.:154.0 3rd Qu.:234.0 3rd Qu.: 16.00
Max. :7.00000 Max. :70076.0 Max. :561.0 Max. :842.0 Max. :372.00
NA's :2 NA's :2 NA's :7173 NA's :2 NA's :2
mo_sin_rcnt_tl mort_acc mths_since_recent_bc mths_since_recent_bc_dlq mths_since_recent_inq
Min. : 0.000 Min. : 0.000 Min. : 0.00 Min. : 0.00 Min. : 0.00
1st Qu.: 3.000 1st Qu.: 0.000 1st Qu.: 6.00 1st Qu.: 20.00 1st Qu.: 2.00
Median : 6.000 Median : 1.000 Median : 14.00 Median : 39.00 Median : 5.00
Mean : 7.997 Mean : 1.851 Mean : 24.44 Mean : 39.63 Mean : 6.92
3rd Qu.: 10.000 3rd Qu.: 3.000 3rd Qu.: 30.00 3rd Qu.: 58.00 3rd Qu.:10.00
Max. :226.000 Max. :37.000 Max. :616.00 Max. :170.00 Max. :25.00
NA's :2 NA's :2 NA's :2248 NA's :173350 NA's :21694
mths_since_recent_revol_delinq num_accts_ever_120_pd num_actv_bc_tl num_actv_rev_tl num_bc_sats
Min. : 0.00 Min. : 0.0000 Min. : 0.000 Min. : 0.000 Min. : 0.000
1st Qu.: 16.00 1st Qu.: 0.0000 1st Qu.: 2.000 1st Qu.: 4.000 1st Qu.: 3.000
Median : 32.00 Median : 0.0000 Median : 3.000 Median : 5.000 Median : 4.000
Mean : 35.46 Mean : 0.5034 Mean : 3.687 Mean : 5.805 Mean : 4.648
3rd Qu.: 52.00 3rd Qu.: 0.0000 3rd Qu.: 5.000 3rd Qu.: 7.000 3rd Qu.: 6.000
Max. :180.00 Max. :33.0000 Max. :26.000 Max. :38.000 Max. :35.000
NA's :150865 NA's :2 NA's :2 NA's :2 NA's :2
num_bc_tl num_il_tl num_op_rev_tl num_rev_accts num_rev_tl_bal_gt_0 num_sats
Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 2.0 Min. : 0.00 Min. : 0.00
1st Qu.: 5.000 1st Qu.: 4.000 1st Qu.: 5.000 1st Qu.: 9.0 1st Qu.: 4.00 1st Qu.: 8.00
Median : 8.000 Median : 7.000 Median : 7.000 Median : 14.0 Median : 5.00 Median :11.00
Mean : 8.544 Mean : 8.573 Mean : 8.277 Mean : 15.3 Mean : 5.77 Mean :11.62
3rd Qu.:11.000 3rd Qu.: 11.000 3rd Qu.:10.000 3rd Qu.: 20.0 3rd Qu.: 7.00 3rd Qu.:14.00
Max. :61.000 Max. :150.000 Max. :62.000 Max. :105.0 Max. :38.00 Max. :84.00
NA's :2 NA's :2 NA's :2 NA's :2 NA's :2 NA's :2
num_tl_120dpd_2m num_tl_30dpd num_tl_90g_dpd_24m num_tl_op_past_12m pct_tl_nvr_dlq percent_bc_gt_75
Min. :0.000 Min. :0.000000 Min. : 0.00000 Min. : 0.000 Min. : 16.70 Min. : 0.00
1st Qu.:0.000 1st Qu.:0.000000 1st Qu.: 0.00000 1st Qu.: 1.000 1st Qu.: 91.40 1st Qu.: 22.20
Median :0.000 Median :0.000000 Median : 0.00000 Median : 2.000 Median : 97.60 Median : 50.00
Mean :0.001 Mean :0.003722 Mean : 0.09458 Mean : 2.007 Mean : 94.24 Mean : 50.77
3rd Qu.:0.000 3rd Qu.:0.000000 3rd Qu.: 0.00000 3rd Qu.: 3.000 3rd Qu.:100.00 3rd Qu.: 80.00
Max. :2.000 Max. :4.000000 Max. :22.00000 Max. :26.000 Max. :100.00 Max. :100.00
NA's :7862 NA's :2 NA's :2 NA's :2 NA's :2 NA's :2559
pub_rec_bankruptcies tax_liens tot_hi_cred_lim total_bal_ex_mort total_bc_limit
Min. : 0.0000 Min. : 0.00000 Min. : 0 Min. : 0 Min. : 0
1st Qu.: 0.0000 1st Qu.: 0.00000 1st Qu.: 47800 1st Qu.: 20769 1st Qu.: 7000
Median : 0.0000 Median : 0.00000 Median : 111514 Median : 36685 Median : 13800
Mean : 0.1349 Mean : 0.05576 Mean : 170249 Mean : 48389 Mean : 20031
3rd Qu.: 0.0000 3rd Qu.: 0.00000 3rd Qu.: 247522 3rd Qu.: 60821 3rd Qu.: 26200
Max. :12.0000 Max. :63.00000 Max. :9999999 Max. :2688920 Max. :1090700
NA's :2 NA's :2 NA's :2 NA's :2 NA's :2
total_il_high_credit_limit highgrade
Min. : 0 Min. :0.0000
1st Qu.: 13592 1st Qu.:0.0000
Median : 30000 Median :0.0000
Mean : 39883 Mean :0.4161
3rd Qu.: 53566 3rd Qu.:1.0000
Max. :1241783 Max. :1.0000
NA's :2
proportion <- sum(highgrade)/235629
proportion
[1] 0.4160905
# Above or Below the median income level
median_inc <- median(loan2$annual_inc[!is.na(loan2$annual_inc)])
# to remove the N/A values ^
above_median <- loan2[loan2$annual_inc >= median_inc,]
below_median <- loan2[loan2$annual_inc < median_inc,]
t.test(above_median$highgrade, below_median$highgrade)
Welch Two Sample t-test
data: above_median$highgrade and below_median$highgrade
t = 45.554, df = 235520, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
0.08813389 0.09605877
sample estimates:
mean of x mean of y
0.4618931 0.3697967
# Loan request is above or below the median loan amount
median_lr <- median(loan2$loan_amnt[!is.na(loan2$loan_amnt)])
above_lr = loan2[loan2$loan_amnt > median_lr,]
below_lr = loan2[loan2$loan_amnt < median_lr,]
t.test(above_lr$highgrade, below_lr$highgrade)
Welch Two Sample t-test
data: above_lr$highgrade and below_lr$highgrade
t = -33.23, df = 233050, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.07162810 -0.06364926
sample estimates:
mean of x mean of y
0.3814771 0.4491158
# Whether the debtor rents their home or not
yes_rent = loan2[loan2$home_ownership == "RENT",]
no_rent = loan2[loan2$home_ownership != "RENT",]
t.test(yes_rent$highgrade, no_rent$highgrade)
Welch Two Sample t-test
data: yes_rent$highgrade and no_rent$highgrade
t = -14.685, df = 199440, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.03450375 -0.02637801
sample estimates:
mean of x mean of y
0.3976199 0.4280607
Part 3
# Use of GLM Command
fit.glm <- glm(data=loan2, highgrade ~ annual_inc + home_ownership + loan_amnt + verification_status + purpose, family=binomial)
glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(fit.glm)
Call:
glm(formula = highgrade ~ annual_inc + home_ownership + loan_amnt +
verification_status + purpose, family = binomial, data = loan2)
Deviance Residuals:
Min 1Q Median 3Q Max
-8.4904 -0.9499 -0.7030 1.1244 2.6029
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 8.188e+00 2.666e+01 0.307 0.7588
annual_inc 8.547e-06 1.216e-07 70.261 < 2e-16 ***
home_ownershipMORTGAGE -8.055e+00 2.666e+01 -0.302 0.7626
home_ownershipOWN -8.071e+00 2.666e+01 -0.303 0.7621
home_ownershipRENT -8.180e+00 2.666e+01 -0.307 0.7590
loan_amnt -3.895e-05 6.762e-07 -57.601 < 2e-16 ***
verification_statusSource Verified -6.533e-01 1.090e-02 -59.928 < 2e-16 ***
verification_statusVerified -9.497e-01 1.245e-02 -76.262 < 2e-16 ***
purposecredit_card 8.271e-01 4.978e-02 16.617 < 2e-16 ***
purposedebt_consolidation -8.011e-02 4.925e-02 -1.627 0.1038
purposehome_improvement -3.269e-01 5.256e-02 -6.219 5.02e-10 ***
purposehouse -2.032e+00 1.385e-01 -14.673 < 2e-16 ***
purposemajor_purchase -1.265e-01 5.963e-02 -2.121 0.0339 *
purposemedical -1.177e+00 7.063e-02 -16.659 < 2e-16 ***
purposemoving -2.159e+00 1.037e-01 -20.814 < 2e-16 ***
purposeother -1.173e+00 5.481e-02 -21.394 < 2e-16 ***
purposerenewable_energy -2.306e+00 3.299e-01 -6.990 2.74e-12 ***
purposesmall_business -1.844e+00 8.677e-02 -21.251 < 2e-16 ***
purposevacation -1.294e+00 8.797e-02 -14.712 < 2e-16 ***
purposewedding -4.688e-01 7.629e-01 -0.614 0.5389
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 319984 on 235628 degrees of freedom
Residual deviance: 290586 on 235609 degrees of freedom
(2 observations deleted due to missingness)
AIC: 290626
Number of Fisher Scoring iterations: 6
# Use of Predict Command
pro_vector = predict.glm(logit_model, type="response")
classifications <- ifelse(predictions > 0.49, 1, 0)
accuracy_training <- 1-mean(classifications != loan3$highgrade)
classifications_1 <- rep.int(0, times=nrow(loan3))
accuracy_1 <- 1-mean(classifications_1 != loan3$highgrade)
classifications_2 <- sample(2, size=nrow(loan3), replace=T)
classifications_2 <- classifications_2 - 1
accuracy_2 <- 1-mean(classifications_2 != loan3$highgrade)
Part 4
library(rpart)
## Train the classification tree model on training data
fit1 = rpart(highgrade ~ annual_inc + home_ownership + loan_amnt + verification_status + purpose, data = loan2, method = "class")
plot(fit1)
text(fit1)
# This predict method will return binary values
class_predictions = predict(fit1, type="class")
1-mean(class_predictions != loan3$highgrade)
longer object length is not a multiple of shorter object lengthlonger object length is not a multiple of shorter object length
[1] 0.6475846
Part 5
testing <- read.csv("C:/Users/Ricky's Computer/Desktop/LoanStats3d.csv", skip = 1)
testing$highgrade <- ifelse(testing$grade == "A" | testing$grade == "B", 1, 0)
# Getting rid of any rows with factor values that aren't found
# in original training data
testing2 <- testing[testing$home_ownership %in% unique(loan2$home_ownership) & testing$home_ownership != "" & testing$purpose %in% unique(loan2$purpose),]
test_predictions.logit <- predict.glm(logit_model, newdata=testing2, type="response")
test_classifications.logit <- ifelse(test_predictions.logit > 0.49, 1, 0)
test_accuracy.logit <- 1 - mean(test_classifications.logit != testing2$highgrade)
paste("Logit test accuracy:", test_accuracy.logit)
[1] "Logit test accuracy: 0.649679169021644"
test_predictions.tree <- predict(fit1, newdata=testing2)
test_classifications.tree <- ifelse(test_predictions.tree[,2] > 0.49, 1, 0)
test_accuracy.tree <- 1 - mean(test_classifications.tree != testing2$highgrade)
paste("Tree test accuracy:", test_accuracy.tree)
[1] "Tree test accuracy: 0.629004450312757"
# Accuracy of random classifier
test_predictions.coin <- sample.int(2, size=nrow(testing2), replace=TRUE) - 1
test_accuracy.coin <- 1 - mean(test_predictions.coin != testing2$highgrade)
paste("Random flip test accuracy:", test_accuracy.coin)
[1] "Random flip test accuracy: 0.499615287797974"
# Accuracy of 0 classifier
test_predictions.0 <- rep(0, times=nrow(testing2))
test_accuracy.0 <- 1 - mean(test_predictions.0 != testing2$highgrade)
paste("All-zero test accuracy:", test_accuracy.0)
[1] "All-zero test accuracy: 0.546559675511881"