rm(list=ls(all=T))
options(digits=4, scipen=12)
library(magrittr)
議題:議題:使用貸款人的資料,預測他會不會還款
【1.1 基礎機率】What proportion of the loans in the dataset were not paid in full?
loan = read.csv("loans.csv")
str(loan)
'data.frame': 9578 obs. of 14 variables:
$ credit.policy : int 1 1 1 1 1 1 1 1 1 1 ...
$ purpose : Factor w/ 7 levels "all_other","credit_card",..: 3 2 3 3 2 2 3 1 5 3 ...
$ int.rate : num 0.119 0.107 0.136 0.101 0.143 ...
$ installment : num 829 228 367 162 103 ...
$ log.annual.inc : num 11.4 11.1 10.4 11.4 11.3 ...
$ dti : num 19.5 14.3 11.6 8.1 15 ...
$ fico : int 737 707 682 712 667 727 667 722 682 707 ...
$ days.with.cr.line: num 5640 2760 4710 2700 4066 ...
$ revol.bal : int 28854 33623 3511 33667 4740 50807 3839 24220 69909 5630 ...
$ revol.util : num 52.1 76.7 25.6 73.2 39.5 51 76.8 68.6 51.1 23 ...
$ inq.last.6mths : int 0 0 1 1 0 0 0 0 1 1 ...
$ delinq.2yrs : int 0 0 0 0 1 0 0 0 0 0 ...
$ pub.rec : int 0 0 0 0 0 0 1 0 0 0 ...
$ not.fully.paid : int 0 0 0 0 0 0 1 1 0 0 ...
table(loan$not.fully.paid) #0.1601
0 1
8045 1533
1533/(8045+1533)
[1] 0.1601
【1.2 檢查缺項】Which of the following variables has at least one missing observation?
summary(loan)
credit.policy purpose int.rate installment
Min. :0.000 all_other :2331 Min. :0.060 Min. : 15.7
1st Qu.:1.000 credit_card :1262 1st Qu.:0.104 1st Qu.:163.8
Median :1.000 debt_consolidation:3957 Median :0.122 Median :268.9
Mean :0.805 educational : 343 Mean :0.123 Mean :319.1
3rd Qu.:1.000 home_improvement : 629 3rd Qu.:0.141 3rd Qu.:432.8
Max. :1.000 major_purchase : 437 Max. :0.216 Max. :940.1
small_business : 619
log.annual.inc dti fico days.with.cr.line revol.bal
Min. : 7.55 Min. : 0.00 Min. :612 Min. : 179 Min. : 0
1st Qu.:10.56 1st Qu.: 7.21 1st Qu.:682 1st Qu.: 2820 1st Qu.: 3187
Median :10.93 Median :12.66 Median :707 Median : 4140 Median : 8596
Mean :10.93 Mean :12.61 Mean :711 Mean : 4562 Mean : 16914
3rd Qu.:11.29 3rd Qu.:17.95 3rd Qu.:737 3rd Qu.: 5730 3rd Qu.: 18250
Max. :14.53 Max. :29.96 Max. :827 Max. :17640 Max. :1207359
NA's :4 NA's :29
revol.util inq.last.6mths delinq.2yrs pub.rec not.fully.paid
Min. : 0.0 Min. : 0.00 Min. : 0.000 Min. :0.000 Min. :0.00
1st Qu.: 22.7 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.:0.000 1st Qu.:0.00
Median : 46.4 Median : 1.00 Median : 0.000 Median :0.000 Median :0.00
Mean : 46.9 Mean : 1.57 Mean : 0.164 Mean :0.062 Mean :0.16
3rd Qu.: 71.0 3rd Qu.: 2.00 3rd Qu.: 0.000 3rd Qu.:0.000 3rd Qu.:0.00
Max. :119.0 Max. :33.00 Max. :13.000 Max. :5.000 Max. :1.00
NA's :62 NA's :29 NA's :29 NA's :29
【1.3 決定是否要補缺項】Which of the following is the best reason to fill in the missing values for these variables instead of removing observations with missing data?
loanNATest = subset(loan, is.na(log.annual.inc) | is.na(days.with.cr.line) | is.na( revol.util) | is.na(inq.last.6mths) | is.na(delinq.2yrs) | is.na(pub.rec))
summary(loanNATest)
credit.policy purpose int.rate installment
Min. :0.000 all_other :41 Min. :0.0712 Min. : 23.4
1st Qu.:0.000 credit_card : 3 1st Qu.:0.0933 1st Qu.: 78.4
Median :0.000 debt_consolidation: 8 Median :0.1122 Median :145.9
Mean :0.387 educational : 3 Mean :0.1187 Mean :159.2
3rd Qu.:1.000 home_improvement : 1 3rd Qu.:0.1456 3rd Qu.:192.7
Max. :1.000 major_purchase : 5 Max. :0.1913 Max. :859.6
small_business : 1
log.annual.inc dti fico days.with.cr.line revol.bal
Min. : 8.29 Min. : 0.00 Min. :642 Min. : 179 Min. : 0
1st Qu.:10.10 1st Qu.: 5.15 1st Qu.:682 1st Qu.:1830 1st Qu.: 0
Median :10.64 Median :10.00 Median :707 Median :2580 Median : 0
Mean :10.56 Mean : 9.18 Mean :712 Mean :3158 Mean : 5476
3rd Qu.:11.25 3rd Qu.:11.54 3rd Qu.:741 3rd Qu.:4621 3rd Qu.: 0
Max. :13.00 Max. :22.72 Max. :802 Max. :7890 Max. :290291
NA's :4 NA's :29
revol.util inq.last.6mths delinq.2yrs pub.rec not.fully.paid
Min. : NA Min. :0.00 Min. :0.000 Min. :0.00 Min. :0.000
1st Qu.: NA 1st Qu.:0.00 1st Qu.:0.000 1st Qu.:0.00 1st Qu.:0.000
Median : NA Median :1.00 Median :0.000 Median :0.00 Median :0.000
Mean :NaN Mean :1.18 Mean :0.212 Mean :0.03 Mean :0.194
3rd Qu.: NA 3rd Qu.:2.00 3rd Qu.:0.000 3rd Qu.:0.00 3rd Qu.:0.000
Max. : NA Max. :6.00 Max. :4.000 Max. :1.00 Max. :1.000
NA's :62 NA's :29 NA's :29 NA's :29
table(loanNATest$not.fully.paid)
0 1
50 12
12/(50+12) #具有na值的資料中(62筆),有19.35%並未還完貸款,對原始資料影響不大。
[1] 0.1935
【1.4 補缺項工具】What best describes the process we just used to handle missing values?
install.packages("mice")
Error in install.packages : Updating loaded packages
library(mice)
set.seed(144)
vars.for.imputation = setdiff(names(loan), "not.fully.paid")
imputed = complete(mice(loan[vars.for.imputation]))
iter imp variable
1 1 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
1 2 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
1 3 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
1 4 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
1 5 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
2 1 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
2 2 log.annual.inc days.with.cr.line
Restarting R session...
【2.1 顯著性】Which independent variables are significant in our model?
library(caTools)
split = sample.split(loanImpute$not.fully.paid,SplitRatio = 0.7)
loanTrain =subset(loanImpute , split == TRUE )
loanTest =subset(loanImpute , split == FALSE )
loanLog = glm(not.fully.paid ~ .,data = loanTrain , family = binomial)
summary(loanLog)
Call:
glm(formula = not.fully.paid ~ ., family = binomial, data = loanTrain)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.205 -0.621 -0.495 -0.361 2.640
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 9.18714453 1.55446335 5.91 0.0000000034 ***
credit.policy -0.33680457 0.10106680 -3.33 0.00086 ***
purposecredit_card -0.61405712 0.13443839 -4.57 0.0000049340 ***
purposedebt_consolidation -0.32120484 0.09182890 -3.50 0.00047 ***
purposeeducational 0.13472394 0.17531177 0.77 0.44220
purposehome_improvement 0.17270095 0.14796329 1.17 0.24313
purposemajor_purchase -0.48299241 0.20088623 -2.40 0.01620 *
purposesmall_business 0.41204779 0.14186303 2.90 0.00368 **
int.rate 0.61098056 2.08454887 0.29 0.76945
installment 0.00127469 0.00020921 6.09 0.0000000011 ***
log.annual.inc -0.43367555 0.07148249 -6.07 0.0000000013 ***
dti 0.00463768 0.00550209 0.84 0.39929
fico -0.00931679 0.00171000 -5.45 0.0000000508 ***
days.with.cr.line 0.00000237 0.00001588 0.15 0.88134
revol.bal 0.00000309 0.00000117 2.64 0.00827 **
revol.util 0.00183926 0.00153463 1.20 0.23072
inq.last.6mths 0.08437328 0.01599605 5.27 0.0000001330 ***
delinq.2yrs -0.08319927 0.06560940 -1.27 0.20476
pub.rec 0.32995658 0.11385916 2.90 0.00376 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 5896.6 on 6704 degrees of freedom
Residual deviance: 5485.2 on 6686 degrees of freedom
AIC: 5523
Number of Fisher Scoring iterations: 5
【2.2 從回歸係數估計邊際效用】Consider two loan applications, which are identical other than the fact that the borrower in Application A has FICO credit score 700 while the borrower in Application B has FICO credit score 710. What is the value of Logit(A) - Logit(B)? What is the value of O(A)/O(B)?
# the difference of logits
logitA = -0.00931679*700
logitB = -0.00931679*710
logitA - logitB #0.09317
[1] 0.09317
# the ratio of odds
oddA = exp(logitA)
oddB = exp(logitB)
oddA/oddB #1.098
[1] 1.098
【2.3 混淆矩陣、正確率 vs 底線機率】What is the accuracy of the logistic regression model? What is the accuracy of the baseline model?
# test accuracy
loanTest$predicted.risk = predict(loanLog , newdata = loanTest , type = "response")
table(loanTest$not.fully.paid,loanTest$predicted.risk > 0.5)
FALSE TRUE
0 2400 13
1 457 3
test_accuracy = (2400+3)/(2400+13+457+3)
test_accuracy
[1] 0.8364
# baseline accuracy
table(loanTest$not.fully.paid)
0 1
2413 460
baseline_accuracy = 2413/(2413+460)
【2.4 ROC & AUC】Use the ROCR package to compute the test set AUC.
# test accuracy
library(ROCR)
ROCRpred = prediction(loanTest$predicted.risk ,loanTest$not.fully.paid )
as.numeric(performance(ROCRpred , "auc")@y.values)
[1] 0.6721
# baseline accuracy
【3.1 高底線模型】The variable int.rate is highly significant in the bivariate model, but it is not significant at the 0.05 level in the model trained with all the independent variables. What is the most likely explanation for this difference?
bivariate = glm(not.fully.paid ~ int.rate,data = loanTrain , family = binomial)
summary(bivariate)
Call:
glm(formula = not.fully.paid ~ int.rate, family = binomial, data = loanTrain)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.055 -0.627 -0.544 -0.436 2.291
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.673 0.169 -21.8 <2e-16 ***
int.rate 15.921 1.270 12.5 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 5896.6 on 6704 degrees of freedom
Residual deviance: 5734.8 on 6703 degrees of freedom
AIC: 5739
Number of Fisher Scoring iterations: 4
cor(loanTrain$int.rate, loanTrain$fico)
[1] -0.7117
#根據上述產生之模型,int.rate的顯著性相當高,可以做為風險預測之係數,但以前面題目產生的所有係數模型而言,int.rate的顯著性不高(沒有*或.),因此該係數對整體結果影響較低。
【3.2 高底線模型的預測值】What is the highest predicted probability of a loan not being paid in full on the testing set? With a logistic regression cutoff of 0.5, how many loans would be predicted as not being paid in full on the testing set?
predbivTest = predict(bivariate , newdata = loanTest , type = "response")
summary(predbivTest)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.062 0.116 0.151 0.160 0.189 0.427
#max = 0.427
#the max predicted probability is 0.427, less than the cutoff of 0.5 =>0
【3.3 高底線模型的辨識率】What is the test set AUC of the bivariate model?
library(ROCR)
ROCRpred = prediction(predbivTest , loanTest$not.fully.paid)
as.numeric(performance(ROCRpred , "auc")@y.values)
[1] 0.6239
【4.1 投資價值的算法】How much does a $10 investment with an annual interest rate of 6% pay back after 3 years, using continuous compounding of interest?
#c*exp(rt)
10*exp(0.06*3) #11.97
[1] 11.97
【4.2 投資獲利的算法,合約完成】While the investment has value c * exp(rt) dollars after collecting interest, the investor had to pay $c for the investment. What is the profit to the investor if the investment is paid back in full?
#投資淨利:投資總收益-投資成本
#c*exp(rt)-c
10*exp(0.06*3)-10
[1] 1.972
【4.3 投資獲利的算法,違約】Now, consider the case where the investor made a $c investment, but it was not paid back in full. Assume, conservatively, that no money was received from the borrower (often a lender will receive some but not all of the value of the loan, making this a pessimistic assumption of how much is received). What is the profit to the investor in this scenario?
# c * exp(rt) - c correct
#借款者沒有還款,且並無利息收入,故損失為投資成本C
-10
[1] -10
【5.1 計算測試資料的實際投報率】What is the maximum profit of a $10 investment in any loan in the testing set?
loanTest$profit = exp(loanTest$int.rate*3) - 1
loanTest$profit[loanTest$not.fully.paid == 1] = -1
summary(loanTest) #profit max:0.889
credit.policy purpose int.rate installment
Min. :0.000 all_other : 688 Min. :0.060 Min. : 15.7
1st Qu.:1.000 credit_card : 411 1st Qu.:0.103 1st Qu.:163.6
Median :1.000 debt_consolidation:1206 Median :0.122 Median :267.7
Mean :0.805 educational : 93 Mean :0.122 Mean :317.0
3rd Qu.:1.000 home_improvement : 186 3rd Qu.:0.139 3rd Qu.:421.9
Max. :1.000 major_purchase : 105 Max. :0.212 Max. :926.8
small_business : 184
log.annual.inc dti fico days.with.cr.line revol.bal
Min. : 8.1 Min. : 0.00 Min. :612 Min. : 179 Min. : 0
1st Qu.:10.6 1st Qu.: 7.16 1st Qu.:682 1st Qu.: 2795 1st Qu.: 3362
Median :10.9 Median :12.85 Median :707 Median : 4140 Median : 8712
Mean :10.9 Mean :12.75 Mean :711 Mean : 4494 Mean : 17198
3rd Qu.:11.3 3rd Qu.:18.30 3rd Qu.:737 3rd Qu.: 5670 3rd Qu.: 18728
Max. :13.5 Max. :29.96 Max. :822 Max. :17640 Max. :1207359
revol.util inq.last.6mths delinq.2yrs pub.rec not.fully.paid
Min. : 0.0 Min. : 0.00 Min. : 0.00 Min. :0.0000 Min. :0.00
1st Qu.: 23.4 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.:0.0000 1st Qu.:0.00
Median : 46.9 Median : 1.00 Median : 0.00 Median :0.0000 Median :0.00
Mean : 47.0 Mean : 1.58 Mean : 0.16 Mean :0.0574 Mean :0.16
3rd Qu.: 70.4 3rd Qu.: 2.00 3rd Qu.: 0.00 3rd Qu.:0.0000 3rd Qu.:0.00
Max. :108.8 Max. :24.00 Max. :13.00 Max. :3.0000 Max. :1.00
predicted.risk profit
Min. :0.0211 Min. :-1.000
1st Qu.:0.0946 1st Qu.: 0.286
Median :0.1370 Median : 0.411
Mean :0.1578 Mean : 0.209
3rd Qu.:0.1966 3rd Qu.: 0.498
Max. :0.9537 Max. : 0.889
#因為投資為10元,故乘以10
0.889*10
[1] 8.89
A simple investment strategy of equally investing in all the loans would yield profit $20.94 for a $100 investment. But this simple investment strategy does not leverage the prediction model we built earlier in this problem.
【6.1 高利率、高風險】What is the average profit of a $1 investment in one of these high-interest loans (do not include the $ sign in your answer)? What proportion of the high-interest loans were not paid back in full?
highInterest = subset(loanTest , int.rate >=0.15 )
summary(highInterest)
credit.policy purpose int.rate installment
Min. :0.000 all_other : 80 Min. :0.150 Min. : 21.6
1st Qu.:0.000 credit_card : 41 1st Qu.:0.154 1st Qu.:187.1
Median :1.000 debt_consolidation:203 Median :0.161 Median :350.0
Mean :0.606 educational : 15 Mean :0.165 Mean :407.9
3rd Qu.:1.000 home_improvement : 19 3rd Qu.:0.172 3rd Qu.:565.0
Max. :1.000 major_purchase : 12 Max. :0.212 Max. :926.8
small_business : 67
log.annual.inc dti fico days.with.cr.line revol.bal
Min. : 8.48 Min. : 0.00 Min. :612 Min. : 419 Min. : 0
1st Qu.:10.64 1st Qu.: 8.34 1st Qu.:662 1st Qu.: 2340 1st Qu.: 4490
Median :11.00 Median :14.93 Median :672 Median : 3900 Median : 10971
Mean :11.01 Mean :14.53 Mean :677 Mean : 4117 Mean : 19717
3rd Qu.:11.41 3rd Qu.:20.56 3rd Qu.:687 3rd Qu.: 5403 3rd Qu.: 22917
Max. :13.30 Max. :29.96 Max. :802 Max. :14580 Max. :226936
revol.util inq.last.6mths delinq.2yrs pub.rec not.fully.paid
Min. : 0.0 Min. : 0.00 Min. :0.000 Min. :0.0000 Min. :0.000
1st Qu.: 43.9 1st Qu.: 0.00 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.000
Median : 69.4 Median : 1.00 Median :0.000 Median :0.0000 Median :0.000
Mean : 64.0 Mean : 2.18 Mean :0.233 Mean :0.0915 Mean :0.252
3rd Qu.: 88.0 3rd Qu.: 3.00 3rd Qu.:0.000 3rd Qu.:0.0000 3rd Qu.:1.000
Max. :103.1 Max. :10.00 Max. :4.000 Max. :3.0000 Max. :1.000
predicted.risk profit
Min. :0.0687 Min. :-1.000
1st Qu.:0.1807 1st Qu.:-1.000
Median :0.2256 Median : 0.599
Mean :0.2466 Mean : 0.225
3rd Qu.:0.2967 3rd Qu.: 0.638
Max. :0.5831 Max. : 0.889
# the average profit = 0.225
# the average not.fully.paid = 0.252
【6.2 高利率之中的低風險】What is the profit of the investor, who invested $1 in each of these 100 loans? How many of 100 selected loans were not paid back in full?
cutoff = sort(highInterest$predicted.risk, decreasing=FALSE)[100]
selectedLoans = subset(highInterest , highInterest$predicted.risk <=cutoff)
summary(selectedLoans)
credit.policy purpose int.rate installment log.annual.inc
Min. :0.00 all_other : 8 Min. :0.150 Min. : 48.8 Min. : 9.57
1st Qu.:1.00 credit_card :17 1st Qu.:0.153 1st Qu.:176.2 1st Qu.:10.78
Median :1.00 debt_consolidation:60 Median :0.157 Median :309.4 Median :11.13
Mean :0.93 educational : 1 Mean :0.161 Mean :358.3 Mean :11.20
3rd Qu.:1.00 home_improvement : 1 3rd Qu.:0.165 3rd Qu.:473.1 3rd Qu.:11.67
Max. :1.00 major_purchase : 7 Max. :0.205 Max. :907.6 Max. :13.30
small_business : 6
dti fico days.with.cr.line revol.bal revol.util
Min. : 0.00 Min. :642 Min. : 1140 Min. : 0 Min. : 0.0
1st Qu.: 6.05 1st Qu.:662 1st Qu.: 2162 1st Qu.: 3768 1st Qu.:45.9
Median :12.35 Median :672 Median : 3630 Median : 9691 Median :71.7
Mean :12.19 Mean :680 Mean : 3911 Mean : 19923 Mean :65.8
3rd Qu.:18.23 3rd Qu.:692 3rd Qu.: 5010 3rd Qu.: 24534 3rd Qu.:93.8
Max. :28.15 Max. :782 Max. :13170 Max. :168496 Max. :99.7
inq.last.6mths delinq.2yrs pub.rec not.fully.paid predicted.risk
Min. : 0.00 Min. :0.00 Min. :0.00 Min. :0.00 Min. :0.0687
1st Qu.: 0.00 1st Qu.:0.00 1st Qu.:0.00 1st Qu.:0.00 1st Qu.:0.1360
Median : 0.00 Median :0.00 Median :0.00 Median :0.00 Median :0.1533
Mean : 0.89 Mean :0.33 Mean :0.03 Mean :0.19 Mean :0.1479
3rd Qu.: 1.00 3rd Qu.:0.00 3rd Qu.:0.00 3rd Qu.:0.00 3rd Qu.:0.1651
Max. :10.00 Max. :4.00 Max. :1.00 Max. :1.00 Max. :0.1763
profit
Min. :-1.000
1st Qu.: 0.582
Median : 0.599
Mean : 0.313
3rd Qu.: 0.632
Max. : 0.851
sum(selectedLoans$profit) #31.28
[1] 31.28
table(selectedLoans$not.fully.paid) #19
0 1
81 19
【Q】利用我們建好的模型,你可以設計出比上述的方法獲利更高的投資方法嗎?請詳述你的作法?
loan_lowrisk = subset(loanTest , int.rate >= 0.16)
str(loan_lowrisk)
'data.frame': 261 obs. of 16 variables:
$ credit.policy : int 1 1 1 1 1 1 1 1 1 1 ...
$ purpose : Factor w/ 7 levels "all_other","credit_card",..: 3 3 4 1 3 2 3 3 2 3 ...
$ int.rate : num 0.162 0.167 0.163 0.19 0.172 ...
$ installment : num 881 710 106 458 536 ...
$ log.annual.inc : num 11.6 11.1 10.9 11.7 11.2 ...
$ dti : num 24.84 20.09 26.05 5.84 19.83 ...
$ fico : int 682 662 672 662 667 667 672 662 662 662 ...
$ days.with.cr.line: num 7619 4110 5430 6210 3690 ...
$ revol.bal : int 51039 26255 19487 13363 52639 15977 16473 13072 22783 8517 ...
$ revol.util : num 93.8 74.3 89.2 99 79.2 83.6 94.1 90.2 93.7 71.6 ...
$ inq.last.6mths : int 0 1 2 1 3 0 2 0 3 2 ...
$ delinq.2yrs : int 1 0 0 1 0 0 2 0 1 0 ...
$ pub.rec : int 0 0 0 0 0 0 0 0 0 0 ...
$ not.fully.paid : int 1 0 0 1 0 0 0 0 0 1 ...
$ predicted.risk : num 0.219 0.263 0.223 0.193 0.25 ...
$ profit : num -1 0.651 0.632 -1 0.673 ...
summary(loan_lowrisk)
credit.policy purpose int.rate installment
Min. :0.000 all_other : 44 Min. :0.160 Min. : 21.6
1st Qu.:0.000 credit_card : 26 1st Qu.:0.163 1st Qu.:185.0
Median :1.000 debt_consolidation:117 Median :0.170 Median :367.4
Mean :0.556 educational : 10 Mean :0.173 Mean :425.0
3rd Qu.:1.000 home_improvement : 10 3rd Qu.:0.179 3rd Qu.:628.9
Max. :1.000 major_purchase : 6 Max. :0.212 Max. :926.8
small_business : 48
log.annual.inc dti fico days.with.cr.line revol.bal
Min. : 8.48 Min. : 0.0 Min. :612 Min. : 419 Min. : 0
1st Qu.:10.71 1st Qu.: 8.4 1st Qu.:662 1st Qu.: 2520 1st Qu.: 4569
Median :11.08 Median :14.8 Median :672 Median : 4110 Median : 13020
Mean :11.06 Mean :14.9 Mean :676 Mean : 4309 Mean : 22466
3rd Qu.:11.43 3rd Qu.:21.0 3rd Qu.:687 3rd Qu.: 5580 3rd Qu.: 25759
Max. :13.30 Max. :30.0 Max. :802 Max. :14580 Max. :226936
revol.util inq.last.6mths delinq.2yrs pub.rec not.fully.paid
Min. : 0.0 Min. : 0.00 Min. :0.000 Min. :0.000 Min. :0.000
1st Qu.: 46.9 1st Qu.: 0.00 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000
Median : 72.3 Median : 2.00 Median :0.000 Median :0.000 Median :0.000
Mean : 66.7 Mean : 2.36 Mean :0.249 Mean :0.115 Mean :0.283
3rd Qu.: 90.2 3rd Qu.: 3.00 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:1.000
Max. :100.6 Max. :10.00 Max. :4.000 Max. :3.000 Max. :1.000
predicted.risk profit
Min. :0.0696 Min. :-1.000
1st Qu.:0.1955 1st Qu.:-1.000
Median :0.2424 Median : 0.633
Mean :0.2624 Mean : 0.199
3rd Qu.:0.3259 3rd Qu.: 0.682
Max. :0.5702 Max. : 0.889
cutoff_lowrisk = sort(loan_lowrisk$predicted.risk, decreasing=FALSE)[50]
selectedLoans_lowrisk = subset(loan_lowrisk , loan_lowrisk$predicted.risk <=cutoff)
summary(selectedLoans_lowrisk)
credit.policy purpose int.rate installment log.annual.inc
Min. :0.000 all_other : 2 Min. :0.160 Min. :107 Min. : 9.57
1st Qu.:1.000 credit_card : 9 1st Qu.:0.163 1st Qu.:193 1st Qu.:10.79
Median :1.000 debt_consolidation:30 Median :0.167 Median :310 Median :11.22
Mean :0.913 educational : 0 Mean :0.169 Mean :383 Mean :11.27
3rd Qu.:1.000 home_improvement : 0 3rd Qu.:0.172 3rd Qu.:550 3rd Qu.:11.70
Max. :1.000 major_purchase : 4 Max. :0.205 Max. :908 Max. :13.30
small_business : 1
dti fico days.with.cr.line revol.bal revol.util
Min. : 0.00 Min. :652 Min. : 1140 Min. : 0 Min. : 0.0
1st Qu.: 6.62 1st Qu.:662 1st Qu.: 2081 1st Qu.: 5325 1st Qu.:48.8
Median :11.18 Median :667 Median : 3662 Median : 10122 Median :87.3
Mean :11.66 Mean :679 Mean : 4000 Mean : 21051 Mean :72.5
3rd Qu.:16.41 3rd Qu.:692 3rd Qu.: 5055 3rd Qu.: 24661 3rd Qu.:97.6
Max. :24.80 Max. :782 Max. :13170 Max. :168496 Max. :99.7
inq.last.6mths delinq.2yrs pub.rec not.fully.paid predicted.risk
Min. : 0 Min. :0.000 Min. :0.0000 Min. :0.000 Min. :0.0696
1st Qu.: 0 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.1333
Median : 0 Median :0.000 Median :0.0000 Median :0.000 Median :0.1514
Mean : 1 Mean :0.457 Mean :0.0435 Mean :0.174 Mean :0.1464
3rd Qu.: 1 3rd Qu.:1.000 3rd Qu.:0.0000 3rd Qu.:0.000 3rd Qu.:0.1657
Max. :10 Max. :4.000 Max. :1.0000 Max. :1.000 Max. :0.1759
profit
Min. :-1.000
1st Qu.: 0.616
Median : 0.633
Mean : 0.369
3rd Qu.: 0.663
Max. : 0.851
#依據cor()顯示,影響收益最大的係數為利率,呈現正相關,因此利用調整利率為"大於16%",並取得資料集中風險最低的第50筆資料作為標準,取出小於該風險值的資料進行計算,便可得到0.369的平均收益