rm(list=ls(all=T))
options(digits=4, scipen=12)
library(magrittr)
議題:議題:使用貸款人的資料,預測他會不會還款
setwd("~/big data/unit3/code_data/data")
The working directory was changed to C:/Users/Administrator/Documents/big data/unit3/code_data/data inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
loan=read.csv("loans.csv")
loans_imputed=read.csv("loans_imputed.csv")
【1.1 基礎機率】What proportion of the loans in the dataset were not paid in full?
table(loan$not.fully.paid)
0 1
8045 1533
1533/(8045+1533)
[1] 0.1601
【1.2 檢查缺項】Which of the following variables has at least one missing observation?
names(which(apply(loan,2,function(x)sum(is.na(x)))>1))
[1] "log.annual.inc" "days.with.cr.line" "revol.util"
[4] "inq.last.6mths" "delinq.2yrs" "pub.rec"
【1.3 決定是否要補缺項】Which of the following is the best reason to fill in the missing values for these variables instead of removing observations with missing data?
c("Ans:We want to be able to predict risk for all borrowers, instead of just the ones with all data reported")
[1] "Ans:We want to be able to predict risk for all borrowers, instead of just the ones with all data reported"
【1.4 補缺項工具】What best describes the process we just used to handle missing values?
library(mice)
Loading required package: lattice
Attaching package: 愼㸱愼㸵mice愼㸱愼㸶
The following objects are masked from 愼㸱愼㸵package:base愼㸱愼㸶:
cbind, rbind
set.seed(144)
vars.for.imputation = setdiff(names(loan), "not.fully.paid")
imputed = complete(mice(loan[vars.for.imputation]))
iter imp variable
1 1 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
1 2 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
1 3 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
1 4 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
1 5 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
2 1 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
2 2 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
2 3 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
2 4 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
2 5 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
3 1 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
3 2 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
3 3 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
3 4 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
3 5 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
4 1 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
4 2 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
4 3 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
4 4 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
4 5 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
5 1 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
5 2 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
5 3 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
5 4 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
5 5 log.annual.inc days.with.cr.line revol.util inq.last.6mths delinq.2yrs pub.rec
loan[vars.for.imputation] = imputed
c("Ans:We predicted missing variable values using the available independent variables for each observation")
[1] "Ans:We predicted missing variable values using the available independent variables for each observation"
【2.1 顯著性】Which independent variables are significant in our model?
library(caTools)
set.seed(144)
spl = sample.split(loan$not.fully.paid, 0.7)
train = subset(loan, spl == TRUE)
test = subset(loan, spl == FALSE)
mod = glm(not.fully.paid~., data=train, family="binomial")
names(which(summary(mod)$coefficients[,1]<0.05))
[1] "credit.policy" "purposecredit_card"
[3] "purposedebt_consolidation" "purposemajor_purchase"
[5] "installment" "log.annual.inc"
[7] "dti" "fico"
[9] "days.with.cr.line" "revol.bal"
[11] "revol.util" "delinq.2yrs"
【2.2 從回歸係數估計邊際效用】Consider two loan applications, which are identical other than the fact that the borrower in Application A has FICO credit score 700 while the borrower in Application B has FICO credit score 710. What is the value of Logit(A) - Logit(B)? What is the value of O(A)/O(B)?
# the difference of logits
summary(mod)$coefficients
Estimate Std. Error z value
(Intercept) 9.158007646 1.555140670 5.8889
credit.policy -0.349246579 0.100825981 -3.4639
purposecredit_card -0.614397768 0.134403507 -4.5713
purposedebt_consolidation -0.321655756 0.091828446 -3.5028
purposeeducational 0.135780792 0.175274026 0.7747
purposehome_improvement 0.174353490 0.147918419 1.1787
purposemajor_purchase -0.481418544 0.200793056 -2.3976
purposesmall_business 0.413433572 0.141832368 2.9149
int.rate 0.622113793 2.084902246 0.2984
installment 0.001272966 0.000209181 6.0855
log.annual.inc -0.431293681 0.071452618 -6.0361
dti 0.004627352 0.005499671 0.8414
fico -0.009293809 0.001708306 -5.4404
days.with.cr.line 0.000002187 0.000015878 0.1378
revol.bal 0.000003035 0.000001166 2.6016
revol.util 0.001916012 0.001532931 1.2499
inq.last.6mths 0.080739865 0.015868489 5.0881
delinq.2yrs -0.083383810 0.065542094 -1.2722
pub.rec 0.331043003 0.113758106 2.9101
Pr(>|z|)
(Intercept) 0.000000003889
credit.policy 0.000532493454
purposecredit_card 0.000004847247
purposedebt_consolidation 0.000460412410
purposeeducational 0.438530426796
purposehome_improvement 0.238512136172
purposemajor_purchase 0.016503523178
purposesmall_business 0.003557510656
int.rate 0.765405589483
installment 0.000000001161
log.annual.inc 0.000000001579
dti 0.400131100463
fico 0.000000053171
days.with.cr.line 0.890435221787
revol.bal 0.009278675923
revol.util 0.211335813305
inq.last.6mths 0.000000361740
delinq.2yrs 0.203295788542
pub.rec 0.003613584738
# the ratio of odds
【2.3 混淆矩陣、正確率 vs 底線機率】What is the accuracy of the logistic regression model? What is the accuracy of the baseline model?
test$predicted.risk = predict(mod, newdata=test, type="response")
table(test$not.fully.paid, test$predicted.risk > 0.5)
FALSE TRUE
0 2400 13
1 457 3
# test accuracy
accuracy=(2400+3)/(2400+457+13+3);accuracy
[1] 0.8364
# baseline accuracy
table(test$not.fully.paid)
0 1
2413 460
accuracy=2413/2873;accuracy
[1] 0.8399
【2.4 ROC & AUC】Use the ROCR package to compute the test set AUC.
library(ROCR)
Loading required package: gplots
Attaching package: 愼㸱愼㸵gplots愼㸱愼㸶
The following object is masked from 愼㸱愼㸵package:stats愼㸱愼㸶:
lowess
pred = prediction(test$predicted.risk, test$not.fully.paid)
as.numeric(performance(pred, "auc")@y.values)
[1] 0.6719
# test accuracy
# baseline accuracy
【3.1 高底線模型】The variable int.rate is highly significant in the bivariate model, but it is not significant at the 0.05 level in the model trained with all the independent variables. What is the most likely explanation for this difference?
bivariate = glm(not.fully.paid~int.rate, data=train, family="binomial")
summary(bivariate)
Call:
glm(formula = not.fully.paid ~ int.rate, family = "binomial",
data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.055 -0.627 -0.544 -0.436 2.291
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.673 0.169 -21.8 <2e-16 ***
int.rate 15.921 1.270 12.5 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 5896.6 on 6704 degrees of freedom
Residual deviance: 5734.8 on 6703 degrees of freedom
AIC: 5739
Number of Fisher Scoring iterations: 4
cor(train$int.rate, train$fico)
[1] -0.7117
【3.2 高底線模型的預測值】What is the highest predicted probability of a loan not being paid in full on the testing set? With a logistic regression cutoff of 0.5, how many loans would be predicted as not being paid in full on the testing set?
pred.bivariate = predict(bivariate, newdata=test, type="response")
max(summary(pred.bivariate))
[1] 0.4266
as.numeric(0)
[1] 0
【3.3 高底線模型的辨識率】What is the test set AUC of the bivariate model?
prediction.bivariate = prediction(pred.bivariate, test$not.fully.paid)
as.numeric(performance(prediction.bivariate, "auc")@y.values)
[1] 0.6239
【4.1 投資價值的算法】How much does a $10 investment with an annual interest rate of 6% pay back after 3 years, using continuous compounding of interest?
10*exp(0.06*3)
[1] 11.97
【4.2 投資獲利的算法,合約完成】While the investment has value c * exp(rt) dollars after collecting interest, the investor had to pay $c for the investment. What is the profit to the investor if the investment is paid back in full?
c("Ans:c * exp(rt) - c")
[1] "Ans:c * exp(rt) - c"
【4.3 投資獲利的算法,違約】Now, consider the case where the investor made a $c investment, but it was not paid back in full. Assume, conservatively, that no money was received from the borrower (often a lender will receive some but not all of the value of the loan, making this a pessimistic assumption of how much is received). What is the profit to the investor in this scenario?
# c * exp(rt) - c correct
c("Ans:-c")
[1] "Ans:-c"
【5.1 計算測試資料的實際投報率】What is the maximum profit of a $10 investment in any loan in the testing set?
test$profit = exp(test$int.rate*3) - 1
test$profit[test$not.fully.paid == 1] = -1
10*max(test$profit)
[1] 8.895
A simple investment strategy of equally investing in all the loans would yield profit $20.94 for a $100 investment. But this simple investment strategy does not leverage the prediction model we built earlier in this problem.
【6.1 高利率、高風險】What is the average profit of a $1 investment in one of these high-interest loans (do not include the $ sign in your answer)? What proportion of the high-interest loans were not paid back in full?
highInterest = subset(test, int.rate >= 0.15)
mean(highInterest$profit)
[1] 0.2251
table(highInterest$not.fully.paid)
0 1
327 110
110/(110+327)
[1] 0.2517
【6.2 高利率之中的低風險】What is the profit of the investor, who invested $1 in each of these 100 loans? How many of 100 selected loans were not paid back in full?
cutoff = sort(highInterest$predicted.risk, decreasing=FALSE)[100]
selectedLoans = subset(highInterest, predicted.risk <= cutoff)
sum(selectedLoans$profit)
[1] 31.28
table(selectedLoans$not.fully.paid)[2]
1
19
首先決定要使用何種因素去做決策,利用相關係數矩陣
round(cor(test[,-2]),4)
credit.policy int.rate installment log.annual.inc
credit.policy 1.0000 -0.2910 0.0915 0.0579
int.rate -0.2910 1.0000 0.2744 0.0407
installment 0.0915 0.2744 1.0000 0.4583
log.annual.inc 0.0579 0.0407 0.4583 1.0000
dti -0.0784 0.2078 0.0638 -0.0498
fico 0.3532 -0.7224 0.0838 0.1391
days.with.cr.line 0.1053 -0.1391 0.1894 0.3442
revol.bal -0.1621 0.0611 0.1901 0.3502
revol.util -0.0993 0.4590 0.0690 0.0222
inq.last.6mths -0.5127 0.1878 -0.0045 0.0704
delinq.2yrs -0.0709 0.1514 -0.0148 0.0195
pub.rec -0.0629 0.1025 -0.0258 -0.0126
not.fully.paid -0.1465 0.1691 0.0567 -0.0130
predicted.risk -0.6077 0.6113 0.1672 -0.1761
profit 0.0926 0.0140 -0.0090 0.0174
dti fico days.with.cr.line revol.bal
credit.policy -0.0784 0.3532 0.1053 -0.1621
int.rate 0.2078 -0.7224 -0.1391 0.0611
installment 0.0638 0.0838 0.1894 0.1901
log.annual.inc -0.0498 0.1391 0.3442 0.3502
dti 1.0000 -0.2298 0.0863 0.1450
fico -0.2298 1.0000 0.2717 -0.0076
days.with.cr.line 0.0863 0.2717 1.0000 0.2198
revol.bal 0.1450 -0.0076 0.2198 1.0000
revol.util 0.3439 -0.5394 -0.0150 0.1691
inq.last.6mths 0.0119 -0.1420 -0.0157 0.0494
delinq.2yrs -0.0571 -0.2206 0.0499 -0.0321
pub.rec 0.0259 -0.1436 0.0725 -0.0141
not.fully.paid 0.0107 -0.1410 -0.0058 0.0622
predicted.risk 0.1648 -0.5994 -0.1425 0.2029
profit 0.0273 0.0046 -0.0240 -0.0537
revol.util inq.last.6mths delinq.2yrs pub.rec
credit.policy -0.0993 -0.5127 -0.0709 -0.0629
int.rate 0.4590 0.1878 0.1514 0.1025
installment 0.0690 -0.0045 -0.0148 -0.0258
log.annual.inc 0.0222 0.0704 0.0195 -0.0126
dti 0.3439 0.0119 -0.0571 0.0259
fico -0.5394 -0.1420 -0.2206 -0.1436
days.with.cr.line -0.0150 -0.0157 0.0499 0.0725
revol.bal 0.1691 0.0494 -0.0321 -0.0141
revol.util 1.0000 -0.0378 -0.0185 0.0649
inq.last.6mths -0.0378 1.0000 0.0108 0.0799
delinq.2yrs -0.0185 0.0108 1.0000 -0.0191
pub.rec 0.0649 0.0799 -0.0191 1.0000
not.fully.paid 0.0847 0.1377 0.0110 0.0331
predicted.risk 0.3100 0.5300 0.0341 0.2436
profit 0.0023 -0.1022 0.0138 -0.0136
not.fully.paid predicted.risk profit
credit.policy -0.1465 -0.6077 0.0926
int.rate 0.1691 0.6113 0.0140
installment 0.0567 0.1672 -0.0090
log.annual.inc -0.0130 -0.1761 0.0174
dti 0.0107 0.1648 0.0273
fico -0.1410 -0.5994 0.0046
days.with.cr.line -0.0058 -0.1425 -0.0240
revol.bal 0.0622 0.2029 -0.0537
revol.util 0.0847 0.3100 0.0023
inq.last.6mths 0.1377 0.5300 -0.1022
delinq.2yrs 0.0110 0.0341 0.0138
pub.rec 0.0331 0.2436 -0.0136
not.fully.paid 1.0000 0.2330 -0.9802
predicted.risk 0.2330 1.0000 -0.1238
profit -0.9802 -0.1238 1.0000
table(test$inq.last.6mths)
0 1 2 3 4 5 6 7 8 9 10 11 12 13
1056 740 440 272 137 94 39 35 23 13 7 6 2 1
14 15 16 18 24
2 3 1 1 1
summary(test$inq.last.6mths)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 0.00 1.00 1.58 2.00 24.00
table(test$inq.last.6mths)
0 1 2 3 4 5 6 7 8 9 10 11 12 13
1056 740 440 272 137 94 39 35 23 13 7 6 2 1
14 15 16 18 24
2 3 1 1 1
summary(test$inq.last.6mths)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 0.00 1.00 1.58 2.00 24.00
summary(test$installment)
Min. 1st Qu. Median Mean 3rd Qu. Max.
15.7 163.6 267.7 317.0 421.9 926.8
summary(test$dti)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 7.16 12.85 12.75 18.30 29.96
summary(test$revol.util)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0 23.4 46.9 47.0 70.3 108.8
data=test[which(test$int.rate>=0.1 & test$credit.policy==1 & test$inq.last.6mths<1 & test$fico<=682),]
summary(test$predicted.risk)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0211 0.0945 0.1371 0.1579 0.1970 0.9508
summary(data[order(data$predicted.risk),"predicted.risk"][1:100])
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0561 0.1240 0.1434 0.1373 0.1540 0.1725
sum(data[order(data$predicted.risk),ncol(data)][1:100])
[1] 42.4