Author: 劉育銘
Source: https://www.openml.org/d/31
#install.packages('data.table')
#install.packages('dplyr')
#install.packages('caTools')
library(data.table)
library(dplyr)
library(stringr)
credit=fread('./data/credit.csv',stringsAsFactors=T)
credit$class=ifelse(credit$class=='good',1,0)
credit$class=credit$class %>% as.factor
credit
library(caTools)
set.seed(88)
split = sample.split(credit$class, SplitRatio = 0.75)
TR = subset(credit, split == TRUE)
TS = subset(credit, split == FALSE)
Q:模型未做變數選擇,Accuracy=?
glm1 = glm(class~., TR, family=binomial)
summary(glm1)
Call:
glm(formula = class ~ ., family = binomial, data = TR)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.6976 -0.6950 0.3526 0.6991 2.2184
Coefficients:
Estimate Std. Error
(Intercept) 2.607e+00 2.038e+00
checking_status'<0' -2.659e-01 2.521e-01
checking_status'>=200' 9.655e-01 4.337e-01
checking_status'no checking' 1.515e+00 2.761e-01
duration -3.316e-02 1.086e-02
credit_history'critical/other existing credit' 1.098e+00 5.164e-01
credit_history'delayed previously' 6.854e-01 5.585e-01
credit_history'existing paid' 4.447e-01 4.489e-01
credit_history'no credits/all paid' -2.334e-01 6.391e-01
purpose'new car' -1.406e+00 1.497e+00
purpose'used car' 2.575e-01 1.543e+00
purposebusiness -6.513e-01 1.527e+00
purposeeducation -1.462e+00 1.533e+00
purposefurniture/equipment -7.327e-01 1.503e+00
purposeother 7.629e-01 1.711e+00
purposeradio/tv -4.334e-01 1.505e+00
purposerepairs -1.520e+00 1.631e+00
purposeretraining 4.302e-01 1.902e+00
credit_amount -1.016e-04 5.148e-05
savings_status'500<=X<1000' 2.358e-01 5.522e-01
savings_status'<100' -3.122e-01 3.286e-01
savings_status'>=1000' 1.019e+00 6.598e-01
savings_status'no known savings' 7.351e-01 4.099e-01
employment'4<=X<7' 8.871e-01 3.308e-01
employment'<1' -2.032e-01 2.806e-01
employment'>=7' 2.001e-01 2.931e-01
employmentunemployed -2.414e-01 5.164e-01
installment_commitment -3.075e-01 1.037e-01
personal_status'male div/sep' -1.186e-01 4.500e-01
personal_status'male mar/wid' 1.220e-01 3.697e-01
personal_status'male single' 4.938e-01 2.470e-01
other_partiesguarantor 1.527e+00 7.069e-01
other_partiesnone 2.673e-01 5.160e-01
residence_since -6.157e-02 1.028e-01
property_magnitude'no known property' -1.687e-01 5.119e-01
property_magnitude'real estate' 2.091e-01 2.947e-01
property_magnitudecar -6.300e-04 2.690e-01
age 1.166e-02 1.089e-02
other_payment_plansnone 8.033e-01 2.810e-01
other_payment_plansstores 3.431e-01 4.974e-01
housingown 3.695e-02 5.556e-01
housingrent -2.892e-01 5.743e-01
existing_credits -1.852e-01 2.263e-01
job'unemp/unskilled non res' 2.690e-01 7.453e-01
job'unskilled resident' 1.007e-01 3.977e-01
jobskilled 9.545e-02 3.213e-01
num_dependents -5.131e-01 2.917e-01
own_telephoneyes 3.093e-01 2.324e-01
foreign_workeryes -9.220e-01 6.635e-01
z value Pr(>|z|)
(Intercept) 1.280 0.20064
checking_status'<0' -1.055 0.29153
checking_status'>=200' 2.226 0.02600 *
checking_status'no checking' 5.486 4.1e-08 ***
duration -3.054 0.00226 **
credit_history'critical/other existing credit' 2.126 0.03348 *
credit_history'delayed previously' 1.227 0.21972
credit_history'existing paid' 0.991 0.32183
credit_history'no credits/all paid' -0.365 0.71501
purpose'new car' -0.939 0.34769
purpose'used car' 0.167 0.86744
purposebusiness -0.426 0.66976
purposeeducation -0.953 0.34038
purposefurniture/equipment -0.487 0.62599
purposeother 0.446 0.65559
purposeradio/tv -0.288 0.77336
purposerepairs -0.931 0.35163
purposeretraining 0.226 0.82106
credit_amount -1.972 0.04856 *
savings_status'500<=X<1000' 0.427 0.66939
savings_status'<100' -0.950 0.34210
savings_status'>=1000' 1.545 0.12239
savings_status'no known savings' 1.793 0.07292 .
employment'4<=X<7' 2.682 0.00732 **
employment'<1' -0.724 0.46887
employment'>=7' 0.683 0.49481
employmentunemployed -0.467 0.64019
installment_commitment -2.965 0.00303 **
personal_status'male div/sep' -0.264 0.79208
personal_status'male mar/wid' 0.330 0.74135
personal_status'male single' 1.999 0.04558 *
other_partiesguarantor 2.160 0.03075 *
other_partiesnone 0.518 0.60443
residence_since -0.599 0.54938
property_magnitude'no known property' -0.330 0.74173
property_magnitude'real estate' 0.709 0.47803
property_magnitudecar -0.002 0.99813
age 1.070 0.28439
other_payment_plansnone 2.859 0.00425 **
other_payment_plansstores 0.690 0.49030
housingown 0.067 0.94697
housingrent -0.504 0.61458
existing_credits -0.819 0.41307
job'unemp/unskilled non res' 0.361 0.71812
job'unskilled resident' 0.253 0.80003
jobskilled 0.297 0.76641
num_dependents -1.759 0.07860 .
own_telephoneyes 1.331 0.18333
foreign_workeryes -1.390 0.16462
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 916.30 on 749 degrees of freedom
Residual deviance: 660.96 on 701 degrees of freedom
AIC: 758.96
Number of Fisher Scoring iterations: 5
pred = predict(glm1, TS,type='response')
mx = table(Actual=TS$class, Predict=pred > 0.5);
mx
Predict
Actual FALSE TRUE
0 41 34
1 24 151
c(accuracy = sum(diag(mx))/sum(mx),
sensitivity = mx[2,2]/sum(mx[2,]),
specificity = mx[1,1]/sum(mx[1,]))
accuracy sensitivity specificity
0.7680000 0.8628571 0.5466667
summary(pred)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.1211 0.4942 0.7892 0.7072 0.9187 0.9956
不同Threshold下的cutoff
pred_total=data.frame()
for(threshold in seq(0.13,0.9,by=0.01)){
pred = predict(glm1, TS,type='response')
mx = table(TS$class, pred > threshold)
pred_row=c(
threshold=threshold,
accuracy = sum(diag(mx))/sum(mx),
sensitivity = mx[2,2]/sum(mx[2,]),
specificity = mx[1,1]/sum(mx[1,]))
pred_total <- rbind(pred_total,pred_row)
}
colnames(pred_total)=c('threshold','accuracy','sensitvity','specificity')
pred_total
pred_total[which.max(pred_total$accuracy),]
library(rpart)
rpart1 = rpart(class ~ ., TR,
method="class")
#install.packages('rpart.plot')
library(rpart.plot)
prp(rpart1,faclen=0)
pred = predict(rpart1, TS, type = "prob")
mx = table(Actual = TS$class, Predict=pred[,2]>0.5); mx
Predict
Actual FALSE TRUE
0 33 42
1 32 143
c(accuracy = sum(diag(mx))/sum(mx),
sensitivity = mx[2,2]/sum(mx[2,]),
specificity = mx[1,1]/sum(mx[1,])
)
accuracy sensitivity specificity
0.7040000 0.8171429 0.4400000
summary(pred[,2])
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.1000 0.4118 0.8258 0.6881 0.8786 1.0000
不同Threshold下的cutoff
pred_total=data.frame()
for(threshold in seq(0.1,0.99,by=0.01)){
pred = predict(rpart1, TS, type = "prob") # predict classes
mx = table(TS$class, pred[,2] > threshold)
pred_row=c(
threshold=threshold,
accuracy = sum(diag(mx))/sum(mx),
sensitivity = mx[2,2]/sum(mx[2,]),
specificity = mx[1,1]/sum(mx[1,]))
pred_total <- rbind(pred_total,pred_row)
}
colnames(pred_total)=c('threshold','accuracy','sensitvity','specificity')
pred_total[which.max(pred_total$accuracy),]
#install.packages('caret')
library(caret)
Loading required package: lattice
Loading required package: ggplot2
#install.packages('e1071')
library(e1071)
cv1 = train(
class ~ ., TR, method = "rpart",
trControl = trainControl(method="cv", number=10), # 10 fold CV
tuneGrid=expand.grid(cp = seq(0.01,0.5,0.01)) # parameter combination
)
cv1; plot(cv1)
CART
750 samples
20 predictor
2 classes: '0', '1'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 674, 676, 674, 675, 675, 676, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.01 0.7135884 0.25382492
0.02 0.7136586 0.24864663
0.03 0.7109379 0.21121715
0.04 0.7082722 0.19467393
0.05 0.7001294 0.09407736
0.06 0.7000213 0.00000000
0.07 0.7000213 0.00000000
0.08 0.7000213 0.00000000
0.09 0.7000213 0.00000000
0.10 0.7000213 0.00000000
0.11 0.7000213 0.00000000
0.12 0.7000213 0.00000000
0.13 0.7000213 0.00000000
0.14 0.7000213 0.00000000
0.15 0.7000213 0.00000000
0.16 0.7000213 0.00000000
0.17 0.7000213 0.00000000
0.18 0.7000213 0.00000000
0.19 0.7000213 0.00000000
0.20 0.7000213 0.00000000
0.21 0.7000213 0.00000000
0.22 0.7000213 0.00000000
0.23 0.7000213 0.00000000
0.24 0.7000213 0.00000000
0.25 0.7000213 0.00000000
0.26 0.7000213 0.00000000
0.27 0.7000213 0.00000000
0.28 0.7000213 0.00000000
0.29 0.7000213 0.00000000
0.30 0.7000213 0.00000000
0.31 0.7000213 0.00000000
0.32 0.7000213 0.00000000
0.33 0.7000213 0.00000000
0.34 0.7000213 0.00000000
0.35 0.7000213 0.00000000
0.36 0.7000213 0.00000000
0.37 0.7000213 0.00000000
0.38 0.7000213 0.00000000
0.39 0.7000213 0.00000000
0.40 0.7000213 0.00000000
0.41 0.7000213 0.00000000
0.42 0.7000213 0.00000000
0.43 0.7000213 0.00000000
0.44 0.7000213 0.00000000
0.45 0.7000213 0.00000000
0.46 0.7000213 0.00000000
0.47 0.7000213 0.00000000
0.48 0.7000213 0.00000000
0.49 0.7000213 0.00000000
0.50 0.7000213 0.00000000
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.02.
選擇CP=0.02
library(rpart)
rpart2 = rpart(class ~ ., TR, method="class", cp=0.01)
pred = predict(rpart2, TS, type='prob')[,2]
table(TS$class, pred > 0.5) %>% {sum(diag(.)) / sum(.)} # 0.704
[1] 0.704
credit %>% is.na %>% colSums()
checking_status duration credit_history
0 0 0
purpose credit_amount savings_status
0 0 0
employment installment_commitment personal_status
0 0 0
other_parties residence_since property_magnitude
0 0 0
age other_payment_plans housing
0 0 0
existing_credits job num_dependents
0 0 0
own_telephone foreign_worker class
0 0 0
敘述統計部分: https://www.openml.org/d/31
credit_cl=credit
credit_cl$purpose=ifelse(!credit$purpose %in% c("'new car'","'used car'","furniture/equipment","radio/tv"),'others',credit$purpose) %>% as.factor
library(caTools)
set.seed(88)
split = sample.split(credit_cl$class, SplitRatio = 0.75)
TR_cl = subset(credit_cl, split == TRUE)
TS_cl = subset(credit_cl, split == FALSE)
glm1= glm(class~., TR, family=binomial)
glm2 = glm(class~., TR_cl, family=binomial)
#summary(glm1)
接著看模型是否受到共線性影響
#install.packages('car')
library(car)
vif(glm1)
GVIF Df GVIF^(1/(2*Df))
checking_status 1.447904 3 1.063628
duration 1.884312 1 1.372703
credit_history 2.829653 4 1.138850
purpose 3.811492 9 1.077167
credit_amount 2.474844 1 1.573164
savings_status 1.505274 4 1.052451
employment 2.717533 4 1.133109
installment_commitment 1.428683 1 1.195275
personal_status 1.727780 3 1.095422
other_parties 1.264821 2 1.060492
residence_since 1.408847 1 1.186949
property_magnitude 4.457815 3 1.282883
age 1.484286 1 1.218313
other_payment_plans 1.363335 2 1.080564
housing 4.108105 2 1.423673
existing_credits 1.820335 1 1.349198
job 2.641865 3 1.175759
num_dependents 1.249582 1 1.117847
own_telephone 1.366808 1 1.169106
foreign_worker 1.130327 1 1.063169
vif(glm2)
GVIF Df GVIF^(1/(2*Df))
checking_status 1.398552 3 1.057499
duration 1.840747 1 1.356741
credit_history 2.570655 4 1.125267
purpose 1.878683 4 1.082011
credit_amount 2.358776 1 1.535831
savings_status 1.462802 4 1.048693
employment 2.596689 4 1.126685
installment_commitment 1.418105 1 1.190842
personal_status 1.638541 3 1.085783
other_parties 1.186797 2 1.043744
residence_since 1.399868 1 1.183160
property_magnitude 4.168607 3 1.268621
age 1.482988 1 1.217780
other_payment_plans 1.266162 2 1.060773
housing 3.938918 2 1.408783
existing_credits 1.804489 1 1.343313
job 2.506559 3 1.165502
num_dependents 1.229017 1 1.108610
own_telephone 1.348051 1 1.161056
foreign_worker 1.112051 1 1.054538
從共線性問題來看,Purpose level簡化過後較沒有問題
pred = predict(glm2, TS_cl,type='response')
mx = table(Actual=TS_cl$class, Predict=pred > 0.5);
c(accuracy = sum(diag(mx))/sum(mx),
sensitivity = mx[2,2]/sum(mx[2,]),
specificity = mx[1,1]/sum(mx[1,]))
accuracy sensitivity specificity
0.7720000 0.8742857 0.5333333
summary(glm2)
Call:
glm(formula = class ~ ., family = binomial, data = TR_cl)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.6686 -0.7374 0.3601 0.7085 2.0373
Coefficients:
Estimate Std. Error
(Intercept) 1.719e+00 1.379e+00
checking_status'<0' -2.826e-01 2.483e-01
checking_status'>=200' 9.014e-01 4.305e-01
checking_status'no checking' 1.465e+00 2.715e-01
duration -3.252e-02 1.069e-02
credit_history'critical/other existing credit' 1.029e+00 5.043e-01
credit_history'delayed previously' 6.709e-01 5.472e-01
credit_history'existing paid' 3.957e-01 4.371e-01
credit_history'no credits/all paid' -1.969e-01 6.247e-01
purpose3 1.616e+00 4.293e-01
purpose6 6.879e-01 3.028e-01
purpose8 9.823e-01 2.901e-01
purposeothers 5.765e-01 2.946e-01
credit_amount -8.732e-05 5.112e-05
savings_status'500<=X<1000' 2.769e-01 5.480e-01
savings_status'<100' -3.417e-01 3.263e-01
savings_status'>=1000' 9.110e-01 6.462e-01
savings_status'no known savings' 6.894e-01 4.067e-01
employment'4<=X<7' 8.662e-01 3.238e-01
employment'<1' -2.237e-01 2.777e-01
employment'>=7' 2.017e-01 2.895e-01
employmentunemployed -9.463e-02 5.079e-01
installment_commitment -3.025e-01 1.028e-01
personal_status'male div/sep' -1.012e-01 4.506e-01
personal_status'male mar/wid' 1.818e-01 3.614e-01
personal_status'male single' 4.933e-01 2.439e-01
other_partiesguarantor 1.261e+00 6.962e-01
other_partiesnone -1.904e-02 5.013e-01
residence_since -5.116e-02 1.016e-01
property_magnitude'no known property' -2.994e-01 4.948e-01
property_magnitude'real estate' 2.403e-01 2.902e-01
property_magnitudecar 1.202e-02 2.671e-01
age 1.113e-02 1.087e-02
other_payment_plansnone 6.761e-01 2.724e-01
other_payment_plansstores 1.966e-01 4.891e-01
housingown 1.400e-02 5.400e-01
housingrent -3.672e-01 5.606e-01
existing_credits -1.851e-01 2.242e-01
job'unemp/unskilled non res' 2.955e-01 7.384e-01
job'unskilled resident' 5.115e-02 3.921e-01
jobskilled 1.678e-02 3.192e-01
num_dependents -4.929e-01 2.877e-01
own_telephoneyes 3.355e-01 2.288e-01
foreign_workeryes -1.001e+00 6.796e-01
z value Pr(>|z|)
(Intercept) 1.247 0.212572
checking_status'<0' -1.138 0.255021
checking_status'>=200' 2.094 0.036285 *
checking_status'no checking' 5.395 6.84e-08 ***
duration -3.043 0.002345 **
credit_history'critical/other existing credit' 2.040 0.041333 *
credit_history'delayed previously' 1.226 0.220182
credit_history'existing paid' 0.905 0.365338
credit_history'no credits/all paid' -0.315 0.752649
purpose3 3.764 0.000167 ***
purpose6 2.271 0.023125 *
purpose8 3.386 0.000709 ***
purposeothers 1.957 0.050370 .
credit_amount -1.708 0.087640 .
savings_status'500<=X<1000' 0.505 0.613335
savings_status'<100' -1.047 0.294967
savings_status'>=1000' 1.410 0.158628
savings_status'no known savings' 1.695 0.090060 .
employment'4<=X<7' 2.675 0.007474 **
employment'<1' -0.805 0.420560
employment'>=7' 0.697 0.485901
employmentunemployed -0.186 0.852191
installment_commitment -2.944 0.003240 **
personal_status'male div/sep' -0.225 0.822264
personal_status'male mar/wid' 0.503 0.614961
personal_status'male single' 2.023 0.043062 *
other_partiesguarantor 1.812 0.070031 .
other_partiesnone -0.038 0.969711
residence_since -0.504 0.614475
property_magnitude'no known property' -0.605 0.545205
property_magnitude'real estate' 0.828 0.407713
property_magnitudecar 0.045 0.964112
age 1.023 0.306081
other_payment_plansnone 2.482 0.013058 *
other_payment_plansstores 0.402 0.687663
housingown 0.026 0.979319
housingrent -0.655 0.512410
existing_credits -0.825 0.409153
job'unemp/unskilled non res' 0.400 0.688999
job'unskilled resident' 0.130 0.896210
jobskilled 0.053 0.958086
num_dependents -1.714 0.086611 .
own_telephoneyes 1.466 0.142578
foreign_workeryes -1.472 0.140950
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 916.30 on 749 degrees of freedom
Residual deviance: 670.33 on 706 degrees of freedom
AIC: 758.33
Number of Fisher Scoring iterations: 5
Purpose level值減少,Purpose VIF值下降至4,準確率稍微提升0.004
其餘類別型資料level皆<5且無缺值,資料完整故不處理
接下來看模型是否還有調整空間
fullmod = glm(class~., TR, family=binomial)
CLmod=glm(class~., TR_cl, family=binomial)
#Forward
nothing <- glm(class~1, TR_cl, family=binomial)
forwards = step(nothing,
scope=list(lower=formula(nothing),upper=formula(CLmod)), direction="forward",trace=0)
#Backward:
backwards = step(CLmod,trace=0)
#Bothways
bothways=step(nothing, list(lower=formula(nothing),upper=formula(CLmod)),
direction="both",trace=0)
formula(forwards) #13 Variables
class ~ checking_status + duration + purpose + employment + credit_history +
savings_status + other_parties + installment_commitment +
other_payment_plans + housing + foreign_worker + credit_amount +
own_telephone
formula(backwards) #13 Variables
class ~ checking_status + duration + credit_history + purpose +
credit_amount + savings_status + employment + installment_commitment +
other_parties + other_payment_plans + housing + own_telephone +
foreign_worker
formula(bothways) #13 Variables
class ~ checking_status + duration + purpose + employment + credit_history +
savings_status + other_parties + installment_commitment +
other_payment_plans + housing + foreign_worker + credit_amount +
own_telephone
models=c('CLmod','forwards','backwards','bothways')
Allmodels_total=lapply(models, function(model){
pred = predict(get(model), TS_cl,type='response')
mx = table(Actual=TS_cl$class, Predict=pred > 0.5);
pred_row=data.frame(
Model=model,
accuracy = sum(diag(mx))/sum(mx),
sensitivity = mx[2,2]/sum(mx[2,]),
specificity = mx[1,1]/sum(mx[1,]))
})
do.call(rbind, Allmodels_total)
NA
我會選擇CLmod:調整過Purpose levels的模型 其他三個模型變數減少至11個,然而準確率下降2%
選定模型後,來看不同Threshold下的TPR與FPR:
#install.packages('ROCR')
library(ROCR)
ROCRpred = prediction(pred, TS_cl$class)
ROCRperf = performance(ROCRpred, "tpr", "fpr")
par(cex=0.8)
plot(ROCRperf, colorize=TRUE, print.cutoffs.at=seq(0,1,0.1))
caTools::colAUC(pred, TS_cl$class)
[,1]
0 vs. 1 0.7988571
AUC約為0.798
payoff = matrix(c(0,-5,-1,0),2,2)
cutoff = seq(0.2, 0.9, 0.01)
result = sapply(cutoff, function(p) sum(table(TS_cl$class,pred>p)*payoff) )
i = which.max(result)
par(cex=0.7)
plot(cutoff, result, type='l', col='cyan', lwd=2, main=sprintf(
"Optomal Expected Result: $%d @ %.2f",result[i],cutoff[i]))
abline(v=seq(0,1,0.05),h=seq(-11000,-6000,500),col='lightgray',lty=3)
abline(v=cutoff[i],col='red')
考慮報酬矩陣後,以Threshold=0.24做決策,花費最少的成本70
payoff = matrix(c(0,-5,-1,0),2,2)
cutoff = seq(0.2, 0.9, 0.01)
result = sapply(cutoff, function(p) sum(table(TS_cl$class,pred>p)*payoff) )
i =71
par(cex=0.7)
plot(cutoff, result, type='l', col='cyan', lwd=2, main=sprintf(
"Optomal Expected Result: $%d @ %.2f",result[i],cutoff[i]))
abline(v=seq(0,1,0.05),h=seq(-11000,-6000,500),col='lightgray',lty=3)
abline(v=cutoff[i],col='red')
payoff = matrix(c(0,-5,-1,0),2,2)
cutoff = seq(0.15, 0.9, 0.01)
result = sapply(cutoff, function(p) sum(table(TS_cl$class,pred>p)*payoff) )
i =1
par(cex=0.7)
plot(cutoff, result, type='l', col='cyan', lwd=2, main=sprintf(
"Optomal Expected Result: $%d @ %.2f",result[i],cutoff[i]))
abline(v=seq(0,1,0.05),h=seq(-11000,-6000,500),col='lightgray',lty=3)
abline(v=cutoff[i],col='red')
即使將高風險顧客誤判為低風險顧客成本高 我們仍有信心將每一位顧客視為低風險顧客