Use R for German Credit Predicting

Data set information

Data set parameter types and the first few rows of the data set

dataall<-read.csv('german_credit_data.csv')
head(dataall)
##   Age    Sex Job Housing Saving.accounts Checking.account Credit.amount
## 1  67   male   2     own            <NA>           little          1169
## 2  22 female   2     own          little         moderate          5951
## 3  49   male   1     own          little             <NA>          2096
## 4  45   male   2    free          little           little          7882
## 5  53   male   2    free          little           little          4870
## 6  35   male   1    free            <NA>             <NA>          9055
##   Duration             Purpose Risk
## 1        6            radio/TV good
## 2       48            radio/TV  bad
## 3       12           education good
## 4       42 furniture/equipment good
## 5       24                 car  bad
## 6       36           education good

数据集信息

summary(dataall)
##       Age            Sex                 Job          Housing         
##  Min.   :19.00   Length:1000        Min.   :0.000   Length:1000       
##  1st Qu.:27.00   Class :character   1st Qu.:2.000   Class :character  
##  Median :33.00   Mode  :character   Median :2.000   Mode  :character  
##  Mean   :35.55                      Mean   :1.904                     
##  3rd Qu.:42.00                      3rd Qu.:2.000                     
##  Max.   :75.00                      Max.   :3.000                     
##  Saving.accounts    Checking.account   Credit.amount      Duration   
##  Length:1000        Length:1000        Min.   :  250   Min.   : 4.0  
##  Class :character   Class :character   1st Qu.: 1366   1st Qu.:12.0  
##  Mode  :character   Mode  :character   Median : 2320   Median :18.0  
##                                        Mean   : 3271   Mean   :20.9  
##                                        3rd Qu.: 3972   3rd Qu.:24.0  
##                                        Max.   :18424   Max.   :72.0  
##    Purpose              Risk          
##  Length:1000        Length:1000       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
#newdata = mice(dataall, method='polr',seed = 100) 
#newdata = complete(newdata,5)

Basic Information Statistics Risk Information

ggplot(dataall, aes(x=Risk, fill=Risk))+geom_bar()+labs(title='Distribution of Risk Classes')

1.Age

ggplot(dataall, aes(Age)) + geom_histogram(binwidth=4, colour="black", fill="lightgreen") +labs(x= "Age",y= "Frequency" , title = "Age Information")

2.Gender

ggplot(dataall, aes(Sex) ) + geom_bar(aes(fill = as.factor(Sex))) + scale_fill_discrete(name="Sex",labels=c( "Female","Male")) +labs(x= "Sex",y= "Frequency" , title = "Sex Information")

3.job

数值越大 技能性越高

ggplot(dataall, aes(Job) ) + geom_bar(aes(fill = as.factor(Job))) + scale_fill_discrete(name="Job",labels=c( "0","1","2","3")) +labs(x= "Job",y= "Frequency" , title = "Job Information")

4.Housing

ggplot(dataall, aes(Housing) ) + geom_bar(aes(fill = as.factor(Housing))) + scale_fill_discrete(name="Housing",labels=c("Free","Own", "Rent")) +labs(x= "Housing",y= "Frequency" , title = "Housing Information")

5.Saving accounts

ggplot(dataall, aes(Saving.accounts) ) + geom_bar(aes(fill = as.factor(Saving.accounts))) + scale_fill_discrete(name="Saving accounts",labels=c( "Little","Moderate", "Quite Rich", "Rich", "NA")) +labs(x= "Saving accounts",y= "Frequency" , title = "Saving Accounts Information")

6.Checking Account

ggplot(dataall, aes(Checking.account) ) + geom_bar(aes(fill = as.factor(Checking.account))) + scale_fill_discrete(name="Checking accounts",labels=c( "Little","Moderate", "Rich", "NA")) +labs(x= "Checking accounts",y= "Frequency" , title = "Checking Accounts Information")

7.Credit amount

ggplot(dataall, aes(Credit.amount)) + geom_histogram(binwidth=1000, colour="black", fill="lightblue") +labs(x= "Credit amount",y= "Frequency" , title = "Credit amount")

8.Duration

ggplot(dataall, aes(Duration)) + geom_histogram(binwidth=4, colour="black", fill="lightyellow") +labs(x= "Duration in Months",y= "Frequency" , title = "Duration")

9.Purpose

ggplot(dataall, aes(Purpose) ) + geom_bar(aes(fill = as.factor(Purpose))) + scale_fill_discrete(name="Purpose of Loan",labels=c( "Business","Car","Domestic Appliances","Education","Furniture/Equipment","Radio/TV","Repairs","Vacation/Others")) +labs(x= "Purpose of Loan",y= "Frequency" , title = "Plot of Loan Purpose")

简单联系分析

年龄&RISK

p1<-ggplot(dataall, aes(x=Age, fill=Risk)) +geom_density(alpha=0.5) +ggtitle('Distribution of Age by Risk') +xlab("Age") + ylab("Density") 
p2<-ggplot(dataall, aes(x=Age, fill=Risk))+geom_bar()+labs(title='Distribution of Age & Risk')
p3<-ggplot(dataall, aes(x=Age, fill=Risk))+geom_boxplot()+labs(title='Distribution of Age & Risk')
ggarrange(p1,p2,p3,ncol=1,nrow=3)

性别&RISK

ggplot(dataall, aes(x=Sex, fill=Risk))+geom_bar()+labs(title='Distribution of Sex & Risk')

Job & RISK

ggplot(dataall, aes(x=Job, fill=Risk))+geom_bar()+labs(title='Distribution of Job & Risk')

Housing & Risk

ggplot(dataall, aes(x=Housing, fill=Risk))+geom_bar()+labs(title='Distribution of Housing & Risk')

Saving accounts &Risk

ggplot(dataall, aes(x=Saving.accounts, fill=Risk))+geom_bar()+labs(title='Distribution of Saving accounts & Risk')

Checking accounts & Risk

ggplot(dataall, aes(x=Risk, fill=Checking.account))+geom_bar()+labs(title='Distribution of Checking accounts & Risk')

Credit amount & Risk

ggplot(dataall, aes(x=Credit.amount, fill=Risk)) +geom_density(alpha=0.5) +ggtitle('Distribution of Credit Amount by Risk') +xlab("Credit Amount") + ylab("Density") 

ggplot(dataall, aes(x=Risk,y=Credit.amount,fill=Risk)) +geom_boxplot() +ggtitle('Distribution of Credit Amount by Risk') +xlab("Credit Amount") + ylab("Density") 

Duration & Risk

ggplot(dataall, aes(x=Duration, fill=Risk)) +geom_density(alpha=0.5) +ggtitle('Distribution of Duration by Risk') +xlab("Duration") + ylab("Density") 

Purpose & Risk

ggplot(dataall, aes(x=Risk, fill=Purpose))+geom_bar(position='dodge2')+coord_flip()

数据数值分析

热力图

model.matrix(~0+., data=dataall) %>% cor(use="pairwise.complete.obs") %>% ggcorrplot(show.diag = T,  lab=TRUE, lab_size=1)

算法建模

1.Logistic regression model

训练集 测试集

set.seed(127)
slidata <- sample(nrow(dataall), 0.7*nrow(dataall))
train <- dataall[slidata,]
test <- dataall[-slidata,]

Logistic

logistic1= glm(formula = as.factor(train$Risk) ~.,family = binomial,data = train)
summary(logistic1)
## 
## Call:
## glm(formula = as.factor(train$Risk) ~ ., family = binomial, data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3254  -1.0752   0.5923   0.9808   2.1732  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 1.013e+00  9.328e-01   1.086  0.27760    
## Age                         6.514e-03  1.128e-02   0.577  0.56361    
## Sexmale                     4.249e-01  2.657e-01   1.599  0.10985    
## Job                         1.186e-01  1.907e-01   0.622  0.53399    
## Housingown                 -4.494e-02  4.294e-01  -0.105  0.91663    
## Housingrent                -1.387e-01  4.875e-01  -0.284  0.77604    
## Saving.accountsmoderate     1.140e-01  3.880e-01   0.294  0.76893    
## Saving.accountsquite rich  -1.454e-01  6.398e-01  -0.227  0.82025    
## Saving.accountsrich         1.011e+00  6.222e-01   1.626  0.10403    
## Checking.accountmoderate    3.619e-02  2.621e-01   0.138  0.89021    
## Checking.accountrich        1.416e+00  4.771e-01   2.968  0.00300 ** 
## Credit.amount               4.936e-05  6.120e-05   0.807  0.41992    
## Duration                   -6.134e-02  1.337e-02  -4.589 4.46e-06 ***
## Purposecar                 -6.581e-01  4.352e-01  -1.512  0.13050    
## Purposedomestic appliances -8.057e-01  1.047e+00  -0.770  0.44143    
## Purposeeducation           -1.806e+00  6.634e-01  -2.723  0.00647 ** 
## Purposefurniture/equipment -1.902e-01  4.594e-01  -0.414  0.67888    
## Purposeradio/TV            -2.653e-01  4.530e-01  -0.586  0.55804    
## Purposerepairs             -4.761e-01  7.981e-01  -0.597  0.55083    
## Purposevacation/others     -1.074e+00  1.102e+00  -0.975  0.32980    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 484.11  on 352  degrees of freedom
## Residual deviance: 424.11  on 333  degrees of freedom
##   (347 observations deleted due to missingness)
## AIC: 464.11
## 
## Number of Fisher Scoring iterations: 4

可以看p值:我们看出Saving.accountsrich,Checking.accountrich,Duration,Sexmale系数要被考虑 重新建模

logistic2= glm(formula = as.factor(train$Risk) ~Saving.accounts+Checking.account+Duration+Sex,family = binomial,data = train)
summary(logistic2)
## 
## Call:
## glm(formula = as.factor(train$Risk) ~ Saving.accounts + Checking.account + 
##     Duration + Sex, family = binomial, data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1312  -1.1205   0.6324   0.9904   1.8606  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                0.733878   0.303801   2.416  0.01571 *  
## Saving.accountsmoderate    0.149005   0.371814   0.401  0.68860    
## Saving.accountsquite rich -0.058863   0.620626  -0.095  0.92444    
## Saving.accountsrich        0.948779   0.613994   1.545  0.12228    
## Checking.accountmoderate   0.083476   0.249537   0.335  0.73798    
## Checking.accountrich       1.353742   0.456952   2.963  0.00305 ** 
## Duration                  -0.048295   0.009934  -4.862 1.16e-06 ***
## Sexmale                    0.509065   0.247438   2.057  0.03965 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 484.11  on 352  degrees of freedom
## Residual deviance: 436.92  on 345  degrees of freedom
##   (347 observations deleted due to missingness)
## AIC: 452.92
## 
## Number of Fisher Scoring iterations: 4

训练集的预测情况

#Predicting train result 
prob_pred_train = predict(logistic2,type = 'response',newdata = train)
glm_pre_train= ifelse(prob_pred_train > 0.5,"good","bad")
summary(glm_pre_train)
##    Length     Class      Mode 
##       700 character character

混淆矩阵

xtable<-table(train$Risk,glm_pre_train)
confusionMatrix(xtable)
## Confusion Matrix and Statistics
## 
##       glm_pre_train
##        bad good
##   bad   76   79
##   good  42  156
##                                           
##                Accuracy : 0.6572          
##                  95% CI : (0.6051, 0.7067)
##     No Information Rate : 0.6657          
##     P-Value [Acc > NIR] : 0.655450        
##                                           
##                   Kappa : 0.2856          
##                                           
##  Mcnemar's Test P-Value : 0.001065        
##                                           
##             Sensitivity : 0.6441          
##             Specificity : 0.6638          
##          Pos Pred Value : 0.4903          
##          Neg Pred Value : 0.7879          
##              Prevalence : 0.3343          
##          Detection Rate : 0.2153          
##    Detection Prevalence : 0.4391          
##       Balanced Accuracy : 0.6539          
##                                           
##        'Positive' Class : bad             
## 

ROC图分析

roc_train=roc(train$Risk,prob_pred_train)
## Setting levels: control = bad, case = good
## Setting direction: controls < cases
plot(roc_train,print.auc=TRUE,auc.polygon=TRUE,grid=c(0.1,0.2),grid.col=c("green","red"),max.auc.polygon=TRUE,auc.polygon.col="lightblue",print.thres="best")

测试集的预测结果

#Predicting test result 
prob_pred_test = predict(logistic2,type = 'response',newdata = test)
glm_pre_test = ifelse(prob_pred_test > 0.5,"good","bad")
summary(glm_pre_test)
##    Length     Class      Mode 
##       300 character character

混淆矩阵

xtable<-table(test$Risk, glm_pre_test)
confusionMatrix(xtable)
## Confusion Matrix and Statistics
## 
##       glm_pre_test
##        bad good
##   bad   33   43
##   good  19   74
##                                           
##                Accuracy : 0.6331          
##                  95% CI : (0.5557, 0.7058)
##     No Information Rate : 0.6923          
##     P-Value [Acc > NIR] : 0.958256        
##                                           
##                   Kappa : 0.2367          
##                                           
##  Mcnemar's Test P-Value : 0.003489        
##                                           
##             Sensitivity : 0.6346          
##             Specificity : 0.6325          
##          Pos Pred Value : 0.4342          
##          Neg Pred Value : 0.7957          
##              Prevalence : 0.3077          
##          Detection Rate : 0.1953          
##    Detection Prevalence : 0.4497          
##       Balanced Accuracy : 0.6335          
##                                           
##        'Positive' Class : bad             
## 

ROC图分析

roc_test=roc(test$Risk,prob_pred_test)
## Setting levels: control = bad, case = good
## Setting direction: controls < cases
plot(roc_test,print.auc=TRUE,auc.polygon=TRUE,grid=c(0.1,0.2),grid.col=c("green","red"),max.auc.polygon=TRUE,auc.polygon.col="lightblue",print.thres="best")

2.Tree Models

Tree_model <- rpart(as.factor(train$Risk)~.,data=train)
fancyRpartPlot(Tree_model)

防止过拟合 我们需要剪枝操作

Tree_model$cptable
##           CP nsplit rel error xerror       xstd
## 1 0.03750000      0     1.000  1.000 0.05976143
## 2 0.02500000      2     0.925  0.995 0.05967142
## 3 0.02250000      3     0.900  1.000 0.05976143
## 4 0.01333333      5     0.855  1.015 0.06002708
## 5 0.01125000      8     0.815  1.065 0.06086607
## 6 0.01000000     13     0.755  1.095 0.06133602
plotcp(Tree_model)#交叉验证误差与复杂度参数的关系图

Tree_model_1 <- prune(Tree_model, cp=0.027)
fancyRpartPlot(Tree_model_1)

来检测预测效果 训练集

Tree_pre_train<-predict(Tree_model_1,train,type='class')
Tree_prob_train<-predict(Tree_model_1,train,type='prob')[,2]
xtable<-table(Tree_pre_train,train$Risk)
confusionMatrix(xtable)
## Confusion Matrix and Statistics
## 
##               
## Tree_pre_train bad good
##           bad   29   14
##           good 171  486
##                                          
##                Accuracy : 0.7357         
##                  95% CI : (0.7014, 0.768)
##     No Information Rate : 0.7143         
##     P-Value [Acc > NIR] : 0.112          
##                                          
##                   Kappa : 0.153          
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.14500        
##             Specificity : 0.97200        
##          Pos Pred Value : 0.67442        
##          Neg Pred Value : 0.73973        
##              Prevalence : 0.28571        
##          Detection Rate : 0.04143        
##    Detection Prevalence : 0.06143        
##       Balanced Accuracy : 0.55850        
##                                          
##        'Positive' Class : bad            
## 
Tree_roc_train=roc(train$Risk,Tree_prob_train)
## Setting levels: control = bad, case = good
## Setting direction: controls < cases
plot(Tree_roc_train,print.auc=TRUE,auc.polygon=TRUE,grid=c(0.1,0.2),grid.col=c("green","red"),max.auc.polygon=TRUE,auc.polygon.col="lightblue",print.thres="best")

测试集

Tree_pre_test<-predict(Tree_model_1,test,type='class')
xtable<-table(Tree_pre_test,test$Risk)
confusionMatrix(xtable)
## Confusion Matrix and Statistics
## 
##              
## Tree_pre_test bad good
##          bad    4    6
##          good  96  194
##                                           
##                Accuracy : 0.66            
##                  95% CI : (0.6033, 0.7135)
##     No Information Rate : 0.6667          
##     P-Value [Acc > NIR] : 0.6226          
##                                           
##                   Kappa : 0.0129          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.04000         
##             Specificity : 0.97000         
##          Pos Pred Value : 0.40000         
##          Neg Pred Value : 0.66897         
##              Prevalence : 0.33333         
##          Detection Rate : 0.01333         
##    Detection Prevalence : 0.03333         
##       Balanced Accuracy : 0.50500         
##                                           
##        'Positive' Class : bad             
## 
roc_test=roc(test$Risk,as.numeric(Tree_pre_test))
## Setting levels: control = bad, case = good
## Setting direction: controls < cases
plot(roc_test,print.auc=TRUE,auc.polygon=TRUE,grid=c(0.1,0.2),grid.col=c("green","red"),max.auc.polygon=TRUE,auc.polygon.col="lightblue",print.thres="best")

3.随机森林

https://cloud.tencent.com/developer/article/1870681

library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
## 
##     importance
## The following object is masked from 'package:ggplot2':
## 
##     margin
train.data.forest<-na.omit(train)
train.forest <- randomForest(as.factor(train.data.forest$Risk) ~ ., data = train.data.forest, importance = TRUE)
train.forest
## 
## Call:
##  randomForest(formula = as.factor(train.data.forest$Risk) ~ .,      data = train.data.forest, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 36.83%
## Confusion matrix:
##      bad good class.error
## bad   81   74   0.4774194
## good  56  142   0.2828283

查看重要性参数

importance.forest <- train.forest$importance
head(importance.forest)
##                           bad         good MeanDecreaseAccuracy
## Age              -0.007751874  0.001180502         -0.002556922
## Sex              -0.002756205 -0.002263176         -0.002493479
## Job               0.006014341  0.001392746          0.003425752
## Housing          -0.003639075  0.004909288          0.001050574
## Saving.accounts   0.002235366  0.002602384          0.002401319
## Checking.account  0.016152666  0.003925401          0.009235639
##                  MeanDecreaseGini
## Age                     32.067186
## Sex                      4.776435
## Job                     10.523550
## Housing                  8.256494
## Saving.accounts          7.348468
## Checking.account        11.401583
varImpPlot(train.forest, n.var = min(15, nrow(train.forest$importance)), main = 'Top  variable importance')

筛选掉不重要的一些参数 训练集

select <- rownames(importance.forest)[1:4]
test.data.forest<-na.omit(test)
train.data.forest.4 <- train.data.forest[ ,c(select, 'Risk')]
test.data.forest.4<- test.data.forest[ ,c(select, 'Risk')]

set.seed(123)
train.forest_4 <- randomForest(as.factor(train.data.forest.4$Risk) ~ ., data = train.data.forest.4, importance = TRUE)
train.forest_4
## 
## Call:
##  randomForest(formula = as.factor(train.data.forest.4$Risk) ~      ., data = train.data.forest.4, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 46.46%
## Confusion matrix:
##      bad good class.error
## bad   48  107   0.6903226
## good  57  141   0.2878788
#plot(margin(train.forest_5, train.data.forest.5$Risk))
 
train_predict.forest <- predict(train.forest_4, train.data.forest.4)
xtable <- table(train_predict.forest, train.data.forest.4$Risk)
confusionMatrix(xtable)
## Confusion Matrix and Statistics
## 
##                     
## train_predict.forest bad good
##                 bad   94   21
##                 good  61  177
##                                           
##                Accuracy : 0.7677          
##                  95% CI : (0.7201, 0.8108)
##     No Information Rate : 0.5609          
##     P-Value [Acc > NIR] : 4.422e-16       
##                                           
##                   Kappa : 0.5148          
##                                           
##  Mcnemar's Test P-Value : 1.656e-05       
##                                           
##             Sensitivity : 0.6065          
##             Specificity : 0.8939          
##          Pos Pred Value : 0.8174          
##          Neg Pred Value : 0.7437          
##              Prevalence : 0.4391          
##          Detection Rate : 0.2663          
##    Detection Prevalence : 0.3258          
##       Balanced Accuracy : 0.7502          
##                                           
##        'Positive' Class : bad             
## 

测试集

test_predict.forest <- predict(train.forest_4, test.data.forest.4)
xtable  <- table(test.data.forest.4$Risk, test_predict.forest)
confusionMatrix(xtable)
## Confusion Matrix and Statistics
## 
##       test_predict.forest
##        bad good
##   bad   26   50
##   good  19   74
##                                           
##                Accuracy : 0.5917          
##                  95% CI : (0.5136, 0.6666)
##     No Information Rate : 0.7337          
##     P-Value [Acc > NIR] : 0.9999782       
##                                           
##                   Kappa : 0.1431          
##                                           
##  Mcnemar's Test P-Value : 0.0003043       
##                                           
##             Sensitivity : 0.5778          
##             Specificity : 0.5968          
##          Pos Pred Value : 0.3421          
##          Neg Pred Value : 0.7957          
##              Prevalence : 0.2663          
##          Detection Rate : 0.1538          
##    Detection Prevalence : 0.4497          
##       Balanced Accuracy : 0.5873          
##                                           
##        'Positive' Class : bad             
## 

4.AdaBoosting

library(adabag)
## Loading required package: foreach
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
train.data.adabag <- na.omit(train)
test.data.adabag <- na.omit(test)
train.data.adabag$Risk <- factor(train.data.adabag$Risk)
error <- as.numeric()
for(i in 1:3){
  train.adaboost <- boosting(Risk ~ ., data=train.data.adabag, mfinal=i)
  test.adaboost.pred <- predict.boosting(train.adaboost,newdata = test.data.adabag)
  error[i] <- test.adaboost.pred$error
}
error <- as.data.frame(error)
xtable<-test.adaboost.pred$confusion
confusionMatrix(xtable)
## Confusion Matrix and Statistics
## 
##                Observed Class
## Predicted Class bad good
##            bad   43   23
##            good  33   70
##                                          
##                Accuracy : 0.6686         
##                  95% CI : (0.5922, 0.739)
##     No Information Rate : 0.5503         
##     P-Value [Acc > NIR] : 0.001146       
##                                          
##                   Kappa : 0.3224         
##                                          
##  Mcnemar's Test P-Value : 0.229102       
##                                          
##             Sensitivity : 0.5658         
##             Specificity : 0.7527         
##          Pos Pred Value : 0.6515         
##          Neg Pred Value : 0.6796         
##              Prevalence : 0.4497         
##          Detection Rate : 0.2544         
##    Detection Prevalence : 0.3905         
##       Balanced Accuracy : 0.6592         
##                                          
##        'Positive' Class : bad            
## 
ggplot(error,aes(x=1:3,y=error))+geom_line(colour="red", linetype="dashed",size = 1)+geom_point(size=3, shape=18)+ylim(0,1) +xlab("the number of basic classifiers")