Use R for German Credit Predicting

Data set information

Data set parameter types and the first few rows of the data set

dataall<-read.csv('german_credit_data.csv')
head(dataall)
##   Age    Sex Job Housing Saving.accounts Checking.account Credit.amount
## 1  67   male   2     own            <NA>           little          1169
## 2  22 female   2     own          little         moderate          5951
## 3  49   male   1     own          little             <NA>          2096
## 4  45   male   2    free          little           little          7882
## 5  53   male   2    free          little           little          4870
## 6  35   male   1    free            <NA>             <NA>          9055
##   Duration             Purpose Risk
## 1        6            radio/TV good
## 2       48            radio/TV  bad
## 3       12           education good
## 4       42 furniture/equipment good
## 5       24                 car  bad
## 6       36           education good

数据集信息

summary(dataall)
##       Age            Sex                 Job          Housing         
##  Min.   :19.00   Length:1000        Min.   :0.000   Length:1000       
##  1st Qu.:27.00   Class :character   1st Qu.:2.000   Class :character  
##  Median :33.00   Mode  :character   Median :2.000   Mode  :character  
##  Mean   :35.55                      Mean   :1.904                     
##  3rd Qu.:42.00                      3rd Qu.:2.000                     
##  Max.   :75.00                      Max.   :3.000                     
##  Saving.accounts    Checking.account   Credit.amount      Duration   
##  Length:1000        Length:1000        Min.   :  250   Min.   : 4.0  
##  Class :character   Class :character   1st Qu.: 1366   1st Qu.:12.0  
##  Mode  :character   Mode  :character   Median : 2320   Median :18.0  
##                                        Mean   : 3271   Mean   :20.9  
##                                        3rd Qu.: 3972   3rd Qu.:24.0  
##                                        Max.   :18424   Max.   :72.0  
##    Purpose              Risk          
##  Length:1000        Length:1000       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 

na值的查看 红色表示缺失

#newdata = mice(dataall, method='polr',seed = 100) 
#newdata = complete
newdata<-dataall
newdata$Housing<-as.factor(newdata$Housing)
newdata$Sex<-as.factor(newdata$Sex)
newdata$Saving.accounts<-as.factor(newdata$Saving.accounts)
newdata$Checking.account<-as.factor(newdata$Checking.account)
newdata$Purpose<-as.factor(newdata$Purpose)
newdata$Risk<-as.factor(newdata$Risk)
head(newdata)
##   Age    Sex Job Housing Saving.accounts Checking.account Credit.amount
## 1  67   male   2     own            <NA>           little          1169
## 2  22 female   2     own          little         moderate          5951
## 3  49   male   1     own          little             <NA>          2096
## 4  45   male   2    free          little           little          7882
## 5  53   male   2    free          little           little          4870
## 6  35   male   1    free            <NA>             <NA>          9055
##   Duration             Purpose Risk
## 1        6            radio/TV good
## 2       48            radio/TV  bad
## 3       12           education good
## 4       42 furniture/equipment good
## 5       24                 car  bad
## 6       36           education good
md.pattern(newdata)

##     Age Sex Job Housing Credit.amount Duration Purpose Risk Saving.accounts
## 522   1   1   1       1             1        1       1    1               1
## 295   1   1   1       1             1        1       1    1               1
## 84    1   1   1       1             1        1       1    1               0
## 99    1   1   1       1             1        1       1    1               0
##       0   0   0       0             0        0       0    0             183
##     Checking.account    
## 522                1   0
## 295                0   1
## 84                 1   1
## 99                 0   2
##                  394 577
matrixplot(newdata)

## 
## Click in a column to sort by the corresponding variable.
## To regain use of the VIM GUI and the R console, click outside the plot region.

判断SAVING和CHECKING是否有关系

marginplot(newdata[c(5,6)])

marginmatrix(newdata)

数据集填充完毕!

datanew<-mice(newdata,maxit=50,seed=500)
## 
##  iter imp variable
##   1   1  Saving.accounts  Checking.account
##   1   2  Saving.accounts  Checking.account
##   1   3  Saving.accounts  Checking.account
##   1   4  Saving.accounts  Checking.account
##   1   5  Saving.accounts  Checking.account
##   2   1  Saving.accounts  Checking.account
##   2   2  Saving.accounts  Checking.account
##   2   3  Saving.accounts  Checking.account
##   2   4  Saving.accounts  Checking.account
##   2   5  Saving.accounts  Checking.account
##   3   1  Saving.accounts  Checking.account
##   3   2  Saving.accounts  Checking.account
##   3   3  Saving.accounts  Checking.account
##   3   4  Saving.accounts  Checking.account
##   3   5  Saving.accounts  Checking.account
##   4   1  Saving.accounts  Checking.account
##   4   2  Saving.accounts  Checking.account
##   4   3  Saving.accounts  Checking.account
##   4   4  Saving.accounts  Checking.account
##   4   5  Saving.accounts  Checking.account
##   5   1  Saving.accounts  Checking.account
##   5   2  Saving.accounts  Checking.account
##   5   3  Saving.accounts  Checking.account
##   5   4  Saving.accounts  Checking.account
##   5   5  Saving.accounts  Checking.account
##   6   1  Saving.accounts  Checking.account
##   6   2  Saving.accounts  Checking.account
##   6   3  Saving.accounts  Checking.account
##   6   4  Saving.accounts  Checking.account
##   6   5  Saving.accounts  Checking.account
##   7   1  Saving.accounts  Checking.account
##   7   2  Saving.accounts  Checking.account
##   7   3  Saving.accounts  Checking.account
##   7   4  Saving.accounts  Checking.account
##   7   5  Saving.accounts  Checking.account
##   8   1  Saving.accounts  Checking.account
##   8   2  Saving.accounts  Checking.account
##   8   3  Saving.accounts  Checking.account
##   8   4  Saving.accounts  Checking.account
##   8   5  Saving.accounts  Checking.account
##   9   1  Saving.accounts  Checking.account
##   9   2  Saving.accounts  Checking.account
##   9   3  Saving.accounts  Checking.account
##   9   4  Saving.accounts  Checking.account
##   9   5  Saving.accounts  Checking.account
##   10   1  Saving.accounts  Checking.account
##   10   2  Saving.accounts  Checking.account
##   10   3  Saving.accounts  Checking.account
##   10   4  Saving.accounts  Checking.account
##   10   5  Saving.accounts  Checking.account
##   11   1  Saving.accounts  Checking.account
##   11   2  Saving.accounts  Checking.account
##   11   3  Saving.accounts  Checking.account
##   11   4  Saving.accounts  Checking.account
##   11   5  Saving.accounts  Checking.account
##   12   1  Saving.accounts  Checking.account
##   12   2  Saving.accounts  Checking.account
##   12   3  Saving.accounts  Checking.account
##   12   4  Saving.accounts  Checking.account
##   12   5  Saving.accounts  Checking.account
##   13   1  Saving.accounts  Checking.account
##   13   2  Saving.accounts  Checking.account
##   13   3  Saving.accounts  Checking.account
##   13   4  Saving.accounts  Checking.account
##   13   5  Saving.accounts  Checking.account
##   14   1  Saving.accounts  Checking.account
##   14   2  Saving.accounts  Checking.account
##   14   3  Saving.accounts  Checking.account
##   14   4  Saving.accounts  Checking.account
##   14   5  Saving.accounts  Checking.account
##   15   1  Saving.accounts  Checking.account
##   15   2  Saving.accounts  Checking.account
##   15   3  Saving.accounts  Checking.account
##   15   4  Saving.accounts  Checking.account
##   15   5  Saving.accounts  Checking.account
##   16   1  Saving.accounts  Checking.account
##   16   2  Saving.accounts  Checking.account
##   16   3  Saving.accounts  Checking.account
##   16   4  Saving.accounts  Checking.account
##   16   5  Saving.accounts  Checking.account
##   17   1  Saving.accounts  Checking.account
##   17   2  Saving.accounts  Checking.account
##   17   3  Saving.accounts  Checking.account
##   17   4  Saving.accounts  Checking.account
##   17   5  Saving.accounts  Checking.account
##   18   1  Saving.accounts  Checking.account
##   18   2  Saving.accounts  Checking.account
##   18   3  Saving.accounts  Checking.account
##   18   4  Saving.accounts  Checking.account
##   18   5  Saving.accounts  Checking.account
##   19   1  Saving.accounts  Checking.account
##   19   2  Saving.accounts  Checking.account
##   19   3  Saving.accounts  Checking.account
##   19   4  Saving.accounts  Checking.account
##   19   5  Saving.accounts  Checking.account
##   20   1  Saving.accounts  Checking.account
##   20   2  Saving.accounts  Checking.account
##   20   3  Saving.accounts  Checking.account
##   20   4  Saving.accounts  Checking.account
##   20   5  Saving.accounts  Checking.account
##   21   1  Saving.accounts  Checking.account
##   21   2  Saving.accounts  Checking.account
##   21   3  Saving.accounts  Checking.account
##   21   4  Saving.accounts  Checking.account
##   21   5  Saving.accounts  Checking.account
##   22   1  Saving.accounts  Checking.account
##   22   2  Saving.accounts  Checking.account
##   22   3  Saving.accounts  Checking.account
##   22   4  Saving.accounts  Checking.account
##   22   5  Saving.accounts  Checking.account
##   23   1  Saving.accounts  Checking.account
##   23   2  Saving.accounts  Checking.account
##   23   3  Saving.accounts  Checking.account
##   23   4  Saving.accounts  Checking.account
##   23   5  Saving.accounts  Checking.account
##   24   1  Saving.accounts  Checking.account
##   24   2  Saving.accounts  Checking.account
##   24   3  Saving.accounts  Checking.account
##   24   4  Saving.accounts  Checking.account
##   24   5  Saving.accounts  Checking.account
##   25   1  Saving.accounts  Checking.account
##   25   2  Saving.accounts  Checking.account
##   25   3  Saving.accounts  Checking.account
##   25   4  Saving.accounts  Checking.account
##   25   5  Saving.accounts  Checking.account
##   26   1  Saving.accounts  Checking.account
##   26   2  Saving.accounts  Checking.account
##   26   3  Saving.accounts  Checking.account
##   26   4  Saving.accounts  Checking.account
##   26   5  Saving.accounts  Checking.account
##   27   1  Saving.accounts  Checking.account
##   27   2  Saving.accounts  Checking.account
##   27   3  Saving.accounts  Checking.account
##   27   4  Saving.accounts  Checking.account
##   27   5  Saving.accounts  Checking.account
##   28   1  Saving.accounts  Checking.account
##   28   2  Saving.accounts  Checking.account
##   28   3  Saving.accounts  Checking.account
##   28   4  Saving.accounts  Checking.account
##   28   5  Saving.accounts  Checking.account
##   29   1  Saving.accounts  Checking.account
##   29   2  Saving.accounts  Checking.account
##   29   3  Saving.accounts  Checking.account
##   29   4  Saving.accounts  Checking.account
##   29   5  Saving.accounts  Checking.account
##   30   1  Saving.accounts  Checking.account
##   30   2  Saving.accounts  Checking.account
##   30   3  Saving.accounts  Checking.account
##   30   4  Saving.accounts  Checking.account
##   30   5  Saving.accounts  Checking.account
##   31   1  Saving.accounts  Checking.account
##   31   2  Saving.accounts  Checking.account
##   31   3  Saving.accounts  Checking.account
##   31   4  Saving.accounts  Checking.account
##   31   5  Saving.accounts  Checking.account
##   32   1  Saving.accounts  Checking.account
##   32   2  Saving.accounts  Checking.account
##   32   3  Saving.accounts  Checking.account
##   32   4  Saving.accounts  Checking.account
##   32   5  Saving.accounts  Checking.account
##   33   1  Saving.accounts  Checking.account
##   33   2  Saving.accounts  Checking.account
##   33   3  Saving.accounts  Checking.account
##   33   4  Saving.accounts  Checking.account
##   33   5  Saving.accounts  Checking.account
##   34   1  Saving.accounts  Checking.account
##   34   2  Saving.accounts  Checking.account
##   34   3  Saving.accounts  Checking.account
##   34   4  Saving.accounts  Checking.account
##   34   5  Saving.accounts  Checking.account
##   35   1  Saving.accounts  Checking.account
##   35   2  Saving.accounts  Checking.account
##   35   3  Saving.accounts  Checking.account
##   35   4  Saving.accounts  Checking.account
##   35   5  Saving.accounts  Checking.account
##   36   1  Saving.accounts  Checking.account
##   36   2  Saving.accounts  Checking.account
##   36   3  Saving.accounts  Checking.account
##   36   4  Saving.accounts  Checking.account
##   36   5  Saving.accounts  Checking.account
##   37   1  Saving.accounts  Checking.account
##   37   2  Saving.accounts  Checking.account
##   37   3  Saving.accounts  Checking.account
##   37   4  Saving.accounts  Checking.account
##   37   5  Saving.accounts  Checking.account
##   38   1  Saving.accounts  Checking.account
##   38   2  Saving.accounts  Checking.account
##   38   3  Saving.accounts  Checking.account
##   38   4  Saving.accounts  Checking.account
##   38   5  Saving.accounts  Checking.account
##   39   1  Saving.accounts  Checking.account
##   39   2  Saving.accounts  Checking.account
##   39   3  Saving.accounts  Checking.account
##   39   4  Saving.accounts  Checking.account
##   39   5  Saving.accounts  Checking.account
##   40   1  Saving.accounts  Checking.account
##   40   2  Saving.accounts  Checking.account
##   40   3  Saving.accounts  Checking.account
##   40   4  Saving.accounts  Checking.account
##   40   5  Saving.accounts  Checking.account
##   41   1  Saving.accounts  Checking.account
##   41   2  Saving.accounts  Checking.account
##   41   3  Saving.accounts  Checking.account
##   41   4  Saving.accounts  Checking.account
##   41   5  Saving.accounts  Checking.account
##   42   1  Saving.accounts  Checking.account
##   42   2  Saving.accounts  Checking.account
##   42   3  Saving.accounts  Checking.account
##   42   4  Saving.accounts  Checking.account
##   42   5  Saving.accounts  Checking.account
##   43   1  Saving.accounts  Checking.account
##   43   2  Saving.accounts  Checking.account
##   43   3  Saving.accounts  Checking.account
##   43   4  Saving.accounts  Checking.account
##   43   5  Saving.accounts  Checking.account
##   44   1  Saving.accounts  Checking.account
##   44   2  Saving.accounts  Checking.account
##   44   3  Saving.accounts  Checking.account
##   44   4  Saving.accounts  Checking.account
##   44   5  Saving.accounts  Checking.account
##   45   1  Saving.accounts  Checking.account
##   45   2  Saving.accounts  Checking.account
##   45   3  Saving.accounts  Checking.account
##   45   4  Saving.accounts  Checking.account
##   45   5  Saving.accounts  Checking.account
##   46   1  Saving.accounts  Checking.account
##   46   2  Saving.accounts  Checking.account
##   46   3  Saving.accounts  Checking.account
##   46   4  Saving.accounts  Checking.account
##   46   5  Saving.accounts  Checking.account
##   47   1  Saving.accounts  Checking.account
##   47   2  Saving.accounts  Checking.account
##   47   3  Saving.accounts  Checking.account
##   47   4  Saving.accounts  Checking.account
##   47   5  Saving.accounts  Checking.account
##   48   1  Saving.accounts  Checking.account
##   48   2  Saving.accounts  Checking.account
##   48   3  Saving.accounts  Checking.account
##   48   4  Saving.accounts  Checking.account
##   48   5  Saving.accounts  Checking.account
##   49   1  Saving.accounts  Checking.account
##   49   2  Saving.accounts  Checking.account
##   49   3  Saving.accounts  Checking.account
##   49   4  Saving.accounts  Checking.account
##   49   5  Saving.accounts  Checking.account
##   50   1  Saving.accounts  Checking.account
##   50   2  Saving.accounts  Checking.account
##   50   3  Saving.accounts  Checking.account
##   50   4  Saving.accounts  Checking.account
##   50   5  Saving.accounts  Checking.account
final.data <- complete(datanew)
md.pattern(final.data)
##  /\     /\
## {  `---'  }
## {  O   O  }
## ==>  V <==  No need for mice. This data set is completely observed.
##  \  \|/  /
##   `-----'

##      Age Sex Job Housing Saving.accounts Checking.account Credit.amount
## 1000   1   1   1       1               1                1             1
##        0   0   0       0               0                0             0
##      Duration Purpose Risk  
## 1000        1       1    1 0
##             0       0    0 0
matrixplot(final.data)

## 
## Click in a column to sort by the corresponding variable.
## To regain use of the VIM GUI and the R console, click outside the plot region.
dataall<-final.data

Basic Information Statistics Risk Information

ggplot(dataall, aes(x=Risk, fill=Risk))+geom_bar()+labs(title='Distribution of Risk Classes')

1.Age

ggplot(dataall, aes(Age)) + geom_histogram(binwidth=4, colour="black", fill="lightgreen") +labs(x= "Age",y= "Frequency" , title = "Age Information")

2.Gender

ggplot(dataall, aes(Sex) ) + geom_bar(aes(fill = as.factor(Sex))) + scale_fill_discrete(name="Sex",labels=c( "Female","Male")) +labs(x= "Sex",y= "Frequency" , title = "Sex Information")

3.job

数值越大 技能性越高

ggplot(dataall, aes(Job) ) + geom_bar(aes(fill = as.factor(Job))) + scale_fill_discrete(name="Job",labels=c( "0","1","2","3")) +labs(x= "Job",y= "Frequency" , title = "Job Information")

4.Housing

ggplot(dataall, aes(Housing) ) + geom_bar(aes(fill = as.factor(Housing))) + scale_fill_discrete(name="Housing",labels=c("Free","Own", "Rent")) +labs(x= "Housing",y= "Frequency" , title = "Housing Information")

5.Saving accounts

ggplot(dataall, aes(Saving.accounts) ) + geom_bar(aes(fill = as.factor(Saving.accounts))) + scale_fill_discrete(name="Saving accounts",labels=c( "Little","Moderate", "Quite Rich", "Rich", "NA")) +labs(x= "Saving accounts",y= "Frequency" , title = "Saving Accounts Information")

6.Checking Account

ggplot(dataall, aes(Checking.account) ) + geom_bar(aes(fill = as.factor(Checking.account))) + scale_fill_discrete(name="Checking accounts",labels=c( "Little","Moderate", "Rich", "NA")) +labs(x= "Checking accounts",y= "Frequency" , title = "Checking Accounts Information")

7.Credit amount

ggplot(dataall, aes(Credit.amount)) + geom_histogram(binwidth=1000, colour="black", fill="lightblue") +labs(x= "Credit amount",y= "Frequency" , title = "Credit amount")

8.Duration

ggplot(dataall, aes(Duration)) + geom_histogram(binwidth=4, colour="black", fill="lightyellow") +labs(x= "Duration in Months",y= "Frequency" , title = "Duration")

9.Purpose

ggplot(dataall, aes(Purpose) ) + geom_bar(aes(fill = as.factor(Purpose))) + scale_fill_discrete(name="Purpose of Loan",labels=c( "Business","Car","Domestic Appliances","Education","Furniture/Equipment","Radio/TV","Repairs","Vacation/Others")) +labs(x= "Purpose of Loan",y= "Frequency" , title = "Plot of Loan Purpose")

简单联系分析

年龄&RISK

p1<-ggplot(dataall, aes(x=Age, fill=Risk)) +geom_density(alpha=0.5) +ggtitle('Distribution of Age by Risk') +xlab("Age") + ylab("Density") 
p2<-ggplot(dataall, aes(x=Age, fill=Risk))+geom_bar()+labs(title='Distribution of Age & Risk')
p3<-ggplot(dataall, aes(x=Age, fill=Risk))+geom_boxplot()+labs(title='Distribution of Age & Risk')
ggarrange(p1,p2,p3,ncol=1,nrow=3)

性别&RISK

ggplot(dataall, aes(x=Sex, fill=Risk))+geom_bar()+labs(title='Distribution of Sex & Risk')

Job & RISK

ggplot(dataall, aes(x=Job, fill=Risk))+geom_bar()+labs(title='Distribution of Job & Risk')

Housing & Risk

ggplot(dataall, aes(x=Housing, fill=Risk))+geom_bar()+labs(title='Distribution of Housing & Risk')

Saving accounts &Risk

ggplot(dataall, aes(x=Saving.accounts, fill=Risk))+geom_bar()+labs(title='Distribution of Saving accounts & Risk')

Checking accounts & Risk

ggplot(dataall, aes(x=Risk, fill=Checking.account))+geom_bar()+labs(title='Distribution of Checking accounts & Risk')

Credit amount & Risk

ggplot(dataall, aes(x=Credit.amount, fill=Risk)) +geom_density(alpha=0.5) +ggtitle('Distribution of Credit Amount by Risk') +xlab("Credit Amount") + ylab("Density") 

ggplot(dataall, aes(x=Risk,y=Credit.amount,fill=Risk)) +geom_boxplot() +ggtitle('Distribution of Credit Amount by Risk') +xlab("Credit Amount") + ylab("Density") 

Duration & Risk

ggplot(dataall, aes(x=Duration, fill=Risk)) +geom_density(alpha=0.5) +ggtitle('Distribution of Duration by Risk') +xlab("Duration") + ylab("Density") 

Purpose & Risk

ggplot(dataall, aes(x=Risk, fill=Purpose))+geom_bar(position='dodge2')+coord_flip()

数据数值分析

热力图

model.matrix(~0+., data=dataall) %>% cor(use="pairwise.complete.obs") %>% ggcorrplot(show.diag = T,  lab=TRUE, lab_size=1)

算法建模

1.Logistic regression model

训练集 测试集

set.seed(127)
slidata <- sample(nrow(dataall), 0.7*nrow(dataall))
train <- dataall[slidata,]
test <- dataall[-slidata,]

Logistic

logistic1= glm(formula = as.factor(train$Risk) ~.,family = binomial,data = train)
summary(logistic1)
## 
## Call:
## glm(formula = as.factor(train$Risk) ~ ., family = binomial, data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4199  -1.0179   0.5972   0.8087   1.7241  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -1.189e-01  7.217e-01  -0.165  0.86910    
## Age                         1.949e-02  9.216e-03   2.115  0.03446 *  
## Sexmale                     4.245e-01  2.010e-01   2.112  0.03472 *  
## Job                         6.090e-02  1.455e-01   0.418  0.67559    
## Housingown                  4.141e-01  3.179e-01   1.302  0.19275    
## Housingrent                 7.384e-02  3.804e-01   0.194  0.84611    
## Saving.accountsmoderate     2.646e-01  2.991e-01   0.884  0.37645    
## Saving.accountsquite rich   4.323e-01  3.835e-01   1.127  0.25974    
## Saving.accountsrich         7.183e-01  4.152e-01   1.730  0.08360 .  
## Checking.accountmoderate    4.706e-01  2.001e-01   2.351  0.01871 *  
## Checking.accountrich        1.100e+00  3.602e-01   3.053  0.00226 ** 
## Credit.amount               2.971e-05  4.434e-05   0.670  0.50288    
## Duration                   -4.114e-02  9.666e-03  -4.257 2.08e-05 ***
## Purposecar                 -4.488e-02  3.393e-01  -0.132  0.89478    
## Purposedomestic appliances -5.045e-01  8.733e-01  -0.578  0.56346    
## Purposeeducation           -4.314e-01  4.663e-01  -0.925  0.35483    
## Purposefurniture/equipment  1.019e-01  3.690e-01   0.276  0.78236    
## Purposeradio/TV             3.742e-01  3.482e-01   1.075  0.28248    
## Purposerepairs             -5.664e-01  6.265e-01  -0.904  0.36589    
## Purposevacation/others     -8.336e-01  8.130e-01  -1.025  0.30520    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 837.58  on 699  degrees of freedom
## Residual deviance: 754.60  on 680  degrees of freedom
## AIC: 794.6
## 
## Number of Fisher Scoring iterations: 4

可以看p值:我们看出Saving.accountsrich,Checking.accountrich,Duration,Sexmale系数要被考虑 重新建模

logistic2= glm(formula = as.factor(train$Risk) ~Saving.accounts+Checking.account+Duration+Sex,family = binomial,data = train)
summary(logistic2)
## 
## Call:
## glm(formula = as.factor(train$Risk) ~ Saving.accounts + Checking.account + 
##     Duration + Sex, family = binomial, data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3771  -1.0862   0.6173   0.8260   1.5535  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                0.905497   0.227071   3.988 6.67e-05 ***
## Saving.accountsmoderate    0.179157   0.289724   0.618 0.536331    
## Saving.accountsquite rich  0.517677   0.378815   1.367 0.171760    
## Saving.accountsrich        0.732533   0.409747   1.788 0.073813 .  
## Checking.accountmoderate   0.545914   0.190896   2.860 0.004240 ** 
## Checking.accountrich       1.231316   0.353947   3.479 0.000504 ***
## Duration                  -0.037375   0.007111  -5.256 1.47e-07 ***
## Sexmale                    0.558147   0.186524   2.992 0.002768 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 837.58  on 699  degrees of freedom
## Residual deviance: 771.08  on 692  degrees of freedom
## AIC: 787.08
## 
## Number of Fisher Scoring iterations: 4

训练集的预测情况

#Predicting train result 
prob_pred_train = predict(logistic2,type = 'response',newdata = train)
glm_pre_train= ifelse(prob_pred_train > 0.5,"good","bad")
summary(glm_pre_train)
##    Length     Class      Mode 
##       700 character character

混淆矩阵

xtable<-table(train$Risk,glm_pre_train)
confusionMatrix(xtable)
## Confusion Matrix and Statistics
## 
##       glm_pre_train
##        bad good
##   bad   31  169
##   good  17  483
##                                           
##                Accuracy : 0.7343          
##                  95% CI : (0.6999, 0.7667)
##     No Information Rate : 0.9314          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1567          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.64583         
##             Specificity : 0.74080         
##          Pos Pred Value : 0.15500         
##          Neg Pred Value : 0.96600         
##              Prevalence : 0.06857         
##          Detection Rate : 0.04429         
##    Detection Prevalence : 0.28571         
##       Balanced Accuracy : 0.69332         
##                                           
##        'Positive' Class : bad             
## 

ROC图分析

roc_train=roc(train$Risk,prob_pred_train)
## Setting levels: control = bad, case = good
## Setting direction: controls < cases
plot(roc_train,print.auc=TRUE,auc.polygon=TRUE,grid=c(0.1,0.2),grid.col=c("green","red"),max.auc.polygon=TRUE,auc.polygon.col="lightblue",print.thres="best")

测试集的预测结果

#Predicting test result 
prob_pred_test = predict(logistic2,type = 'response',newdata = test)
glm_pre_test = ifelse(prob_pred_test > 0.5,"good","bad")
summary(glm_pre_test)
##    Length     Class      Mode 
##       300 character character

混淆矩阵

xtable<-table(test$Risk, glm_pre_test)
confusionMatrix(xtable)
## Confusion Matrix and Statistics
## 
##       glm_pre_test
##        bad good
##   bad   12   88
##   good   2  198
##                                           
##                Accuracy : 0.7             
##                  95% CI : (0.6447, 0.7513)
##     No Information Rate : 0.9533          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1401          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.85714         
##             Specificity : 0.69231         
##          Pos Pred Value : 0.12000         
##          Neg Pred Value : 0.99000         
##              Prevalence : 0.04667         
##          Detection Rate : 0.04000         
##    Detection Prevalence : 0.33333         
##       Balanced Accuracy : 0.77473         
##                                           
##        'Positive' Class : bad             
## 

ROC图分析

roc_test=roc(test$Risk,prob_pred_test)
## Setting levels: control = bad, case = good
## Setting direction: controls < cases
plot(roc_test,print.auc=TRUE,auc.polygon=TRUE,grid=c(0.1,0.2),grid.col=c("green","red"),max.auc.polygon=TRUE,auc.polygon.col="lightblue",print.thres="best")

2.Tree Models

Tree_model <- rpart(as.factor(train$Risk)~.,data=train)
fancyRpartPlot(Tree_model)

防止过拟合 我们需要剪枝操作

Tree_model$cptable
##        CP nsplit rel error xerror       xstd
## 1 0.03750      0     1.000  1.000 0.05976143
## 2 0.02500      2     0.925  1.035 0.06037117
## 3 0.01750      3     0.900  1.040 0.06045541
## 4 0.01500      7     0.825  1.030 0.06028622
## 5 0.01125     11     0.755  1.070 0.06094611
## 6 0.01000     16     0.695  1.080 0.06110413
plotcp(Tree_model)#交叉验证误差与复杂度参数的关系图

Tree_model_1 <- prune(Tree_model, cp=0.014)
fancyRpartPlot(Tree_model_1)

来检测预测效果 训练集

Tree_pre_train<-predict(Tree_model_1,train,type='class')
Tree_prob_train<-predict(Tree_model_1,train,type='prob')[,2]
xtable<-table(Tree_pre_train,train$Risk)
confusionMatrix(xtable)
## Confusion Matrix and Statistics
## 
##               
## Tree_pre_train bad good
##           bad   70   21
##           good 130  479
##                                           
##                Accuracy : 0.7843          
##                  95% CI : (0.7519, 0.8142)
##     No Information Rate : 0.7143          
##     P-Value [Acc > NIR] : 1.554e-05       
##                                           
##                   Kappa : 0.3682          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.3500          
##             Specificity : 0.9580          
##          Pos Pred Value : 0.7692          
##          Neg Pred Value : 0.7865          
##              Prevalence : 0.2857          
##          Detection Rate : 0.1000          
##    Detection Prevalence : 0.1300          
##       Balanced Accuracy : 0.6540          
##                                           
##        'Positive' Class : bad             
## 
Tree_roc_train=roc(train$Risk,Tree_prob_train)
## Setting levels: control = bad, case = good
## Setting direction: controls < cases
plot(Tree_roc_train,print.auc=TRUE,auc.polygon=TRUE,grid=c(0.1,0.2),grid.col=c("green","red"),max.auc.polygon=TRUE,auc.polygon.col="lightblue",print.thres="best")

测试集

Tree_pre_test<-predict(Tree_model_1,test,type='class')
xtable<-table(Tree_pre_test,test$Risk)
confusionMatrix(xtable)
## Confusion Matrix and Statistics
## 
##              
## Tree_pre_test bad good
##          bad   11   14
##          good  89  186
##                                           
##                Accuracy : 0.6567          
##                  95% CI : (0.5999, 0.7103)
##     No Information Rate : 0.6667          
##     P-Value [Acc > NIR] : 0.6679          
##                                           
##                   Kappa : 0.0492          
##                                           
##  Mcnemar's Test P-Value : 3.067e-13       
##                                           
##             Sensitivity : 0.11000         
##             Specificity : 0.93000         
##          Pos Pred Value : 0.44000         
##          Neg Pred Value : 0.67636         
##              Prevalence : 0.33333         
##          Detection Rate : 0.03667         
##    Detection Prevalence : 0.08333         
##       Balanced Accuracy : 0.52000         
##                                           
##        'Positive' Class : bad             
## 
roc_test=roc(test$Risk,as.numeric(Tree_pre_test))
## Setting levels: control = bad, case = good
## Setting direction: controls < cases
plot(roc_test,print.auc=TRUE,auc.polygon=TRUE,grid=c(0.1,0.2),grid.col=c("green","red"),max.auc.polygon=TRUE,auc.polygon.col="lightblue",print.thres="best")

3.随机森林

https://cloud.tencent.com/developer/article/1870681

train.data.forest<-na.omit(train)
train.forest <- randomForest(as.factor(train.data.forest$Risk) ~ ., data = train.data.forest, importance = TRUE)
train.forest
## 
## Call:
##  randomForest(formula = as.factor(train.data.forest$Risk) ~ .,      data = train.data.forest, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 27.43%
## Confusion matrix:
##      bad good class.error
## bad   55  145       0.725
## good  47  453       0.094

查看重要性参数

importance.forest <- train.forest$importance
head(importance.forest)
##                           bad          good MeanDecreaseAccuracy
## Age               0.005758647  0.0057862485         0.0057903076
## Sex              -0.006800140 -0.0026523966        -0.0038236225
## Job               0.001222651  0.0005228298         0.0007067322
## Housing          -0.002377569  0.0048924327         0.0028245588
## Saving.accounts   0.003816116 -0.0040903448        -0.0018307483
## Checking.account  0.017223317  0.0025708484         0.0067008988
##                  MeanDecreaseGini
## Age                     56.765824
## Sex                      7.805841
## Job                     15.715161
## Housing                 12.777167
## Saving.accounts         14.017268
## Checking.account        16.941745
varImpPlot(train.forest, n.var = min(15, nrow(train.forest$importance)), main = 'Top  variable importance')

筛选掉不重要的一些参数 训练集

select <- rownames(importance.forest)[1:4]
test.data.forest<-na.omit(test)
train.data.forest.4 <- train.data.forest[ ,c(select, 'Risk')]
test.data.forest.4<- test.data.forest[ ,c(select, 'Risk')]

set.seed(123)
train.forest_4 <- randomForest(as.factor(train.data.forest.4$Risk) ~ ., data = train.data.forest.4, importance = TRUE)
train.forest_4
## 
## Call:
##  randomForest(formula = as.factor(train.data.forest.4$Risk) ~      ., data = train.data.forest.4, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 31.71%
## Confusion matrix:
##      bad good class.error
## bad   22  178       0.890
## good  44  456       0.088
#plot(margin(train.forest_5, train.data.forest.5$Risk))
 
train_predict.forest <- predict(train.forest_4, train.data.forest.4)
xtable <- table(train_predict.forest, train.data.forest.4$Risk)
confusionMatrix(xtable)
## Confusion Matrix and Statistics
## 
##                     
## train_predict.forest bad good
##                 bad   56    9
##                 good 144  491
##                                           
##                Accuracy : 0.7814          
##                  95% CI : (0.7489, 0.8115)
##     No Information Rate : 0.7143          
##     P-Value [Acc > NIR] : 3.328e-05       
##                                           
##                   Kappa : 0.3285          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.28000         
##             Specificity : 0.98200         
##          Pos Pred Value : 0.86154         
##          Neg Pred Value : 0.77323         
##              Prevalence : 0.28571         
##          Detection Rate : 0.08000         
##    Detection Prevalence : 0.09286         
##       Balanced Accuracy : 0.63100         
##                                           
##        'Positive' Class : bad             
## 

测试集

test_predict.forest <- predict(train.forest_4, test.data.forest.4)
xtable  <- table(test.data.forest.4$Risk, test_predict.forest)
confusionMatrix(xtable)
## Confusion Matrix and Statistics
## 
##       test_predict.forest
##        bad good
##   bad   10   90
##   good  15  185
##                                           
##                Accuracy : 0.65            
##                  95% CI : (0.5931, 0.7039)
##     No Information Rate : 0.9167          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0308          
##                                           
##  Mcnemar's Test P-Value : 5.136e-13       
##                                           
##             Sensitivity : 0.40000         
##             Specificity : 0.67273         
##          Pos Pred Value : 0.10000         
##          Neg Pred Value : 0.92500         
##              Prevalence : 0.08333         
##          Detection Rate : 0.03333         
##    Detection Prevalence : 0.33333         
##       Balanced Accuracy : 0.53636         
##                                           
##        'Positive' Class : bad             
## 

4.AdaBoosting

library(adabag)
## Loading required package: foreach
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
train.data.adabag <- na.omit(train)
test.data.adabag <- na.omit(test)
train.data.adabag$Risk <- factor(train.data.adabag$Risk)
error <- as.numeric()
for(i in 1:20){
  train.adaboost <- boosting(Risk ~ ., data=train.data.adabag, mfinal=i)
  test.adaboost.pred <- predict.boosting(train.adaboost,newdata = test.data.adabag)
  error[i] <- test.adaboost.pred$error
}
error <- as.data.frame(error)
xtable<-test.adaboost.pred$confusion
confusionMatrix(xtable)
## Confusion Matrix and Statistics
## 
##                Observed Class
## Predicted Class bad good
##            bad   31   39
##            good  69  161
##                                           
##                Accuracy : 0.64            
##                  95% CI : (0.5828, 0.6944)
##     No Information Rate : 0.6667          
##     P-Value [Acc > NIR] : 0.850963        
##                                           
##                   Kappa : 0.1243          
##                                           
##  Mcnemar's Test P-Value : 0.005262        
##                                           
##             Sensitivity : 0.3100          
##             Specificity : 0.8050          
##          Pos Pred Value : 0.4429          
##          Neg Pred Value : 0.7000          
##              Prevalence : 0.3333          
##          Detection Rate : 0.1033          
##    Detection Prevalence : 0.2333          
##       Balanced Accuracy : 0.5575          
##                                           
##        'Positive' Class : bad             
## 
ggplot(error,aes(x=1:20,y=error))+geom_line(colour="red", linetype="dashed",size = 1)+geom_point(size=3, shape=18)+ylim(0,1) +xlab("the number of basic classifiers")