Data set parameter types and the first few rows of the data set
dataall<-read.csv('german_credit_data.csv')
head(dataall)
## Age Sex Job Housing Saving.accounts Checking.account Credit.amount
## 1 67 male 2 own <NA> little 1169
## 2 22 female 2 own little moderate 5951
## 3 49 male 1 own little <NA> 2096
## 4 45 male 2 free little little 7882
## 5 53 male 2 free little little 4870
## 6 35 male 1 free <NA> <NA> 9055
## Duration Purpose Risk
## 1 6 radio/TV good
## 2 48 radio/TV bad
## 3 12 education good
## 4 42 furniture/equipment good
## 5 24 car bad
## 6 36 education good
数据集信息
summary(dataall)
## Age Sex Job Housing
## Min. :19.00 Length:1000 Min. :0.000 Length:1000
## 1st Qu.:27.00 Class :character 1st Qu.:2.000 Class :character
## Median :33.00 Mode :character Median :2.000 Mode :character
## Mean :35.55 Mean :1.904
## 3rd Qu.:42.00 3rd Qu.:2.000
## Max. :75.00 Max. :3.000
## Saving.accounts Checking.account Credit.amount Duration
## Length:1000 Length:1000 Min. : 250 Min. : 4.0
## Class :character Class :character 1st Qu.: 1366 1st Qu.:12.0
## Mode :character Mode :character Median : 2320 Median :18.0
## Mean : 3271 Mean :20.9
## 3rd Qu.: 3972 3rd Qu.:24.0
## Max. :18424 Max. :72.0
## Purpose Risk
## Length:1000 Length:1000
## Class :character Class :character
## Mode :character Mode :character
##
##
##
na值的查看 红色表示缺失
#newdata = mice(dataall, method='polr',seed = 100)
#newdata = complete
newdata<-dataall
newdata$Housing<-as.factor(newdata$Housing)
newdata$Sex<-as.factor(newdata$Sex)
newdata$Saving.accounts<-as.factor(newdata$Saving.accounts)
newdata$Checking.account<-as.factor(newdata$Checking.account)
newdata$Purpose<-as.factor(newdata$Purpose)
newdata$Risk<-as.factor(newdata$Risk)
head(newdata)
## Age Sex Job Housing Saving.accounts Checking.account Credit.amount
## 1 67 male 2 own <NA> little 1169
## 2 22 female 2 own little moderate 5951
## 3 49 male 1 own little <NA> 2096
## 4 45 male 2 free little little 7882
## 5 53 male 2 free little little 4870
## 6 35 male 1 free <NA> <NA> 9055
## Duration Purpose Risk
## 1 6 radio/TV good
## 2 48 radio/TV bad
## 3 12 education good
## 4 42 furniture/equipment good
## 5 24 car bad
## 6 36 education good
md.pattern(newdata)
## Age Sex Job Housing Credit.amount Duration Purpose Risk Saving.accounts
## 522 1 1 1 1 1 1 1 1 1
## 295 1 1 1 1 1 1 1 1 1
## 84 1 1 1 1 1 1 1 1 0
## 99 1 1 1 1 1 1 1 1 0
## 0 0 0 0 0 0 0 0 183
## Checking.account
## 522 1 0
## 295 0 1
## 84 1 1
## 99 0 2
## 394 577
matrixplot(newdata)
##
## Click in a column to sort by the corresponding variable.
## To regain use of the VIM GUI and the R console, click outside the plot region.
判断SAVING和CHECKING是否有关系
marginplot(newdata[c(5,6)])
marginmatrix(newdata)
数据集填充完毕!
datanew<-mice(newdata,maxit=50,seed=500)
##
## iter imp variable
## 1 1 Saving.accounts Checking.account
## 1 2 Saving.accounts Checking.account
## 1 3 Saving.accounts Checking.account
## 1 4 Saving.accounts Checking.account
## 1 5 Saving.accounts Checking.account
## 2 1 Saving.accounts Checking.account
## 2 2 Saving.accounts Checking.account
## 2 3 Saving.accounts Checking.account
## 2 4 Saving.accounts Checking.account
## 2 5 Saving.accounts Checking.account
## 3 1 Saving.accounts Checking.account
## 3 2 Saving.accounts Checking.account
## 3 3 Saving.accounts Checking.account
## 3 4 Saving.accounts Checking.account
## 3 5 Saving.accounts Checking.account
## 4 1 Saving.accounts Checking.account
## 4 2 Saving.accounts Checking.account
## 4 3 Saving.accounts Checking.account
## 4 4 Saving.accounts Checking.account
## 4 5 Saving.accounts Checking.account
## 5 1 Saving.accounts Checking.account
## 5 2 Saving.accounts Checking.account
## 5 3 Saving.accounts Checking.account
## 5 4 Saving.accounts Checking.account
## 5 5 Saving.accounts Checking.account
## 6 1 Saving.accounts Checking.account
## 6 2 Saving.accounts Checking.account
## 6 3 Saving.accounts Checking.account
## 6 4 Saving.accounts Checking.account
## 6 5 Saving.accounts Checking.account
## 7 1 Saving.accounts Checking.account
## 7 2 Saving.accounts Checking.account
## 7 3 Saving.accounts Checking.account
## 7 4 Saving.accounts Checking.account
## 7 5 Saving.accounts Checking.account
## 8 1 Saving.accounts Checking.account
## 8 2 Saving.accounts Checking.account
## 8 3 Saving.accounts Checking.account
## 8 4 Saving.accounts Checking.account
## 8 5 Saving.accounts Checking.account
## 9 1 Saving.accounts Checking.account
## 9 2 Saving.accounts Checking.account
## 9 3 Saving.accounts Checking.account
## 9 4 Saving.accounts Checking.account
## 9 5 Saving.accounts Checking.account
## 10 1 Saving.accounts Checking.account
## 10 2 Saving.accounts Checking.account
## 10 3 Saving.accounts Checking.account
## 10 4 Saving.accounts Checking.account
## 10 5 Saving.accounts Checking.account
## 11 1 Saving.accounts Checking.account
## 11 2 Saving.accounts Checking.account
## 11 3 Saving.accounts Checking.account
## 11 4 Saving.accounts Checking.account
## 11 5 Saving.accounts Checking.account
## 12 1 Saving.accounts Checking.account
## 12 2 Saving.accounts Checking.account
## 12 3 Saving.accounts Checking.account
## 12 4 Saving.accounts Checking.account
## 12 5 Saving.accounts Checking.account
## 13 1 Saving.accounts Checking.account
## 13 2 Saving.accounts Checking.account
## 13 3 Saving.accounts Checking.account
## 13 4 Saving.accounts Checking.account
## 13 5 Saving.accounts Checking.account
## 14 1 Saving.accounts Checking.account
## 14 2 Saving.accounts Checking.account
## 14 3 Saving.accounts Checking.account
## 14 4 Saving.accounts Checking.account
## 14 5 Saving.accounts Checking.account
## 15 1 Saving.accounts Checking.account
## 15 2 Saving.accounts Checking.account
## 15 3 Saving.accounts Checking.account
## 15 4 Saving.accounts Checking.account
## 15 5 Saving.accounts Checking.account
## 16 1 Saving.accounts Checking.account
## 16 2 Saving.accounts Checking.account
## 16 3 Saving.accounts Checking.account
## 16 4 Saving.accounts Checking.account
## 16 5 Saving.accounts Checking.account
## 17 1 Saving.accounts Checking.account
## 17 2 Saving.accounts Checking.account
## 17 3 Saving.accounts Checking.account
## 17 4 Saving.accounts Checking.account
## 17 5 Saving.accounts Checking.account
## 18 1 Saving.accounts Checking.account
## 18 2 Saving.accounts Checking.account
## 18 3 Saving.accounts Checking.account
## 18 4 Saving.accounts Checking.account
## 18 5 Saving.accounts Checking.account
## 19 1 Saving.accounts Checking.account
## 19 2 Saving.accounts Checking.account
## 19 3 Saving.accounts Checking.account
## 19 4 Saving.accounts Checking.account
## 19 5 Saving.accounts Checking.account
## 20 1 Saving.accounts Checking.account
## 20 2 Saving.accounts Checking.account
## 20 3 Saving.accounts Checking.account
## 20 4 Saving.accounts Checking.account
## 20 5 Saving.accounts Checking.account
## 21 1 Saving.accounts Checking.account
## 21 2 Saving.accounts Checking.account
## 21 3 Saving.accounts Checking.account
## 21 4 Saving.accounts Checking.account
## 21 5 Saving.accounts Checking.account
## 22 1 Saving.accounts Checking.account
## 22 2 Saving.accounts Checking.account
## 22 3 Saving.accounts Checking.account
## 22 4 Saving.accounts Checking.account
## 22 5 Saving.accounts Checking.account
## 23 1 Saving.accounts Checking.account
## 23 2 Saving.accounts Checking.account
## 23 3 Saving.accounts Checking.account
## 23 4 Saving.accounts Checking.account
## 23 5 Saving.accounts Checking.account
## 24 1 Saving.accounts Checking.account
## 24 2 Saving.accounts Checking.account
## 24 3 Saving.accounts Checking.account
## 24 4 Saving.accounts Checking.account
## 24 5 Saving.accounts Checking.account
## 25 1 Saving.accounts Checking.account
## 25 2 Saving.accounts Checking.account
## 25 3 Saving.accounts Checking.account
## 25 4 Saving.accounts Checking.account
## 25 5 Saving.accounts Checking.account
## 26 1 Saving.accounts Checking.account
## 26 2 Saving.accounts Checking.account
## 26 3 Saving.accounts Checking.account
## 26 4 Saving.accounts Checking.account
## 26 5 Saving.accounts Checking.account
## 27 1 Saving.accounts Checking.account
## 27 2 Saving.accounts Checking.account
## 27 3 Saving.accounts Checking.account
## 27 4 Saving.accounts Checking.account
## 27 5 Saving.accounts Checking.account
## 28 1 Saving.accounts Checking.account
## 28 2 Saving.accounts Checking.account
## 28 3 Saving.accounts Checking.account
## 28 4 Saving.accounts Checking.account
## 28 5 Saving.accounts Checking.account
## 29 1 Saving.accounts Checking.account
## 29 2 Saving.accounts Checking.account
## 29 3 Saving.accounts Checking.account
## 29 4 Saving.accounts Checking.account
## 29 5 Saving.accounts Checking.account
## 30 1 Saving.accounts Checking.account
## 30 2 Saving.accounts Checking.account
## 30 3 Saving.accounts Checking.account
## 30 4 Saving.accounts Checking.account
## 30 5 Saving.accounts Checking.account
## 31 1 Saving.accounts Checking.account
## 31 2 Saving.accounts Checking.account
## 31 3 Saving.accounts Checking.account
## 31 4 Saving.accounts Checking.account
## 31 5 Saving.accounts Checking.account
## 32 1 Saving.accounts Checking.account
## 32 2 Saving.accounts Checking.account
## 32 3 Saving.accounts Checking.account
## 32 4 Saving.accounts Checking.account
## 32 5 Saving.accounts Checking.account
## 33 1 Saving.accounts Checking.account
## 33 2 Saving.accounts Checking.account
## 33 3 Saving.accounts Checking.account
## 33 4 Saving.accounts Checking.account
## 33 5 Saving.accounts Checking.account
## 34 1 Saving.accounts Checking.account
## 34 2 Saving.accounts Checking.account
## 34 3 Saving.accounts Checking.account
## 34 4 Saving.accounts Checking.account
## 34 5 Saving.accounts Checking.account
## 35 1 Saving.accounts Checking.account
## 35 2 Saving.accounts Checking.account
## 35 3 Saving.accounts Checking.account
## 35 4 Saving.accounts Checking.account
## 35 5 Saving.accounts Checking.account
## 36 1 Saving.accounts Checking.account
## 36 2 Saving.accounts Checking.account
## 36 3 Saving.accounts Checking.account
## 36 4 Saving.accounts Checking.account
## 36 5 Saving.accounts Checking.account
## 37 1 Saving.accounts Checking.account
## 37 2 Saving.accounts Checking.account
## 37 3 Saving.accounts Checking.account
## 37 4 Saving.accounts Checking.account
## 37 5 Saving.accounts Checking.account
## 38 1 Saving.accounts Checking.account
## 38 2 Saving.accounts Checking.account
## 38 3 Saving.accounts Checking.account
## 38 4 Saving.accounts Checking.account
## 38 5 Saving.accounts Checking.account
## 39 1 Saving.accounts Checking.account
## 39 2 Saving.accounts Checking.account
## 39 3 Saving.accounts Checking.account
## 39 4 Saving.accounts Checking.account
## 39 5 Saving.accounts Checking.account
## 40 1 Saving.accounts Checking.account
## 40 2 Saving.accounts Checking.account
## 40 3 Saving.accounts Checking.account
## 40 4 Saving.accounts Checking.account
## 40 5 Saving.accounts Checking.account
## 41 1 Saving.accounts Checking.account
## 41 2 Saving.accounts Checking.account
## 41 3 Saving.accounts Checking.account
## 41 4 Saving.accounts Checking.account
## 41 5 Saving.accounts Checking.account
## 42 1 Saving.accounts Checking.account
## 42 2 Saving.accounts Checking.account
## 42 3 Saving.accounts Checking.account
## 42 4 Saving.accounts Checking.account
## 42 5 Saving.accounts Checking.account
## 43 1 Saving.accounts Checking.account
## 43 2 Saving.accounts Checking.account
## 43 3 Saving.accounts Checking.account
## 43 4 Saving.accounts Checking.account
## 43 5 Saving.accounts Checking.account
## 44 1 Saving.accounts Checking.account
## 44 2 Saving.accounts Checking.account
## 44 3 Saving.accounts Checking.account
## 44 4 Saving.accounts Checking.account
## 44 5 Saving.accounts Checking.account
## 45 1 Saving.accounts Checking.account
## 45 2 Saving.accounts Checking.account
## 45 3 Saving.accounts Checking.account
## 45 4 Saving.accounts Checking.account
## 45 5 Saving.accounts Checking.account
## 46 1 Saving.accounts Checking.account
## 46 2 Saving.accounts Checking.account
## 46 3 Saving.accounts Checking.account
## 46 4 Saving.accounts Checking.account
## 46 5 Saving.accounts Checking.account
## 47 1 Saving.accounts Checking.account
## 47 2 Saving.accounts Checking.account
## 47 3 Saving.accounts Checking.account
## 47 4 Saving.accounts Checking.account
## 47 5 Saving.accounts Checking.account
## 48 1 Saving.accounts Checking.account
## 48 2 Saving.accounts Checking.account
## 48 3 Saving.accounts Checking.account
## 48 4 Saving.accounts Checking.account
## 48 5 Saving.accounts Checking.account
## 49 1 Saving.accounts Checking.account
## 49 2 Saving.accounts Checking.account
## 49 3 Saving.accounts Checking.account
## 49 4 Saving.accounts Checking.account
## 49 5 Saving.accounts Checking.account
## 50 1 Saving.accounts Checking.account
## 50 2 Saving.accounts Checking.account
## 50 3 Saving.accounts Checking.account
## 50 4 Saving.accounts Checking.account
## 50 5 Saving.accounts Checking.account
final.data <- complete(datanew)
md.pattern(final.data)
## /\ /\
## { `---' }
## { O O }
## ==> V <== No need for mice. This data set is completely observed.
## \ \|/ /
## `-----'
## Age Sex Job Housing Saving.accounts Checking.account Credit.amount
## 1000 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0
## Duration Purpose Risk
## 1000 1 1 1 0
## 0 0 0 0
matrixplot(final.data)
##
## Click in a column to sort by the corresponding variable.
## To regain use of the VIM GUI and the R console, click outside the plot region.
dataall<-final.data
Basic Information Statistics Risk Information
ggplot(dataall, aes(x=Risk, fill=Risk))+geom_bar()+labs(title='Distribution of Risk Classes')
1.Age
ggplot(dataall, aes(Age)) + geom_histogram(binwidth=4, colour="black", fill="lightgreen") +labs(x= "Age",y= "Frequency" , title = "Age Information")
2.Gender
ggplot(dataall, aes(Sex) ) + geom_bar(aes(fill = as.factor(Sex))) + scale_fill_discrete(name="Sex",labels=c( "Female","Male")) +labs(x= "Sex",y= "Frequency" , title = "Sex Information")
3.job
数值越大 技能性越高
ggplot(dataall, aes(Job) ) + geom_bar(aes(fill = as.factor(Job))) + scale_fill_discrete(name="Job",labels=c( "0","1","2","3")) +labs(x= "Job",y= "Frequency" , title = "Job Information")
4.Housing
ggplot(dataall, aes(Housing) ) + geom_bar(aes(fill = as.factor(Housing))) + scale_fill_discrete(name="Housing",labels=c("Free","Own", "Rent")) +labs(x= "Housing",y= "Frequency" , title = "Housing Information")
5.Saving accounts
ggplot(dataall, aes(Saving.accounts) ) + geom_bar(aes(fill = as.factor(Saving.accounts))) + scale_fill_discrete(name="Saving accounts",labels=c( "Little","Moderate", "Quite Rich", "Rich", "NA")) +labs(x= "Saving accounts",y= "Frequency" , title = "Saving Accounts Information")
6.Checking Account
ggplot(dataall, aes(Checking.account) ) + geom_bar(aes(fill = as.factor(Checking.account))) + scale_fill_discrete(name="Checking accounts",labels=c( "Little","Moderate", "Rich", "NA")) +labs(x= "Checking accounts",y= "Frequency" , title = "Checking Accounts Information")
7.Credit amount
ggplot(dataall, aes(Credit.amount)) + geom_histogram(binwidth=1000, colour="black", fill="lightblue") +labs(x= "Credit amount",y= "Frequency" , title = "Credit amount")
8.Duration
ggplot(dataall, aes(Duration)) + geom_histogram(binwidth=4, colour="black", fill="lightyellow") +labs(x= "Duration in Months",y= "Frequency" , title = "Duration")
9.Purpose
ggplot(dataall, aes(Purpose) ) + geom_bar(aes(fill = as.factor(Purpose))) + scale_fill_discrete(name="Purpose of Loan",labels=c( "Business","Car","Domestic Appliances","Education","Furniture/Equipment","Radio/TV","Repairs","Vacation/Others")) +labs(x= "Purpose of Loan",y= "Frequency" , title = "Plot of Loan Purpose")
年龄&RISK
p1<-ggplot(dataall, aes(x=Age, fill=Risk)) +geom_density(alpha=0.5) +ggtitle('Distribution of Age by Risk') +xlab("Age") + ylab("Density")
p2<-ggplot(dataall, aes(x=Age, fill=Risk))+geom_bar()+labs(title='Distribution of Age & Risk')
p3<-ggplot(dataall, aes(x=Age, fill=Risk))+geom_boxplot()+labs(title='Distribution of Age & Risk')
ggarrange(p1,p2,p3,ncol=1,nrow=3)
性别&RISK
ggplot(dataall, aes(x=Sex, fill=Risk))+geom_bar()+labs(title='Distribution of Sex & Risk')
Job & RISK
ggplot(dataall, aes(x=Job, fill=Risk))+geom_bar()+labs(title='Distribution of Job & Risk')
Housing & Risk
ggplot(dataall, aes(x=Housing, fill=Risk))+geom_bar()+labs(title='Distribution of Housing & Risk')
Saving accounts &Risk
ggplot(dataall, aes(x=Saving.accounts, fill=Risk))+geom_bar()+labs(title='Distribution of Saving accounts & Risk')
Checking accounts & Risk
ggplot(dataall, aes(x=Risk, fill=Checking.account))+geom_bar()+labs(title='Distribution of Checking accounts & Risk')
Credit amount & Risk
ggplot(dataall, aes(x=Credit.amount, fill=Risk)) +geom_density(alpha=0.5) +ggtitle('Distribution of Credit Amount by Risk') +xlab("Credit Amount") + ylab("Density")
ggplot(dataall, aes(x=Risk,y=Credit.amount,fill=Risk)) +geom_boxplot() +ggtitle('Distribution of Credit Amount by Risk') +xlab("Credit Amount") + ylab("Density")
Duration & Risk
ggplot(dataall, aes(x=Duration, fill=Risk)) +geom_density(alpha=0.5) +ggtitle('Distribution of Duration by Risk') +xlab("Duration") + ylab("Density")
Purpose & Risk
ggplot(dataall, aes(x=Risk, fill=Purpose))+geom_bar(position='dodge2')+coord_flip()
热力图
model.matrix(~0+., data=dataall) %>% cor(use="pairwise.complete.obs") %>% ggcorrplot(show.diag = T, lab=TRUE, lab_size=1)
训练集 测试集
set.seed(127)
slidata <- sample(nrow(dataall), 0.7*nrow(dataall))
train <- dataall[slidata,]
test <- dataall[-slidata,]
Logistic
logistic1= glm(formula = as.factor(train$Risk) ~.,family = binomial,data = train)
summary(logistic1)
##
## Call:
## glm(formula = as.factor(train$Risk) ~ ., family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4199 -1.0179 0.5972 0.8087 1.7241
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.189e-01 7.217e-01 -0.165 0.86910
## Age 1.949e-02 9.216e-03 2.115 0.03446 *
## Sexmale 4.245e-01 2.010e-01 2.112 0.03472 *
## Job 6.090e-02 1.455e-01 0.418 0.67559
## Housingown 4.141e-01 3.179e-01 1.302 0.19275
## Housingrent 7.384e-02 3.804e-01 0.194 0.84611
## Saving.accountsmoderate 2.646e-01 2.991e-01 0.884 0.37645
## Saving.accountsquite rich 4.323e-01 3.835e-01 1.127 0.25974
## Saving.accountsrich 7.183e-01 4.152e-01 1.730 0.08360 .
## Checking.accountmoderate 4.706e-01 2.001e-01 2.351 0.01871 *
## Checking.accountrich 1.100e+00 3.602e-01 3.053 0.00226 **
## Credit.amount 2.971e-05 4.434e-05 0.670 0.50288
## Duration -4.114e-02 9.666e-03 -4.257 2.08e-05 ***
## Purposecar -4.488e-02 3.393e-01 -0.132 0.89478
## Purposedomestic appliances -5.045e-01 8.733e-01 -0.578 0.56346
## Purposeeducation -4.314e-01 4.663e-01 -0.925 0.35483
## Purposefurniture/equipment 1.019e-01 3.690e-01 0.276 0.78236
## Purposeradio/TV 3.742e-01 3.482e-01 1.075 0.28248
## Purposerepairs -5.664e-01 6.265e-01 -0.904 0.36589
## Purposevacation/others -8.336e-01 8.130e-01 -1.025 0.30520
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 837.58 on 699 degrees of freedom
## Residual deviance: 754.60 on 680 degrees of freedom
## AIC: 794.6
##
## Number of Fisher Scoring iterations: 4
可以看p值:我们看出Saving.accountsrich,Checking.accountrich,Duration,Sexmale系数要被考虑 重新建模
logistic2= glm(formula = as.factor(train$Risk) ~Saving.accounts+Checking.account+Duration+Sex,family = binomial,data = train)
summary(logistic2)
##
## Call:
## glm(formula = as.factor(train$Risk) ~ Saving.accounts + Checking.account +
## Duration + Sex, family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3771 -1.0862 0.6173 0.8260 1.5535
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.905497 0.227071 3.988 6.67e-05 ***
## Saving.accountsmoderate 0.179157 0.289724 0.618 0.536331
## Saving.accountsquite rich 0.517677 0.378815 1.367 0.171760
## Saving.accountsrich 0.732533 0.409747 1.788 0.073813 .
## Checking.accountmoderate 0.545914 0.190896 2.860 0.004240 **
## Checking.accountrich 1.231316 0.353947 3.479 0.000504 ***
## Duration -0.037375 0.007111 -5.256 1.47e-07 ***
## Sexmale 0.558147 0.186524 2.992 0.002768 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 837.58 on 699 degrees of freedom
## Residual deviance: 771.08 on 692 degrees of freedom
## AIC: 787.08
##
## Number of Fisher Scoring iterations: 4
训练集的预测情况
#Predicting train result
prob_pred_train = predict(logistic2,type = 'response',newdata = train)
glm_pre_train= ifelse(prob_pred_train > 0.5,"good","bad")
summary(glm_pre_train)
## Length Class Mode
## 700 character character
混淆矩阵
xtable<-table(train$Risk,glm_pre_train)
confusionMatrix(xtable)
## Confusion Matrix and Statistics
##
## glm_pre_train
## bad good
## bad 31 169
## good 17 483
##
## Accuracy : 0.7343
## 95% CI : (0.6999, 0.7667)
## No Information Rate : 0.9314
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1567
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.64583
## Specificity : 0.74080
## Pos Pred Value : 0.15500
## Neg Pred Value : 0.96600
## Prevalence : 0.06857
## Detection Rate : 0.04429
## Detection Prevalence : 0.28571
## Balanced Accuracy : 0.69332
##
## 'Positive' Class : bad
##
ROC图分析
roc_train=roc(train$Risk,prob_pred_train)
## Setting levels: control = bad, case = good
## Setting direction: controls < cases
plot(roc_train,print.auc=TRUE,auc.polygon=TRUE,grid=c(0.1,0.2),grid.col=c("green","red"),max.auc.polygon=TRUE,auc.polygon.col="lightblue",print.thres="best")
测试集的预测结果
#Predicting test result
prob_pred_test = predict(logistic2,type = 'response',newdata = test)
glm_pre_test = ifelse(prob_pred_test > 0.5,"good","bad")
summary(glm_pre_test)
## Length Class Mode
## 300 character character
混淆矩阵
xtable<-table(test$Risk, glm_pre_test)
confusionMatrix(xtable)
## Confusion Matrix and Statistics
##
## glm_pre_test
## bad good
## bad 12 88
## good 2 198
##
## Accuracy : 0.7
## 95% CI : (0.6447, 0.7513)
## No Information Rate : 0.9533
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1401
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.85714
## Specificity : 0.69231
## Pos Pred Value : 0.12000
## Neg Pred Value : 0.99000
## Prevalence : 0.04667
## Detection Rate : 0.04000
## Detection Prevalence : 0.33333
## Balanced Accuracy : 0.77473
##
## 'Positive' Class : bad
##
ROC图分析
roc_test=roc(test$Risk,prob_pred_test)
## Setting levels: control = bad, case = good
## Setting direction: controls < cases
plot(roc_test,print.auc=TRUE,auc.polygon=TRUE,grid=c(0.1,0.2),grid.col=c("green","red"),max.auc.polygon=TRUE,auc.polygon.col="lightblue",print.thres="best")
Tree_model <- rpart(as.factor(train$Risk)~.,data=train)
fancyRpartPlot(Tree_model)
防止过拟合 我们需要剪枝操作
Tree_model$cptable
## CP nsplit rel error xerror xstd
## 1 0.03750 0 1.000 1.000 0.05976143
## 2 0.02500 2 0.925 1.035 0.06037117
## 3 0.01750 3 0.900 1.040 0.06045541
## 4 0.01500 7 0.825 1.030 0.06028622
## 5 0.01125 11 0.755 1.070 0.06094611
## 6 0.01000 16 0.695 1.080 0.06110413
plotcp(Tree_model)#交叉验证误差与复杂度参数的关系图
Tree_model_1 <- prune(Tree_model, cp=0.014)
fancyRpartPlot(Tree_model_1)
来检测预测效果 训练集
Tree_pre_train<-predict(Tree_model_1,train,type='class')
Tree_prob_train<-predict(Tree_model_1,train,type='prob')[,2]
xtable<-table(Tree_pre_train,train$Risk)
confusionMatrix(xtable)
## Confusion Matrix and Statistics
##
##
## Tree_pre_train bad good
## bad 70 21
## good 130 479
##
## Accuracy : 0.7843
## 95% CI : (0.7519, 0.8142)
## No Information Rate : 0.7143
## P-Value [Acc > NIR] : 1.554e-05
##
## Kappa : 0.3682
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.3500
## Specificity : 0.9580
## Pos Pred Value : 0.7692
## Neg Pred Value : 0.7865
## Prevalence : 0.2857
## Detection Rate : 0.1000
## Detection Prevalence : 0.1300
## Balanced Accuracy : 0.6540
##
## 'Positive' Class : bad
##
Tree_roc_train=roc(train$Risk,Tree_prob_train)
## Setting levels: control = bad, case = good
## Setting direction: controls < cases
plot(Tree_roc_train,print.auc=TRUE,auc.polygon=TRUE,grid=c(0.1,0.2),grid.col=c("green","red"),max.auc.polygon=TRUE,auc.polygon.col="lightblue",print.thres="best")
测试集
Tree_pre_test<-predict(Tree_model_1,test,type='class')
xtable<-table(Tree_pre_test,test$Risk)
confusionMatrix(xtable)
## Confusion Matrix and Statistics
##
##
## Tree_pre_test bad good
## bad 11 14
## good 89 186
##
## Accuracy : 0.6567
## 95% CI : (0.5999, 0.7103)
## No Information Rate : 0.6667
## P-Value [Acc > NIR] : 0.6679
##
## Kappa : 0.0492
##
## Mcnemar's Test P-Value : 3.067e-13
##
## Sensitivity : 0.11000
## Specificity : 0.93000
## Pos Pred Value : 0.44000
## Neg Pred Value : 0.67636
## Prevalence : 0.33333
## Detection Rate : 0.03667
## Detection Prevalence : 0.08333
## Balanced Accuracy : 0.52000
##
## 'Positive' Class : bad
##
roc_test=roc(test$Risk,as.numeric(Tree_pre_test))
## Setting levels: control = bad, case = good
## Setting direction: controls < cases
plot(roc_test,print.auc=TRUE,auc.polygon=TRUE,grid=c(0.1,0.2),grid.col=c("green","red"),max.auc.polygon=TRUE,auc.polygon.col="lightblue",print.thres="best")
https://cloud.tencent.com/developer/article/1870681
train.data.forest<-na.omit(train)
train.forest <- randomForest(as.factor(train.data.forest$Risk) ~ ., data = train.data.forest, importance = TRUE)
train.forest
##
## Call:
## randomForest(formula = as.factor(train.data.forest$Risk) ~ ., data = train.data.forest, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 27.43%
## Confusion matrix:
## bad good class.error
## bad 55 145 0.725
## good 47 453 0.094
查看重要性参数
importance.forest <- train.forest$importance
head(importance.forest)
## bad good MeanDecreaseAccuracy
## Age 0.005758647 0.0057862485 0.0057903076
## Sex -0.006800140 -0.0026523966 -0.0038236225
## Job 0.001222651 0.0005228298 0.0007067322
## Housing -0.002377569 0.0048924327 0.0028245588
## Saving.accounts 0.003816116 -0.0040903448 -0.0018307483
## Checking.account 0.017223317 0.0025708484 0.0067008988
## MeanDecreaseGini
## Age 56.765824
## Sex 7.805841
## Job 15.715161
## Housing 12.777167
## Saving.accounts 14.017268
## Checking.account 16.941745
varImpPlot(train.forest, n.var = min(15, nrow(train.forest$importance)), main = 'Top variable importance')
筛选掉不重要的一些参数 训练集
select <- rownames(importance.forest)[1:4]
test.data.forest<-na.omit(test)
train.data.forest.4 <- train.data.forest[ ,c(select, 'Risk')]
test.data.forest.4<- test.data.forest[ ,c(select, 'Risk')]
set.seed(123)
train.forest_4 <- randomForest(as.factor(train.data.forest.4$Risk) ~ ., data = train.data.forest.4, importance = TRUE)
train.forest_4
##
## Call:
## randomForest(formula = as.factor(train.data.forest.4$Risk) ~ ., data = train.data.forest.4, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 31.71%
## Confusion matrix:
## bad good class.error
## bad 22 178 0.890
## good 44 456 0.088
#plot(margin(train.forest_5, train.data.forest.5$Risk))
train_predict.forest <- predict(train.forest_4, train.data.forest.4)
xtable <- table(train_predict.forest, train.data.forest.4$Risk)
confusionMatrix(xtable)
## Confusion Matrix and Statistics
##
##
## train_predict.forest bad good
## bad 56 9
## good 144 491
##
## Accuracy : 0.7814
## 95% CI : (0.7489, 0.8115)
## No Information Rate : 0.7143
## P-Value [Acc > NIR] : 3.328e-05
##
## Kappa : 0.3285
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.28000
## Specificity : 0.98200
## Pos Pred Value : 0.86154
## Neg Pred Value : 0.77323
## Prevalence : 0.28571
## Detection Rate : 0.08000
## Detection Prevalence : 0.09286
## Balanced Accuracy : 0.63100
##
## 'Positive' Class : bad
##
测试集
test_predict.forest <- predict(train.forest_4, test.data.forest.4)
xtable <- table(test.data.forest.4$Risk, test_predict.forest)
confusionMatrix(xtable)
## Confusion Matrix and Statistics
##
## test_predict.forest
## bad good
## bad 10 90
## good 15 185
##
## Accuracy : 0.65
## 95% CI : (0.5931, 0.7039)
## No Information Rate : 0.9167
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0308
##
## Mcnemar's Test P-Value : 5.136e-13
##
## Sensitivity : 0.40000
## Specificity : 0.67273
## Pos Pred Value : 0.10000
## Neg Pred Value : 0.92500
## Prevalence : 0.08333
## Detection Rate : 0.03333
## Detection Prevalence : 0.33333
## Balanced Accuracy : 0.53636
##
## 'Positive' Class : bad
##
library(adabag)
## Loading required package: foreach
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
train.data.adabag <- na.omit(train)
test.data.adabag <- na.omit(test)
train.data.adabag$Risk <- factor(train.data.adabag$Risk)
error <- as.numeric()
for(i in 1:20){
train.adaboost <- boosting(Risk ~ ., data=train.data.adabag, mfinal=i)
test.adaboost.pred <- predict.boosting(train.adaboost,newdata = test.data.adabag)
error[i] <- test.adaboost.pred$error
}
error <- as.data.frame(error)
xtable<-test.adaboost.pred$confusion
confusionMatrix(xtable)
## Confusion Matrix and Statistics
##
## Observed Class
## Predicted Class bad good
## bad 31 39
## good 69 161
##
## Accuracy : 0.64
## 95% CI : (0.5828, 0.6944)
## No Information Rate : 0.6667
## P-Value [Acc > NIR] : 0.850963
##
## Kappa : 0.1243
##
## Mcnemar's Test P-Value : 0.005262
##
## Sensitivity : 0.3100
## Specificity : 0.8050
## Pos Pred Value : 0.4429
## Neg Pred Value : 0.7000
## Prevalence : 0.3333
## Detection Rate : 0.1033
## Detection Prevalence : 0.2333
## Balanced Accuracy : 0.5575
##
## 'Positive' Class : bad
##
ggplot(error,aes(x=1:20,y=error))+geom_line(colour="red", linetype="dashed",size = 1)+geom_point(size=3, shape=18)+ylim(0,1) +xlab("the number of basic classifiers")