Data set parameter types and the first few rows of the data set
dataall<-read.csv('german_credit_data.csv')
head(dataall)
## Age Sex Job Housing Saving.accounts Checking.account Credit.amount
## 1 67 male 2 own <NA> little 1169
## 2 22 female 2 own little moderate 5951
## 3 49 male 1 own little <NA> 2096
## 4 45 male 2 free little little 7882
## 5 53 male 2 free little little 4870
## 6 35 male 1 free <NA> <NA> 9055
## Duration Purpose Risk
## 1 6 radio/TV good
## 2 48 radio/TV bad
## 3 12 education good
## 4 42 furniture/equipment good
## 5 24 car bad
## 6 36 education good
数据集信息
summary(dataall)
## Age Sex Job Housing
## Min. :19.00 Length:1000 Min. :0.000 Length:1000
## 1st Qu.:27.00 Class :character 1st Qu.:2.000 Class :character
## Median :33.00 Mode :character Median :2.000 Mode :character
## Mean :35.55 Mean :1.904
## 3rd Qu.:42.00 3rd Qu.:2.000
## Max. :75.00 Max. :3.000
## Saving.accounts Checking.account Credit.amount Duration
## Length:1000 Length:1000 Min. : 250 Min. : 4.0
## Class :character Class :character 1st Qu.: 1366 1st Qu.:12.0
## Mode :character Mode :character Median : 2320 Median :18.0
## Mean : 3271 Mean :20.9
## 3rd Qu.: 3972 3rd Qu.:24.0
## Max. :18424 Max. :72.0
## Purpose Risk
## Length:1000 Length:1000
## Class :character Class :character
## Mode :character Mode :character
##
##
##
#newdata = mice(dataall, method='polr',seed = 100)
#newdata = complete(newdata,5)
Basic Information Statistics Risk Information
ggplot(dataall, aes(x=Risk, fill=Risk))+geom_bar()+labs(title='Distribution of Risk Classes')
1.Age
ggplot(dataall, aes(Age)) + geom_histogram(binwidth=4, colour="black", fill="lightgreen") +labs(x= "Age",y= "Frequency" , title = "Age Information")
2.Gender
ggplot(dataall, aes(Sex) ) + geom_bar(aes(fill = as.factor(Sex))) + scale_fill_discrete(name="Sex",labels=c( "Female","Male")) +labs(x= "Sex",y= "Frequency" , title = "Sex Information")
3.job
数值越大 技能性越高
ggplot(dataall, aes(Job) ) + geom_bar(aes(fill = as.factor(Job))) + scale_fill_discrete(name="Job",labels=c( "0","1","2","3")) +labs(x= "Job",y= "Frequency" , title = "Job Information")
4.Housing
ggplot(dataall, aes(Housing) ) + geom_bar(aes(fill = as.factor(Housing))) + scale_fill_discrete(name="Housing",labels=c("Free","Own", "Rent")) +labs(x= "Housing",y= "Frequency" , title = "Housing Information")
5.Saving accounts
ggplot(dataall, aes(Saving.accounts) ) + geom_bar(aes(fill = as.factor(Saving.accounts))) + scale_fill_discrete(name="Saving accounts",labels=c( "Little","Moderate", "Quite Rich", "Rich", "NA")) +labs(x= "Saving accounts",y= "Frequency" , title = "Saving Accounts Information")
6.Checking Account
ggplot(dataall, aes(Checking.account) ) + geom_bar(aes(fill = as.factor(Checking.account))) + scale_fill_discrete(name="Checking accounts",labels=c( "Little","Moderate", "Rich", "NA")) +labs(x= "Checking accounts",y= "Frequency" , title = "Checking Accounts Information")
7.Credit amount
ggplot(dataall, aes(Credit.amount)) + geom_histogram(binwidth=1000, colour="black", fill="lightblue") +labs(x= "Credit amount",y= "Frequency" , title = "Credit amount")
8.Duration
ggplot(dataall, aes(Duration)) + geom_histogram(binwidth=4, colour="black", fill="lightyellow") +labs(x= "Duration in Months",y= "Frequency" , title = "Duration")
9.Purpose
ggplot(dataall, aes(Purpose) ) + geom_bar(aes(fill = as.factor(Purpose))) + scale_fill_discrete(name="Purpose of Loan",labels=c( "Business","Car","Domestic Appliances","Education","Furniture/Equipment","Radio/TV","Repairs","Vacation/Others")) +labs(x= "Purpose of Loan",y= "Frequency" , title = "Plot of Loan Purpose")
年龄&RISK
p1<-ggplot(dataall, aes(x=Age, fill=Risk)) +geom_density(alpha=0.5) +ggtitle('Distribution of Age by Risk') +xlab("Age") + ylab("Density")
p2<-ggplot(dataall, aes(x=Age, fill=Risk))+geom_bar()+labs(title='Distribution of Age & Risk')
p3<-ggplot(dataall, aes(x=Age, fill=Risk))+geom_boxplot()+labs(title='Distribution of Age & Risk')
ggarrange(p1,p2,p3,ncol=1,nrow=3)
性别&RISK
ggplot(dataall, aes(x=Sex, fill=Risk))+geom_bar()+labs(title='Distribution of Sex & Risk')
Job & RISK
ggplot(dataall, aes(x=Job, fill=Risk))+geom_bar()+labs(title='Distribution of Job & Risk')
Housing & Risk
ggplot(dataall, aes(x=Housing, fill=Risk))+geom_bar()+labs(title='Distribution of Housing & Risk')
Saving accounts &Risk
ggplot(dataall, aes(x=Saving.accounts, fill=Risk))+geom_bar()+labs(title='Distribution of Saving accounts & Risk')
Checking accounts & Risk
ggplot(dataall, aes(x=Risk, fill=Checking.account))+geom_bar()+labs(title='Distribution of Checking accounts & Risk')
Credit amount & Risk
ggplot(dataall, aes(x=Credit.amount, fill=Risk)) +geom_density(alpha=0.5) +ggtitle('Distribution of Credit Amount by Risk') +xlab("Credit Amount") + ylab("Density")
ggplot(dataall, aes(x=Risk,y=Credit.amount,fill=Risk)) +geom_boxplot() +ggtitle('Distribution of Credit Amount by Risk') +xlab("Credit Amount") + ylab("Density")
Duration & Risk
ggplot(dataall, aes(x=Duration, fill=Risk)) +geom_density(alpha=0.5) +ggtitle('Distribution of Duration by Risk') +xlab("Duration") + ylab("Density")
Purpose & Risk
ggplot(dataall, aes(x=Risk, fill=Purpose))+geom_bar(position='dodge2')+coord_flip()
热力图
model.matrix(~0+., data=dataall) %>% cor(use="pairwise.complete.obs") %>% ggcorrplot(show.diag = T, lab=TRUE, lab_size=1)
训练集 测试集
set.seed(127)
slidata <- sample(nrow(dataall), 0.7*nrow(dataall))
train <- dataall[slidata,]
test <- dataall[-slidata,]
Logistic
logistic1= glm(formula = as.factor(train$Risk) ~.,family = binomial,data = train)
summary(logistic1)
##
## Call:
## glm(formula = as.factor(train$Risk) ~ ., family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3254 -1.0752 0.5923 0.9808 2.1732
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.013e+00 9.328e-01 1.086 0.27760
## Age 6.514e-03 1.128e-02 0.577 0.56361
## Sexmale 4.249e-01 2.657e-01 1.599 0.10985
## Job 1.186e-01 1.907e-01 0.622 0.53399
## Housingown -4.494e-02 4.294e-01 -0.105 0.91663
## Housingrent -1.387e-01 4.875e-01 -0.284 0.77604
## Saving.accountsmoderate 1.140e-01 3.880e-01 0.294 0.76893
## Saving.accountsquite rich -1.454e-01 6.398e-01 -0.227 0.82025
## Saving.accountsrich 1.011e+00 6.222e-01 1.626 0.10403
## Checking.accountmoderate 3.619e-02 2.621e-01 0.138 0.89021
## Checking.accountrich 1.416e+00 4.771e-01 2.968 0.00300 **
## Credit.amount 4.936e-05 6.120e-05 0.807 0.41992
## Duration -6.134e-02 1.337e-02 -4.589 4.46e-06 ***
## Purposecar -6.581e-01 4.352e-01 -1.512 0.13050
## Purposedomestic appliances -8.057e-01 1.047e+00 -0.770 0.44143
## Purposeeducation -1.806e+00 6.634e-01 -2.723 0.00647 **
## Purposefurniture/equipment -1.902e-01 4.594e-01 -0.414 0.67888
## Purposeradio/TV -2.653e-01 4.530e-01 -0.586 0.55804
## Purposerepairs -4.761e-01 7.981e-01 -0.597 0.55083
## Purposevacation/others -1.074e+00 1.102e+00 -0.975 0.32980
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 484.11 on 352 degrees of freedom
## Residual deviance: 424.11 on 333 degrees of freedom
## (347 observations deleted due to missingness)
## AIC: 464.11
##
## Number of Fisher Scoring iterations: 4
可以看p值:我们看出Saving.accountsrich,Checking.accountrich,Duration,Sexmale系数要被考虑 重新建模
logistic2= glm(formula = as.factor(train$Risk) ~Saving.accounts+Checking.account+Duration+Sex,family = binomial,data = train)
summary(logistic2)
##
## Call:
## glm(formula = as.factor(train$Risk) ~ Saving.accounts + Checking.account +
## Duration + Sex, family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1312 -1.1205 0.6324 0.9904 1.8606
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.733878 0.303801 2.416 0.01571 *
## Saving.accountsmoderate 0.149005 0.371814 0.401 0.68860
## Saving.accountsquite rich -0.058863 0.620626 -0.095 0.92444
## Saving.accountsrich 0.948779 0.613994 1.545 0.12228
## Checking.accountmoderate 0.083476 0.249537 0.335 0.73798
## Checking.accountrich 1.353742 0.456952 2.963 0.00305 **
## Duration -0.048295 0.009934 -4.862 1.16e-06 ***
## Sexmale 0.509065 0.247438 2.057 0.03965 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 484.11 on 352 degrees of freedom
## Residual deviance: 436.92 on 345 degrees of freedom
## (347 observations deleted due to missingness)
## AIC: 452.92
##
## Number of Fisher Scoring iterations: 4
训练集的预测情况
#Predicting train result
prob_pred_train = predict(logistic2,type = 'response',newdata = train)
glm_pre_train= ifelse(prob_pred_train > 0.5,"good","bad")
summary(glm_pre_train)
## Length Class Mode
## 700 character character
混淆矩阵
xtable<-table(train$Risk,glm_pre_train)
confusionMatrix(xtable)
## Confusion Matrix and Statistics
##
## glm_pre_train
## bad good
## bad 76 79
## good 42 156
##
## Accuracy : 0.6572
## 95% CI : (0.6051, 0.7067)
## No Information Rate : 0.6657
## P-Value [Acc > NIR] : 0.655450
##
## Kappa : 0.2856
##
## Mcnemar's Test P-Value : 0.001065
##
## Sensitivity : 0.6441
## Specificity : 0.6638
## Pos Pred Value : 0.4903
## Neg Pred Value : 0.7879
## Prevalence : 0.3343
## Detection Rate : 0.2153
## Detection Prevalence : 0.4391
## Balanced Accuracy : 0.6539
##
## 'Positive' Class : bad
##
ROC图分析
roc_train=roc(train$Risk,prob_pred_train)
## Setting levels: control = bad, case = good
## Setting direction: controls < cases
plot(roc_train,print.auc=TRUE,auc.polygon=TRUE,grid=c(0.1,0.2),grid.col=c("green","red"),max.auc.polygon=TRUE,auc.polygon.col="lightblue",print.thres="best")
测试集的预测结果
#Predicting test result
prob_pred_test = predict(logistic2,type = 'response',newdata = test)
glm_pre_test = ifelse(prob_pred_test > 0.5,"good","bad")
summary(glm_pre_test)
## Length Class Mode
## 300 character character
混淆矩阵
xtable<-table(test$Risk, glm_pre_test)
confusionMatrix(xtable)
## Confusion Matrix and Statistics
##
## glm_pre_test
## bad good
## bad 33 43
## good 19 74
##
## Accuracy : 0.6331
## 95% CI : (0.5557, 0.7058)
## No Information Rate : 0.6923
## P-Value [Acc > NIR] : 0.958256
##
## Kappa : 0.2367
##
## Mcnemar's Test P-Value : 0.003489
##
## Sensitivity : 0.6346
## Specificity : 0.6325
## Pos Pred Value : 0.4342
## Neg Pred Value : 0.7957
## Prevalence : 0.3077
## Detection Rate : 0.1953
## Detection Prevalence : 0.4497
## Balanced Accuracy : 0.6335
##
## 'Positive' Class : bad
##
ROC图分析
roc_test=roc(test$Risk,prob_pred_test)
## Setting levels: control = bad, case = good
## Setting direction: controls < cases
plot(roc_test,print.auc=TRUE,auc.polygon=TRUE,grid=c(0.1,0.2),grid.col=c("green","red"),max.auc.polygon=TRUE,auc.polygon.col="lightblue",print.thres="best")
Tree_model <- rpart(as.factor(train$Risk)~.,data=train)
fancyRpartPlot(Tree_model)
防止过拟合 我们需要剪枝操作
Tree_model$cptable
## CP nsplit rel error xerror xstd
## 1 0.03750000 0 1.000 1.000 0.05976143
## 2 0.02500000 2 0.925 0.995 0.05967142
## 3 0.02250000 3 0.900 1.000 0.05976143
## 4 0.01333333 5 0.855 1.015 0.06002708
## 5 0.01125000 8 0.815 1.065 0.06086607
## 6 0.01000000 13 0.755 1.095 0.06133602
plotcp(Tree_model)#交叉验证误差与复杂度参数的关系图
Tree_model_1 <- prune(Tree_model, cp=0.027)
fancyRpartPlot(Tree_model_1)
来检测预测效果 训练集
Tree_pre_train<-predict(Tree_model_1,train,type='class')
Tree_prob_train<-predict(Tree_model_1,train,type='prob')[,2]
xtable<-table(Tree_pre_train,train$Risk)
confusionMatrix(xtable)
## Confusion Matrix and Statistics
##
##
## Tree_pre_train bad good
## bad 29 14
## good 171 486
##
## Accuracy : 0.7357
## 95% CI : (0.7014, 0.768)
## No Information Rate : 0.7143
## P-Value [Acc > NIR] : 0.112
##
## Kappa : 0.153
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.14500
## Specificity : 0.97200
## Pos Pred Value : 0.67442
## Neg Pred Value : 0.73973
## Prevalence : 0.28571
## Detection Rate : 0.04143
## Detection Prevalence : 0.06143
## Balanced Accuracy : 0.55850
##
## 'Positive' Class : bad
##
Tree_roc_train=roc(train$Risk,Tree_prob_train)
## Setting levels: control = bad, case = good
## Setting direction: controls < cases
plot(Tree_roc_train,print.auc=TRUE,auc.polygon=TRUE,grid=c(0.1,0.2),grid.col=c("green","red"),max.auc.polygon=TRUE,auc.polygon.col="lightblue",print.thres="best")
测试集
Tree_pre_test<-predict(Tree_model_1,test,type='class')
xtable<-table(Tree_pre_test,test$Risk)
confusionMatrix(xtable)
## Confusion Matrix and Statistics
##
##
## Tree_pre_test bad good
## bad 4 6
## good 96 194
##
## Accuracy : 0.66
## 95% CI : (0.6033, 0.7135)
## No Information Rate : 0.6667
## P-Value [Acc > NIR] : 0.6226
##
## Kappa : 0.0129
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.04000
## Specificity : 0.97000
## Pos Pred Value : 0.40000
## Neg Pred Value : 0.66897
## Prevalence : 0.33333
## Detection Rate : 0.01333
## Detection Prevalence : 0.03333
## Balanced Accuracy : 0.50500
##
## 'Positive' Class : bad
##
roc_test=roc(test$Risk,as.numeric(Tree_pre_test))
## Setting levels: control = bad, case = good
## Setting direction: controls < cases
plot(roc_test,print.auc=TRUE,auc.polygon=TRUE,grid=c(0.1,0.2),grid.col=c("green","red"),max.auc.polygon=TRUE,auc.polygon.col="lightblue",print.thres="best")
https://cloud.tencent.com/developer/article/1870681
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
##
## importance
## The following object is masked from 'package:ggplot2':
##
## margin
train.data.forest<-na.omit(train)
train.forest <- randomForest(as.factor(train.data.forest$Risk) ~ ., data = train.data.forest, importance = TRUE)
train.forest
##
## Call:
## randomForest(formula = as.factor(train.data.forest$Risk) ~ ., data = train.data.forest, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 36.83%
## Confusion matrix:
## bad good class.error
## bad 81 74 0.4774194
## good 56 142 0.2828283
查看重要性参数
importance.forest <- train.forest$importance
head(importance.forest)
## bad good MeanDecreaseAccuracy
## Age -0.007751874 0.001180502 -0.002556922
## Sex -0.002756205 -0.002263176 -0.002493479
## Job 0.006014341 0.001392746 0.003425752
## Housing -0.003639075 0.004909288 0.001050574
## Saving.accounts 0.002235366 0.002602384 0.002401319
## Checking.account 0.016152666 0.003925401 0.009235639
## MeanDecreaseGini
## Age 32.067186
## Sex 4.776435
## Job 10.523550
## Housing 8.256494
## Saving.accounts 7.348468
## Checking.account 11.401583
varImpPlot(train.forest, n.var = min(15, nrow(train.forest$importance)), main = 'Top variable importance')
筛选掉不重要的一些参数 训练集
select <- rownames(importance.forest)[1:4]
test.data.forest<-na.omit(test)
train.data.forest.4 <- train.data.forest[ ,c(select, 'Risk')]
test.data.forest.4<- test.data.forest[ ,c(select, 'Risk')]
set.seed(123)
train.forest_4 <- randomForest(as.factor(train.data.forest.4$Risk) ~ ., data = train.data.forest.4, importance = TRUE)
train.forest_4
##
## Call:
## randomForest(formula = as.factor(train.data.forest.4$Risk) ~ ., data = train.data.forest.4, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 46.46%
## Confusion matrix:
## bad good class.error
## bad 48 107 0.6903226
## good 57 141 0.2878788
#plot(margin(train.forest_5, train.data.forest.5$Risk))
train_predict.forest <- predict(train.forest_4, train.data.forest.4)
xtable <- table(train_predict.forest, train.data.forest.4$Risk)
confusionMatrix(xtable)
## Confusion Matrix and Statistics
##
##
## train_predict.forest bad good
## bad 94 21
## good 61 177
##
## Accuracy : 0.7677
## 95% CI : (0.7201, 0.8108)
## No Information Rate : 0.5609
## P-Value [Acc > NIR] : 4.422e-16
##
## Kappa : 0.5148
##
## Mcnemar's Test P-Value : 1.656e-05
##
## Sensitivity : 0.6065
## Specificity : 0.8939
## Pos Pred Value : 0.8174
## Neg Pred Value : 0.7437
## Prevalence : 0.4391
## Detection Rate : 0.2663
## Detection Prevalence : 0.3258
## Balanced Accuracy : 0.7502
##
## 'Positive' Class : bad
##
测试集
test_predict.forest <- predict(train.forest_4, test.data.forest.4)
xtable <- table(test.data.forest.4$Risk, test_predict.forest)
confusionMatrix(xtable)
## Confusion Matrix and Statistics
##
## test_predict.forest
## bad good
## bad 26 50
## good 19 74
##
## Accuracy : 0.5917
## 95% CI : (0.5136, 0.6666)
## No Information Rate : 0.7337
## P-Value [Acc > NIR] : 0.9999782
##
## Kappa : 0.1431
##
## Mcnemar's Test P-Value : 0.0003043
##
## Sensitivity : 0.5778
## Specificity : 0.5968
## Pos Pred Value : 0.3421
## Neg Pred Value : 0.7957
## Prevalence : 0.2663
## Detection Rate : 0.1538
## Detection Prevalence : 0.4497
## Balanced Accuracy : 0.5873
##
## 'Positive' Class : bad
##
library(adabag)
## Loading required package: foreach
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
train.data.adabag <- na.omit(train)
test.data.adabag <- na.omit(test)
train.data.adabag$Risk <- factor(train.data.adabag$Risk)
error <- as.numeric()
for(i in 1:3){
train.adaboost <- boosting(Risk ~ ., data=train.data.adabag, mfinal=i)
test.adaboost.pred <- predict.boosting(train.adaboost,newdata = test.data.adabag)
error[i] <- test.adaboost.pred$error
}
error <- as.data.frame(error)
xtable<-test.adaboost.pred$confusion
confusionMatrix(xtable)
## Confusion Matrix and Statistics
##
## Observed Class
## Predicted Class bad good
## bad 43 23
## good 33 70
##
## Accuracy : 0.6686
## 95% CI : (0.5922, 0.739)
## No Information Rate : 0.5503
## P-Value [Acc > NIR] : 0.001146
##
## Kappa : 0.3224
##
## Mcnemar's Test P-Value : 0.229102
##
## Sensitivity : 0.5658
## Specificity : 0.7527
## Pos Pred Value : 0.6515
## Neg Pred Value : 0.6796
## Prevalence : 0.4497
## Detection Rate : 0.2544
## Detection Prevalence : 0.3905
## Balanced Accuracy : 0.6592
##
## 'Positive' Class : bad
##
ggplot(error,aes(x=1:3,y=error))+geom_line(colour="red", linetype="dashed",size = 1)+geom_point(size=3, shape=18)+ylim(0,1) +xlab("the number of basic classifiers")