对数据进行初步的分析,首先将layer小于等于3的供应商记为1,layer大于3的供应商记为2 好坏供应商的占比为:
load('/Users/milin/采购判别/supplier.Rdata')
table(supplier$label1)
##
## 0 1
## 7487 427569
查看数据的分布:
library(ggplot2)
ggplot(data = supplier, aes(x = supplier_level)) + geom_histogram(stat = 'density') +
facet_grid( ~ label1)
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 88751 rows containing non-finite values (stat_density).
ggplot(data = supplier, aes(x = trade_medal)) + geom_histogram(stat = 'density') +
facet_grid( ~ label1)
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 162645 rows containing non-finite values (stat_density).
ggplot(data = supplier, aes(x = satis_degree)) + geom_histogram(stat = 'density') +
facet_grid( ~ label1)
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 151416 rows containing non-finite values (stat_density).
ggplot(data = supplier, aes(x = description)) + geom_histogram(stat = 'density') +
facet_grid( ~ label1)
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 127283 rows containing non-finite values (stat_density).
ggplot(data = supplier, aes(x = resp_speed)) + geom_histogram(stat = 'density') +
facet_grid( ~ label1)
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 128953 rows containing non-finite values (stat_density).
ggplot(data = supplier, aes(x = deli_speed)) + geom_histogram(stat = 'density') +
facet_grid( ~ label1)
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 138959 rows containing non-finite values (stat_density).
ggplot(data = supplier, aes(x = repu_rate)) + geom_histogram(stat = 'density') +
facet_grid( ~ label1)
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 181608 rows containing non-finite values (stat_density).
从中可以看到,好坏数据之间的分布非常相似,也就是说这些数据对于供应商的好坏没有区分能力。
但是还是尝试简历模型
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.4
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
supplier_noNA <- na.omit(supplier)
supplier_noNA$label1 <- as.factor(supplier_noNA$label1)
supplier_noNA$label2 <- as.factor(supplier_noNA$label2)
good <- filter(supplier_noNA,label1==0)
## Warning: package 'bindrcpp' was built under R version 3.4.4
bad <- filter(supplier_noNA,label1==1)
bad <- sample_n(bad,size = 7000)
alldata <- rbind(good,bad)
library(party)
library(caret)
set.seed(1)
id <- sample(1:13045,size = 13045*0.7)
train <- alldata[id,]
test <- alldata[-id,]
model <- ctree(formula = label2~.,data = train[,-c(1,9)])
model <- randomForest::randomForest(formula = label1~.,data = train[,-c(1,10)])
p <- predict(model,newdata=test[,-c(1,10)])
confusionMatrix(p,test$label1)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1262 699
## 1 511 1442
##
## Accuracy : 0.6909
## 95% CI : (0.6761, 0.7053)
## No Information Rate : 0.547
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3818
## Mcnemar's Test P-Value : 7.621e-08
##
## Sensitivity : 0.7118
## Specificity : 0.6735
## Pos Pred Value : 0.6435
## Neg Pred Value : 0.7384
## Prevalence : 0.4530
## Detection Rate : 0.3224
## Detection Prevalence : 0.5010
## Balanced Accuracy : 0.6927
##
## 'Positive' Class : 0
##
可以发现,模型的效果并不是很好