盲采供应商分层

对数据进行初步的分析,首先将layer小于等于3的供应商记为1,layer大于3的供应商记为2 好坏供应商的占比为:

load('/Users/milin/采购判别/supplier.Rdata')
table(supplier$label1)
## 
##      0      1 
##   7487 427569

查看数据的分布:

library(ggplot2)
 ggplot(data = supplier, aes(x = supplier_level)) + geom_histogram(stat = 'density') +
  facet_grid( ~ label1)
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 88751 rows containing non-finite values (stat_density).

ggplot(data = supplier, aes(x = trade_medal)) + geom_histogram(stat = 'density') +
  facet_grid( ~ label1)
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 162645 rows containing non-finite values (stat_density).

ggplot(data = supplier, aes(x = satis_degree)) + geom_histogram(stat = 'density') +
  facet_grid( ~ label1)
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 151416 rows containing non-finite values (stat_density).

ggplot(data = supplier, aes(x = description)) + geom_histogram(stat = 'density') +
  facet_grid( ~ label1)
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 127283 rows containing non-finite values (stat_density).

ggplot(data = supplier, aes(x = resp_speed)) + geom_histogram(stat = 'density') +
  facet_grid( ~ label1)
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 128953 rows containing non-finite values (stat_density).

ggplot(data = supplier, aes(x = deli_speed)) + geom_histogram(stat = 'density') +
  facet_grid( ~ label1)
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 138959 rows containing non-finite values (stat_density).

ggplot(data = supplier, aes(x = repu_rate)) + geom_histogram(stat = 'density') +
  facet_grid( ~ label1)
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 181608 rows containing non-finite values (stat_density).

从中可以看到,好坏数据之间的分布非常相似,也就是说这些数据对于供应商的好坏没有区分能力。

但是还是尝试简历模型

决策树模型

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.4
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
supplier_noNA <- na.omit(supplier)
supplier_noNA$label1 <- as.factor(supplier_noNA$label1)
supplier_noNA$label2 <- as.factor(supplier_noNA$label2)
good <- filter(supplier_noNA,label1==0)
## Warning: package 'bindrcpp' was built under R version 3.4.4
bad <- filter(supplier_noNA,label1==1)

bad <- sample_n(bad,size = 7000)
alldata <- rbind(good,bad)
library(party)
library(caret)
set.seed(1)
id <- sample(1:13045,size = 13045*0.7)
train <- alldata[id,]
test <- alldata[-id,]

model <- ctree(formula = label2~.,data = train[,-c(1,9)])
model <- randomForest::randomForest(formula = label1~.,data = train[,-c(1,10)])

p <- predict(model,newdata=test[,-c(1,10)])

confusionMatrix(p,test$label1)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1262  699
##          1  511 1442
##                                           
##                Accuracy : 0.6909          
##                  95% CI : (0.6761, 0.7053)
##     No Information Rate : 0.547           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3818          
##  Mcnemar's Test P-Value : 7.621e-08       
##                                           
##             Sensitivity : 0.7118          
##             Specificity : 0.6735          
##          Pos Pred Value : 0.6435          
##          Neg Pred Value : 0.7384          
##              Prevalence : 0.4530          
##          Detection Rate : 0.3224          
##    Detection Prevalence : 0.5010          
##       Balanced Accuracy : 0.6927          
##                                           
##        'Positive' Class : 0               
## 

可以发现,模型的效果并不是很好