# !diagnostics offdescription <- structure(list(sku = structure(c(9L, 6L, 5L, 2L, 3L, 4L, 17L,
18L, 19L, 20L, 8L, 14L, 13L, 12L, 11L, 7L, 1L, 10L, 15L, 21L,
16L, 22L), .Label = c("deck_risk ", "forecast_3_month ", "forecast_6_month ",
"forecast_9_month ", "in_transit_qty ", "lead_time ", "local_bo_qty ",
"min_bank ", "national_inv ", "oe_constraint ", "perf_12_month_avg ",
"perf_6_month_avg ", "pieces_past_due ", "potential_issue ",
"ppap_risk ", "rev_stop ", "sales_1_month ", "sales_3_month ",
"sales_6_month ", "sales_9_month ", "stop_auto_buy ", "went_on_backorder "
), class = "factor"), Random.ID.for.the.product = structure(c(3L,
18L, 1L, 4L, 5L, 6L, 11L, 12L, 13L, 14L, 7L, 15L, 9L, 17L, 16L,
2L, 8L, 8L, 8L, 8L, 8L, 10L), .Label = c(" Amount of product in transit from source",
" Amount of stock orders overdue", " Current inventory level for the part",
" Forecast sales for the next 3 months", " Forecast sales for the next 6 months",
" Forecast sales for the next 9 months", " Minimum recommend amount to stock",
" Part risk flag", " Parts overdue from source", " Product actually went on backorder. This is the target value.",
" Sales quantity for the prior 1 month time period", " Sales quantity for the prior 3 month time period",
" Sales quantity for the prior 6 month time period", " Sales quantity for the prior 9 month time period",
" Source issue for part identified", " Source performance for prior 12 month period",
" Source performance for prior 6 month period", " Transit time for product (if available)"
), class = "factor")), .Names = c("sku", "Random.ID.for.the.product"
), class = "data.frame", row.names = c(NA, -22L))#sapply(all, function(x) sum(is.na(x)))
all[is.na(all)] = 0
all$lead_time %<>% as.numeric()
all <- as.data.frame(unclass(all))all2 <- bind_rows(all %>% filter(went_on_backorder == "Yes") %>% sample_n(1000),
all %>% filter(went_on_backorder == "No") %>% sample_n(1000)) %>% tbl_df()
train <- all2 %>% filter(datatype == "train") %>% select(-datatype)
train <- bind_rows(train %>% filter(went_on_backorder == "Yes") %>% sample_n(100),
train %>% filter(went_on_backorder == "No") %>% sample_n(100)
) %>% select(-sku)
test <- all2 %>% filter(datatype == "test") %>% select(-datatype)
test <- bind_rows(test %>% filter(went_on_backorder == "Yes") %>% sample_n(50),
test %>% filter(went_on_backorder == "No") %>% sample_n(50)
) %>% select(-sku)#all_h2o <- cbind(weights, all[, c(which(colvars$variance > 50))])
all_h2o <- as.h2o(all2, destination_frame="prostate.hex")
|
| | 0%
|
|=================================================================| 100%
h2o.describe(all_h2o) %>% # excluding the weights column
gather(x, y, Zeros:Sigma) %>%
mutate(group = ifelse(x %in% c("Min", "Max", "Mean"), "min, mean, max",
ifelse(x %in% c("NegInf", "PosInf"), "Inf", "sigma, zeros"))) %>%
# separating them into facets makes them easier to see %>%
mutate(Label = factor(Label, levels = colnames(all_h2o[, -1]))) %>%
ggplot2::ggplot(aes(x = Label %>% as.character() %>% to_title(), y = as.numeric(y), color = x))+
geom_point(size = 3, alpha = 0.6) +
scale_color_brewer(palette = "Set1") +
my_theme() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
facet_grid(group ~ ., scales = "free") +
labs(x = "Feature",
y = "Value",
color = "Metrics: ",
title = "Exoplanet: Descriptive Measures") +
theme(legend.title = element_text(face = "bold")) +
theme(plot.title = element_text(face="bold", hjust = 0.5)) +
theme(legend.background = element_rect(size = 0.1),
legend.position = "right",
legend.direction = "vertical",
axis.text.x = element_text(size=8))XGBT <- train(went_on_backorder ~ .,
data=train,
method="xgbTree",
preProcess = NULL,
tuneLength=5,
trControl = trainControl(method = "repeatedcv",
number=5,
repeats=5,
verboseIter = FALSE,
summaryFunction=multiClassSummary)
)
#Support vector machine with Linear Kernel model
SVM <- train(went_on_backorder ~ .,
data=train,
method="svmLinear",
preProcess = NULL,
tuneLength=5,
trControl = trainControl(method = "repeatedcv",
number=5,
repeats=5,
verboseIter = FALSE,
summaryFunction=multiClassSummary)
)confusionMatrix(XGBT,positive ="Yes")Cross-Validated (5 fold, repeated 5 times) Confusion Matrix
(entries are percentual average cell counts across resamples)
Reference
Prediction No Yes
No 41.2 8.8
Yes 8.8 41.2
Accuracy (average) : 0.824
confusionMatrix(SVM,positive ="Yes")Cross-Validated (5 fold, repeated 5 times) Confusion Matrix
(entries are percentual average cell counts across resamples)
Reference
Prediction No Yes
No 36.9 9.0
Yes 13.1 41.0
Accuracy (average) : 0.779
resXGB <- XGBT$resample %>%
separate(Resample,into = c("Fold", "Iter"), sep = ".Rep")
resXGB = dplyr::rename(resXGB,
ACC=Accuracy,
KAP=Kappa,
SEN=Sensitivity,
SPEC=Specificity,
PPV=Pos_Pred_Value,
NPV=Neg_Pred_Value,
BAC=Balanced_Accuracy) %>%
.[,-7]
resXGB$Iteration=c(1:nrow(resXGB))
resXGB$model="XGB"
resSVM<-SVM$resample %>%
separate(Resample,into = c("Fold", "Iter"), sep = ".Rep")
resSVM = dplyr::rename(resSVM,
ACC=Accuracy,
KAP=Kappa,
SEN=Sensitivity,
SPEC=Specificity,
PPV=Pos_Pred_Value,
NPV=Neg_Pred_Value,
BAC=Balanced_Accuracy) %>%
.[,-7]
resSVM$Iteration=c(1:nrow(resSVM))
resSVM$model="SVM"
resXGB=gather(resXGB,ACC:BAC,key="Metric",value="Value")
resSVM=gather(resSVM,ACC:BAC,key="Metric",value="Value")
resdf=rbind(resSVM,resXGB)grid.arrange(
resdf %>%
ggplot(aes(x=Metric,y=Value,fill=model,color=model)) +
stat_summary(fun.ymin=min,fun.ymax=max,fun.y="median",shape=21,size=1) +
facet_grid(~model) +
coord_flip() +
scale_fill_brewer(palette = "Set1") +
labs(title = "SVM & XGB Comparison: Line Chart") +
theme(plot.title = element_text(face="bold", hjust = 0.5),
legend.background = element_rect(size = 0.1),
legend.position = "right",
legend.direction = "vertical",
legend.title = element_text(face = "bold")),
resdf %>%
ggplot(aes(x=Metric,y=Value))+
geom_boxplot(aes(fill=model),alpha=0.6)+
coord_flip()+facet_wrap(~Metric,scales="free",ncol=2) +
theme(axis.text.y=element_blank()) +
scale_fill_brewer(palette = "Set1")+
scale_color_brewer(palette = "Set1") +
labs(title = "SVM & XGB Comparison: Box Chart") +
theme(plot.title = element_text(face="bold", hjust = 0.5),
legend.background = element_rect(size = 0.1),
legend.position = "right",
legend.direction = "vertical",
legend.title = element_text(face = "bold"))
)grid.arrange(
resdf %>%
ggplot(aes(x=Iteration,y=Value))+
geom_path(aes(color=model),size=1,alpha=0.7)+
facet_wrap(~Metric,scales="free",ncol=2)+
scale_color_brewer(palette = "Set1") +
labs(title = "SVM & XGB Comparison") +
theme(plot.title = element_text(face="bold", hjust = 0.5),
legend.background = element_rect(size = 0.1),
legend.position = "right",
legend.direction = "vertical",
legend.title = element_text(face = "bold")),
resdf %>%
ggplot(aes(x=Iter,y=Value,group=model))+
geom_line(aes(color=model),size=1,show.legend=F)+
facet_grid(Fold~Metric,scales="free") +
scale_color_brewer(palette = "Set1") +
labs(title = "Metrics by Fold") +
theme(plot.title = element_text(face="bold", hjust = 0.5),
legend.background = element_rect(size = 0.1),
legend.position = "right",
legend.direction = "vertical",
legend.title = element_text(face = "bold"))
)Extreme Gradient Boosting: Train v. Test Confusion Matrix
predxgb <- predict(XGBT,test,type="prob") %>% cbind(test,.)
predxgb$Predicted=predict(XGBT,test)
predxgb$Model="XGBT"
predsvm <- test %>% mutate(.,Predicted=predict(SVM,.))
predsvm$Model="SVM"
confusionMatrix(predxgb$Predicted,
reference=predxgb$went_on_backorder,
positive ="Yes",
mode="everything")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 42 6
Yes 8 44
Accuracy : 0.86
95% CI : (0.7763, 0.9213)
No Information Rate : 0.5
P-Value [Acc > NIR] : 0.00000000000004142
Kappa : 0.72
Mcnemar's Test P-Value : 0.7893
Sensitivity : 0.8800
Specificity : 0.8400
Pos Pred Value : 0.8462
Neg Pred Value : 0.8750
Precision : 0.8462
Recall : 0.8800
F1 : 0.8627
Prevalence : 0.5000
Detection Rate : 0.4400
Detection Prevalence : 0.5200
Balanced Accuracy : 0.8600
'Positive' Class : Yes
XGBoost has an accuracy of 86%.
Support Vector Machines: Train v. Test Confusion Matrix
confusionMatrix(predsvm$Predicted,
reference=predsvm$went_on_backorder,
positive ="Yes",
mode="everything")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 45 5
Yes 5 45
Accuracy : 0.9
95% CI : (0.8238, 0.951)
No Information Rate : 0.5
P-Value [Acc > NIR] : <0.0000000000000002
Kappa : 0.8
Mcnemar's Test P-Value : 1
Sensitivity : 0.90
Specificity : 0.90
Pos Pred Value : 0.90
Neg Pred Value : 0.90
Precision : 0.90
Recall : 0.90
F1 : 0.90
Prevalence : 0.50
Detection Rate : 0.45
Detection Prevalence : 0.50
Balanced Accuracy : 0.90
'Positive' Class : Yes
Support Vector Machines has an accuracy of 90%.
features <- train %>% select(-went_on_backorder) %>%
apply(.,2,function(x) as.numeric(as.character(x)))%>%
as.matrix()%>%
colnames()
xgboost::xgb.importance(features,XGBT$finalModel) %>%
xgboost::xgb.ggplot.importance()datalog <- all2 %>%
select_if(is.numeric) %>% select(-sku) %>% log2(.) %>%
mutate(.,Outcome=all2$went_on_backorder,
PredXGBT=predict(XGBT,all2 %>% select(-sku)),
PredSVM=predict(SVM,all2 %>% select(-sku))
)
#colnames(datalog) = colnames(datalog) %>% to_title()
plotfuncLow <- function(data,mapping){
p <- ggplot(data = data,mapping=mapping)+
geom_point(aes(fill=datalog$PredXGBT),shape=21,color="black")+
stat_density2d(geom="polygon",aes(fill=datalog$PredXGBT,alpha = ..level..))+
scale_fill_manual(values=c("blue","red"))
p
}
plotfuncUp <- function(data,mapping){
p <- ggplot(data = data,mapping=mapping)+
geom_point(aes(fill=datalog$PredSVM),shape=21,color="black")+
stat_density2d(geom="polygon",aes(fill=datalog$PredSVM,alpha = ..level..))+
scale_fill_manual(values=c("violet","gold"))
p
}
library(GGally)
ggpairs(datalog,
columns = 1:9,
lower = list(continuous=plotfuncLow),
diag=NULL,
upper = list(continuous=plotfuncUp))all2 <- bind_rows(all %>% filter(went_on_backorder == "Yes") %>% sample_n(1000),
all %>% filter(went_on_backorder == "No") %>% sample_n(1000)) %>% tbl_df()
train <- all2 %>% filter(datatype == "train") %>% select(-datatype)
train <- bind_rows(train %>% filter(went_on_backorder == "Yes") %>% sample_n(300),
train %>% filter(went_on_backorder == "No") %>% sample_n(300)
) %>% select(-sku)
test <- all2 %>% filter(datatype == "test") %>% select(-datatype)
test <- bind_rows(test %>% filter(went_on_backorder == "Yes") %>% sample_n(100),
test %>% filter(went_on_backorder == "No") %>% sample_n(100)
) %>% select(-sku)Control=trainControl(method= "repeatedcv",
number=10,
repeats=10,
classProbs=TRUE,
sampling = "smote",
summaryFunction = twoClassSummary)
cart1=caret::train(went_on_backorder~.,
data=train,
method="rpart",
trControl=Control,
tuneLength=10)
cart1CART
600 samples
21 predictor
2 classes: 'No', 'Yes'
No pre-processing
Resampling: Cross-Validated (10 fold, repeated 10 times)
Summary of sample sizes: 540, 540, 540, 540, 540, 540, ...
Addtional sampling using SMOTE
Resampling results across tuning parameters:
cp ROC Sens Spec
0.00000000 0.8582333 0.77933333 0.8256667
0.06222222 0.8315389 0.81233333 0.8326667
0.12444444 0.7834500 0.76466667 0.8010000
0.18666667 0.7766444 0.75733333 0.7943333
0.24888889 0.7192944 0.61633333 0.8210000
0.31111111 0.7116667 0.59700000 0.8263333
0.37333333 0.7100000 0.58866667 0.8313333
0.43555556 0.6961667 0.54266667 0.8496667
0.49777778 0.5561667 0.15466667 0.9576667
0.56000000 0.5048333 0.01233333 0.9973333
ROC was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.
library(partykit)
library(party)
fancyRpartPlot(cart1$finalModel,palettes="RdPu", sub="", main="Method:CP Tuning Based Decision Tree")confusionMatrix(cart1,positive ="Yes")Cross-Validated (10 fold, repeated 10 times) Confusion Matrix
(entries are percentual average cell counts across resamples)
Reference
Prediction No Yes
No 39.0 8.7
Yes 11.0 41.3
Accuracy (average) : 0.8025
pred1 <- predict(cart1,
test ,
type="prob") %>% cbind(test,.)
pred1$Predicted=predict(cart1,test)
confusionMatrix(pred1$Predicted,
reference=pred1$went_on_backorder,
positive ="Yes",
mode="everything")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 83 21
Yes 17 79
Accuracy : 0.81
95% CI : (0.7487, 0.8619)
No Information Rate : 0.5
P-Value [Acc > NIR] : <0.0000000000000002
Kappa : 0.62
Mcnemar's Test P-Value : 0.6265
Sensitivity : 0.7900
Specificity : 0.8300
Pos Pred Value : 0.8229
Neg Pred Value : 0.7981
Precision : 0.8229
Recall : 0.7900
F1 : 0.8061
Prevalence : 0.5000
Detection Rate : 0.3950
Detection Prevalence : 0.4800
Balanced Accuracy : 0.8100
'Positive' Class : Yes
rpart_results_comparison <- data.frame(c("CP Based", "Max Tree Based", "Refined"))
rpart_results_comparison$Accuracy = 0
colnames(rpart_results_comparison)[1] <- "rPart_Method"
rpart_results_comparison$Accuracy[1] <- confusionMatrix(pred1$Predicted,
reference=pred1$went_on_backorder,
positive ="Yes",
mode="everything")$overall[[1]] %>% as.numeric()CP Tuning Based Tree has an accuracy of 0.81.
cart2=caret::train(went_on_backorder~.,
data=train,
method = "rpart2",
trControl=Control,
tuneLength=10)note: only 6 possible values of the max tree depth from the initial fit.
Truncating the grid to 6 .
cart2CART
600 samples
21 predictor
2 classes: 'No', 'Yes'
No pre-processing
Resampling: Cross-Validated (10 fold, repeated 10 times)
Summary of sample sizes: 540, 540, 540, 540, 540, 540, ...
Addtional sampling using SMOTE
Resampling results across tuning parameters:
maxdepth ROC Sens Spec
1 0.7038333 0.5866667 0.8210000
2 0.8185111 0.8100000 0.8206667
4 0.8501833 0.8120000 0.8476667
10 0.8590111 0.7953333 0.8590000
12 0.8596611 0.7950000 0.8616667
15 0.8596611 0.7950000 0.8616667
ROC was used to select the optimal model using the largest value.
The final value used for the model was maxdepth = 12.
fancyRpartPlot(cart2$finalModel,palettes="RdPu", main="Method: Max Tree Depth Based Decision Tree", sub="")confusionMatrix(cart2,positive ="Yes")Cross-Validated (10 fold, repeated 10 times) Confusion Matrix
(entries are percentual average cell counts across resamples)
Reference
Prediction No Yes
No 39.8 6.9
Yes 10.2 43.1
Accuracy (average) : 0.8283
pred2 <- predict(cart2,
test,
type="prob") %>% cbind(test,.)
pred2$Predicted=predict(cart2,test)
confusionMatrix(pred2$Predicted,
reference=pred2$went_on_backorder,
positive ="Yes",
mode="everything")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 79 14
Yes 21 86
Accuracy : 0.825
95% CI : (0.7651, 0.875)
No Information Rate : 0.5
P-Value [Acc > NIR] : <0.0000000000000002
Kappa : 0.65
Mcnemar's Test P-Value : 0.3105
Sensitivity : 0.8600
Specificity : 0.7900
Pos Pred Value : 0.8037
Neg Pred Value : 0.8495
Precision : 0.8037
Recall : 0.8600
F1 : 0.8309
Prevalence : 0.5000
Detection Rate : 0.4300
Detection Prevalence : 0.5350
Balanced Accuracy : 0.8250
'Positive' Class : Yes
rpart_results_comparison$Accuracy[2] <- confusionMatrix(pred2$Predicted,
reference=pred2$went_on_backorder,
positive ="Yes",
mode="everything")$overall[[1]] %>% as.numeric()Max Tree Based Tuning has an accuracy of 0.825.
task4f=makeClassifTask(id="backorder",data=train,target="went_on_backorder",positive = "Yes")
tasktest=makeClassifTask(id="backorder",data=test,target="went_on_backorder",positive = "Yes")
learner = makeLearner("classif.rpart", predict.type = "prob")learner$par.set Type len Def Constr Req Tunable Trafo
minsplit integer - 20 1 to Inf - TRUE -
minbucket integer - - 1 to Inf - TRUE -
cp numeric - 0.01 0 to 1 - TRUE -
maxcompete integer - 4 0 to Inf - TRUE -
maxsurrogate integer - 5 0 to Inf - TRUE -
usesurrogate discrete - 2 0,1,2 - TRUE -
surrogatestyle discrete - 0 0,1 - TRUE -
maxdepth integer - 30 1 to 30 - TRUE -
xval integer - 10 0 to Inf - FALSE -
parms untyped - - - - TRUE -
Optimal combination of CP and MaxtreeDepth will be choosen for our algorithm.
ps=makeParamSet(makeDiscreteParam("maxdepth",values = c(1,2,3,4,5,6,7)),makeNumericParam("cp",lower=0.01,upper=0.1))
ctrlgrid = makeTuneControlGrid()
rdesc = makeResampleDesc("RepCV",reps = 10,folds=10)
res=tuneParams(learner,
task=task4f,
resampling=rdesc,
par.set=ps,
control=ctrlgrid,
measures = list(mmce,bac))res$x$maxdepth
[1] 6
$cp
[1] 0.07
The optimal tuning is a max depth of 6 and a cp of 0.07.
resdf=generateHyperParsEffectData(res)
resdata=resdf$data%>%tbl_df()
resdata %>%
ggplot(aes(x=cp,y=maxdepth))+
geom_point(aes(size=mmce.test.mean,fill=mmce.test.mean),alpha=0.6,shape=21)+
geom_vline(xintercept=res$x$cp,color="red",size=0.7)+
geom_hline(yintercept=res$x$maxdepth,color="red",size=0.7)+
scale_fill_gradient(high="purple",low="#ff0033") +
theme(plot.title = element_text(face="bold", hjust = 0.5),
legend.background = element_rect(size = 0.1),
legend.position = "right",
legend.direction = "vertical",
legend.title = element_text(face = "bold")) +
labs(title = "Mean Misclassification Error (MMCE) Grid Tuning")resdata %>%
ggplot(aes(x=cp,y=maxdepth))+
geom_point(aes(size=bac.test.mean,fill=bac.test.mean),alpha=0.6,shape=21)+
geom_vline(xintercept=res$x$cp,color="blue",size=0.7)+
geom_hline(yintercept=res$x$maxdepth,color="blue",size=0.7)+
scale_fill_gradient(low="purple",high="#ff0033") +
theme(plot.title = element_text(face="bold", hjust = 0.5),
legend.background = element_rect(size = 0.1),
legend.position = "right",
legend.direction = "vertical",
legend.title = element_text(face = "bold")) +
labs(title = "Balance Accuracy (BAC) Grid Tuning")learner2=setHyperPars(learner,par.vals = res$x)
cartmlr=mlr::train(learner2,task4f)
predmlr=predict(cartmlr,tasktest)
mets=list(auc,bac,tpr,tnr,mmce,ber,fpr,fnr)
performance(predmlr, measures =mets) auc bac tpr tnr mmce ber fpr fnr
0.87095 0.87500 0.84000 0.91000 0.12500 0.12500 0.09000 0.16000
truth=predmlr$data$truth
confusionMatrix(predmlr$data$response,
reference=predmlr$data$truth,
positive ="Yes",
mode="everything")Confusion Matrix and Statistics
Reference
Prediction No Yes
No 91 16
Yes 9 84
Accuracy : 0.875
95% CI : (0.821, 0.9174)
No Information Rate : 0.5
P-Value [Acc > NIR] : <0.0000000000000002
Kappa : 0.75
Mcnemar's Test P-Value : 0.2301
Sensitivity : 0.8400
Specificity : 0.9100
Pos Pred Value : 0.9032
Neg Pred Value : 0.8505
Precision : 0.9032
Recall : 0.8400
F1 : 0.8705
Prevalence : 0.5000
Detection Rate : 0.4200
Detection Prevalence : 0.4650
Balanced Accuracy : 0.8750
'Positive' Class : Yes
rpart_results_comparison$Accuracy[3] <- confusionMatrix(predmlr$data$response,
reference=predmlr$data$truth,
positive ="Yes",
mode="everything")$overall[[1]]fancyRpartPlot(cartmlr$learner.model,palettes="RdPu", sub="", main = "Refined Grid Tuning Decision Tree")
rpart_results_comparison %<>% arrange(desc(Accuracy))
time_calib = (proc.time() - start_time)Our Refined Tuning has the best result with 5% higher accuracy than next method.
colnames(rpart_results_comparison)[1] <- "rPart Method"
rpart_results_comparison rPart Method Accuracy
1 Refined 0.875
2 Max Tree Based 0.825
3 CP Based 0.810
This rMarkdown script took 14 minutes to run.