library(dplyr)
data(sales, package="DMwR2")
sales
summary(sales)
ID Prod Quant Val
v431 : 10159 p1125 : 3923 Min. : 100 Min. : 1005
v54 : 6017 p3774 : 1824 1st Qu.: 107 1st Qu.: 1345
v426 : 3902 p1437 : 1720 Median : 168 Median : 2675
v1679 : 3016 p1917 : 1702 Mean : 8442 Mean : 14617
v1085 : 3001 p4089 : 1598 3rd Qu.: 738 3rd Qu.: 8680
v1183 : 2642 p2742 : 1519 Max. :473883883 Max. :4642955
(Other):372409 (Other):388860 NA's :13842 NA's :1182
Insp
ok : 14462
unkn :385414
fraud: 1270
nlevels(sales$ID)
[1] 6016
nlevels(sales$Prod)
[1] 4548
filter(sales,is.na(Quant),is.na(Val))
We can look at the proportions of the missing values
table(sales$Insp)/nrow(sales) * 100
ok unkn fraud
3.605171 96.078236 0.316593
library(ggplot2)
ggplot(group_by(sales,ID) %>% summarize(nTrans=n()),aes(x=ID,y=nTrans)) +
geom_bar(stat="identity") +
theme(axis.text.x = element_blank(), axis.ticks.x=element_blank()) +
xlab("Salesmen") + ylab("Nr. of Transactions") +
ggtitle("Nr. of Transactions per Salesman")
ggplot(group_by(sales,Prod) %>% summarize(nTrans=n()),aes(x=Prod,y=nTrans)) +
geom_bar(stat="identity") +
theme(axis.text.x = element_blank(), axis.ticks.x=element_blank()) +
xlab("Product") + ylab("Nr. of Transactions") +
ggtitle("Nr. of Transactions per Product")
ggplot(group_by(sales,ID) %>% summarize(nTrans=n()),aes(x=ID,y=nTrans)) +
geom_bar(stat="identity") +
theme(axis.text.x = element_blank(), axis.ticks.x=element_blank()) +
xlab("Product") + ylab("Nr. of Transactions") +
ggtitle("Nr. of Transactions per Salesman")
ggplot(group_by(sales,Prod) %>% summarize(nTrans=n()),aes(x=Prod,y=nTrans)) +
geom_bar(stat="identity") +
theme(axis.text.x = element_blank(), axis.ticks.x=element_blank()) +
xlab("Salesmen") + ylab("Nr. of Transactions") +
ggtitle("Nr. of Transactions per Product")
sales <- mutate(sales,Uprice=Val/Quant)
sales
summary(sales$Uprice)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 8.46 11.89 20.30 19.11 26460.70 14136
prods <- group_by(sales,Prod)
mpProds <- summarize(prods,medianPrice=median(Uprice,na.rm=TRUE))
bind_cols(mpProds %>% arrange(medianPrice) %>% slice(1:5),
mpProds %>% arrange(desc(medianPrice)) %>% slice(1:5))
New names:
library(ggplot2)
library(forcats)
ggplot(filter(sales,Prod %in% c("p3689","p560")),aes(x=fct_drop(Prod),y=Uprice)) +
geom_boxplot() + scale_y_log10() +
xlab("") + ylab("log10(UnitPrice)")
ids <- group_by(sales,ID)
tvIDs <- summarize(ids,totalVal=sum(Val,na.rm=TRUE))
bind_cols(tvIDs %>% arrange(totalVal) %>% slice(1:5),
tvIDs %>% arrange(desc(totalVal)) %>% slice(1:5))
New names:
arrange(tvIDs,desc(totalVal)) %>% slice(1:100) %>%
summarize(t100=sum(totalVal)) /
(summarize(tvIDs,sum(totalVal))) * 100
arrange(tvIDs,totalVal) %>% slice(1:2000) %>%
summarize(b2000=sum(totalVal)) /
(summarize(tvIDs,sum(totalVal))) * 100
prods <- group_by(sales,Prod)
qtProds <- summarize(prods,totalQty=sum(Quant,na.rm=TRUE))
bind_cols(qtProds %>% arrange(desc(totalQty)) %>% slice(1:5),
qtProds %>% arrange(totalQty) %>% slice(1:5))
New names:
arrange(qtProds,desc(totalQty)) %>% slice(1:100) %>%
summarize(t100=sum(as.numeric(totalQty))) /
(summarize(qtProds,sum(as.numeric(totalQty)))) * 100
arrange(qtProds,totalQty) %>% slice(1:4000) %>%
summarize(b4000=sum(as.numeric(totalQty))) /
(summarize(qtProds,sum(as.numeric(totalQty)))) * 100
nouts <- function(x) length(boxplot.stats(x)$out)
noutsProds <- summarise(prods,nOut=nouts(Uprice))
arrange(noutsProds,desc(nOut))
summarize(noutsProds,totalOuts=sum(nOut))
summarize(noutsProds,totalOuts=sum(nOut))/nrow(sales)*100
prop.naQandV <- function(q,v) 100*sum(is.na(q) & is.na(v))/length(q)
summarise(ids,nProbs=prop.naQandV(Quant,Val)) %>% arrange(desc(nProbs))
summarise(prods,nProbs=prop.naQandV(Quant,Val)) %>% arrange(desc(nProbs))
sales <- filter(sales,!(is.na(Quant) & is.na(Val)))
prop.nas <- function(x) 100*sum(is.na(x))/length(x)
summarise(prods,propNA.Q=prop.nas(Quant)) %>% arrange(desc(propNA.Q))
filter(sales, Prod %in% c("p2442","p2443")) %>%
group_by(Insp) %>% count()
sales <- droplevels(filter(sales,!(Prod %in% c("p2442", "p2443"))))
summarise(ids,propNA.Q=prop.nas(Quant)) %>% arrange(desc(propNA.Q))
summarise(prods,propNA.V=prop.nas(Val)) %>% arrange(desc(propNA.V))
summarise(ids,propNA.V=prop.nas(Val)) %>% arrange(desc(propNA.V))
tPrice <- filter(sales, Insp != "fraud") %>%
group_by(Prod) %>%
summarise(medianPrice = median(Uprice,na.rm=TRUE))
noQuantMedPrices <- filter(sales, is.na(Quant)) %>%
inner_join(tPrice) %>%
select(medianPrice)
Joining with `by = join_by(Prod)`
noValMedPrices <- filter(sales, is.na(Val)) %>%
inner_join(tPrice) %>%
select(medianPrice)
Joining with `by = join_by(Prod)`
noQuant <- which(is.na(sales$Quant))
noVal <- which(is.na(sales$Val))
sales[noQuant,'Quant'] <- ceiling(sales[noQuant,'Val'] /noQuantMedPrices)
sales[noVal,'Val'] <- sales[noVal,'Quant'] * noValMedPrices
sales$Uprice <- sales$Val/sales$Quant
save(sales, file = "salesClean.Rdata")
ms <- filter(sales,Insp != "fraud") %>%
group_by(Prod) %>%
summarize(median=median(Uprice,na.rm=TRUE),
iqr=IQR(Uprice,na.rm=TRUE),
nTrans=n(),
fewTrans=ifelse(nTrans>20,FALSE,TRUE))
ms
Properties of the distribution of unit prices
ggplot(ms,aes(x=median,y=iqr,color=fewTrans)) +
geom_point() +
xlab("Median") + ylab("IQR")
ggplot(ms,aes(x=median,y=iqr,color=fewTrans)) +
geom_point() +
scale_y_log10() + scale_x_log10() +
xlab("log(Median)") + ylab("log(IQR)")
ms <- mutate(ms,smedian=scale(median),siqr=scale(iqr))
smalls <- which(ms$fewTrans)
nsmalls <- as.character(ms$Prod[smalls])
similar <- matrix(NA,length(smalls),7,
dimnames=list(nsmalls,
c("RowSimProd", "ks.stat", "ks.p", "medP", "iqrP", "medS","iqrS")))
xprods <- tapply(sales$Uprice, sales$Prod, list)
for(i in seq_along(smalls)) {
d <- scale(ms[,c("smedian","siqr")],
c(ms$smedian[smalls[i]],ms$siqr[smalls[i]]),
FALSE)
d <- sqrt(drop(d^2 %*% rep(1, ncol(d))))
stat <- ks.test(xprods[[nsmalls[i]]], xprods[[order(d)[2]]])
similar[i, ] <- c(order(d)[2], stat$statistic, stat$p.value,
ms$median[smalls[i]],ms$iqr[smalls[i]],
ms$median[order(d)[2]],ms$iqr[order(d)[2]])
}
Warning: p-value will be approximate in the presence of tiesWarning: p-value will be approximate in the presence of tiesWarning: p-value will be approximate in the presence of ties
head(similar)
RowSimProd ks.stat ks.p medP iqrP medS iqrS
p8 2827 0.4339623 0.04073992 3.850211 0.7282168 3.868306 0.7938557
p18 213 0.2568922 0.21381724 5.187266 8.0359968 5.274884 7.8207052
p38 1044 0.3650794 0.08445708 5.490758 6.4162095 5.651818 6.2436224
p39 3418 0.2214286 0.69337166 7.986486 1.4229755 8.005181 1.5625650
p40 1335 0.3760000 0.02941385 9.674797 1.6104511 9.711538 1.6505602
p47 1387 0.3125000 0.35500258 2.504092 2.5625835 2.413498 2.6402087
bind_rows(filter(ms,Prod==rownames(similar)[1]),
ms[similar[1,1],])
nrow(similar[similar[, "ks.p"] >= 0.9, ])
[1] 72
sum(similar[, "ks.p"] >= 0.9)
[1] 72
save(similar, file = "similarProducts.Rdata")
library(ROCR)
data(ROCR.simple)
pred <- prediction(ROCR.simple$predictions, ROCR.simple$labels)
perf <- performance(pred, "prec", "rec")
plot(perf)
PRcurve <- function(preds, trues, ...) {
require(ROCR, quietly = TRUE)
pd <- prediction(preds, trues)
pf <- performance(pd, "prec", "rec")
pf@y.values <- lapply(pf@y.values, function(x) rev(cummax(rev(x))))
plot(pf, ...)
}
PRcurve(ROCR.simple$predictions, ROCR.simple$labels)
library(ROCR)
data(ROCR.simple)
pred <- prediction(ROCR.simple$predictions, ROCR.simple$labels)
perf <- performance(pred, "prec", "rec")
par( mfrow=c(1,2) )
plot(perf)
PRcurve(ROCR.simple$predictions, ROCR.simple$labels)
par( mfrow=c(1,1) )
pred <- prediction(ROCR.simple$predictions, ROCR.simple$labels)
perf <- performance(pred, "lift", "rpp")
plot(perf, main = "Lift Chart")
CRchart <- function(preds, trues, ...) {
require(ROCR, quietly = T)
pd <- prediction(preds, trues)
pf <- performance(pd, "rec", "rpp")
plot(pf, ...)
}
CRchart(ROCR.simple$predictions, ROCR.simple$labels,
main='Cumulative Recall Chart')
avgNDTP <- function(toInsp,train,stats) {
if (missing(train) && missing(stats))
stop('Provide either the training data or the product stats')
if (missing(stats)) {
stats <- as.matrix(filter(train,Insp != 'fraud') %>%
group_by(Prod) %>%
summarise(median=median(Uprice),iqr=IQR(Uprice)) %>%
select(median,iqr))
rownames(stats) <- levels(train$Prod)
stats[which(stats[,'iqr']==0),'iqr'] <- stats[which(stats[,'iqr']==0),'median']
}
return(mean(abs(toInsp$Uprice-stats[toInsp$Prod,'median']) /
stats[toInsp$Prod,'iqr']))
}
evalOutlierRanking <- function(testSet,rankOrder,Threshold,statsProds,...)
{
ordTS <- testSet[rankOrder,]
N <- nrow(testSet)
nF <- if (Threshold < 1) as.integer(Threshold*N) else Threshold
cm <- table(c(rep('fraud',nF),rep('ok',N-nF)),ordTS$Insp)
prec <- cm['fraud','fraud']/sum(cm['fraud',])
rec <- cm['fraud','fraud']/sum(cm[,'fraud'])
AVGndtp <- avgNDTP(ordTS[1:nF,],stats=statsProds)
return(c(Precision=prec,Recall=rec,avgNDTP=AVGndtp))
}
BPrule.wf <- function(form,train,test,...) {
require(dplyr, quietly=TRUE)
ms <- as.matrix(filter(train,Insp != 'fraud') %>%
group_by(Prod) %>%
summarise(median=median(Uprice),iqr=IQR(Uprice)) %>%
select(median,iqr))
rownames(ms) <- levels(train$Prod)
ms[which(ms[,'iqr']==0),'iqr'] <- ms[which(ms[,'iqr']==0),'median']
ORscore <- abs(test$Uprice-ms[test$Prod,'median']) /
ms[test$Prod,'iqr']
rankOrder <- order(ORscore,decreasing=TRUE)
res <- list(testSet=test,rankOrder=rankOrder,
probs=matrix(c(ORscore,ifelse(test$Insp=='fraud',1,0)),
ncol=2))
res
}
library(dplyr)
globalStats <- as.matrix(filter(sales,Insp != 'fraud') %>%
group_by(Prod) %>%
summarise(median=median(Uprice),iqr=IQR(Uprice)) %>%
select(median,iqr))
rownames(globalStats) <- levels(sales$Prod)
globalStats[which(globalStats[,'iqr']==0),'iqr'] <-
globalStats[which(globalStats[,'iqr']==0),'median']
head(globalStats,3)
median iqr
p1 11.34615 8.563580
p2 10.87786 5.609731
p3 10.00000 4.809092
library(performanceEstimation)
bp.res <- performanceEstimation(
PredTask(Insp ~ ., sales),
Workflow("BPrule.wf"),
EstimationTask(metrics=c("Precision","Recall","avgNDTP"),
method=Holdout(nReps=3, hldSz=0.3, strat=TRUE),
evaluator="evalOutlierRanking",
evaluator.pars=list(Threshold=0.1, statsProds=globalStats))
)
##### PERFORMANCE ESTIMATION USING HOLD OUT #####
** PREDICTIVE TASK :: sales.Insp
++ MODEL/WORKFLOW :: BPrule.wf
Task for estimating Precision,Recall,avgNDTP using
Stratified 3 x 70 % / 30 % Holdout
Run with seed = 1234
Iteration : 1 2 3
summary(bp.res)
== Summary of a Hold Out Performance Estimation Experiment ==
Task for estimating Precision,Recall,avgNDTP using
Stratified 3 x 70 % / 30 % Holdout
Run with seed = 1234
* Predictive Tasks :: sales.Insp
* Workflows :: BPrule.wf
-> Task: sales.Insp
*Workflow: BPrule.wf
Precision Recall avgNDTP
avg 0.0178244211 0.56315789 11.3639416
std 0.0007261201 0.02294157 0.7633666
med 0.0181575879 0.57368421 11.1037531
iqr 0.0006663335 0.02105263 0.7293525
min 0.0169915042 0.53684211 10.7646833
max 0.0183241712 0.57894737 12.2233884
invalid 0.0000000000 0.00000000 0.0000000
ps.bp <- sapply(getIterationsInfo(bp.res),function(i) i$probs[,1])
ts.bp <- sapply(getIterationsInfo(bp.res),function(i) i$probs[,2])
PRcurve(ps.bp,ts.bp,main="PR curve",avg="vertical")
CRchart(ps.bp,ts.bp,main='Cumulative Recall curve',avg='vertical')
par(mfrow=c(1,2))
ps.bp <- sapply(getIterationsInfo(bp.res), function(i) i$probs[,1])
ts.bp <- sapply(getIterationsInfo(bp.res), function(i) i$probs[,2])
PRcurve(ps.bp, ts.bp, main="PR curve", avg="vertical")
CRchart(ps.bp, ts.bp, main='Cumulative Recall curve', avg='vertical')
LOF.wf <- function(form, train, test, k, ...) {
require(DMwR2, quietly=TRUE)
ntr <- nrow(train)
all <- as.data.frame(rbind(train,test))
N <- nrow(all)
ups <- split(all$Uprice,all$Prod)
r <- list(length=ups)
for(u in seq(along=ups))
r[[u]] <- if (NROW(ups[[u]]) > 3)
lofactor(ups[[u]],min(k,NROW(ups[[u]]) %/% 2))
else if (NROW(ups[[u]])) rep(0,NROW(ups[[u]]))
else NULL
all$lof <- vector(length=N)
split(all$lof,all$Prod) <- r
all$lof[which(!(is.infinite(all$lof) | is.nan(all$lof)))] <-
SoftMax(all$lof[which(!(is.infinite(all$lof) | is.nan(all$lof)))])
res <- list(testSet=test,
rankOrder=order(all[(ntr+1):N,'lof'],decreasing=TRUE),
probs=as.matrix(cbind(all[(ntr+1):N,'lof'],
ifelse(test$Insp=='fraud',1,0))))
res
}
lof.res <- performanceEstimation(
PredTask(Insp ~ . , sales),
Workflow("LOF.wf", k=7),
EstimationTask(metrics=c("Precision","Recall","avgNDTP"),
method=Holdout(nReps=3, hldSz=0.3, strat=TRUE),
evaluator="evalOutlierRanking",
evaluator.pars=list(Threshold=0.1, statsProds=globalStats))
)
##### PERFORMANCE ESTIMATION USING HOLD OUT #####
** PREDICTIVE TASK :: sales.Insp
++ MODEL/WORKFLOW :: LOF.wf
Task for estimating Precision,Recall,avgNDTP using
Stratified 3 x 70 % / 30 % Holdout
Run with seed = 1234
Iteration : 1
summary(lof.res)
== Summary of a Hold Out Performance Estimation Experiment ==
Task for estimating Precision,Recall,avgNDTP using
Stratified 3 x 70 % / 30 % Holdout
Run with seed = 1234
* Predictive Tasks :: sales.Insp
* Workflows :: LOF.wf
-> Task: sales.Insp
*Workflow: LOF.wf
Precision Recall avgNDTP
avg 0.0225442834 0.712280702 8.9724516
std 0.0002925108 0.009241802 0.7111686
med 0.0225720473 0.713157895 8.7667068
iqr 0.0002915209 0.009210526 0.6884858
min 0.0222388806 0.702631579 8.3868382
max 0.0228219224 0.721052632 9.7638097
invalid 0.0000000000 0.000000000 0.0000000
ps.lof <- sapply(getIterationsInfo(lof.res), function(i) i$probs[,1])
ts.lof <- sapply(getIterationsInfo(lof.res), function(i) i$probs[,2])
PRcurve(ps.bp,ts.bp,main="PR curve",lty=1,
xlim=c(0,1),ylim=c(0,1),avg="vertical")
#PRcurve(ps.lof,ts.lof,add=TRUE,lty=2,avg='vertical')
legend('topright',c('BPrule','LOF'),lty=c(1,2))
CRchart(ps.bp,ts.bp,main='Cumulative Recall curve',
lty=1,xlim=c(0,1),ylim=c(0,1),avg='vertical')
#CRchart(ps.lof,ts.lof,add=TRUE,lty=2,avg='vertical')
legend('bottomright',c('BPrule','LOF'),lty=c(1,2))
par(mfrow=c(1,2))
ps.lof <- sapply(getIterationsInfo(lof.res), function(i) i$probs[,1])
ts.lof <- sapply(getIterationsInfo(lof.res), function(i) i$probs[,2])
PRcurve(ps.bp, ts.bp,main="PR curve",lty=1,
xlim=c(0,1),ylim=c(0,1),avg="vertical")
#PRcurve(ps.lof, ts.lof,add=T,lty=2,avg='vertical')
legend('topright',c('BPrule','LOF'),lty=c(1,2))
CRchart(ps.bp,ts.bp,main='Cumulative Recall curve',
lty=1,xlim=c(0,1),ylim=c(0,1),avg='vertical')
#CRchart(ps.lof,ts.lof,add=T,lty=2,avg='vertical')
legend('bottomright',c('BPrule','LOF'),lty=c(1,2))
ORh.wf <- function(form, train, test, ...) {
require(DMwR2, quietly=TRUE)
ntr <- nrow(train)
all <- as.data.frame(rbind(train,test))
N <- nrow(all)
ups <- split(all$Uprice,all$Prod)
r <- list(length=ups)
for(u in seq(along=ups))
r[[u]] <- if (NROW(ups[[u]]) > 3)
outliers.ranking(ups[[u]])$prob.outliers
else if (NROW(ups[[u]])) rep(0,NROW(ups[[u]]))
else NULL
all$orh <- vector(length=N)
split(all$orh,all$Prod) <- r
all$orh[which(!(is.infinite(all$orh) | is.nan(all$orh)))] <-
SoftMax(all$orh[which(!(is.infinite(all$orh) | is.nan(all$orh)))])
res <- list(testSet=test,
rankOrder=order(all[(ntr+1):N,'orh'],decreasing=TRUE),
probs=as.matrix(cbind(all[(ntr+1):N,'orh'],
ifelse(test$Insp=='fraud',1,0))))
res
}
orh.res <- performanceEstimation(
PredTask(Insp ~ . , sales),
Workflow("ORh.wf"),
EstimationTask(metrics=c("Precision","Recall","avgNDTP"),
method=Holdout(nReps=3, hldSz=0.3, strat=TRUE),
evaluator="evalOutlierRanking",
evaluator.pars=list(Threshold=0.1, statsProds=globalStats))
)
##### PERFORMANCE ESTIMATION USING HOLD OUT #####
** PREDICTIVE TASK :: sales.Insp
++ MODEL/WORKFLOW :: ORh.wf
Task for estimating Precision,Recall,avgNDTP using
Stratified 3 x 70 % / 30 % Holdout
Run with seed = 1234
Iteration : 1 2 3
summary(orh.res)
== Summary of a Hold Out Performance Estimation Experiment ==
Task for estimating Precision,Recall,avgNDTP using
Stratified 3 x 70 % / 30 % Holdout
Run with seed = 1234
* Predictive Tasks :: sales.Insp
* Workflows :: ORh.wf
-> Task: sales.Insp
*Workflow: ORh.wf
Precision Recall avgNDTP
avg 0.0222388806 0.70263158 9.1207378
std 0.0006006249 0.01897659 0.9773273
med 0.0220722972 0.69736842 8.6315065
iqr 0.0005830418 0.01842105 0.8807146
min 0.0217391304 0.68684211 8.4846389
max 0.0229052141 0.72368421 10.2460681
invalid 0.0000000000 0.00000000 0.0000000
ps.orh <- sapply(getIterationsInfo(orh.res), function(i) i$probs[,1])
ts.orh <- sapply(getIterationsInfo(orh.res), function(i) i$probs[,2])
PRcurve(ps.bp,ts.bp,main="PR curve",lty=1,
xlim=c(0,1),ylim=c(0,1),avg="vertical")
#PRcurve(ps.lof,ts.lof,add=TRUE,lty=2,avg='vertical')
PRcurve(ps.orh,ts.orh,add=TRUE,lty=1,col='grey', avg='vertical')
legend('topright',c('BPrule','LOF','ORh'),lty=c(1,2,1),
col=c('black','black','grey'))
CRchart(ps.bp,ts.bp,main='Cumulative Recall curve',
lty=1,xlim=c(0,1),ylim=c(0,1),avg='vertical')
#CRchart(ps.lof,ts.lof,add=TRUE,lty=2,avg='vertical')
CRchart(ps.orh,ts.orh,add=TRUE,lty=1,col='grey',avg='vertical')
legend('bottomright',c('BPrule','LOF','ORh'),lty=c(1,2,1),
col=c('black','black','grey'))
par(mfrow=c(1,2))
ps.orh <- sapply(getIterationsInfo(orh.res), function(i) i$probs[,1])
ts.orh <- sapply(getIterationsInfo(orh.res), function(i) i$probs[,2])
PRcurve(ps.bp,ts.bp,main="PR curve",lty=1,
xlim=c(0,1),ylim=c(0,1),avg="vertical")
#PRcurve(ps.lof,ts.lof,add=T,lty=2,avg='vertical')
PRcurve(ps.orh,ts.orh,add=T,lty=1,col='grey', avg='vertical')
legend('topright',c('BPrule','LOF','ORh'),lty=c(1,2,1),
col=c('black','black','grey'))
CRchart(ps.bp,ts.bp,main='Cumulative Recall curve',
lty=1,xlim=c(0,1),ylim=c(0,1),avg='vertical')
#CRchart(ps.lof,ts.lof,add=T,lty=2,avg='vertical')
CRchart(ps.orh,ts.orh,add=T,lty=1,col='grey',avg='vertical')
legend('bottomright',c('BPrule','LOF','ORh'),lty=c(1,2,1),
col=c('black','black','grey'))
library(UBL)
data(iris)
data <- iris[, c(1, 2, 5)]
data$Species <- factor(ifelse(data$Species == "setosa", "rare","common"))
table(data$Species)
common rare
100 50
newData <- SmoteClassif(Species ~ ., data, C.perc = "balance")
table(newData$Species)
common rare
75 75
newData2 <- SmoteClassif(Species ~ ., data, C.perc = list(common = 1,rare = 6))
table(newData2$Species)
common rare
100 300
library(ggplot2)
ggplot(data,aes(x=Sepal.Length,y=Sepal.Width,color=Species)) +
geom_point() + ggtitle("Original Data")
ggplot(newData2,aes(x=Sepal.Length,y=Sepal.Width,color=Species)) +
geom_point() + ggtitle("SMOTE'd Data")
NB.wf <- function(form,train,test,...) {
require(e1071,quietly=TRUE)
sup <- which(train$Insp != 'unkn')
data <- as.data.frame(train[sup,c('ID','Prod','Uprice','Insp')])
data$Insp <- factor(data$Insp,levels=c('ok','fraud'))
model <- naiveBayes(Insp ~ .,data, ...)
preds <- predict(model,test[,c('ID','Prod','Uprice','Insp')], type='raw')
rankOrder <- order(preds[,'fraud'], decreasing=TRUE)
rankScore <- preds[,'fraud']
res <- list(testSet=test,
rankOrder=rankOrder,
probs=as.matrix(cbind(rankScore,
ifelse(test$Insp=='fraud',1,0))))
res
}
nb.res <- performanceEstimation(
PredTask(Insp ~ . , sales),
Workflow("NB.wf"),
EstimationTask(metrics=c("Precision","Recall","avgNDTP"),
method=Holdout(nReps=3,hldSz=0.3,strat=TRUE),
evaluator="evalOutlierRanking",
evaluator.pars=list(Threshold=0.1,
statsProds=globalStats))
)
##### PERFORMANCE ESTIMATION USING HOLD OUT #####
** PREDICTIVE TASK :: sales.Insp
++ MODEL/WORKFLOW :: NB.wf
Task for estimating Precision,Recall,avgNDTP using
Stratified 3 x 70 % / 30 % Holdout
Run with seed = 1234
Iteration : 1 2 3
summary(nb.res)
== Summary of a Hold Out Performance Estimation Experiment ==
Task for estimating Precision,Recall,avgNDTP using
Stratified 3 x 70 % / 30 % Holdout
Run with seed = 1234
* Predictive Tasks :: sales.Insp
* Workflows :: NB.wf
-> Task: sales.Insp
*Workflow: NB.wf
Precision Recall avgNDTP
avg 0.0134932534 0.42631579 6.403751
std 0.0009009374 0.02846488 1.070227
med 0.0137431284 0.43421053 6.175571
iqr 0.0008745627 0.02763158 1.051825
min 0.0124937531 0.39473684 5.466015
max 0.0142428786 0.45000000 7.569665
invalid 0.0000000000 0.00000000 0.000000
ps.nb <- sapply(getIterationsInfo(nb.res), function(i) i$probs[,1])
ts.nb <- sapply(getIterationsInfo(nb.res), function(i) i$probs[,2])
PRcurve(ps.nb,ts.nb,main="PR curve",lty=1,
xlim=c(0,1),ylim=c(0,1),avg="vertical")
PRcurve(ps.orh,ts.orh,add=TRUE,lty=2,avg='vertical')
legend('topright',c('NaiveBayes','ORh'),lty=1,col=c('black','grey'))
CRchart(ps.nb,ts.nb,main='Cumulative Recall curve',
lty=1,xlim=c(0,1),ylim=c(0,1),avg='vertical')
CRchart(ps.orh,ts.orh,add=TRUE,lty=2,avg='vertical')
legend('bottomright',c('NaiveBayes','ORh'),lty=1,col=c('black','grey'))
par(mfrow=c(1,2))
ps.nb <- sapply(getIterationsInfo(nb.res), function(i) i$probs[,1])
ts.nb <- sapply(getIterationsInfo(nb.res), function(i) i$probs[,2])
PRcurve(ps.nb,ts.nb,main="PR curve",lty=1,
xlim=c(0,1),ylim=c(0,1),avg="vertical")
PRcurve(ps.orh,ts.orh,add=T,lty=2,avg='vertical')
legend('topright',c('NaiveBayes','ORh'),lty=1,col=c('black','grey'))
CRchart(ps.nb,ts.nb,main='Cumulative Recall curve',
lty=1,xlim=c(0,1),ylim=c(0,1),avg='vertical')
CRchart(ps.orh,ts.orh,add=T,lty=2,avg='vertical')
legend('bottomright',c('NaiveBayes','ORh'),lty=1,col=c('black','grey'))
NBsm.wf <- function(form,train,test,C.perc="balance",dist="HEOM",...) {
require(e1071,quietly=TRUE)
require(UBL,quietly=TRUE)
sup <- which(train$Insp != 'unkn')
data <- as.data.frame(train[sup,c('ID','Prod','Uprice','Insp')])
data$Insp <- factor(data$Insp,levels=c('ok','fraud'))
newData <- SmoteClassif(Insp ~ .,data,C.perc=C.perc,dist=dist,...)
model <- naiveBayes(Insp ~ .,newData)
preds <- predict(model,test[,c('ID','Prod','Uprice','Insp')],type='raw')
rankOrder <- order(preds[,'fraud'],decreasing=T)
rankScore <- preds[,'fraud']
res <- list(testSet=test,
rankOrder=rankOrder,
probs=as.matrix(cbind(rankScore,
ifelse(test$Insp=='fraud',1,0))))
res
}
nbs.res <- performanceEstimation(
PredTask(Insp ~ ., sales),
Workflow("NBsm.wf"),
EstimationTask(metrics=c("Precision","Recall","avgNDTP"),
method=Holdout(nReps=3,hldSz=0.3,strat=TRUE),
evaluator="evalOutlierRanking",
evaluator.pars=list(Threshold=0.1,
statsProds=globalStats))
)
##### PERFORMANCE ESTIMATION USING HOLD OUT #####
** PREDICTIVE TASK :: sales.Insp
++ MODEL/WORKFLOW :: NBsm.wf
Task for estimating Precision,Recall,avgNDTP using
Stratified 3 x 70 % / 30 % Holdout
Run with seed = 1234
Iteration : 1 2 3
summary(nbs.res)
== Summary of a Hold Out Performance Estimation Experiment ==
Task for estimating Precision,Recall,avgNDTP using
Stratified 3 x 70 % / 30 % Holdout
Run with seed = 1234
* Predictive Tasks :: sales.Insp
* Workflows :: NBsm.wf
-> Task: sales.Insp
*Workflow: NBsm.wf
Precision Recall avgNDTP
avg 0.0145205175 0.45877193 6.7723410
std 0.0009866931 0.03117431 0.8098474
med 0.0139930035 0.44210526 6.3507932
iqr 0.0008745627 0.02763158 0.7228941
min 0.0139097118 0.43947368 6.2602207
max 0.0156588372 0.49473684 7.7060090
invalid 0.0000000000 0.00000000 0.0000000
ps.nbs <- sapply(getIterationsInfo(nbs.res), function(i) i$probs[,1])
ts.nbs <- sapply(getIterationsInfo(nbs.res), function(i) i$probs[,2])
PRcurve(ps.nb,ts.nb,main="PR curve",lty=1,
xlim=c(0,1),ylim=c(0,1), avg="vertical")
PRcurve(ps.orh,ts.orh,add=TRUE,lty=2, avg='vertical')
PRcurve(ps.nbs,ts.nbs,add=TRUE,lty=1, col='grey',avg='vertical')
legend('topright',c('NaiveBayes','ORh','smoteNaiveBayes'),lty=c(1,2,1),
col=c('black','black','grey'))
CRchart(ps.nb,ts.nb,main='Cumulative Recall curve',
lty=1,xlim=c(0,1),ylim=c(0,1),avg='vertical')
CRchart(ps.orh,ts.orh,add=TRUE,lty=2,avg='vertical')
CRchart(ps.nbs,ts.nbs,add=TRUE,lty=1,col='grey',avg='vertical')
legend('bottomright',c('NaiveBayes','ORh','smoteNaiveBayes'),lty=c(1,2,1),
col=c('black','black','grey'))
par(mfrow=c(1,2))
ps.nbs <- sapply(getIterationsInfo(nbs.res), function(i) i$probs[,1])
ts.nbs <- sapply(getIterationsInfo(nbs.res), function(i) i$probs[,2])
PRcurve(ps.nb,ts.nb,main="PR curve",lty=1,
xlim=c(0,1),ylim=c(0,1), avg="vertical")
PRcurve(ps.orh,ts.orh,add=T,lty=2,avg='vertical')
PRcurve(ps.nbs,ts.nbs,add=T,lty=1,col='grey',avg='vertical')
legend('topright',c('NaiveBayes','ORh','smoteNaiveBayes'),
lty=c(1,2,1),col=c('black','black','grey'))
CRchart(ps.nb,ts.nb,main='Cumulative Recall curve',
lty=1,xlim=c(0,1),ylim=c(0,1),avg='vertical')
CRchart(ps.orh,ts.orh,add=T,lty=2,avg='vertical')
CRchart(ps.nbs,ts.nbs,add=T,lty=1,col='grey',avg='vertical')
legend('bottomright',c('NaiveBayes','ORh','smoteNaiveBayes'),
lty=c(1,2,1),col=c('black','black','grey'))
library(DMwR2)
library(e1071)
data(iris)
set.seed(1234)
idx <- sample(150, 100)
tr <- iris[idx, ]
ts <- iris[-idx, ]
nb <- naiveBayes(Species ~ ., tr)
table(predict(nb, ts), ts$Species)
setosa versicolor virginica
setosa 18 0 0
versicolor 0 16 1
virginica 0 2 13
trST <- tr
nas <- sample(100, 90)
trST[nas, "Species"] <- NA
func <- function(m, d) {
p <- predict(m, d, type = "raw")
data.frame(cl = colnames(p)[ apply(p, 1, which.max) ],
p = apply(p, 1, max))
}
nbSTbase <- naiveBayes(Species ~ ., trST[-nas, ])
table(predict(nbSTbase, ts), ts$Species)
setosa versicolor virginica
setosa 18 0 0
versicolor 0 16 2
virginica 0 2 12
nbST <- SelfTrain(Species ~ ., trST,
learner="naiveBayes", learner.pars=list(),
pred="func")
table(predict(nbST, ts), ts$Species)
setosa versicolor virginica
setosa 18 0 0
versicolor 0 16 1
virginica 0 2 13
pred.nb <- function(m,d) {
p <- predict(m,d,type='raw')
data.frame(cl=colnames(p)[apply(p,1,which.max)],
p=apply(p,1,max)
)
}
nb.st.wf <- function(form,train,test,...) {
require(e1071,quietly=TRUE)
require(DMwR2, quietly=TRUE)
train <- as.data.frame(train[,c('ID','Prod','Uprice','Insp')])
train[which(train$Insp == 'unkn'),'Insp'] <- NA
train$Insp <- factor(train$Insp,levels=c('ok','fraud'))
model <- SelfTrain(form,train,
learner='naiveBayes', learner.pars=list(),
pred='pred.nb')
preds <- predict(model,test[,c('ID','Prod','Uprice','Insp')],
type='raw')
rankOrder <- order(preds[,'fraud'],decreasing=TRUE)
rankScore <- preds[,"fraud"]
res <- list(testSet=test,
rankOrder=rankOrder,
probs=as.matrix(cbind(rankScore,
ifelse(test$Insp=='fraud',1,0))))
res
}
nb.st.res <- performanceEstimation(
PredTask(Insp ~ .,sales),
Workflow("nb.st.wf"),
EstimationTask(metrics=c("Precision","Recall","avgNDTP"),
method=Holdout(nReps=3,hldSz=0.3,strat=TRUE),
evaluator="evalOutlierRanking",
evaluator.pars=list(Threshold=0.1,
statsProds=globalStats))
)
##### PERFORMANCE ESTIMATION USING HOLD OUT #####
** PREDICTIVE TASK :: sales.Insp
++ MODEL/WORKFLOW :: nb.st.wf
Task for estimating Precision,Recall,avgNDTP using
Stratified 3 x 70 % / 30 % Holdout
Run with seed = 1234
Iteration : 1 2 3
summary(nb.st.res)
== Summary of a Hold Out Performance Estimation Experiment ==
Task for estimating Precision,Recall,avgNDTP using
Stratified 3 x 70 % / 30 % Holdout
Run with seed = 1234
* Predictive Tasks :: sales.Insp
* Workflows :: nb.st.wf
-> Task: sales.Insp
*Workflow: nb.st.wf
Precision Recall avgNDTP
avg 0.0132989061 0.420175439 7.1697293
std 0.0002544603 0.008039606 0.6635963
med 0.0132433783 0.418421053 6.9139197
iqr 0.0002498751 0.007894737 0.6255247
min 0.0130767949 0.413157895 6.6721094
max 0.0135765451 0.428947368 7.9231588
invalid 0.0000000000 0.000000000 0.0000000
ps.nb.st <- sapply(getIterationsInfo(nb.st.res), function(i) i$probs[,1])
ts.nb.st <- sapply(getIterationsInfo(nb.st.res), function(i) i$probs[,2])
PRcurve(ps.nb,ts.nb,main="PR curve",lty=1,
xlim=c(0,1),ylim=c(0,1), avg="vertical")
PRcurve(ps.orh,ts.orh,add=TRUE,lty=1, color='grey', avg='vertical')
PRcurve(ps.nb.st,ts.nb.st,add=TRUE,lty=2,avg='vertical')
legend('topright',c('NaiveBayes','ORh','NaiveBayes-ST'),
lty=c(1,1,2),col=c('black','grey','black'))
CRchart(ps.nb,ts.nb,main='Cumulative Recall curve',
lty=1,xlim=c(0,1),ylim=c(0,1),avg='vertical')
CRchart(ps.orh,ts.orh,add=TRUE,lty=1,color='grey',avg='vertical')
CRchart(ps.nb.st,ts.nb.st,add=TRUE,lty=2,avg='vertical')
legend('bottomright',c('NaiveBayes','ORh','NaiveBayes-ST'),
lty=c(1,1,2),col=c('black','grey','grey'))
par(mfrow=c(1,2))
ps.nb.st <- sapply(getIterationsInfo(nb.st.res), function(i) i$probs[,1])
ts.nb.st <- sapply(getIterationsInfo(nb.st.res), function(i) i$probs[,2])
PRcurve(ps.nb,ts.nb,main="PR curve",lty=1,
xlim=c(0,1),ylim=c(0,1), avg="vertical")
PRcurve(ps.orh,ts.orh,add=T,lty=1, color='grey', avg='vertical')
PRcurve(ps.nb.st,ts.nb.st,add=T,lty=2,avg='vertical')
legend('topright',c('NaiveBayes','ORh','NaiveBayes-ST'),
lty=c(1,1,2),col=c('black','grey','black'))
CRchart(ps.nb,ts.nb,main='Cumulative Recall curve',
lty=1,xlim=c(0,1),ylim=c(0,1),avg='vertical')
CRchart(ps.orh,ts.orh,add=T,lty=1,color='grey',avg='vertical')
CRchart(ps.nb.st,ts.nb.st,add=T,lty=2,avg='vertical')
legend('bottomright',c('NaiveBayes','ORh','NaiveBayes-ST'),
lty=c(1,1,2),col=c('black','grey','grey'))