load("./R/data_models_20220110.RData")
library(dplyr)
library(tidyr)
library(tidytext)
library(rpart)
library(randomForest)
library(e1071)
library(caret)
library(rminer)
library(gbm)
library(FSelectorRcpp)
library(ggplot2)
library(ggpubr)
CT Classification Tree
RF Random Forest
LR Logistic Regression
NB Naive Bayes
SVM Support Vector Machine
GBM Gradient Boosting Machines
eval_ALL(subset, data, dependent, kk, rr)
subset: vettore con l’elenco delle variabili esplicative
data: data.frame
dependent: variabile obiettivo (factor con livelli Yes e No), da indicare in formato stringa
kk: k-folds
rr: numero di ripetizioni
eval_ALL <- function(subset=NULL, data=NULL, dependent = NULL,kk=10,rr=5){
if(is.null(subset) | is.null(data) | is.null(dependent)){stop("Manca qualcosa")}
if("data.frame" %in% class(data)==FALSE){stop("data non è un data.frame")}
if(sum(subset %in% names(data)) != length(subset)){stop("variabili in subset non corretto")}
if(dependent %in% names(data)==FALSE){stop("variabile dipendente non valida")}
if("Yes" %in% levels(data[,which(names(data)==dependent)])==FALSE){stop("la dipendente non è un fattore con modalità Yes No")}
my.AUC <- function(labels, scores){
labels <- as.logical(labels)
pos <- scores[labels]
neg <- scores[!labels]
if(length(pos)==0 | length(neg)==0){
auc <- NA
} else {
U <- as.numeric(wilcox.test(pos, neg)$statistic)
auc <- U/(length(pos) * length(neg))
}
return(auc)
}
fitControl <- trainControl(method = "cv", #"repeatedcv""cv"
number=10,
preProcOptions = list(thresh = 0.95),
classProbs = TRUE,
summaryFunction = twoClassSummary)
grid <- expand.grid(sigma = c(.01, .015, 0.2),
C = c(1, 10, 20, 50))
init <- Sys.time()
print("tuning SVM")
svm.tune.d <- train(as.formula(paste(names(data)[which(names(data)==dependent)],"~.")), data = data,
method = "svmRadial", # Radial kernel
tuneGrid = grid,
trControl=fitControl)
print("tuning GBM")
gbm.tune.1e <- train(as.formula(paste(names(data)[which(names(data)==dependent)],"~.")), data = data,
method = "gbm", trControl = fitControl, verbose = F,
tuneLength=5)
durt <- as.numeric(Sys.time()-init)
lstHypPr <- list(svm.tune.d$bestTune,gbm.tune.1e$bestTune)
k <- kk
trr <- 0
for(R in 1:rr){
cat("\n",R,": ")
splits <- runif(nrow(data))
for(h in 1:k){
cat(h)
test.idx <- (splits >= (h - 1) / k) & (splits < h / k)
train.idx <- !test.idx
test <- data[test.idx, , drop = FALSE]
train <- data[train.idx, , drop = FALSE]
# ----- CT
tree <- rpart(to_formula(subset, dependent), train)
cmt <- confusionMatrix(predict(tree, test, type = "c"),test[[dependent]],positive = "Yes",mode = "everything")
TN <- cmt$table[1,1]; TP <- cmt$table[2,2]; FP <- cmt$table[2,1]; FN <- cmt$table[1,2]
MCC <- ((TP*TN)-(FP*FN))/sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
pred <- predict(tree, test, type = "p")[,2]
dpred <- data.frame(id=1:nrow(data),pred=predict(tree, data, type = "p")[,2]) %>% mutate(method="CT",giro=R,K=h)
vrimp <- tree$variable.importance
vrimp <- data.frame(variable=names(vrimp),importance=vrimp) %>% mutate(method="CT",giro=R,K=h)
auc <- my.AUC(as.numeric(test[,which(names(test)==dependent)])-1,pred)
risuT2 <- list(cmt$overall["Accuracy"],cmt$overall["Kappa"],
cmt$byClass["F1"],cmt$byClass["Precision"],cmt$byClass["Recall"],
cmt$byClass["Specificity"],cmt$byClass["Balanced Accuracy"],MCC,auc,RR=R,KK=h)
rrr2 <- as.data.frame(matrix(unlist(risuT2),ncol = 11,byrow = T))
rrr2$vvars <- paste(subset,collapse = ",")
rrr2$nvars <- length(subset)
rrr2$method <- "CT"
if(trr==0){resultsT2=rrr2;resImp=vrimp;dfpred=dpred;trr=1} else {resultsT2=rbind(resultsT2,rrr2);resImp=rbind(resImp,vrimp);dfpred=rbind(dfpred,dpred)}
# ----- RF
tree <- randomForest(to_formula(subset, dependent), train)
cmt <- confusionMatrix(predict(tree, test, type = "c"),test[[dependent]],positive = "Yes",mode = "everything")
TN <- cmt$table[1,1]; TP <- cmt$table[2,2]; FP <- cmt$table[2,1]; FN <- cmt$table[1,2]
MCC <- ((TP*TN)-(FP*FN))/sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
pred <- predict(tree, test, type = "p")[,2]
dpred <- data.frame(id=1:nrow(data),pred=predict(tree, data, type = "p")[,2]) %>% mutate(method="RF",giro=R,K=h)
vrimp <- varImp(tree) #$Overall
vrimp <- data.frame(variable=rownames(vrimp),importance=vrimp$Overall) %>% mutate(method="RF",giro=R,K=h)
rownames(vrimp) <- NULL
auc <- my.AUC(as.numeric(test[,which(names(test)==dependent)])-1,pred)
risuT2 <- list(cmt$overall["Accuracy"],cmt$overall["Kappa"],
cmt$byClass["F1"],cmt$byClass["Precision"],cmt$byClass["Recall"],
cmt$byClass["Specificity"],cmt$byClass["Balanced Accuracy"],MCC,auc,RR=R,KK=h)
rrr2 <- as.data.frame(matrix(unlist(risuT2),ncol = 11,byrow = T))
rrr2$vvars <- paste(subset,collapse = ",")
rrr2$nvars <- length(subset)
rrr2$method <- "RF"
resultsT2=rbind(resultsT2,rrr2)
resImp=rbind(resImp,vrimp)
dfpred=rbind(dfpred,dpred)
# ----- SVM
tree <- svm(to_formula(subset, dependent), train, cost = svm.tune.d$bestTune$C, gamma = svm.tune.d$bestTune$sigma,probability=T)
cmt <- confusionMatrix(predict(tree, test, type = "c"),test[[dependent]],positive = "Yes",mode = "everything")
TN <- cmt$table[1,1]; TP <- cmt$table[2,2]; FP <- cmt$table[2,1]; FN <- cmt$table[1,2]
MCC <- ((TP*TN)-(FP*FN))/sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
pred <- as.data.frame(attr(predict(tree, test,probability = T),"probabilities"))$Yes
dpred <- data.frame(id=1:nrow(data),pred=as.data.frame(attr(predict(tree, data,probability = T),"probabilities"))$Yes) %>% mutate(method="SVM",giro=R,K=h)
M <- rminer::fit(to_formula(subset, dependent),data = train, model="svm", kpar=list(sigma = svm.tune.d$bestTune$sigma), C=svm.tune.d$bestTune$C)
tmpI <- Importance(M, data=train)
vrimp <- data.frame(variable=colnames(train)[-1],importance=tmpI$imp[-1]) %>% mutate(method="SVM",giro=R,K=h)
auc <- my.AUC(as.numeric(test[,which(names(test)==dependent)])-1,pred)
risuT2 <- list(cmt$overall["Accuracy"],cmt$overall["Kappa"],
cmt$byClass["F1"],cmt$byClass["Precision"],cmt$byClass["Recall"],
cmt$byClass["Specificity"],cmt$byClass["Balanced Accuracy"],MCC,auc,RR=R,KK=h)
rrr2 <- as.data.frame(matrix(unlist(risuT2),ncol = 11,byrow = T))
rrr2$vvars <- paste(subset,collapse = ",")
rrr2$nvars <- length(subset)
rrr2$method <- "SVM"
resultsT2=rbind(resultsT2,rrr2)
resImp=rbind(resImp,vrimp)
dfpred=rbind(dfpred,dpred)
# ----- GBM
xtrainr <- as.data.frame(train)
xtrainr[,dependent] <- as.numeric(xtrainr[,dependent])-1
tree <- gbm(as.formula(paste(names(xtrainr)[which(names(xtrainr)==dependent)],"~.")), distribution = "bernoulli", data = xtrainr,
n.trees=gbm.tune.1e$bestTune$n.trees,interaction.depth=gbm.tune.1e$bestTune$interaction.depth,
shrinkage=gbm.tune.1e$bestTune$shrinkage,n.minobsinnode=gbm.tune.1e$bestTune$n.minobsinnode)
best.iter <- gbm.tune.1e$bestTune$n.trees
mpp <- predict(tree, newdata = test, n.trees = best.iter, type = "response")
prdt <- factor(ifelse(mpp > 0.5, "Yes", "No"))
cmt <- confusionMatrix(prdt,test[[dependent]],positive = "Yes",mode = "everything")
TN <- cmt$table[1,1]; TP <- cmt$table[2,2]; FP <- cmt$table[2,1]; FN <- cmt$table[1,2]
MCC <- ((TP*TN)-(FP*FN))/sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
pred <- mpp
dpred <- data.frame(id=1:nrow(data),pred=predict(tree, newdata = data, n.trees = best.iter, type = "response")) %>% mutate(method="GBM",giro=R,K=h)
tmpI <- summary(tree,plot=F)
vrimp <- data.frame(variable=tmpI$var,importance=tmpI$rel.inf) %>% mutate(method="GBM",giro=R,K=h)
auc <- my.AUC(as.numeric(test[,which(names(test)==dependent)])-1,pred)
risuT2 <- list(cmt$overall["Accuracy"],cmt$overall["Kappa"],
cmt$byClass["F1"],cmt$byClass["Precision"],cmt$byClass["Recall"],
cmt$byClass["Specificity"],cmt$byClass["Balanced Accuracy"],MCC,auc,RR=R,KK=h)
rrr2 <- as.data.frame(matrix(unlist(risuT2),ncol = 11,byrow = T))
rrr2$vvars <- paste(subset,collapse = ",")
rrr2$nvars <- length(subset)
rrr2$method <- "GBM"
resultsT2=rbind(resultsT2,rrr2)
resImp=rbind(resImp,vrimp)
dfpred=rbind(dfpred,dpred)
# ----- LR
tree <- glm(to_formula(subset, dependent), train,family = "binomial")
prdt <- factor(ifelse(predict(tree,newdata=test,type="response")<0.5,"No","Yes"))
cmt <- confusionMatrix(prdt,test[[dependent]],positive = "Yes",mode = "everything")
TN <- cmt$table[1,1]; TP <- cmt$table[2,2]; FP <- cmt$table[2,1]; FN <- cmt$table[1,2]
MCC <- ((TP*TN)-(FP*FN))/sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
pred <- predict(tree,newdata=test,type="response")
dpred <- data.frame(id=1:nrow(data),pred=predict(tree,newdata=data,type="response")) %>% mutate(method="LR",giro=R,K=h)
M <- rminer::fit(to_formula(subset, dependent) ,data = train, model="lr")
tmpI <- Importance(M, data=train)
vrimp <- data.frame(variable=colnames(train)[-1],importance=tmpI$imp[-1]) %>% mutate(method="LR",giro=R,K=h)
auc <- my.AUC(as.numeric(test[,which(names(test)==dependent)])-1,pred)
risuT2 <- list(cmt$overall["Accuracy"],cmt$overall["Kappa"],
cmt$byClass["F1"],cmt$byClass["Precision"],cmt$byClass["Recall"],
cmt$byClass["Specificity"],cmt$byClass["Balanced Accuracy"],MCC,auc,RR=R,KK=h)
rrr2 <- as.data.frame(matrix(unlist(risuT2),ncol = 11,byrow = T))
rrr2$vvars <- paste(subset,collapse = ",")
rrr2$nvars <- length(subset)
rrr2$method <- "LR"
resultsT2=rbind(resultsT2,rrr2)
resImp=rbind(resImp,vrimp)
dfpred=rbind(dfpred,dpred)
# ----- NB
tree <- naiveBayes(to_formula(subset, dependent), train)
cmt <- confusionMatrix(predict(tree, test, type = "c"),test[[dependent]],positive = "Yes",mode = "everything")
TN <- cmt$table[1,1]; TP <- cmt$table[2,2]; FP <- cmt$table[2,1]; FN <- cmt$table[1,2]
MCC <- ((TP*TN)-(FP*FN))/sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
pred <- as.data.frame(predict(tree,newdata=test,type="raw"))$Yes
dpred <- data.frame(id=1:nrow(data),pred=as.data.frame(predict(tree,newdata=data,type="raw"))$Yes) %>% mutate(method="NB",giro=R,K=h)
M <- rminer::fit(to_formula(subset, dependent),data = train, model="naiveBayes")
tmpI <- Importance(M, data=train)
vrimp <- data.frame(variable=colnames(train)[-1],importance=tmpI$imp[-1]) %>% mutate(method="NB",giro=R,K=h)
auc <- my.AUC(as.numeric(test[,which(names(test)==dependent)])-1,pred)
risuT2 <- list(cmt$overall["Accuracy"],cmt$overall["Kappa"],
cmt$byClass["F1"],cmt$byClass["Precision"],cmt$byClass["Recall"],
cmt$byClass["Specificity"],cmt$byClass["Balanced Accuracy"],MCC,auc,RR=R,KK=h)
rrr2 <- as.data.frame(matrix(unlist(risuT2),ncol = 11,byrow = T))
rrr2$vvars <- paste(subset,collapse = ",")
rrr2$nvars <- length(subset)
rrr2$method="NB"
resultsT2=rbind(resultsT2,rrr2)
resImp=rbind(resImp,vrimp)
dfpred=rbind(dfpred,dpred)
}
}
names(resultsT2)[1:11] <- c("Accuracy","Kappa","F1","Precision","Recall","Specificity","Balanced.Accuracy","MCC","AUC","r","k")
medie=aggregate(resultsT2[,c(1:9)],by = list(resultsT2$method,resultsT2$vvars),FUN = "mean",na.rm=T)
colnames(medie)[1] <- "method"
colnames(medie)[2] <- "vvars"
importance.avg <- resImp %>% group_by(variable,method) %>% summarise(importance=mean(importance)) %>% arrange(method,-importance)
predic.avg=dfpred %>% group_by(id,method) %>% summarise(probYes=mean(pred))
durtot <- as.numeric(Sys.time()-init)
tdata <- bind_cols(data,predic.avg %>% spread(method,probYes))
return(list(medie=medie,totale=resultsT2,
importance.all=resImp,importance.avg=importance.avg,
predic.all=dfpred, predic.avg=predic.avg,
tdata=tdata,
lstHypPr=lstHypPr)) # ,durt,durtot
}
datR2.1 <- datR2.1 %>% select(-Breed)
res <- eval_ALL(subset = names(datR2.1)[-1],dependent = "Hypothyroidism",data = datR2.1,kk = 10,rr = 5)
## [1] "tuning SVM"
## [1] "tuning GBM"
##
## 1 : 12345678910
## 2 : 12345678910
## 3 : 12345678910
## 4 : 12345678910
## 5 : 12345678910
ritorna una lista con:
la media degli indici (Accuracy, F1, Kappa, MCC, …) per algoritmo (res$medie)
i valori degli indici per ogni algoritmo, k-fold e ripetizione (res$totale)
importanza delle variabili per ogni algoritmo, k-fold e ripetizione (res$importance.all)
importanza media delle variabili per algoritmo (res$importance.avg)
probabilità di appartenere alla classe Yes per ogni algoritmo, k-fold e ripetizione (res$predic.all)
probabilità media di appartenere alla classe Yes per ogni algoritmo (res$predic.avg)
data.frame originale con la probabilità media di appartenere alla classe Yes per ogni algoritmo (res$tdata)
res$medie %>% select(-vvars) %>% as_tibble()
## # A tibble: 6 × 10
## method Accuracy Kappa F1 Precision Recall Specificity Balance…¹ MCC AUC
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 CT 0.819 0.493 0.608 0.717 0.552 0.917 0.735 0.511 0.769
## 2 GBM 0.848 0.576 0.675 0.772 0.620 0.933 0.776 0.591 0.858
## 3 LR 0.865 0.622 0.709 0.813 0.658 0.942 0.800 0.640 0.866
## 4 NB 0.867 0.639 0.727 0.783 0.709 0.927 0.818 0.652 0.877
## 5 RF 0.863 0.599 0.697 0.845 0.595 0.961 0.778 0.623 0.878
## 6 SVM 0.835 0.555 0.667 0.702 0.658 0.892 0.775 0.563 0.871
## # … with abbreviated variable name ¹Balanced.Accuracy
res$totale %>% select(-vvars) %>% as_tibble()
## # A tibble: 300 × 13
## Accuracy Kappa F1 Precision Recall Speci…¹ Balan…² MCC AUC r k
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.871 0.587 0.667 0.667 0.667 0.92 0.793 0.587 0.72 1 1
## 2 0.871 0.527 0.6 0.75 0.5 0.96 0.73 0.542 0.95 1 1
## 3 0.806 0.450 0.571 0.5 0.667 0.84 0.753 0.457 0.887 1 1
## 4 0.839 0.448 0.545 0.6 0.5 0.92 0.71 0.451 0.813 1 1
## 5 0.871 0.587 0.667 0.667 0.667 0.92 0.793 0.587 0.927 1 1
## 6 0.903 0.708 0.769 0.714 0.833 0.92 0.877 0.712 0.947 1 1
## 7 0.857 0.580 0.667 0.833 0.556 0.962 0.759 0.600 0.675 1 2
## 8 0.914 0.748 0.8 1 0.667 1 0.833 0.773 0.932 1 2
## 9 0.8 0.457 0.588 0.625 0.556 0.885 0.720 0.458 0.885 1 2
## 10 0.857 0.580 0.667 0.833 0.556 0.962 0.759 0.600 0.906 1 2
## # … with 290 more rows, 2 more variables: nvars <int>, method <chr>, and
## # abbreviated variable names ¹Specificity, ²Balanced.Accuracy
plot.effect(obj, x, method, gridn, smooth.spar, size.dot)
obj output della funzione eval_ALL
x variabile indipendente di cui valutare l’effetto (in formato testo, ad esempio “Hct”)
method algoritmo di calcolo (“CT”, “GBM”, “LR”, “NB”, “RF”, “SVM”)
gridn per variabili numeriche, il numero di intervalli; può essere un numero o il nome di un algoritmo per calcolare il numero di intervalli (“FD”,“Scott”,“Sturges”)
smooth.spar per variabili numeriche, parametro di smoothing con valore in (0,1]
size.dot per variabili fattore, dimnensione dei punti
plot.effect <- function(obj = NULL, x = NULL, method = NULL, gridn = "FD", smooth.spar=0.5, size.dot = 2){ # data = NULL,
# plot effetto marginale delle variabili indipendenti sulle probabilità di appartenere alla classe Yes
# obj output della funzione eval_ALL
# x variabile indipendente di cui valutare l'effetto (in formato testo, ad esempio "Hct")
# method algoritmo di calcolo ("CT", "GBM", "LR", "NB", "RF", "SVM")
# gridn per variabili numeriche, il numero di intervalli; può essere un numero o il nome di un algoritmo per calcolare il numero di intervalli ("FD","Scott","Sturges")
# smooth.spar per variabili numeriche, parametro di smoothing con valore in (0,1]
# size.dot per variabili fattore, dimnensione dei punti
#
# ritorna un oggetto ggplot
#
if(is.null(obj) | is.null(x) | is.null(method)){stop("mancano dati")} # | is.null(data)
if(!"predic.avg" %in% names(obj)){stop("obj non valido")}
tdata <- obj$tdata #bind_cols(data,obj[["predic.avg"]] %>% spread(method,probYes))
pv <- which(colnames(tdata)==x)
pm <- which(colnames(tdata)==method)
if(length(pv)==0 | length(pm)==0){stop("errore in x o method")}
if(class(tdata[,pv])=="numeric"){
hh <- hist(tdata[,pv],breaks = gridn,plot = F)
tmp <- tdata %>% select(varb=pv,meth=all_of(pm))
tmp2 <- tmp %>% mutate(x=cut(varb,breaks=hh$breaks,include.lowest=T)) %>%
group_by(x,.drop = F) %>%
summarise(Yhat=mean(meth)) %>%
mutate(xx=hh$mids)
tmp2 <- na.omit(tmp2)
smoothingSpline = smooth.spline(tmp2$xx, tmp2$Yhat, spar=smooth.spar)
smoot <- ifelse(smoothingSpline$y>1,1,smoothingSpline$y)
smoot <- ifelse(smoot<0,0,smoot)
gr <- tmp2 %>% mutate(smoo=smoot) %>%
ggplot(aes(x=xx,y=Yhat))+geom_point()+ geom_line(aes(y=smoo),color="blue",size=1)+
ylim(c(0,1))+xlab(x)
}
if(class(tdata[,pv])=="factor"){
tmp <- tdata %>% select(varb=pv,meth=all_of(pm))
gr <- tmp %>%
group_by(varb,.drop = F) %>%
summarise(Yhat=mean(meth)) %>%
ggplot(aes(x=varb,y=Yhat))+geom_point(color="blue",size=size.dot)+
ylim(c(0,1))+xlab(x)
}
return(gr)
}
plot.effect(obj = res,x = "Cholesterol_mg.dl",method = "RF",gridn = 20,smooth.spar = 0.5)+theme_bw()
plot.effect(obj = res,x = "Dermatopathy",method = "GBM",gridn = 20,smooth.spar = 0.5)+theme_minimal()
res$importance.avg %>%
top_n(10) %>%
ungroup %>%
mutate(variable=reorder_within(variable,importance,method)) %>%
arrange(method,-importance) %>%
ggplot(aes(x=reorder_within(variable,importance,method),y=importance))+
geom_bar(stat = "identity")+coord_flip()+scale_x_reordered() + #scale_y_continuous(expand = c(0,0))+
facet_wrap(~method,scales = "free",ncol = 3)+
theme_minimal()+xlab("")+ylab("")+
ggtitle("Variable importance by method")
res$totale %>% ggplot(aes(x=method,y=MCC))+
stat_boxplot(geom = "errorbar",width = 0.15) +geom_boxplot()+theme_minimal()
res$totale %>% select(method,r,k,MCC,F1,AUC,Accuracy) %>% gather("index","value",4:7) %>%
ggplot(aes(x=method,y=value))+
stat_boxplot(geom = "errorbar",width = 0.15)+
geom_boxplot()+facet_wrap(~index)+theme_minimal()
p1 <- plot.effect(obj = res,x = "Cholesterol_mg.dl",method = "RF",gridn = 20,smooth.spar = 0.6)+theme_minimal()+ggtitle("RF")
p2 <- plot.effect(obj = res,x = "Dermatopathy",method = "GBM",gridn = 20,smooth.spar = 0.5)+theme_minimal()+ggtitle("GBM")
p3 <- plot.effect(obj = res,x = "Hct",method = "GBM",gridn = 20,smooth.spar = 0.7)+theme_minimal()+ggtitle("GBM")
p4 <- plot.effect(obj = res,x = "Hct",method = "RF",gridn = 20,smooth.spar = 0.7)+theme_minimal()+ggtitle("RF")
ggarrange(p1,p2,p3,p4)