load("./R/data_models_20220110.RData")
library(dplyr)
library(tidyr)
library(tidytext)
library(rpart)
library(randomForest)
library(e1071)
library(caret)
library(rminer)
library(gbm)
library(FSelectorRcpp)
library(ggplot2)
library(ggpubr)


Funzione per il confronto, con un repeated k-folds, della capacità predittiva degli algoritmi:

eval_ALL(subset, data, dependent, kk, rr)


eval_ALL <- function(subset=NULL, data=NULL, dependent = NULL,kk=10,rr=5){
  if(is.null(subset) | is.null(data) | is.null(dependent)){stop("Manca qualcosa")}
  if("data.frame" %in% class(data)==FALSE){stop("data non è un data.frame")}
  if(sum(subset %in% names(data)) != length(subset)){stop("variabili in subset non corretto")}
  if(dependent %in% names(data)==FALSE){stop("variabile dipendente non valida")}
  if("Yes" %in% levels(data[,which(names(data)==dependent)])==FALSE){stop("la dipendente non è un fattore con modalità Yes No")}
  
  my.AUC <- function(labels, scores){
    labels <- as.logical(labels)
    pos <- scores[labels]
    neg <- scores[!labels]
    if(length(pos)==0 | length(neg)==0){
      auc <- NA
    } else {
      U <- as.numeric(wilcox.test(pos, neg)$statistic)
      auc <- U/(length(pos) * length(neg))
    }
    return(auc)
  }
  
  fitControl <- trainControl(method = "cv", #"repeatedcv""cv"
                             number=10,
                             preProcOptions = list(thresh = 0.95),
                             classProbs = TRUE,
                             summaryFunction = twoClassSummary)
  grid <- expand.grid(sigma = c(.01, .015, 0.2),
                      C = c(1, 10, 20, 50))
  init <- Sys.time()
  print("tuning SVM")
  svm.tune.d <- train(as.formula(paste(names(data)[which(names(data)==dependent)],"~.")), data = data,
                      method = "svmRadial",   # Radial kernel
                      tuneGrid = grid,
                      trControl=fitControl)
  
  print("tuning GBM")
  gbm.tune.1e <- train(as.formula(paste(names(data)[which(names(data)==dependent)],"~.")), data = data,
                       method = "gbm", trControl = fitControl, verbose = F,
                       tuneLength=5)
  durt <- as.numeric(Sys.time()-init)
  lstHypPr <- list(svm.tune.d$bestTune,gbm.tune.1e$bestTune)                    
  k <- kk
  trr <- 0
  
  for(R in 1:rr){
    cat("\n",R,": ")
    splits <- runif(nrow(data))
    for(h in 1:k){
      cat(h)
      test.idx <- (splits >= (h - 1) / k) & (splits < h / k)
      train.idx <- !test.idx
      test <- data[test.idx, , drop = FALSE]
      train <- data[train.idx, , drop = FALSE]
      # ----- CT
      tree <- rpart(to_formula(subset, dependent), train)
      cmt <- confusionMatrix(predict(tree, test, type = "c"),test[[dependent]],positive = "Yes",mode = "everything")
      TN <- cmt$table[1,1]; TP <- cmt$table[2,2]; FP <- cmt$table[2,1]; FN <- cmt$table[1,2]
      MCC <- ((TP*TN)-(FP*FN))/sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
      pred <- predict(tree, test, type = "p")[,2]
      dpred <- data.frame(id=1:nrow(data),pred=predict(tree, data, type = "p")[,2]) %>% mutate(method="CT",giro=R,K=h)
      vrimp <- tree$variable.importance
      vrimp <- data.frame(variable=names(vrimp),importance=vrimp) %>% mutate(method="CT",giro=R,K=h)
      auc <- my.AUC(as.numeric(test[,which(names(test)==dependent)])-1,pred)
      risuT2 <- list(cmt$overall["Accuracy"],cmt$overall["Kappa"],
                     cmt$byClass["F1"],cmt$byClass["Precision"],cmt$byClass["Recall"],
                     cmt$byClass["Specificity"],cmt$byClass["Balanced Accuracy"],MCC,auc,RR=R,KK=h)
      rrr2 <- as.data.frame(matrix(unlist(risuT2),ncol = 11,byrow = T))
      rrr2$vvars <- paste(subset,collapse = ",")
      rrr2$nvars <- length(subset)
      rrr2$method <- "CT"
      if(trr==0){resultsT2=rrr2;resImp=vrimp;dfpred=dpred;trr=1} else {resultsT2=rbind(resultsT2,rrr2);resImp=rbind(resImp,vrimp);dfpred=rbind(dfpred,dpred)}
      # ----- RF
      tree <- randomForest(to_formula(subset, dependent), train)
      cmt <- confusionMatrix(predict(tree, test, type = "c"),test[[dependent]],positive = "Yes",mode = "everything")
      TN <- cmt$table[1,1]; TP <- cmt$table[2,2]; FP <- cmt$table[2,1]; FN <- cmt$table[1,2]
      MCC <- ((TP*TN)-(FP*FN))/sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
      pred <- predict(tree, test, type = "p")[,2]
      dpred <- data.frame(id=1:nrow(data),pred=predict(tree, data, type = "p")[,2]) %>% mutate(method="RF",giro=R,K=h)
      vrimp <- varImp(tree) #$Overall
      vrimp <- data.frame(variable=rownames(vrimp),importance=vrimp$Overall) %>% mutate(method="RF",giro=R,K=h)
      rownames(vrimp) <- NULL
      auc <- my.AUC(as.numeric(test[,which(names(test)==dependent)])-1,pred)
      risuT2 <- list(cmt$overall["Accuracy"],cmt$overall["Kappa"],
                     cmt$byClass["F1"],cmt$byClass["Precision"],cmt$byClass["Recall"],
                     cmt$byClass["Specificity"],cmt$byClass["Balanced Accuracy"],MCC,auc,RR=R,KK=h)
      rrr2 <- as.data.frame(matrix(unlist(risuT2),ncol = 11,byrow = T))
      rrr2$vvars <- paste(subset,collapse = ",")
      rrr2$nvars <- length(subset)
      rrr2$method <- "RF"
      resultsT2=rbind(resultsT2,rrr2)
      resImp=rbind(resImp,vrimp)
      dfpred=rbind(dfpred,dpred)
      # ----- SVM
      tree <- svm(to_formula(subset, dependent), train, cost = svm.tune.d$bestTune$C, gamma = svm.tune.d$bestTune$sigma,probability=T)
      cmt <- confusionMatrix(predict(tree, test, type = "c"),test[[dependent]],positive = "Yes",mode = "everything")
      TN <- cmt$table[1,1]; TP <- cmt$table[2,2]; FP <- cmt$table[2,1]; FN <- cmt$table[1,2]
      MCC <- ((TP*TN)-(FP*FN))/sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
      pred <- as.data.frame(attr(predict(tree, test,probability = T),"probabilities"))$Yes
      dpred <- data.frame(id=1:nrow(data),pred=as.data.frame(attr(predict(tree, data,probability = T),"probabilities"))$Yes) %>% mutate(method="SVM",giro=R,K=h)
      M <- rminer::fit(to_formula(subset, dependent),data = train, model="svm", kpar=list(sigma = svm.tune.d$bestTune$sigma), C=svm.tune.d$bestTune$C)
      tmpI <- Importance(M, data=train)
      vrimp <- data.frame(variable=colnames(train)[-1],importance=tmpI$imp[-1]) %>% mutate(method="SVM",giro=R,K=h)
      auc <- my.AUC(as.numeric(test[,which(names(test)==dependent)])-1,pred)
      risuT2 <- list(cmt$overall["Accuracy"],cmt$overall["Kappa"],
                     cmt$byClass["F1"],cmt$byClass["Precision"],cmt$byClass["Recall"],
                     cmt$byClass["Specificity"],cmt$byClass["Balanced Accuracy"],MCC,auc,RR=R,KK=h)
      rrr2 <- as.data.frame(matrix(unlist(risuT2),ncol = 11,byrow = T))
      rrr2$vvars <- paste(subset,collapse = ",")
      rrr2$nvars <- length(subset)
      rrr2$method <- "SVM"
      resultsT2=rbind(resultsT2,rrr2)
      resImp=rbind(resImp,vrimp)
      dfpred=rbind(dfpred,dpred)
      # ----- GBM
      xtrainr <- as.data.frame(train)
      xtrainr[,dependent] <- as.numeric(xtrainr[,dependent])-1
      tree <- gbm(as.formula(paste(names(xtrainr)[which(names(xtrainr)==dependent)],"~.")), distribution = "bernoulli", data = xtrainr, 
                  n.trees=gbm.tune.1e$bestTune$n.trees,interaction.depth=gbm.tune.1e$bestTune$interaction.depth,
                  shrinkage=gbm.tune.1e$bestTune$shrinkage,n.minobsinnode=gbm.tune.1e$bestTune$n.minobsinnode)
      best.iter <- gbm.tune.1e$bestTune$n.trees
      mpp <- predict(tree, newdata = test, n.trees = best.iter, type = "response")
      prdt <- factor(ifelse(mpp > 0.5, "Yes", "No"))
      cmt <- confusionMatrix(prdt,test[[dependent]],positive = "Yes",mode = "everything")
      TN <- cmt$table[1,1]; TP <- cmt$table[2,2]; FP <- cmt$table[2,1]; FN <- cmt$table[1,2]
      MCC <- ((TP*TN)-(FP*FN))/sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
      pred <- mpp
      dpred <- data.frame(id=1:nrow(data),pred=predict(tree, newdata = data, n.trees = best.iter, type = "response")) %>% mutate(method="GBM",giro=R,K=h)
      tmpI <- summary(tree,plot=F)
      vrimp <- data.frame(variable=tmpI$var,importance=tmpI$rel.inf) %>% mutate(method="GBM",giro=R,K=h)
      auc <- my.AUC(as.numeric(test[,which(names(test)==dependent)])-1,pred)
      risuT2 <- list(cmt$overall["Accuracy"],cmt$overall["Kappa"],
                     cmt$byClass["F1"],cmt$byClass["Precision"],cmt$byClass["Recall"],
                     cmt$byClass["Specificity"],cmt$byClass["Balanced Accuracy"],MCC,auc,RR=R,KK=h)
      rrr2 <- as.data.frame(matrix(unlist(risuT2),ncol = 11,byrow = T))
      rrr2$vvars <- paste(subset,collapse = ",")
      rrr2$nvars <- length(subset)
      rrr2$method <- "GBM"
      resultsT2=rbind(resultsT2,rrr2)
      resImp=rbind(resImp,vrimp)
      dfpred=rbind(dfpred,dpred)
      # ----- LR
      tree <- glm(to_formula(subset, dependent), train,family = "binomial")
      prdt <- factor(ifelse(predict(tree,newdata=test,type="response")<0.5,"No","Yes"))
      cmt <- confusionMatrix(prdt,test[[dependent]],positive = "Yes",mode = "everything")
      TN <- cmt$table[1,1]; TP <- cmt$table[2,2]; FP <- cmt$table[2,1]; FN <- cmt$table[1,2]
      MCC <- ((TP*TN)-(FP*FN))/sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
      pred <- predict(tree,newdata=test,type="response")
      dpred <- data.frame(id=1:nrow(data),pred=predict(tree,newdata=data,type="response")) %>% mutate(method="LR",giro=R,K=h)
      M <- rminer::fit(to_formula(subset, dependent) ,data = train, model="lr")
      tmpI <- Importance(M, data=train)
      vrimp <- data.frame(variable=colnames(train)[-1],importance=tmpI$imp[-1]) %>% mutate(method="LR",giro=R,K=h)
      auc <- my.AUC(as.numeric(test[,which(names(test)==dependent)])-1,pred)
      risuT2 <- list(cmt$overall["Accuracy"],cmt$overall["Kappa"],
                     cmt$byClass["F1"],cmt$byClass["Precision"],cmt$byClass["Recall"],
                     cmt$byClass["Specificity"],cmt$byClass["Balanced Accuracy"],MCC,auc,RR=R,KK=h)
      rrr2 <- as.data.frame(matrix(unlist(risuT2),ncol = 11,byrow = T))
      rrr2$vvars <- paste(subset,collapse = ",")
      rrr2$nvars <- length(subset)
      rrr2$method <- "LR"
      resultsT2=rbind(resultsT2,rrr2)
      resImp=rbind(resImp,vrimp)
      dfpred=rbind(dfpred,dpred)
      # ----- NB
      tree <- naiveBayes(to_formula(subset, dependent), train)
      cmt <- confusionMatrix(predict(tree, test, type = "c"),test[[dependent]],positive = "Yes",mode = "everything")
      TN <- cmt$table[1,1]; TP <- cmt$table[2,2]; FP <- cmt$table[2,1]; FN <- cmt$table[1,2]
      MCC <- ((TP*TN)-(FP*FN))/sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
      pred <- as.data.frame(predict(tree,newdata=test,type="raw"))$Yes
      dpred <- data.frame(id=1:nrow(data),pred=as.data.frame(predict(tree,newdata=data,type="raw"))$Yes) %>% mutate(method="NB",giro=R,K=h)
      M <- rminer::fit(to_formula(subset, dependent),data = train, model="naiveBayes")
      tmpI <- Importance(M, data=train)
      vrimp <- data.frame(variable=colnames(train)[-1],importance=tmpI$imp[-1]) %>% mutate(method="NB",giro=R,K=h)
      auc <- my.AUC(as.numeric(test[,which(names(test)==dependent)])-1,pred)
      risuT2 <- list(cmt$overall["Accuracy"],cmt$overall["Kappa"],
                     cmt$byClass["F1"],cmt$byClass["Precision"],cmt$byClass["Recall"],
                     cmt$byClass["Specificity"],cmt$byClass["Balanced Accuracy"],MCC,auc,RR=R,KK=h)
      rrr2 <- as.data.frame(matrix(unlist(risuT2),ncol = 11,byrow = T))
      rrr2$vvars <- paste(subset,collapse = ",")
      rrr2$nvars <- length(subset)
      rrr2$method="NB"
      resultsT2=rbind(resultsT2,rrr2)
      resImp=rbind(resImp,vrimp)
      dfpred=rbind(dfpred,dpred)
    }
  }
  names(resultsT2)[1:11] <- c("Accuracy","Kappa","F1","Precision","Recall","Specificity","Balanced.Accuracy","MCC","AUC","r","k")
  medie=aggregate(resultsT2[,c(1:9)],by = list(resultsT2$method,resultsT2$vvars),FUN = "mean",na.rm=T)
  colnames(medie)[1] <- "method"
  colnames(medie)[2] <- "vvars"
  importance.avg <- resImp %>% group_by(variable,method) %>% summarise(importance=mean(importance)) %>% arrange(method,-importance)
  predic.avg=dfpred %>% group_by(id,method) %>% summarise(probYes=mean(pred))
  durtot <- as.numeric(Sys.time()-init)
  tdata <- bind_cols(data,predic.avg %>% spread(method,probYes))
  return(list(medie=medie,totale=resultsT2,
              importance.all=resImp,importance.avg=importance.avg,
              predic.all=dfpred, predic.avg=predic.avg,
              tdata=tdata,
              lstHypPr=lstHypPr)) # ,durt,durtot
}


datR2.1 <- datR2.1 %>% select(-Breed)

res <- eval_ALL(subset = names(datR2.1)[-1],dependent = "Hypothyroidism",data = datR2.1,kk = 10,rr = 5)
## [1] "tuning SVM"
## [1] "tuning GBM"
## 
##  1 : 12345678910
##  2 : 12345678910
##  3 : 12345678910
##  4 : 12345678910
##  5 : 12345678910


ritorna una lista con:


res$medie %>% select(-vvars) %>% as_tibble()
## # A tibble: 6 × 10
##   method Accuracy Kappa    F1 Precision Recall Specificity Balance…¹   MCC   AUC
##   <chr>     <dbl> <dbl> <dbl>     <dbl>  <dbl>       <dbl>     <dbl> <dbl> <dbl>
## 1 CT        0.819 0.493 0.608     0.717  0.552       0.917     0.735 0.511 0.769
## 2 GBM       0.848 0.576 0.675     0.772  0.620       0.933     0.776 0.591 0.858
## 3 LR        0.865 0.622 0.709     0.813  0.658       0.942     0.800 0.640 0.866
## 4 NB        0.867 0.639 0.727     0.783  0.709       0.927     0.818 0.652 0.877
## 5 RF        0.863 0.599 0.697     0.845  0.595       0.961     0.778 0.623 0.878
## 6 SVM       0.835 0.555 0.667     0.702  0.658       0.892     0.775 0.563 0.871
## # … with abbreviated variable name ¹​Balanced.Accuracy
res$totale %>% select(-vvars) %>% as_tibble()
## # A tibble: 300 × 13
##    Accuracy Kappa    F1 Precision Recall Speci…¹ Balan…²   MCC   AUC     r     k
##       <dbl> <dbl> <dbl>     <dbl>  <dbl>   <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>
##  1    0.871 0.587 0.667     0.667  0.667   0.92    0.793 0.587 0.72      1     1
##  2    0.871 0.527 0.6       0.75   0.5     0.96    0.73  0.542 0.95      1     1
##  3    0.806 0.450 0.571     0.5    0.667   0.84    0.753 0.457 0.887     1     1
##  4    0.839 0.448 0.545     0.6    0.5     0.92    0.71  0.451 0.813     1     1
##  5    0.871 0.587 0.667     0.667  0.667   0.92    0.793 0.587 0.927     1     1
##  6    0.903 0.708 0.769     0.714  0.833   0.92    0.877 0.712 0.947     1     1
##  7    0.857 0.580 0.667     0.833  0.556   0.962   0.759 0.600 0.675     1     2
##  8    0.914 0.748 0.8       1      0.667   1       0.833 0.773 0.932     1     2
##  9    0.8   0.457 0.588     0.625  0.556   0.885   0.720 0.458 0.885     1     2
## 10    0.857 0.580 0.667     0.833  0.556   0.962   0.759 0.600 0.906     1     2
## # … with 290 more rows, 2 more variables: nvars <int>, method <chr>, and
## #   abbreviated variable names ¹​Specificity, ²​Balanced.Accuracy


funzione per grafici con effetti marginali delle variabili indipendenti


plot.effect(obj, x, method, gridn, smooth.spar, size.dot)


plot.effect <- function(obj = NULL, x = NULL, method = NULL, gridn = "FD", smooth.spar=0.5, size.dot = 2){ #  data = NULL, 
  # plot effetto marginale delle variabili indipendenti sulle probabilità di appartenere alla classe Yes
  # obj           output della funzione eval_ALL
  # x             variabile indipendente di cui valutare l'effetto (in formato testo, ad esempio "Hct")
  # method        algoritmo di calcolo ("CT", "GBM", "LR", "NB", "RF", "SVM")
  # gridn         per variabili numeriche, il numero di intervalli; può essere un numero o il nome di un algoritmo per calcolare il numero di intervalli ("FD","Scott","Sturges")
  # smooth.spar   per variabili numeriche, parametro di smoothing con valore in (0,1]
  # size.dot      per variabili fattore, dimnensione dei punti
  #
  # ritorna un oggetto ggplot
  #
  if(is.null(obj) | is.null(x) | is.null(method)){stop("mancano dati")}    #  | is.null(data)
  if(!"predic.avg" %in% names(obj)){stop("obj non valido")}
  tdata <- obj$tdata #bind_cols(data,obj[["predic.avg"]] %>% spread(method,probYes))
  pv <- which(colnames(tdata)==x)
  pm <- which(colnames(tdata)==method)
  if(length(pv)==0 | length(pm)==0){stop("errore in x o method")}
  if(class(tdata[,pv])=="numeric"){
    hh <- hist(tdata[,pv],breaks = gridn,plot = F)
    tmp <- tdata %>% select(varb=pv,meth=all_of(pm))
    tmp2 <- tmp %>% mutate(x=cut(varb,breaks=hh$breaks,include.lowest=T)) %>% 
      group_by(x,.drop = F) %>% 
      summarise(Yhat=mean(meth)) %>% 
      mutate(xx=hh$mids)
    tmp2 <- na.omit(tmp2)
    smoothingSpline = smooth.spline(tmp2$xx, tmp2$Yhat, spar=smooth.spar)
    smoot <- ifelse(smoothingSpline$y>1,1,smoothingSpline$y)
    smoot <- ifelse(smoot<0,0,smoot)
    gr <- tmp2 %>% mutate(smoo=smoot) %>% 
      ggplot(aes(x=xx,y=Yhat))+geom_point()+ geom_line(aes(y=smoo),color="blue",size=1)+
      ylim(c(0,1))+xlab(x)
  }
  
  if(class(tdata[,pv])=="factor"){
    tmp <- tdata %>% select(varb=pv,meth=all_of(pm))
    gr <- tmp %>% 
      group_by(varb,.drop = F) %>% 
      summarise(Yhat=mean(meth)) %>% 
      ggplot(aes(x=varb,y=Yhat))+geom_point(color="blue",size=size.dot)+
      ylim(c(0,1))+xlab(x)
  }
  return(gr)
}
plot.effect(obj = res,x = "Cholesterol_mg.dl",method = "RF",gridn = 20,smooth.spar = 0.5)+theme_bw()

plot.effect(obj = res,x = "Dermatopathy",method = "GBM",gridn = 20,smooth.spar = 0.5)+theme_minimal()

esempi

res$importance.avg %>% 
  top_n(10) %>%
  ungroup %>%
  mutate(variable=reorder_within(variable,importance,method)) %>%
  arrange(method,-importance) %>%
  ggplot(aes(x=reorder_within(variable,importance,method),y=importance))+
  geom_bar(stat = "identity")+coord_flip()+scale_x_reordered() + #scale_y_continuous(expand = c(0,0))+
  facet_wrap(~method,scales = "free",ncol = 3)+
  theme_minimal()+xlab("")+ylab("")+
  ggtitle("Variable importance by method")

res$totale %>% ggplot(aes(x=method,y=MCC))+
  stat_boxplot(geom = "errorbar",width = 0.15) +geom_boxplot()+theme_minimal()

res$totale %>% select(method,r,k,MCC,F1,AUC,Accuracy) %>% gather("index","value",4:7) %>% 
  ggplot(aes(x=method,y=value))+
  stat_boxplot(geom = "errorbar",width = 0.15)+
  geom_boxplot()+facet_wrap(~index)+theme_minimal()

p1 <- plot.effect(obj = res,x = "Cholesterol_mg.dl",method = "RF",gridn = 20,smooth.spar = 0.6)+theme_minimal()+ggtitle("RF")
p2 <- plot.effect(obj = res,x = "Dermatopathy",method = "GBM",gridn = 20,smooth.spar = 0.5)+theme_minimal()+ggtitle("GBM")
p3 <- plot.effect(obj = res,x = "Hct",method = "GBM",gridn = 20,smooth.spar = 0.7)+theme_minimal()+ggtitle("GBM")
p4 <- plot.effect(obj = res,x = "Hct",method = "RF",gridn = 20,smooth.spar = 0.7)+theme_minimal()+ggtitle("RF")

ggarrange(p1,p2,p3,p4)