Preprocessing

Generating a Function to Partition Dataset

#Particiona un dataset preparandolo para el proceso de cross validation
#recibe como parámetros el dataset, el porcentaje de registros que se desea tener para el TraininSet
#y el porcentaje de registros que se desea para el CVSet
#Regresa una lista con tres dataframes conteniendo los datasets dttrain, dtcv y dttest en ese orden
splitDataSetForCV<-function(dataSet,nTraining,nCV){
  nregistros<-ceiling(nrow(dataSet)*(nTraining/100))
  indexRows<-sample(nrow(dataSet),nregistros,replace=F)
  dttrain<-dataSet[indexRows,]
  dttemp<-dataSet[-indexRows,]
  tope<-ceiling(nrow(dataSet)*(nCV/100)) 
  dtcv<-dttemp[1:tope,]
  dttest<-dttemp[(tope+1):nrow(dttemp),]
  listDF<-list(dttrain,dtcv,dttest)  
}

Getting the data

Breast Cancer database

breast_cancer<-read.csv("https://raw.githubusercontent.com/arturo-laflor/breast-cancer/master/breast_cancer_nn/breast_cancer.csv",header = TRUE,sep = ",")

ds<-breast_cancer[-1]
ds<-ds[complete.cases(ds),]
ds<-ds[ds$v6!="?",]

ds[which(ds$R==2),10]<-0
ds[which(ds$R==4),10]<-1

ds<-as.data.frame(sapply(ds,as.numeric))

lstDS<-splitDataSetForCV(ds,70,15)
ds<-lstDS[[1]]
dstt<-lstDS[[3]]

Get formula??

get_formula<-function(vec_names,var_objetivo){
 var_objetivo_f<-paste(var_objetivo,"~",sep = "")
 var_objetivo_f<-paste(var_objetivo,"~",sep = "")
 f<-as.formula(paste(var_objetivo_f,paste(vec_names[!vec_names %in% var_objetivo],collapse = " + "))) 
 return(f)
}

Analizing With Differente Algorithms

Elastic Network + LASSO Regression

Where:

  • x: is the matrix with predictive features
  • y: is the vector of labels
  • alpha: Parameter to control the trade-off between elastic net and LASSO. Take values between [0,1] alpha=1: LASSO, alpha=0: Elastic Net
  • nlambda: Lambda is the regurlarization parameter. The algorithm performs the training testing different lambdas to choose the optimal at the end. nlambda is the number of lambdas that the practicioner wants the algorithm to evaluate.
  • family: paremeter to specify the number of classes in the target feature.
  • type.measure: Classification or regression
  • ds: dataset
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-13
dblAlpha=0.9
iLambda=20


x<-as.matrix(ds[,1:(dim(ds)[2]-1)])
y<-as.matrix(ds[,(dim(ds)[2])])


ModelLR<-cv.glmnet(x,y, alpha = dblAlpha, nlambda = iLambda, family = "binomial",type.measure = "class")


lmin<-ModelLR$lambda.min
coefm<-coef.cv.glmnet(ModelLR,s=lmin)
matrixcoef<-as.matrix(coefm)

plot(ModelLR)

print(matrixcoef)
##                       1
## (Intercept) -7.37691722
## v1           0.40469056
## v2           0.03980427
## v3           0.33243689
## v4           0.27904242
## v5           0.13480465
## v6           0.15098408
## v7           0.44584055
## v8           0.07211616
## v9           0.00000000

Random Forest

library(randomForest)
library(ggplot2)

f<-get_formula(names(ds),names(ds)[dim(ds)[2]])

ModelRF<-randomForest(f, data=ds, ntree=500,importance=TRUE)

print(ModelRF)
## 
## Call:
##  randomForest(formula = f, data = ds, ntree = 500, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 0.02379069
##                     % Var explained: 89.55
print(importance(ModelRF,type = 2))
##    IncNodePurity
## v1     6.1847400
## v2    25.6336821
## v3    29.1888487
## v4     4.8676966
## v5     8.8472852
## v6     5.1402952
## v7    18.1808556
## v8     8.0814899
## v9     0.2944162
varImpPlot(ModelRF)

plot(ModelRF)

XG-Boost

library(xgboost)
library(DiagrammeR)

f<-get_formula(names(ds),names(ds)[dim(ds)[2]])

ds3B<-ds
    sm<-sparse.model.matrix(f,data = ds3B)
    target_vec<-(as.numeric(ds3B[,dim(ds3B)[2]]))
    ModelXGB <- xgboost(data = sm, label = target_vec, max_depth = 2,
                   eta = 1, nthread = 2, nrounds = 50,objective = "binary:logistic")
## [1]  train-error:0.041754 
## [2]  train-error:0.027140 
## [3]  train-error:0.016701 
## [4]  train-error:0.012526 
## [5]  train-error:0.012526 
## [6]  train-error:0.012526 
## [7]  train-error:0.014614 
## [8]  train-error:0.010438 
## [9]  train-error:0.008351 
## [10] train-error:0.010438 
## [11] train-error:0.010438 
## [12] train-error:0.008351 
## [13] train-error:0.010438 
## [14] train-error:0.006263 
## [15] train-error:0.006263 
## [16] train-error:0.006263 
## [17] train-error:0.006263 
## [18] train-error:0.006263 
## [19] train-error:0.006263 
## [20] train-error:0.004175 
## [21] train-error:0.004175 
## [22] train-error:0.004175 
## [23] train-error:0.002088 
## [24] train-error:0.004175 
## [25] train-error:0.002088 
## [26] train-error:0.002088 
## [27] train-error:0.002088 
## [28] train-error:0.002088 
## [29] train-error:0.002088 
## [30] train-error:0.002088 
## [31] train-error:0.002088 
## [32] train-error:0.002088 
## [33] train-error:0.002088 
## [34] train-error:0.002088 
## [35] train-error:0.002088 
## [36] train-error:0.002088 
## [37] train-error:0.002088 
## [38] train-error:0.002088 
## [39] train-error:0.002088 
## [40] train-error:0.002088 
## [41] train-error:0.002088 
## [42] train-error:0.002088 
## [43] train-error:0.002088 
## [44] train-error:0.002088 
## [45] train-error:0.002088 
## [46] train-error:0.002088 
## [47] train-error:0.002088 
## [48] train-error:0.002088 
## [49] train-error:0.000000 
## [50] train-error:0.002088
matrixcoef<-xgb.importance(feature_names = colnames(sm), model = ModelXGB)
xgb.ggplot.importance(importance_matrix = matrixcoef,
                          rel_to_first = TRUE,xlab="Relative Importance",n_clusters = c(1:4))

xgb.plot.tree(feature_names = names(ds3B), model = ModelXGB)

Artificial Neural Network

  library(neuralnet)
  f<-get_formula(names(ds),names(ds)[dim(ds)[2]])
  ModelNN<-neuralnet(f,data = ds,hidden=c(4),linear.output = F)
  plot(ModelNN)

Support Vector Machine

Building weights vector

construye_vec_costos<-function(costo_min,costo_max){
  
  vecc<-costo_min
  cost_temp<-costo_min
  while(cost_temp<costo_max){
    cost_temp<-cost_temp*3
    vecc<-c(vecc,cost_temp)
  }
  vecc<-vecc[1:(length(vecc)-1)]
  vecc<-c(vecc,costo_max)
}

Implementing SVM

  library(e1071)
  f<-get_formula(names(ds),names(ds)[dim(ds)[2]])
  vec_costo<-construye_vec_costos(0.01,10)
  kernel<-"radial"
  
  
  tune_svm<-tune(svm,f,data = ds,kernel=kernel,degree=1,
       ranges=list(cost=vec_costo,gamma=c(0.5,1,1.5,2,2.5,3.0)))

  mejor_costo<-tune_svm$best.model$cost
  mejor_gamma<-tune_svm$best.model$gamma
  mejor_epsilon<-tune_svm$best.model$epsilon
  performance<-tune_svm$performances
  menor_error<-min(performance[,3])
  ModelSVM<-tune_svm$best.model
  
    X<-cbind.data.frame(C=performance[,1],Gamma=as.factor(performance[,2]),Error=performance[,3]) 
    
    if(identical(kernel,'lineal')){
      X<-X[which(X$Gamma==mejorGamma),]
    }
    
    gg<-ggplot(data=X)+
      geom_point(aes(x=ifelse(C<1,-1/log2(C),log2(C)),y=Error,shape=Gamma, color=Gamma))+
     labs(x='C',y='Error')
     gg

     dfparam<-cbind.data.frame(Parameters=c("C","Gamma","Error"),
                               Optimal.Values=c(mejor_costo,mejor_gamma,menor_error))
     dfparam
##   Parameters Optimal.Values
## 1          C  0.81000000000
## 2      Gamma  0.50000000000
## 3      Error  0.03286164096
library(magrittr)

#dsTest<-splitDataSetForCV(ds,70,15)%>%.[[3]]%>%.[,1:(dim(dsTest)[2]-1)]

vec_obs<-dstt[,(dim(dstt)[2])]
dsTest<-dstt[,1:(dim(dstt)[2]-1)]

#Logistic Regression
dsTestLR<-as.matrix(dsTest)
prLR<-predict(ModelLR,newx=dsTestLR,type="class",s="lambda.min")
mean(vec_obs==prLR)
## [1] 0.9603960396
table(vec_obs,prLR)
##        prLR
## vec_obs  0  1
##       0 73  3
##       1  1 24
#Random_FOrest
prRF<-round(predict(ModelRF,dsTest),digits = 0)
mean(vec_obs==prRF)
## [1] 0.9603960396
table(vec_obs,prRF)
##        prRF
## vec_obs  0  1
##       0 73  3
##       1  1 24
#NeuralNet
compNN<-compute(ModelNN,dsTest)
prNN<-round(compNN$net.result)
mean(vec_obs==prNN)
## [1] 0.9504950495
table(vec_obs,prNN)
##        prNN
## vec_obs  0  1
##       0 72  4
##       1  1 24
#XGBOOST
xgbTest<-cbind.data.frame(dsTest,R=vec_obs)
stm<-sparse.model.matrix(R~.,data = xgbTest)
prXGB<-round(predict(ModelXGB,stm),digits = 0)
mean(vec_obs==prXGB)
## [1] 0.9702970297
table(vec_obs,prXGB)
##        prXGB
## vec_obs  0  1
##       0 73  3
##       1  0 25
#SVM
prSVM<-round(predict(ModelSVM,dsTest))
mean(vec_obs==prSVM)
## [1] 0.9603960396
table(vec_obs,prSVM)
##        prSVM
## vec_obs  0  1
##       0 72  4
##       1  0 25
means <- matrix(c(mean(vec_obs==prLR),mean(vec_obs==prRF),mean(vec_obs==prNN),mean(vec_obs==prXGB),mean(vec_obs==prSVM)), nrow=5, ncol=1)

rownames(means)<-c("Regression", "Random Forest", "Neural Net", "XGBoost", "SVM")
colnames(means)<-"Mean of ??"

as.data.frame(means)
##                 Mean of ??
## Regression    0.9603960396
## Random Forest 0.9603960396
## Neural Net    0.9504950495
## XGBoost       0.9702970297
## SVM           0.9603960396