#Particiona un dataset preparandolo para el proceso de cross validation
#recibe como parámetros el dataset, el porcentaje de registros que se desea tener para el TraininSet
#y el porcentaje de registros que se desea para el CVSet
#Regresa una lista con tres dataframes conteniendo los datasets dttrain, dtcv y dttest en ese orden
splitDataSetForCV<-function(dataSet,nTraining,nCV){
nregistros<-ceiling(nrow(dataSet)*(nTraining/100))
indexRows<-sample(nrow(dataSet),nregistros,replace=F)
dttrain<-dataSet[indexRows,]
dttemp<-dataSet[-indexRows,]
tope<-ceiling(nrow(dataSet)*(nCV/100))
dtcv<-dttemp[1:tope,]
dttest<-dttemp[(tope+1):nrow(dttemp),]
listDF<-list(dttrain,dtcv,dttest)
}
breast_cancer<-read.csv("https://raw.githubusercontent.com/arturo-laflor/breast-cancer/master/breast_cancer_nn/breast_cancer.csv",header = TRUE,sep = ",")
ds<-breast_cancer[-1]
ds<-ds[complete.cases(ds),]
ds<-ds[ds$v6!="?",]
ds[which(ds$R==2),10]<-0
ds[which(ds$R==4),10]<-1
ds<-as.data.frame(sapply(ds,as.numeric))
lstDS<-splitDataSetForCV(ds,70,15)
ds<-lstDS[[1]]
dstt<-lstDS[[3]]
get_formula<-function(vec_names,var_objetivo){
var_objetivo_f<-paste(var_objetivo,"~",sep = "")
var_objetivo_f<-paste(var_objetivo,"~",sep = "")
f<-as.formula(paste(var_objetivo_f,paste(vec_names[!vec_names %in% var_objetivo],collapse = " + ")))
return(f)
}
Where:
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-13
dblAlpha=0.9
iLambda=20
x<-as.matrix(ds[,1:(dim(ds)[2]-1)])
y<-as.matrix(ds[,(dim(ds)[2])])
ModelLR<-cv.glmnet(x,y, alpha = dblAlpha, nlambda = iLambda, family = "binomial",type.measure = "class")
lmin<-ModelLR$lambda.min
coefm<-coef.cv.glmnet(ModelLR,s=lmin)
matrixcoef<-as.matrix(coefm)
plot(ModelLR)
print(matrixcoef)
## 1
## (Intercept) -7.37691722
## v1 0.40469056
## v2 0.03980427
## v3 0.33243689
## v4 0.27904242
## v5 0.13480465
## v6 0.15098408
## v7 0.44584055
## v8 0.07211616
## v9 0.00000000
library(randomForest)
library(ggplot2)
f<-get_formula(names(ds),names(ds)[dim(ds)[2]])
ModelRF<-randomForest(f, data=ds, ntree=500,importance=TRUE)
print(ModelRF)
##
## Call:
## randomForest(formula = f, data = ds, ntree = 500, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 0.02379069
## % Var explained: 89.55
print(importance(ModelRF,type = 2))
## IncNodePurity
## v1 6.1847400
## v2 25.6336821
## v3 29.1888487
## v4 4.8676966
## v5 8.8472852
## v6 5.1402952
## v7 18.1808556
## v8 8.0814899
## v9 0.2944162
varImpPlot(ModelRF)
plot(ModelRF)
library(xgboost)
library(DiagrammeR)
f<-get_formula(names(ds),names(ds)[dim(ds)[2]])
ds3B<-ds
sm<-sparse.model.matrix(f,data = ds3B)
target_vec<-(as.numeric(ds3B[,dim(ds3B)[2]]))
ModelXGB <- xgboost(data = sm, label = target_vec, max_depth = 2,
eta = 1, nthread = 2, nrounds = 50,objective = "binary:logistic")
## [1] train-error:0.041754
## [2] train-error:0.027140
## [3] train-error:0.016701
## [4] train-error:0.012526
## [5] train-error:0.012526
## [6] train-error:0.012526
## [7] train-error:0.014614
## [8] train-error:0.010438
## [9] train-error:0.008351
## [10] train-error:0.010438
## [11] train-error:0.010438
## [12] train-error:0.008351
## [13] train-error:0.010438
## [14] train-error:0.006263
## [15] train-error:0.006263
## [16] train-error:0.006263
## [17] train-error:0.006263
## [18] train-error:0.006263
## [19] train-error:0.006263
## [20] train-error:0.004175
## [21] train-error:0.004175
## [22] train-error:0.004175
## [23] train-error:0.002088
## [24] train-error:0.004175
## [25] train-error:0.002088
## [26] train-error:0.002088
## [27] train-error:0.002088
## [28] train-error:0.002088
## [29] train-error:0.002088
## [30] train-error:0.002088
## [31] train-error:0.002088
## [32] train-error:0.002088
## [33] train-error:0.002088
## [34] train-error:0.002088
## [35] train-error:0.002088
## [36] train-error:0.002088
## [37] train-error:0.002088
## [38] train-error:0.002088
## [39] train-error:0.002088
## [40] train-error:0.002088
## [41] train-error:0.002088
## [42] train-error:0.002088
## [43] train-error:0.002088
## [44] train-error:0.002088
## [45] train-error:0.002088
## [46] train-error:0.002088
## [47] train-error:0.002088
## [48] train-error:0.002088
## [49] train-error:0.000000
## [50] train-error:0.002088
matrixcoef<-xgb.importance(feature_names = colnames(sm), model = ModelXGB)
xgb.ggplot.importance(importance_matrix = matrixcoef,
rel_to_first = TRUE,xlab="Relative Importance",n_clusters = c(1:4))
xgb.plot.tree(feature_names = names(ds3B), model = ModelXGB)
library(neuralnet)
f<-get_formula(names(ds),names(ds)[dim(ds)[2]])
ModelNN<-neuralnet(f,data = ds,hidden=c(4),linear.output = F)
plot(ModelNN)
construye_vec_costos<-function(costo_min,costo_max){
vecc<-costo_min
cost_temp<-costo_min
while(cost_temp<costo_max){
cost_temp<-cost_temp*3
vecc<-c(vecc,cost_temp)
}
vecc<-vecc[1:(length(vecc)-1)]
vecc<-c(vecc,costo_max)
}
library(e1071)
f<-get_formula(names(ds),names(ds)[dim(ds)[2]])
vec_costo<-construye_vec_costos(0.01,10)
kernel<-"radial"
tune_svm<-tune(svm,f,data = ds,kernel=kernel,degree=1,
ranges=list(cost=vec_costo,gamma=c(0.5,1,1.5,2,2.5,3.0)))
mejor_costo<-tune_svm$best.model$cost
mejor_gamma<-tune_svm$best.model$gamma
mejor_epsilon<-tune_svm$best.model$epsilon
performance<-tune_svm$performances
menor_error<-min(performance[,3])
ModelSVM<-tune_svm$best.model
X<-cbind.data.frame(C=performance[,1],Gamma=as.factor(performance[,2]),Error=performance[,3])
if(identical(kernel,'lineal')){
X<-X[which(X$Gamma==mejorGamma),]
}
gg<-ggplot(data=X)+
geom_point(aes(x=ifelse(C<1,-1/log2(C),log2(C)),y=Error,shape=Gamma, color=Gamma))+
labs(x='C',y='Error')
gg
dfparam<-cbind.data.frame(Parameters=c("C","Gamma","Error"),
Optimal.Values=c(mejor_costo,mejor_gamma,menor_error))
dfparam
## Parameters Optimal.Values
## 1 C 0.81000000000
## 2 Gamma 0.50000000000
## 3 Error 0.03286164096
library(magrittr)
#dsTest<-splitDataSetForCV(ds,70,15)%>%.[[3]]%>%.[,1:(dim(dsTest)[2]-1)]
vec_obs<-dstt[,(dim(dstt)[2])]
dsTest<-dstt[,1:(dim(dstt)[2]-1)]
#Logistic Regression
dsTestLR<-as.matrix(dsTest)
prLR<-predict(ModelLR,newx=dsTestLR,type="class",s="lambda.min")
mean(vec_obs==prLR)
## [1] 0.9603960396
table(vec_obs,prLR)
## prLR
## vec_obs 0 1
## 0 73 3
## 1 1 24
#Random_FOrest
prRF<-round(predict(ModelRF,dsTest),digits = 0)
mean(vec_obs==prRF)
## [1] 0.9603960396
table(vec_obs,prRF)
## prRF
## vec_obs 0 1
## 0 73 3
## 1 1 24
#NeuralNet
compNN<-compute(ModelNN,dsTest)
prNN<-round(compNN$net.result)
mean(vec_obs==prNN)
## [1] 0.9504950495
table(vec_obs,prNN)
## prNN
## vec_obs 0 1
## 0 72 4
## 1 1 24
#XGBOOST
xgbTest<-cbind.data.frame(dsTest,R=vec_obs)
stm<-sparse.model.matrix(R~.,data = xgbTest)
prXGB<-round(predict(ModelXGB,stm),digits = 0)
mean(vec_obs==prXGB)
## [1] 0.9702970297
table(vec_obs,prXGB)
## prXGB
## vec_obs 0 1
## 0 73 3
## 1 0 25
#SVM
prSVM<-round(predict(ModelSVM,dsTest))
mean(vec_obs==prSVM)
## [1] 0.9603960396
table(vec_obs,prSVM)
## prSVM
## vec_obs 0 1
## 0 72 4
## 1 0 25
means <- matrix(c(mean(vec_obs==prLR),mean(vec_obs==prRF),mean(vec_obs==prNN),mean(vec_obs==prXGB),mean(vec_obs==prSVM)), nrow=5, ncol=1)
rownames(means)<-c("Regression", "Random Forest", "Neural Net", "XGBoost", "SVM")
colnames(means)<-"Mean of ??"
as.data.frame(means)
## Mean of ??
## Regression 0.9603960396
## Random Forest 0.9603960396
## Neural Net 0.9504950495
## XGBoost 0.9702970297
## SVM 0.9603960396