Conjunto de datos vinos
wines <- read.csv('wine.txt', header=FALSE, sep = ',')
head(wines)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14
## 1 1 14.23 1.71 2.43 15.6 127 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065
## 2 1 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050
## 3 1 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185
## 4 1 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480
## 5 1 13.24 2.59 2.87 21.0 118 2.80 2.69 0.39 1.82 4.32 1.04 2.93 735
## 6 1 14.20 1.76 2.45 15.2 112 3.27 3.39 0.34 1.97 6.75 1.05 2.85 1450
names(wines) <- c("label","Alcohol","Malic_acid","Ash","Alcalinity_of_ash","Magnesium","Total_phenols","Flavanoids","Nonflavanoid_phenols","Proanthocyanins","Color_intensity","Hue","OD280_OD315_of_diluted_wines","Proline")
summary(wines)
## label Alcohol Malic_acid Ash
## Min. :1.000 Min. :11.03 Min. :0.740 Min. :1.360
## 1st Qu.:1.000 1st Qu.:12.36 1st Qu.:1.603 1st Qu.:2.210
## Median :2.000 Median :13.05 Median :1.865 Median :2.360
## Mean :1.938 Mean :13.00 Mean :2.336 Mean :2.367
## 3rd Qu.:3.000 3rd Qu.:13.68 3rd Qu.:3.083 3rd Qu.:2.558
## Max. :3.000 Max. :14.83 Max. :5.800 Max. :3.230
## Alcalinity_of_ash Magnesium Total_phenols Flavanoids
## Min. :10.60 Min. : 70.00 Min. :0.980 Min. :0.340
## 1st Qu.:17.20 1st Qu.: 88.00 1st Qu.:1.742 1st Qu.:1.205
## Median :19.50 Median : 98.00 Median :2.355 Median :2.135
## Mean :19.49 Mean : 99.74 Mean :2.295 Mean :2.029
## 3rd Qu.:21.50 3rd Qu.:107.00 3rd Qu.:2.800 3rd Qu.:2.875
## Max. :30.00 Max. :162.00 Max. :3.880 Max. :5.080
## Nonflavanoid_phenols Proanthocyanins Color_intensity Hue
## Min. :0.1300 Min. :0.410 Min. : 1.280 Min. :0.4800
## 1st Qu.:0.2700 1st Qu.:1.250 1st Qu.: 3.220 1st Qu.:0.7825
## Median :0.3400 Median :1.555 Median : 4.690 Median :0.9650
## Mean :0.3619 Mean :1.591 Mean : 5.058 Mean :0.9574
## 3rd Qu.:0.4375 3rd Qu.:1.950 3rd Qu.: 6.200 3rd Qu.:1.1200
## Max. :0.6600 Max. :3.580 Max. :13.000 Max. :1.7100
## OD280_OD315_of_diluted_wines Proline
## Min. :1.270 Min. : 278.0
## 1st Qu.:1.938 1st Qu.: 500.5
## Median :2.780 Median : 673.5
## Mean :2.612 Mean : 746.9
## 3rd Qu.:3.170 3rd Qu.: 985.0
## Max. :4.000 Max. :1680.0
Exploración del conjunto de datos
str(wines)
## 'data.frame': 178 obs. of 14 variables:
## $ label : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Alcohol : num 14.2 13.2 13.2 14.4 13.2 ...
## $ Malic_acid : num 1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
## $ Ash : num 2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
## $ Alcalinity_of_ash : num 15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
## $ Magnesium : int 127 100 101 113 118 112 96 121 97 98 ...
## $ Total_phenols : num 2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
## $ Flavanoids : num 3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
## $ Nonflavanoid_phenols : num 0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
## $ Proanthocyanins : num 2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
## $ Color_intensity : num 5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
## $ Hue : num 1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
## $ OD280_OD315_of_diluted_wines: num 3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
## $ Proline : int 1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
pairs(wines[2:14], col=wines$label)

Preprocesado de datos: dummy coding
library(dummies)
## dummies-1.5.6 provided by Decision Patterns
wines_D <- dummy.data.frame(wines, names = 'label')
head(wines_D)
## label1 label2 label3 Alcohol Malic_acid Ash Alcalinity_of_ash Magnesium
## 1 1 0 0 14.23 1.71 2.43 15.6 127
## 2 1 0 0 13.20 1.78 2.14 11.2 100
## 3 1 0 0 13.16 2.36 2.67 18.6 101
## 4 1 0 0 14.37 1.95 2.50 16.8 113
## 5 1 0 0 13.24 2.59 2.87 21.0 118
## 6 1 0 0 14.20 1.76 2.45 15.2 112
## Total_phenols Flavanoids Nonflavanoid_phenols Proanthocyanins
## 1 2.80 3.06 0.28 2.29
## 2 2.65 2.76 0.26 1.28
## 3 2.80 3.24 0.30 2.81
## 4 3.85 3.49 0.24 2.18
## 5 2.80 2.69 0.39 1.82
## 6 3.27 3.39 0.34 1.97
## Color_intensity Hue OD280_OD315_of_diluted_wines Proline
## 1 5.64 1.04 3.92 1065
## 2 4.38 1.05 3.40 1050
## 3 5.68 1.03 3.17 1185
## 4 7.80 0.86 3.45 1480
## 5 4.32 1.04 2.93 735
## 6 6.75 1.05 2.85 1450
Escalado de datos
min_ <- apply(wines_D,2,min)
max_ <- apply(wines_D,2,max)
wines_D_S <- as.data.frame(scale(wines_D, center = min_, scale = max_ - min_))
summary(wines_D_S)
## label1 label2 label3 Alcohol
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.3507
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.5316
## Mean :0.3315 Mean :0.3989 Mean :0.2697 Mean :0.5186
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.6967
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Malic_acid Ash Alcalinity_of_ash Magnesium
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.1705 1st Qu.:0.4545 1st Qu.:0.3402 1st Qu.:0.1957
## Median :0.2223 Median :0.5348 Median :0.4588 Median :0.3043
## Mean :0.3155 Mean :0.5382 Mean :0.4585 Mean :0.3233
## 3rd Qu.:0.4629 3rd Qu.:0.6404 3rd Qu.:0.5619 3rd Qu.:0.4022
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Total_phenols Flavanoids Nonflavanoid_phenols Proanthocyanins
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.2629 1st Qu.:0.1825 1st Qu.:0.2642 1st Qu.:0.2650
## Median :0.4741 Median :0.3787 Median :0.3962 Median :0.3612
## Mean :0.4535 Mean :0.3564 Mean :0.4375 Mean :0.3725
## 3rd Qu.:0.6276 3rd Qu.:0.5348 3rd Qu.:0.5802 3rd Qu.:0.4858
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Color_intensity Hue OD280_OD315_of_diluted_wines
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.1655 1st Qu.:0.2459 1st Qu.:0.2445
## Median :0.2910 Median :0.3943 Median :0.5531
## Mean :0.3224 Mean :0.3882 Mean :0.4915
## 3rd Qu.:0.4198 3rd Qu.:0.5203 3rd Qu.:0.6960
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## Proline
## Min. :0.0000
## 1st Qu.:0.1587
## Median :0.2821
## Mean :0.3344
## 3rd Qu.:0.5043
## Max. :1.0000
Red Neuronal neuralnet
library(neuralnet)
var_ind <- paste(names(wines_D_S)[1:3], collapse = '+')
var_ind <- paste(var_ind,'~')
var_dep <- paste(names(wines_D_S)[4:16], collapse = '+')
formula <- as.formula(paste(var_ind,var_dep))
formula
## label1 + label2 + label3 ~ Alcohol + Malic_acid + Ash + Alcalinity_of_ash +
## Magnesium + Total_phenols + Flavanoids + Nonflavanoid_phenols +
## Proanthocyanins + Color_intensity + Hue + OD280_OD315_of_diluted_wines +
## Proline
# act.fct = 'logistic' Usamos la función logística o sigmoide como función de activación. También se puede aplicar la función tangente hiperbólica 'tanh'. No existe la función rampa 'ReLU' en este paquete.
# linear.output = FALSE Con esto indicamos que quiero aplicar la función de activación y no estoy realizando una tarea de regresión
NN <- neuralnet(formula, data = wines_D_S, hidden = c(13,10,3), act.fct = 'logistic', linear.output = FALSE, lifesign = 'minimal')
## hidden: 13, 10, 3 thresh: 0.01 rep: 1/1 steps: 78 error: 0.01569 time: 0.09 secs
plot(NN)
Predicción con neuralnet
pr_nn <- compute(NN, wines_D_S[,4:16])
head(pr_nn$net.result)
## [,1] [,2] [,3]
## 1 0.9963691045 0.002834947165 0.0007161293934
## 2 0.9943113162 0.004392518861 0.0008681763015
## 3 0.9961658875 0.003094125294 0.0006691898905
## 4 0.9968189819 0.002416539709 0.0006922611838
## 5 0.9877134886 0.010014475950 0.0008267414043
## 6 0.9967506916 0.002465114009 0.0006960019506
Evaluando la precisión de la red neuronal
prediction_nn <- max.col(pr_nn$net.result)
original_values <-max.col(wines_D_S[,1:3])
mean(prediction_nn==original_values)
## [1] 1
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
confusionMatrix(as.factor(prediction_nn),as.factor(original_values))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3
## 1 59 0 0
## 2 0 71 0
## 3 0 0 48
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9794892, 1)
## No Information Rate : 0.3988764
## P-Value [Acc > NIR] : < 0.00000000000000022204
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3
## Sensitivity 1.0000000 1.0000000 1.0000000
## Specificity 1.0000000 1.0000000 1.0000000
## Pos Pred Value 1.0000000 1.0000000 1.0000000
## Neg Pred Value 1.0000000 1.0000000 1.0000000
## Prevalence 0.3314607 0.3988764 0.2696629
## Detection Rate 0.3314607 0.3988764 0.2696629
## Detection Prevalence 0.3314607 0.3988764 0.2696629
## Balanced Accuracy 1.0000000 1.0000000 1.0000000
Validación cruzada del clasificador con bucle for
set.seed(500)
k <- 10 # número de sobres
proportion <- 0.8 # proporcion para la extracción del conjunto de entrenamiento
kappa_cv <- NULL
accuracy_cv <- NULL
for(i in 1:k) {
index <- sample(1:nrow(wines_D_S), round(proportion*nrow(wines_D_S)))
train_cv <- wines_D_S[index,]
test_cv <- wines_D_S[-index,]
NN_cv <- neuralnet(formula, train_cv, hidden=c(13,10,3), act.fct = 'logistic', linear.output = FALSE, lifesign = 'minimal')
pred_cv <- compute(NN_cv, test_cv[,4:16])
prediction_nn <- max.col(pred_cv$net.result)
original_values <- max.col(test_cv[,1:3])
object_mat <-confusionMatrix(as.factor(prediction_nn), as.factor(original_values))
accuracy_cv[i] <- object_mat$overall[1]
kappa_cv[i] <- object_mat$overall[2]
}
## hidden: 13, 10, 3 thresh: 0.01 rep: 1/1 steps: 86 error: 0.02138 time: 0.13 secs
## hidden: 13, 10, 3 thresh: 0.01 rep: 1/1 steps: 91 error: 0.01018 time: 0.05 secs
## hidden: 13, 10, 3 thresh: 0.01 rep: 1/1 steps: 82 error: 0.01941 time: 0.05 secs
## hidden: 13, 10, 3 thresh: 0.01 rep: 1/1 steps: 101 error: 0.02347 time: 0.06 secs
## hidden: 13, 10, 3 thresh: 0.01 rep: 1/1 steps: 107 error: 0.01408 time: 0.06 secs
## hidden: 13, 10, 3 thresh: 0.01 rep: 1/1 steps: 85 error: 0.0195 time: 0.05 secs
## hidden: 13, 10, 3 thresh: 0.01 rep: 1/1 steps: 75 error: 0.01748 time: 0.05 secs
## hidden: 13, 10, 3 thresh: 0.01 rep: 1/1 steps: 136 error: 0.02131 time: 0.08 secs
## hidden: 13, 10, 3 thresh: 0.01 rep: 1/1 steps: 80 error: 0.01305 time: 0.05 secs
## hidden: 13, 10, 3 thresh: 0.01 rep: 1/1 steps: 68 error: 0.01995 time: 0.04 secs
accuracy_cv
## [1] 1.0000000000 1.0000000000 1.0000000000 0.9444444444 1.0000000000
## [6] 1.0000000000 1.0000000000 1.0000000000 0.9722222222 1.0000000000
mean(accuracy_cv)
## [1] 0.9916666667
kappa_cv
## [1] 1.0000000000 1.0000000000 1.0000000000 0.9150943396 1.0000000000
## [6] 1.0000000000 1.0000000000 1.0000000000 0.9560439560 1.0000000000
mean(kappa_cv)
## [1] 0.9871138296