Conjunto de datos vinos

wines <- read.csv('wine.txt', header=FALSE, sep = ',')
head(wines)
##   V1    V2   V3   V4   V5  V6   V7   V8   V9  V10  V11  V12  V13  V14
## 1  1 14.23 1.71 2.43 15.6 127 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065
## 2  1 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050
## 3  1 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185
## 4  1 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480
## 5  1 13.24 2.59 2.87 21.0 118 2.80 2.69 0.39 1.82 4.32 1.04 2.93  735
## 6  1 14.20 1.76 2.45 15.2 112 3.27 3.39 0.34 1.97 6.75 1.05 2.85 1450
names(wines) <- c("label","Alcohol","Malic_acid","Ash","Alcalinity_of_ash","Magnesium","Total_phenols","Flavanoids","Nonflavanoid_phenols","Proanthocyanins","Color_intensity","Hue","OD280_OD315_of_diluted_wines","Proline")
summary(wines)
##      label          Alcohol        Malic_acid         Ash       
##  Min.   :1.000   Min.   :11.03   Min.   :0.740   Min.   :1.360  
##  1st Qu.:1.000   1st Qu.:12.36   1st Qu.:1.603   1st Qu.:2.210  
##  Median :2.000   Median :13.05   Median :1.865   Median :2.360  
##  Mean   :1.938   Mean   :13.00   Mean   :2.336   Mean   :2.367  
##  3rd Qu.:3.000   3rd Qu.:13.68   3rd Qu.:3.083   3rd Qu.:2.558  
##  Max.   :3.000   Max.   :14.83   Max.   :5.800   Max.   :3.230  
##  Alcalinity_of_ash   Magnesium      Total_phenols     Flavanoids   
##  Min.   :10.60     Min.   : 70.00   Min.   :0.980   Min.   :0.340  
##  1st Qu.:17.20     1st Qu.: 88.00   1st Qu.:1.742   1st Qu.:1.205  
##  Median :19.50     Median : 98.00   Median :2.355   Median :2.135  
##  Mean   :19.49     Mean   : 99.74   Mean   :2.295   Mean   :2.029  
##  3rd Qu.:21.50     3rd Qu.:107.00   3rd Qu.:2.800   3rd Qu.:2.875  
##  Max.   :30.00     Max.   :162.00   Max.   :3.880   Max.   :5.080  
##  Nonflavanoid_phenols Proanthocyanins Color_intensity       Hue        
##  Min.   :0.1300       Min.   :0.410   Min.   : 1.280   Min.   :0.4800  
##  1st Qu.:0.2700       1st Qu.:1.250   1st Qu.: 3.220   1st Qu.:0.7825  
##  Median :0.3400       Median :1.555   Median : 4.690   Median :0.9650  
##  Mean   :0.3619       Mean   :1.591   Mean   : 5.058   Mean   :0.9574  
##  3rd Qu.:0.4375       3rd Qu.:1.950   3rd Qu.: 6.200   3rd Qu.:1.1200  
##  Max.   :0.6600       Max.   :3.580   Max.   :13.000   Max.   :1.7100  
##  OD280_OD315_of_diluted_wines    Proline      
##  Min.   :1.270                Min.   : 278.0  
##  1st Qu.:1.938                1st Qu.: 500.5  
##  Median :2.780                Median : 673.5  
##  Mean   :2.612                Mean   : 746.9  
##  3rd Qu.:3.170                3rd Qu.: 985.0  
##  Max.   :4.000                Max.   :1680.0

Exploración del conjunto de datos

str(wines)
## 'data.frame':    178 obs. of  14 variables:
##  $ label                       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Alcohol                     : num  14.2 13.2 13.2 14.4 13.2 ...
##  $ Malic_acid                  : num  1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
##  $ Ash                         : num  2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
##  $ Alcalinity_of_ash           : num  15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
##  $ Magnesium                   : int  127 100 101 113 118 112 96 121 97 98 ...
##  $ Total_phenols               : num  2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
##  $ Flavanoids                  : num  3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
##  $ Nonflavanoid_phenols        : num  0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
##  $ Proanthocyanins             : num  2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
##  $ Color_intensity             : num  5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
##  $ Hue                         : num  1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
##  $ OD280_OD315_of_diluted_wines: num  3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
##  $ Proline                     : int  1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
pairs(wines[2:14], col=wines$label)

Preprocesado de datos: dummy coding

library(dummies)
## dummies-1.5.6 provided by Decision Patterns
wines_D <- dummy.data.frame(wines, names = 'label')
head(wines_D)
##   label1 label2 label3 Alcohol Malic_acid  Ash Alcalinity_of_ash Magnesium
## 1      1      0      0   14.23       1.71 2.43              15.6       127
## 2      1      0      0   13.20       1.78 2.14              11.2       100
## 3      1      0      0   13.16       2.36 2.67              18.6       101
## 4      1      0      0   14.37       1.95 2.50              16.8       113
## 5      1      0      0   13.24       2.59 2.87              21.0       118
## 6      1      0      0   14.20       1.76 2.45              15.2       112
##   Total_phenols Flavanoids Nonflavanoid_phenols Proanthocyanins
## 1          2.80       3.06                 0.28            2.29
## 2          2.65       2.76                 0.26            1.28
## 3          2.80       3.24                 0.30            2.81
## 4          3.85       3.49                 0.24            2.18
## 5          2.80       2.69                 0.39            1.82
## 6          3.27       3.39                 0.34            1.97
##   Color_intensity  Hue OD280_OD315_of_diluted_wines Proline
## 1            5.64 1.04                         3.92    1065
## 2            4.38 1.05                         3.40    1050
## 3            5.68 1.03                         3.17    1185
## 4            7.80 0.86                         3.45    1480
## 5            4.32 1.04                         2.93     735
## 6            6.75 1.05                         2.85    1450

Escalado de datos

min_ <- apply(wines_D,2,min)
max_ <- apply(wines_D,2,max)
wines_D_S <- as.data.frame(scale(wines_D, center = min_, scale = max_ - min_))
summary(wines_D_S)
##      label1           label2           label3          Alcohol      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.3507  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.5316  
##  Mean   :0.3315   Mean   :0.3989   Mean   :0.2697   Mean   :0.5186  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.6967  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##    Malic_acid          Ash         Alcalinity_of_ash   Magnesium     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000    Min.   :0.0000  
##  1st Qu.:0.1705   1st Qu.:0.4545   1st Qu.:0.3402    1st Qu.:0.1957  
##  Median :0.2223   Median :0.5348   Median :0.4588    Median :0.3043  
##  Mean   :0.3155   Mean   :0.5382   Mean   :0.4585    Mean   :0.3233  
##  3rd Qu.:0.4629   3rd Qu.:0.6404   3rd Qu.:0.5619    3rd Qu.:0.4022  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000    Max.   :1.0000  
##  Total_phenols      Flavanoids     Nonflavanoid_phenols Proanthocyanins 
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000       Min.   :0.0000  
##  1st Qu.:0.2629   1st Qu.:0.1825   1st Qu.:0.2642       1st Qu.:0.2650  
##  Median :0.4741   Median :0.3787   Median :0.3962       Median :0.3612  
##  Mean   :0.4535   Mean   :0.3564   Mean   :0.4375       Mean   :0.3725  
##  3rd Qu.:0.6276   3rd Qu.:0.5348   3rd Qu.:0.5802       3rd Qu.:0.4858  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000       Max.   :1.0000  
##  Color_intensity       Hue         OD280_OD315_of_diluted_wines
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000              
##  1st Qu.:0.1655   1st Qu.:0.2459   1st Qu.:0.2445              
##  Median :0.2910   Median :0.3943   Median :0.5531              
##  Mean   :0.3224   Mean   :0.3882   Mean   :0.4915              
##  3rd Qu.:0.4198   3rd Qu.:0.5203   3rd Qu.:0.6960              
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000              
##     Proline      
##  Min.   :0.0000  
##  1st Qu.:0.1587  
##  Median :0.2821  
##  Mean   :0.3344  
##  3rd Qu.:0.5043  
##  Max.   :1.0000

Red Neuronal neuralnet

library(neuralnet)
var_ind <- paste(names(wines_D_S)[1:3], collapse = '+')
var_ind <- paste(var_ind,'~')
var_dep <- paste(names(wines_D_S)[4:16], collapse = '+')
formula <- as.formula(paste(var_ind,var_dep))
formula
## label1 + label2 + label3 ~ Alcohol + Malic_acid + Ash + Alcalinity_of_ash + 
##     Magnesium + Total_phenols + Flavanoids + Nonflavanoid_phenols + 
##     Proanthocyanins + Color_intensity + Hue + OD280_OD315_of_diluted_wines + 
##     Proline
# act.fct = 'logistic' Usamos la función logística o sigmoide como función de activación. También se puede aplicar la función tangente hiperbólica 'tanh'. No existe la función rampa 'ReLU' en este paquete.
# linear.output = FALSE Con esto indicamos que quiero aplicar la función de activación y no estoy realizando una tarea de regresión

NN <- neuralnet(formula, data = wines_D_S, hidden = c(13,10,3), act.fct = 'logistic', linear.output = FALSE, lifesign = 'minimal') 
## hidden: 13, 10, 3    thresh: 0.01    rep: 1/1    steps:      78  error: 0.01569  time: 0.09 secs
plot(NN)

Predicción con neuralnet

pr_nn <- compute(NN, wines_D_S[,4:16])
head(pr_nn$net.result)
##           [,1]           [,2]            [,3]
## 1 0.9963691045 0.002834947165 0.0007161293934
## 2 0.9943113162 0.004392518861 0.0008681763015
## 3 0.9961658875 0.003094125294 0.0006691898905
## 4 0.9968189819 0.002416539709 0.0006922611838
## 5 0.9877134886 0.010014475950 0.0008267414043
## 6 0.9967506916 0.002465114009 0.0006960019506

Evaluando la precisión de la red neuronal

prediction_nn <- max.col(pr_nn$net.result)
original_values <-max.col(wines_D_S[,1:3])
mean(prediction_nn==original_values)
## [1] 1
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
confusionMatrix(as.factor(prediction_nn),as.factor(original_values))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3
##          1 59  0  0
##          2  0 71  0
##          3  0  0 48
## 
## Overall Statistics
##                                                   
##                Accuracy : 1                       
##                  95% CI : (0.9794892, 1)          
##     No Information Rate : 0.3988764               
##     P-Value [Acc > NIR] : < 0.00000000000000022204
##                                                   
##                   Kappa : 1                       
##  Mcnemar's Test P-Value : NA                      
## 
## Statistics by Class:
## 
##                       Class: 1  Class: 2  Class: 3
## Sensitivity          1.0000000 1.0000000 1.0000000
## Specificity          1.0000000 1.0000000 1.0000000
## Pos Pred Value       1.0000000 1.0000000 1.0000000
## Neg Pred Value       1.0000000 1.0000000 1.0000000
## Prevalence           0.3314607 0.3988764 0.2696629
## Detection Rate       0.3314607 0.3988764 0.2696629
## Detection Prevalence 0.3314607 0.3988764 0.2696629
## Balanced Accuracy    1.0000000 1.0000000 1.0000000

Validación cruzada del clasificador con bucle for

set.seed(500)
k <- 10 # número de sobres
proportion <- 0.8 # proporcion para la extracción del conjunto de entrenamiento
kappa_cv <- NULL
accuracy_cv <- NULL
for(i in 1:k) {
  index <- sample(1:nrow(wines_D_S), round(proportion*nrow(wines_D_S)))
  train_cv <- wines_D_S[index,]
  test_cv <- wines_D_S[-index,]
  NN_cv <- neuralnet(formula, train_cv, hidden=c(13,10,3), act.fct = 'logistic', linear.output = FALSE, lifesign = 'minimal')
  pred_cv <- compute(NN_cv, test_cv[,4:16])
  prediction_nn <- max.col(pred_cv$net.result)
  original_values <- max.col(test_cv[,1:3])
  object_mat <-confusionMatrix(as.factor(prediction_nn), as.factor(original_values))
  accuracy_cv[i] <- object_mat$overall[1]
  kappa_cv[i] <- object_mat$overall[2]
}
## hidden: 13, 10, 3    thresh: 0.01    rep: 1/1    steps:      86  error: 0.02138  time: 0.13 secs
## hidden: 13, 10, 3    thresh: 0.01    rep: 1/1    steps:      91  error: 0.01018  time: 0.05 secs
## hidden: 13, 10, 3    thresh: 0.01    rep: 1/1    steps:      82  error: 0.01941  time: 0.05 secs
## hidden: 13, 10, 3    thresh: 0.01    rep: 1/1    steps:     101  error: 0.02347  time: 0.06 secs
## hidden: 13, 10, 3    thresh: 0.01    rep: 1/1    steps:     107  error: 0.01408  time: 0.06 secs
## hidden: 13, 10, 3    thresh: 0.01    rep: 1/1    steps:      85  error: 0.0195   time: 0.05 secs
## hidden: 13, 10, 3    thresh: 0.01    rep: 1/1    steps:      75  error: 0.01748  time: 0.05 secs
## hidden: 13, 10, 3    thresh: 0.01    rep: 1/1    steps:     136  error: 0.02131  time: 0.08 secs
## hidden: 13, 10, 3    thresh: 0.01    rep: 1/1    steps:      80  error: 0.01305  time: 0.05 secs
## hidden: 13, 10, 3    thresh: 0.01    rep: 1/1    steps:      68  error: 0.01995  time: 0.04 secs
accuracy_cv
##  [1] 1.0000000000 1.0000000000 1.0000000000 0.9444444444 1.0000000000
##  [6] 1.0000000000 1.0000000000 1.0000000000 0.9722222222 1.0000000000
mean(accuracy_cv)
## [1] 0.9916666667
kappa_cv
##  [1] 1.0000000000 1.0000000000 1.0000000000 0.9150943396 1.0000000000
##  [6] 1.0000000000 1.0000000000 1.0000000000 0.9560439560 1.0000000000
mean(kappa_cv)
## [1] 0.9871138296