vines_dataset<-readr::read_csv("datasets/vine_train_newnames.csv", col_types =cols())
#vines_dataset
Training dataset dimension 58 x 77
#removed_numeric_var<-c("ubi_x","ubi_y","ID")
removed_numeric_var<-c("ID")
vines_dataset <- vines_dataset %>% select(-removed_numeric_var)
vines_dataset_factors<-vines_dataset %>% select_if(~class(.) == 'character')
names(vines_dataset_factors) %>% as.data.frame()
NA
skimr::skim(vines_dataset_factors %>% select(-Calidad)) %>% knitr::kable() %>% kable_styling(font_size = 9)
skim_type | skim_variable | n_missing | complete_rate | character.min | character.max | character.empty | character.n_unique | character.whitespace |
---|---|---|---|---|---|---|---|---|
character | ubi_Finca | 0 | 1.000000 | 5 | 22 | 0 | 29 | 0 |
character | ubi_Zona | 0 | 1.000000 | 5 | 12 | 0 | 3 | 0 |
character | ubi_Distrito | 0 | 1.000000 | 6 | 16 | 0 | 18 | 0 |
character | eco_Estacion datos clima | 0 | 1.000000 | 8 | 11 | 0 | 6 | 0 |
character | suelo_Textura | 3 | 0.961039 | 6 | 16 | 0 | 6 | 0 |
character | eco_Winkler historico | 0 | 1.000000 | 2 | 3 | 0 | 2 | 0 |
vines_dataset_numeric<-vines_dataset %>% select_if(~class(.) == 'numeric')
names(vines_dataset_numeric) %>% as.data.frame()
skimr::skim(vines_dataset_numeric) %>% knitr::kable() %>% kable_styling(font_size = 9)
skim_type | skim_variable | n_missing | complete_rate | numeric.mean | numeric.sd | numeric.p0 | numeric.p25 | numeric.p50 | numeric.p75 | numeric.p100 | numeric.hist |
---|---|---|---|---|---|---|---|---|---|---|---|
numeric | uva_L hollejo | 0 | 1.0000000 | 2.874569e+01 | 5.459099e+00 | 1.821657e+01 | 2.550730e+01 | 2.807872e+01 | 3.218875e+01 | 4.261734e+01 | ▅▇▇▅▁ |
numeric | uva_a hollejo | 0 | 1.0000000 | 6.289838e+01 | 4.801958e+00 | 5.251411e+01 | 6.018530e+01 | 6.263834e+01 | 6.652651e+01 | 7.421597e+01 | ▃▅▇▆▂ |
numeric | uva_b hollejo | 0 | 1.0000000 | 4.555514e+01 | 6.056351e+00 | 3.140652e+01 | 4.208936e+01 | 4.653096e+01 | 5.020659e+01 | 5.634098e+01 | ▂▃▆▇▃ |
numeric | uva_h hollejo | 0 | 1.0000000 | 3.579853e+01 | 2.875735e+00 | 2.421982e+01 | 3.424246e+01 | 3.669681e+01 | 3.772302e+01 | 3.934650e+01 | ▁▁▂▃▇ |
numeric | uva_C hollejo | 0 | 1.0000000 | 7.776013e+01 | 6.661614e+00 | 6.121770e+01 | 7.405367e+01 | 7.862206e+01 | 8.297825e+01 | 8.889917e+01 | ▂▃▆▇▆ |
numeric | uva_FT(mg/g hollejo) | 0 | 1.0000000 | 6.815455e+00 | 1.973878e+00 | 2.730000e+00 | 5.420000e+00 | 7.010000e+00 | 8.170000e+00 | 1.315000e+01 | ▂▇▇▂▁ |
numeric | uva_FT(mg/g baya) | 0 | 1.0000000 | 5.184416e-01 | 1.816957e-01 | 2.300000e-01 | 4.000000e-01 | 5.200000e-01 | 6.400000e-01 | 1.200000e+00 | ▇▇▅▁▁ |
numeric | uva_FT(mg/baya) | 0 | 1.0000000 | 1.138442e+00 | 3.360023e-01 | 4.300000e-01 | 9.300000e-01 | 1.160000e+00 | 1.350000e+00 | 2.090000e+00 | ▂▆▇▃▁ |
numeric | uva_TAN(mg/g hollejo) | 0 | 1.0000000 | 1.475195e+00 | 4.707262e-01 | 4.200000e-01 | 1.170000e+00 | 1.450000e+00 | 1.850000e+00 | 2.580000e+00 | ▃▇▇▆▂ |
numeric | uva_TAN(mg/g baya) | 0 | 1.0000000 | 1.109091e-01 | 3.970890e-02 | 4.000000e-02 | 8.000000e-02 | 1.000000e-01 | 1.300000e-01 | 2.400000e-01 | ▃▇▅▁▁ |
numeric | uva_TAN(mg/baya) | 0 | 1.0000000 | 2.463636e-01 | 7.803350e-02 | 6.000000e-02 | 1.900000e-01 | 2.400000e-01 | 3.000000e-01 | 4.300000e-01 | ▂▅▇▅▂ |
numeric | uva_ANT(mg/g hollejo) | 0 | 1.0000000 | 3.022987e+00 | 1.508207e+00 | 5.300000e-01 | 1.610000e+00 | 3.090000e+00 | 4.390000e+00 | 5.810000e+00 | ▇▆▇▇▅ |
numeric | uva_ANT(mg/g baya) | 0 | 1.0000000 | 2.301299e-01 | 1.188928e-01 | 3.000000e-02 | 1.200000e-01 | 2.200000e-01 | 3.400000e-01 | 4.900000e-01 | ▇▆▆▇▂ |
numeric | uva_ANT(mg/baya) | 0 | 1.0000000 | 5.041558e-01 | 2.554993e-01 | 9.000000e-02 | 2.500000e-01 | 5.100000e-01 | 7.100000e-01 | 9.400000e-01 | ▇▅▇▅▆ |
numeric | uva_Acti antirr hollejo | 0 | 1.0000000 | 3.902727e-01 | 1.546606e-01 | 1.810000e-01 | 2.520000e-01 | 3.410000e-01 | 4.970000e-01 | 7.610000e-01 | ▇▃▅▃▂ |
numeric | ubi_x | 0 | 1.0000000 | 4.944155e+05 | 1.516253e+04 | 4.722227e+05 | 4.824171e+05 | 4.895404e+05 | 5.108191e+05 | 5.211357e+05 | ▆▇▅▂▆ |
numeric | ubi_y | 0 | 1.0000000 | 6.321765e+06 | 6.935472e+04 | 6.259165e+06 | 6.270718e+06 | 6.292645e+06 | 6.330100e+06 | 6.482843e+06 | ▇▃▁▁▂ |
numeric | planta_Nro de Racimos | 1 | 0.9870130 | 3.216886e+01 | 1.252138e+01 | 8.750000e+00 | 2.475000e+01 | 3.237500e+01 | 3.993750e+01 | 6.066667e+01 | ▅▇▇▅▂ |
numeric | planta_Rendimiento | 0 | 1.0000000 | 2.745681e+00 | 1.299585e+00 | 7.000000e-01 | 1.791250e+00 | 2.703750e+00 | 3.798333e+00 | 6.385000e+00 | ▇▇▇▂▁ |
numeric | planta_Peso racimo | 1 | 0.9870130 | 8.598426e+01 | 2.326719e+01 | 4.317500e+01 | 6.995833e+01 | 8.418750e+01 | 9.958125e+01 | 1.489500e+02 | ▃▇▇▂▁ |
numeric | planta_Peso de poda | 0 | 1.0000000 | 5.560946e-01 | 3.414147e-01 | 9.250000e-02 | 3.500000e-01 | 4.450000e-01 | 6.400000e-01 | 1.775000e+00 | ▇▇▂▁▁ |
numeric | planta_Long Cordon | 0 | 1.0000000 | 9.831293e-01 | 2.207139e-01 | 4.525000e-01 | 8.500000e-01 | 9.666667e-01 | 1.110000e+00 | 1.570000e+00 | ▂▃▇▃▁ |
numeric | planta_Ravaz | 0 | 1.0000000 | 6.870817e+00 | 4.079626e+00 | 1.199712e+00 | 4.444548e+00 | 6.121651e+00 | 8.740419e+00 | 2.237703e+01 | ▇▇▃▁▁ |
numeric | planta_Peso de baya | 0 | 1.0000000 | 1.876786e+00 | 1.078832e+00 | 9.943200e-01 | 1.582320e+00 | 1.769080e+00 | 1.981160e+00 | 1.084352e+01 | ▇▁▁▁▁ |
numeric | planta_Bayas/racimo | 1 | 0.9870130 | 4.936153e+01 | 1.610240e+01 | 9.500000e+00 | 3.750000e+01 | 4.875000e+01 | 5.668750e+01 | 9.775000e+01 | ▁▇▇▂▁ |
numeric | uva_Brix | 0 | 1.0000000 | 2.495065e+01 | 1.113981e+00 | 2.270000e+01 | 2.420000e+01 | 2.480000e+01 | 2.560000e+01 | 2.830000e+01 | ▂▇▅▁▁ |
numeric | uva_pH mosto | 0 | 1.0000000 | 3.650494e+00 | 2.499957e-01 | 3.291000e+00 | 3.550000e+00 | 3.620000e+00 | 3.678000e+00 | 5.581000e+00 | ▇▁▁▁▁ |
numeric | uva_Acidez Mosto | 0 | 1.0000000 | 3.797727e+00 | 9.571079e-01 | 2.250000e+00 | 3.375000e+00 | 3.750000e+00 | 4.125000e+00 | 9.875000e+00 | ▇▆▁▁▁ |
numeric | ubi_Altura s.n.m. | 0 | 1.0000000 | 1.115339e+03 | 1.676664e+02 | 8.050000e+02 | 9.673831e+02 | 1.094949e+03 | 1.237089e+03 | 1.405014e+03 | ▂▇▇▂▅ |
numeric | planta_Aspecto | 0 | 1.0000000 | 1.600122e+02 | 8.213981e+01 | 6.343495e+01 | 9.567704e+01 | 1.233805e+02 | 2.045968e+02 | 3.482716e+02 | ▇▂▃▁▂ |
numeric | suelo_Pendiente | 0 | 1.0000000 | 1.684270e+00 | 9.458563e-01 | 6.482576e-01 | 8.594052e-01 | 1.314315e+00 | 2.293767e+00 | 3.923592e+00 | ▇▃▂▂▁ |
numeric | suelo_Indice fondo Valle | 0 | 1.0000000 | 2.565787e+00 | 1.444410e+00 | 0.000000e+00 | 2.000000e+00 | 2.000000e+00 | 3.000000e+00 | 6.000000e+00 | ▅▇▇▁▃ |
numeric | planta_ndvi | 0 | 1.0000000 | 4.405046e-01 | 1.189833e-01 | 1.998800e-01 | 3.588160e-01 | 4.481651e-01 | 5.113118e-01 | 6.595923e-01 | ▃▃▇▇▃ |
numeric | suelo_Vol. Sedimentacion (%) | 3 | 0.9610390 | 8.367568e+01 | 1.162909e+01 | 6.800000e+01 | 7.600000e+01 | 8.000000e+01 | 8.800000e+01 | 1.160000e+02 | ▇▃▅▂▁ |
numeric | eco_GDA historico | 0 | 1.0000000 | 1.826931e+03 | 1.415219e+02 | 1.680219e+03 | 1.714931e+03 | 1.788594e+03 | 1.837215e+03 | 2.180023e+03 | ▇▆▂▁▂ |
numeric | eco_Temp. Mínima °C Temp 2020 | 0 | 1.0000000 | 1.143646e+01 | 1.306108e+00 | 1.019000e+01 | 1.019000e+01 | 1.125000e+01 | 1.187000e+01 | 1.478638e+01 | ▇▆▁▃▁ |
numeric | eco_Temp. Media °C Temp 2020 | 0 | 1.0000000 | 1.913327e+01 | 9.194734e-01 | 1.819000e+01 | 1.857000e+01 | 1.882000e+01 | 1.922000e+01 | 2.097658e+01 | ▇▇▁▁▃ |
numeric | eco_Temp. Máxima °C Temp 2020 | 0 | 1.0000000 | 2.670542e+01 | 1.390381e+00 | 2.486000e+01 | 2.558000e+01 | 2.656103e+01 | 2.704000e+01 | 2.926187e+01 | ▇▃▇▁▃ |
numeric | eco_Amplitud Térmica °C Temp 2020 | 0 | 1.0000000 | 1.512831e+01 | 1.500230e+00 | 1.177465e+01 | 1.434000e+01 | 1.468000e+01 | 1.685000e+01 | 1.685000e+01 | ▁▃▇▃▇ |
numeric | eco_Precipitaciones (mm) Temp 2020 | 16 | 0.7922078 | 2.291230e+02 | 9.511294e+01 | 8.500000e+01 | 1.780000e+02 | 2.195000e+02 | 3.350000e+02 | 3.350000e+02 | ▃▆▃▁▇ |
numeric | eco_Grados Días Acum. Temp 2020 | 0 | 1.0000000 | 1.969156e+03 | 1.905096e+02 | 1.777550e+03 | 1.851820e+03 | 1.900400e+03 | 1.992390e+03 | 2.349915e+03 | ▇▇▁▁▃ |
numeric | eco_Evapotranspiración mm Temp 2020 | 13 | 0.8311688 | 9.567577e+02 | 6.524567e+01 | 8.660000e+02 | 8.918000e+02 | 9.766000e+02 | 9.807800e+02 | 1.162590e+03 | ▃▇▁▁▁ |
numeric | eco_I. de Fresco Nocturno Temp 2020 | 0 | 1.0000000 | 1.286563e+01 | 1.497985e+00 | 1.149000e+01 | 1.149000e+01 | 1.254000e+01 | 1.348000e+01 | 1.667097e+01 | ▇▆▁▃▁ |
numeric | eco_Temp. Media Marzo °C Temp 2020 | 0 | 1.0000000 | 2.033224e+01 | 1.034615e+00 | 1.924000e+01 | 1.981000e+01 | 1.992000e+01 | 2.039000e+01 | 2.240323e+01 | ▇▇▁▁▃ |
numeric | eco_Ampl. Térmica Marzo °C Temp 2020 | 0 | 1.0000000 | 1.533956e+01 | 1.535967e+00 | 1.138710e+01 | 1.431000e+01 | 1.452000e+01 | 1.722000e+01 | 1.722000e+01 | ▁▁▇▃▅ |
numeric | eco_Horas de Frío Temp 2020 | 29 | 0.6233766 | 1.450938e+03 | 8.676488e+01 | 1.313000e+03 | 1.313000e+03 | 1.494000e+03 | 1.516000e+03 | 1.516000e+03 | ▅▁▁▃▇ |
numeric | eco_Días con Temp.>35°C Temp 2020 | 0 | 1.0000000 | 6.792208e+00 | 7.824283e+00 | 0.000000e+00 | 1.000000e+00 | 6.000000e+00 | 7.000000e+00 | 2.300000e+01 | ▇▇▁▁▃ |
numeric | eco_Días con Heladas Temp 2020 | 0 | 1.0000000 | 8.922078e+00 | 5.609593e+00 | 0.000000e+00 | 7.000000e+00 | 7.000000e+00 | 1.300000e+01 | 1.700000e+01 | ▅▁▇▇▃ |
numeric | eco_Radiacíon W/m2 Temp 2020 | 0 | 1.0000000 | 2.554018e+02 | 2.479439e+01 | 2.263900e+02 | 2.333100e+02 | 2.426128e+02 | 2.887100e+02 | 2.887100e+02 | ▇▃▂▁▇ |
numeric | eco_Velocidad Media Viento km/h Temp 2020 | 0 | 1.0000000 | 3.736633e+00 | 1.074010e+00 | 1.976061e+00 | 3.360000e+00 | 3.390000e+00 | 4.580000e+00 | 5.420649e+00 | ▃▁▇▅▃ |
numeric | eco_Humedad Relativa Media % Temp 2020 | 0 | 1.0000000 | 5.023523e+01 | 7.963722e+00 | 3.287000e+01 | 4.948000e+01 | 5.084000e+01 | 5.246000e+01 | 6.051096e+01 | ▂▁▁▇▂ |
vines_dataset %>% group_by(Calidad) %>% summarise(total=n()) %>%
ggplot()+
geom_col(aes(x=Calidad,y=total,fill=Calidad))+
ggdark::dark_theme_bw()
res<-VIM::aggr(vines_dataset_numeric, combined = TRUE,
numbers = TRUE,
sortCombs = TRUE,
sortVars = TRUE,
labels=names(vines_dataset_numeric),
cex.axis=.4,
varheight = FALSE,
cex.numbers=0.8,
cex.lab=0.8,
prop = FALSE)
Variables sorted by number of missings:
vines_dataset_numeric_imputed <- kNN(vines_dataset_numeric)
vines_dataset_numeric<-vines_dataset_numeric_imputed %>% select(-ends_with("_imp"))
#vines_dataset_numeric %>% select(-NAME)
library(d3heatmap)
vines_dataset_numeric_cor_matrix<-cor(vines_dataset_numeric ,method="spearman")
#heatmap(postop_data_cor_matrix)
d3heatmap(vines_dataset_numeric_cor_matrix ,colors = "Blues",cexRow = 0.8, cexCol = 0.8)
library(FactoMineR)
#vines_dataset_numeric %>% select(-highlyCorrelated_var) %>% tibble::add_column(Calidad=vines_dataset$Calidad)
res_pca = PCA(vines_dataset_numeric %>%
tibble::add_column(Calidad=vines_dataset$Calidad)
, scale.unit=TRUE,
ncp=6,
graph=F,
quali.sup=52, #colid for Calidad
)
plot(res_pca,choix="ind",habillage=52)
par(mfcol=c(1,2))
plot(res_pca,choix="var",habillage="none",invisible = "ind") # para las variables
plotellipses(res_pca, invisible="ind",xlim=c(-6,6),ylim=c(-6,6))
#vines_dataset %>% mutate()
#vines_dataset$Calidad %>% as.numeric(as.factor(vines_dataset$Calidad))
100 new datasets used for evaluating feature selection algorithms
vines_dataset_numeric_reduced<-vines_dataset_numeric %>%
select(-highlyCorrelated_var) %>%
#select(names(var_importance[1:11])) %>%
tibble::add_column(Calidad=as.factor(vines_dataset$Calidad))
resamples<-rsample::bootstraps(vines_dataset_numeric_reduced,strata= Calidad,times = 100 )
num_of_feat<-20
vines_dataset_bootstrap<-rsample::analysis(resamples$splits[[sample(1:100,1)]])
val_dataset <- vines_dataset_bootstrap %>% group_by(Calidad) %>% sample_n(2) %>% ungroup()
train_dataset <-dplyr::setdiff(vines_dataset_numeric_reduced,val_dataset)
var_importance_boruta<-Boruta(Calidad ~ . ,data=train_dataset)
var_importance_boruta<-attStats(var_importance_boruta) %>% filter(decision=='Confirmed') %>% select(meanImp) %>% arrange(desc(meanImp)) %>% top_n(num_of_feat)
Selecting by meanImp
var_importance_boruta<-var_importance_boruta %>% add_rownames('variable')
var_importance_boruta_final<-apply(var_importance_boruta$variable %>% t() ,2, function(x) stringr::str_replace_all(x, pattern="`",replacement = "")) %>% as.data.frame()
names(var_importance_boruta_final)<-"variable"
var_importance_boruta_final
var_importance<-c()
for (i in 1:100){
vines_dataset_bootstrap<-rsample::analysis(resamples$splits[[i]])
val_dataset <- vines_dataset_bootstrap %>% group_by(Calidad) %>% sample_n(2) %>% ungroup()
train_dataset <-dplyr::setdiff(vines_dataset_numeric_reduced,val_dataset)
tree<-rpart::rpart(Calidad~.,
data=train_dataset,
control = rpart.control(minsplit = 10),)
var_importance<- c(var_importance,tree$variable.importance[1:num_of_feat])
#rpart.plot(tree,type=1,
# extra=101, box.palette="GnBu",
# branch.lty=3, shadow.col="gray", nn=TRUE
# )
}
var_importance_cart_final<-data.frame(variable=names(var_importance),value=var_importance) %>% group_by(variable) %>% summarise(n=n()) %>% arrange(desc(n)) %>% top_n(20)
Selecting by n
var_importance_cart_final %>% as.data.frame()
vines_dataset_bootstrap<-rsample::analysis(resamples$splits[[sample(1:100,1)]])
val_dataset <- vines_dataset_bootstrap %>% group_by(Calidad) %>% sample_n(2) %>% ungroup()
train_dataset <-dplyr::setdiff(vines_dataset_numeric_reduced,val_dataset)
rf_names<-apply(names(train_dataset) %>% t() ,2, function(x) stringr::str_replace_all(x, pattern=" ",replacement = "_"))
rf_names<-apply(rf_names %>% t() ,2, function(x) stringr::str_replace_all(x, pattern="[/()%°>]+",replacement = "_"))
train_dataset_rf<-train_dataset
names(train_dataset_rf)<-rf_names
rf<-randomForest::randomForest(Calidad ~ .,data=train_dataset_rf,importance=TRUE)
var_importance_rf_final<-randomForest::importance(rf) %>% as.data.frame() %>% arrange(desc(MeanDecreaseGini)) %>% add_rownames('variable') %>% top_n(num_of_feat) %>% select(variable)
Selecting by MeanDecreaseGini
var_importance_rf_final<- data.frame(variable=names(train_dataset[,which(names(train_dataset_rf) %in% var_importance_rf_final$variable)]) )
var_importance_rf_final
library(glmnet)
vines_dataset_bootstrap<-rsample::analysis(resamples$splits[[sample(1:100,1)]])
val_dataset <- vines_dataset_bootstrap %>% group_by(Calidad) %>% sample_n(2) %>% ungroup()
train_dataset <-dplyr::setdiff(vines_dataset_numeric_reduced,val_dataset)
glmfit<-cv.glmnet(x = train_dataset %>% select(-Calidad) %>% as.matrix(),
y = train_dataset$Calidad %>% as.factor(),
family = "multinomial")
one multinomial or binomial class has fewer than 8 observations; dangerous groundone multinomial or binomial class has fewer than 8 observations; dangerous groundone multinomial or binomial class has fewer than 8 observations; dangerous groundone multinomial or binomial class has fewer than 8 observations; dangerous groundone multinomial or binomial class has fewer than 8 observations; dangerous groundone multinomial or binomial class has fewer than 8 observations; dangerous groundone multinomial or binomial class has fewer than 8 observations; dangerous ground
#coef(glmfit,s = 'lambda.min')
'%ni%'<-Negate('%in%')
c<-coef(glmfit,s='lambda.min',exact=TRUE)
var_importance_glmnet<-purrr::map(c, function(x) {
inds <- which(x != 0)
variables <- row.names(x)[inds]
variables <- variables[variables %ni% '(Intercept)']
})
#do.call(rbind,var_importance_glmnet) %>% as.data.frame()
var_importance_glmnet_final<-var_importance_glmnet %>% unlist() %>% unique() %>% as.data.frame()
names(var_importance_glmnet_final)<-"variable"
var_importance_glmnet_final
var_importance_glmnet
$A0
[1] "uva_ANT(mg/g hollejo)" "planta_Nro de Racimos" "eco_GDA historico"
$A1
[1] "uva_C hollejo" "uva_TAN(mg/g hollejo)" "planta_Nro de Racimos"
[4] "planta_Rendimiento" "planta_Long Cordon" "uva_pH mosto"
[7] "uva_Acidez Mosto" "planta_Aspecto" "planta_ndvi"
[10] "suelo_Vol. Sedimentacion (%)" "eco_Días con Heladas Temp 2020"
$A2
[1] "uva_FT(mg/g baya)" "uva_TAN(mg/g hollejo)" "uva_Acti antirr hollejo"
[4] "planta_Peso de poda" "planta_Long Cordon" "planta_Ravaz"
[7] "planta_Bayas/racimo" "planta_ndvi" "eco_Horas de Frío Temp 2020"
[10] "eco_Días con Heladas Temp 2020"
$A3
[1] "uva_TAN(mg/g hollejo)" "planta_Peso de baya"
[3] "planta_Bayas/racimo" "planta_Aspecto"
[5] "planta_ndvi" "suelo_Vol. Sedimentacion (%)"
[7] "eco_Radiacíon W/m2 Temp 2020" "eco_Velocidad Media Viento km/h Temp 2020"
$A5
[1] "uva_h hollejo" "uva_ANT(mg/g hollejo)" "planta_Rendimiento"
[4] "planta_Peso de poda" "ubi_Altura s.n.m." "suelo_Indice fondo Valle"
[7] "suelo_Vol. Sedimentacion (%)" "eco_Radiacíon W/m2 Temp 2020"
selected_variables<-var_importance_boruta_final
selected_variables
NA
NA
NA
vines_dataset_numeric_reduced %>% select(selected_variables$variable) %>% reshape2::melt() %>%
ggplot()+
facet_wrap(~variable,scales = 'free',ncol = 10)+
geom_boxplot(aes(x=variable,y=value,fill=variable),color='gray')+
ggdark::dark_theme_bw()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
theme(legend.position="none",
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())+
theme( strip.text = element_text(size = 6))
No id variables; using all as measure variables
NA
calidad_index<-vines_dataset_numeric_reduced %>%
select(selected_variables$variable,Calidad)
calidad_index<- which(colnames(calidad_index)=="Calidad")
library(FactoMineR)
res_pca = PCA(vines_dataset_numeric_reduced %>%
select(selected_variables$variable,Calidad)
, scale.unit=TRUE,
ncp=6,
graph=F,
quali.sup=calidad_index, #colid for Calidad
)
plot(res_pca,choix="ind",habillage=calidad_index)
#par(mfcol=c(1,2))
plot(res_pca,choix="var",habillage="none",invisible = "ind") # para las variables
plotellipses(res_pca, invisible="ind",xlim=c(-6,6),ylim=c(-6,6))
NA
NA
vines_dataset_bootstrap<-rsample::analysis(resamples$splits[[sample(1:100,1)]])
val_dataset <- vines_dataset_bootstrap %>% group_by(Calidad) %>% sample_n(2) %>% ungroup()
train_dataset <-dplyr::setdiff(vines_dataset_numeric_reduced,val_dataset)
tree<-rpart::rpart(Calidad~.,
data=train_dataset %>%
select(selected_variables$variable,Calidad),
control = rpart.control(minsplit = 5),)
rpart.plot(tree,type=1,
extra=101, box.palette="GnBu",
branch.lty=3, shadow.col="gray", nn=TRUE
)
predictions<-predict(tree,val_dataset,type = 'class')
caret::confusionMatrix(val_dataset$Calidad %>% as.factor(),predictions)
Confusion Matrix and Statistics
Reference
Prediction A0 A1 A2 A3 A5
A0 2 0 0 0 0
A1 0 2 0 0 0
A2 0 0 2 0 0
A3 0 0 0 2 0
A5 0 0 0 0 2
Overall Statistics
Accuracy : 1
95% CI : (0.6915, 1)
No Information Rate : 0.2
P-Value [Acc > NIR] : 1.024e-07
Kappa : 1
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: A0 Class: A1 Class: A2 Class: A3 Class: A5
Sensitivity 1.0 1.0 1.0 1.0 1.0
Specificity 1.0 1.0 1.0 1.0 1.0
Pos Pred Value 1.0 1.0 1.0 1.0 1.0
Neg Pred Value 1.0 1.0 1.0 1.0 1.0
Prevalence 0.2 0.2 0.2 0.2 0.2
Detection Rate 0.2 0.2 0.2 0.2 0.2
Detection Prevalence 0.2 0.2 0.2 0.2 0.2
Balanced Accuracy 1.0 1.0 1.0 1.0 1.0
#printcp(tree)
tree$variable.importance %>% as.data.frame()
library(caret)
library(doMC)
registerDoMC(cores = 4)
ctrl_fast <- trainControl(
method = "repeatedcv",
repeats = 3,
number = 10,
returnResamp = 'final',
savePredictions = 'final',
verboseIter = F,
classProbs = TRUE,
allowParallel = T
)
#rf_grid <- expand.grid(.mtry = c(5))
cartFit <- caret::train(
x = vines_dataset_numeric_reduced %>%
select(selected_variables$variable) %>% na.omit(),
y = vines_dataset_numeric_reduced %>%
select(Calidad) %>% unlist() %>% as.factor(),
method = "rpart",
tuneLength=10,
#tuneGrid = rf_grid,
#verbose = 2,
trControl = ctrl_fast,
#ntree = 200
)
cartFit$results %>%
ggplot(aes(x = cp, y = Accuracy)) +
geom_point(color = 'red') +
geom_errorbar(
aes(ymin = Accuracy - AccuracySD, ymax = Accuracy + AccuracySD),
width = .02,
color = 'yellow'
) +
ggdark::dark_theme_bw() +
labs(title="CART: Mean and Standard deviation after hyper-parameter (cp) tuning")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
cartFit
CART
77 samples
20 predictors
5 classes: 'A0', 'A1', 'A2', 'A3', 'A5'
No pre-processing
Resampling: Cross-Validated (10 fold, repeated 3 times)
Summary of sample sizes: 70, 70, 70, 70, 68, 70, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.00000000 0.7126323 0.63882743
0.03434343 0.6941799 0.61294049
0.06868687 0.6500000 0.55062754
0.10303030 0.6419974 0.53926611
0.13737374 0.5918651 0.46255796
0.17171717 0.5008598 0.33608628
0.20606061 0.4876323 0.31820216
0.24040404 0.4263889 0.21813021
0.27474747 0.3949074 0.16209794
0.30909091 0.3172619 0.05389871
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.
set.seed(21052025)
cart_data <-
learning_curve_dat(dat = vines_dataset_numeric_reduced %>%
select(selected_variables$variable,Calidad),
outcome = "Calidad",
#test_prop = 0.6,
proportion = seq(0.2,1,0.1),
## `train` arguments1
method = "rpart",
metric = "Accuracy",
trControl = ctrl_fast,
verbose = F)
There were missing values in resampled performance measures.There were missing values in resampled performance measures.There were missing values in resampled performance measures.
ggplot(cart_data, aes(x = Training_Size, y = Accuracy, color = Data)) +
geom_smooth(method = loess, span = .8) +
ggdark::dark_theme_bw()+
labs(title="CART: Learning curves on training and resampled datasets")
NA
NA
#rf_grid <- expand.grid(.mtry = c(5))
rfFit <- caret::train(
x = vines_dataset_numeric_reduced %>%
select(selected_variables$variable) %>% na.omit(),
y = vines_dataset_numeric_reduced %>%
select(Calidad) %>% unlist() %>% as.factor(),
method = "rf",
tuneLength=10,
#tuneGrid = rf_grid,
#verbose = 2,
trControl = ctrl_fast,
#ntree = 200
)
rfFit$results %>%
ggplot(aes(x = mtry, y = Accuracy)) +
geom_point(color = 'red') +
geom_errorbar(
aes(ymin = Accuracy - AccuracySD, ymax = Accuracy + AccuracySD),
width = .02,
color = 'yellow'
) +
ggdark::dark_theme_bw() +
labs(title="Random Forest: Mean and Standard deviation after hyper-parameter (mtry) tuning")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
rfFit
Random Forest
77 samples
20 predictors
5 classes: 'A0', 'A1', 'A2', 'A3', 'A5'
No pre-processing
Resampling: Cross-Validated (10 fold, repeated 3 times)
Summary of sample sizes: 68, 69, 70, 69, 70, 70, ...
Resampling results across tuning parameters:
mtry Accuracy Kappa
2 0.9726190 0.9646631
4 0.9726190 0.9646631
6 0.9630952 0.9522164
8 0.9541667 0.9404679
10 0.9583333 0.9457442
12 0.9494048 0.9341616
14 0.9589286 0.9466083
16 0.9494048 0.9339957
18 0.9541667 0.9403020
20 0.9494048 0.9339957
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.
set.seed(21052025)
rf_data <-
learning_curve_dat(dat = vines_dataset_numeric_reduced %>%
select(selected_variables$variable,Calidad),
outcome = "Calidad",
#test_prop = 0.6,
proportion = seq(0.2,1,0.1),
## `train` arguments1
method = "rf",
metric = "Accuracy",
trControl = ctrl_fast,
verbose = F)
There were missing values in resampled performance measures.There were missing values in resampled performance measures.
ggplot(rf_data, aes(x = Training_Size, y = Accuracy, color = Data)) +
geom_smooth(method = loess, span = .8) +
ggdark::dark_theme_bw()+
labs(title="Random Forests: Learning curves on training and resampled datasets")
glm_results %>% ggplot(aes(x = parameters, y = Accuracy)) +
geom_point(color = 'red') +
geom_errorbar(
aes(ymin = Accuracy - AccuracySD, ymax = Accuracy + AccuracySD),
width = 0.02,
color = 'yellow'
) +
ggdark::dark_theme_bw() +
labs(title="Elastic Net: Mean and Standard deviation after hyper-parameter (mtry) tuning")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
theme(axis.text=element_text(size=6))
glmFit
glmnet
77 samples
20 predictors
5 classes: 'A0', 'A1', 'A2', 'A3', 'A5'
No pre-processing
Resampling: Cross-Validated (10 fold, repeated 3 times)
Summary of sample sizes: 69, 69, 69, 69, 70, 69, ...
Resampling results across tuning parameters:
alpha lambda Accuracy Kappa
0.1 0.0001221811 0.9520503 0.9378532
0.1 0.0002822542 0.9520503 0.9378532
0.1 0.0006520439 0.9520503 0.9378532
0.1 0.0015063059 0.9520503 0.9378532
0.1 0.0034797622 0.9478836 0.9326244
0.1 0.0080387021 0.9389550 0.9214128
0.1 0.0185704450 0.9389550 0.9214128
0.1 0.0429001375 0.9160053 0.8914522
0.1 0.0991048843 0.8819444 0.8472580
0.1 0.2289451423 0.8437169 0.7960484
0.2 0.0001221811 0.9520503 0.9378532
0.2 0.0002822542 0.9520503 0.9378532
0.2 0.0006520439 0.9520503 0.9378532
0.2 0.0015063059 0.9520503 0.9378532
0.2 0.0034797622 0.9431217 0.9266415
0.2 0.0080387021 0.9389550 0.9214128
0.2 0.0185704450 0.9333995 0.9142065
0.2 0.0429001375 0.9160053 0.8914522
0.2 0.0991048843 0.8736111 0.8362648
0.2 0.2289451423 0.8232143 0.7678246
0.3 0.0001221811 0.9603836 0.9490925
0.3 0.0002822542 0.9603836 0.9490925
0.3 0.0006520439 0.9562169 0.9436503
0.3 0.0015063059 0.9520503 0.9378532
0.3 0.0034797622 0.9431217 0.9266415
0.3 0.0080387021 0.9389550 0.9216139
0.3 0.0185704450 0.9389550 0.9216139
0.3 0.0429001375 0.9160053 0.8914522
0.3 0.0991048843 0.8694444 0.8309315
0.3 0.2289451423 0.8008598 0.7354319
0.4 0.0001221811 0.9659392 0.9564999
0.4 0.0002822542 0.9603836 0.9490925
0.4 0.0006520439 0.9603836 0.9490925
0.4 0.0015063059 0.9603836 0.9490925
0.4 0.0034797622 0.9431217 0.9268426
0.4 0.0080387021 0.9389550 0.9216139
0.4 0.0185704450 0.9389550 0.9216139
0.4 0.0429001375 0.9029101 0.8744275
0.4 0.0991048843 0.8652778 0.8246074
0.4 0.2289451423 0.7445767 0.6602432
0.5 0.0001221811 0.9659392 0.9564999
0.5 0.0002822542 0.9659392 0.9564999
0.5 0.0006520439 0.9659392 0.9564999
0.5 0.0015063059 0.9603836 0.9490925
0.5 0.0034797622 0.9514550 0.9380819
0.5 0.0080387021 0.9389550 0.9216139
0.5 0.0185704450 0.9389550 0.9216139
0.5 0.0429001375 0.8939815 0.8630067
0.5 0.0991048843 0.8568122 0.8133011
0.5 0.2289451423 0.6128307 0.4747423
0.6 0.0001221811 0.9659392 0.9564999
0.6 0.0002822542 0.9659392 0.9564999
0.6 0.0006520439 0.9659392 0.9564999
0.6 0.0015063059 0.9659392 0.9567010
0.6 0.0034797622 0.9570106 0.9454893
0.6 0.0080387021 0.9472884 0.9328532
0.6 0.0185704450 0.9389550 0.9216139
0.6 0.0429001375 0.8939815 0.8630067
0.6 0.0991048843 0.8441799 0.7960461
0.6 0.2289451423 0.5425265 0.3780085
0.7 0.0001221811 0.9659392 0.9564999
0.7 0.0002822542 0.9659392 0.9564999
0.7 0.0006520439 0.9659392 0.9564999
0.7 0.0015063059 0.9659392 0.9567010
0.7 0.0034797622 0.9570106 0.9454893
0.7 0.0080387021 0.9528439 0.9402606
0.7 0.0185704450 0.9389550 0.9216139
0.7 0.0429001375 0.8902778 0.8579247
0.7 0.0991048843 0.8460317 0.7983793
0.7 0.2289451423 0.5170635 0.3432009
0.8 0.0001221811 0.9659392 0.9564999
0.8 0.0002822542 0.9659392 0.9564999
0.8 0.0006520439 0.9659392 0.9567010
0.8 0.0015063059 0.9659392 0.9567010
0.8 0.0034797622 0.9570106 0.9454893
0.8 0.0080387021 0.9528439 0.9402606
0.8 0.0185704450 0.9431217 0.9270561
0.8 0.0429001375 0.8819444 0.8472580
0.8 0.0991048843 0.8376984 0.7877634
0.8 0.2289451423 0.5072090 0.3250596
0.9 0.0001221811 0.9659392 0.9564999
0.9 0.0002822542 0.9659392 0.9567010
0.9 0.0006520439 0.9659392 0.9567010
0.9 0.0015063059 0.9617725 0.9514722
0.9 0.0034797622 0.9570106 0.9454893
0.9 0.0080387021 0.9570106 0.9454893
0.9 0.0185704450 0.9528439 0.9402606
0.9 0.0429001375 0.8736111 0.8354917
0.9 0.0991048843 0.8555556 0.8118500
0.9 0.2289451423 0.4982804 0.3128374
1.0 0.0001221811 0.9576058 0.9460343
1.0 0.0002822542 0.9617725 0.9513676
1.0 0.0006520439 0.9576058 0.9460343
1.0 0.0015063059 0.9576058 0.9460343
1.0 0.0034797622 0.9528439 0.9401560
1.0 0.0080387021 0.9486772 0.9348227
1.0 0.0185704450 0.9383598 0.9217486
1.0 0.0429001375 0.8750000 0.8389257
1.0 0.0991048843 0.8472222 0.8013882
1.0 0.2289451423 0.4423280 0.2364550
Accuracy was used to select the optimal model using the largest value.
The final values used for the model were alpha = 0.6 and lambda = 0.001506306.