Cleaning the RStudio environment
rm(list = ls()) # clean global environment
cat("\014") # clean the console
Default directory
# CHANGE TO THE CORRESPONDING WORKING DIRECTORY
knitr::opts_knit$set(root.dir = '/home/harpo/Dropbox/ongoing-work/git-repos/inta-vine-quality/')
#knitr::opts_knit$set(root.dir = '/home/rodralez/hostdir/jobs/demand-planning/')
#vines_dataset<-readr::read_csv("datasets/vine_train_newnames.csv", col_types =cols())
#vines_dataset
vines_dataset<-readr::read_csv("rawdata/2022/vine_raw.csv",col_types = cols(
Temporada = col_character(),
ID = col_double(),
ubi_Finca = col_character(),
ubi_x = col_double(),
ubi_y = col_double(),
`ubi_Altura s.n.m.` = col_double(),
ubi_Zona = col_character(),
ubi_Distrito = col_character(),
`uva_L hollejo` = col_double(),
uva_a_hollejo = col_double(),
uva_b_hollejo = col_double(),
uva_h_hollejo = col_double(),
uva_C_hollejo = col_double(),
`uva_FT(mg/ghollejo)` = col_double(),
`uva_FT(mg/gbaya)` = col_double(),
`uva_FT(mg/baya)` = col_double(),
`uva_TAN(mg/ghollejo)` = col_double(),
`uva_TAN(mg/gbaya)` = col_double(),
`uva_TAN(mg/baya)` = col_double(),
`uva_ANT(mg/ghollejo)` = col_double(),
`uva_ANT(mg/gbaya)` = col_double(),
`uva_ANT(mg/baya)` = col_double(),
uva_Acti_antirr_hollejo = col_double(),
`Uva_Semillas/baya` = col_double(),
`uva_Peso semillas/baya` = col_double(),
uva_Brix = col_double(),
uva_pH_mosto = col_double(),
`uva_Acidez Mosto` = col_double(),
`uva_ NPA_Enz(mg/L)` = col_double(),
`uva_ Potasio_Enz(mg/L)` = col_double(),
`uva_ NPA_Foss(mg/L)` = col_double(),
`uva_ Potasio_Foss(mg/L)` = col_double(),
planta_Peso_racimo = col_double(),
planta_Peso_de_baya = col_double(),
`planta_Bayas/racimo` = col_double(),
planta_Ravaz = col_double(),
`planta_Long Cordon` = col_double(),
`planta_Nro de Racimos/m` = col_double(),
`planta_ Peso de poda/m` = col_double(),
`planta_Dist. hileras` = col_double(),
`planta_Dist. plantas` = col_double(),
`planta_Plantas/ha` = col_double(),
`planta_Rnto/ha` = col_double(),
planta_ndvi = col_double(),
suelo_Aspecto = col_double(),
`suelo_Vol. Sedimentacion (%)` = col_double(),
suelo_Textura = col_character(),
`suelo_Indice fondo Valle` = col_double(),
suelo_Pendiente = col_double(),
`eco_Estacion datos clima` = col_character(),
`eco_GDA historico` = col_double(),
`eco_Temp. Mínima °C` = col_double(),
`eco_Temp. Media °C` = col_double(),
`eco_Temp. Máxima °C` = col_double(),
`eco_Amplitud Térmica °C` = col_double(),
`eco_Precipitaciones (mm) Temp 2020` = col_double(),
`eco_Grados Días Acum. Temp` = col_double(),
`eco_Evapotranspiración mm Temp` = col_double(),
`eco_Días con Temp.>35°C Temp` = col_double(),
`eco_Radiacíon W/m2 Temp` = col_double(),
`eco_Velocidad Media Viento km/h Temp` = col_double(),
`eco_I. de Fresco Nocturno Temp` = col_double(),
`eco_Temp. Media Marzo °C Temp` = col_double(),
`eco_Ampl. Térmica Marzo °C Temp` = col_double(),
`eco_Horas de Frío Temp` = col_double(),
Calidad = col_character()
)
)
#spec(vines_dataset)
#vines_dataset <- vines_dataset %>% filter(Temporada == 2020)
vines_dataset %>% group_by(Temporada) %>% count()
Training dataset dimension 56 x 211
#removed_numeric_var<-c("ubi_x","ubi_y","ID")
removed_numeric_var<-c("ID","ubi_x","ubi_y")
vines_dataset <- vines_dataset %>% select(-all_of(removed_numeric_var))
missing_vals<-vines_dataset %>% group_by(Temporada) %>% select_if(function(x) any(is.na(x))) %>%
summarise_each(funs(sum(is.na(.)))) %>% reshape2::melt() %>% filter(Temporada == "2020" & value == 87) %>% select(variable)
Using Temporada as id variables
missing_vals<-rbind(missing_vals, "eco_Horas de Frío Temp") %>% unlist() %>% unname
vines_dataset<-vines_dataset %>% select(-all_of(missing_vals))
set.seed(7)
test_dataset <- vines_dataset %>% sample_frac(0.2)
train_dataset <-dplyr::setdiff(vines_dataset, test_dataset)
vines_dataset<-train_dataset
vines_dataset %>% group_by(Temporada) %>% count()
vines_dataset_factors<-vines_dataset %>% select_if(~class(.) == 'character')
names(vines_dataset_factors) %>% as.data.frame()
skimr::skim(vines_dataset_factors %>% select(-Calidad))# %>% knitr::kable() %>% kable_styling(font_size = 9)
── Data Summary ────────────────────────
Values
Name vines_dataset_factors %>%...
Number of rows 211
Number of columns 6
_______________________
Column type frequency:
character 6
________________________
Group variables None
vines_dataset_numeric<-vines_dataset %>% select_if(~class(.) == 'numeric')
names(vines_dataset_numeric) %>% as.data.frame()
skimr::skim(vines_dataset_numeric) %>% arrange(desc(n_missing)) #%>% knitr::kable() %>% kable_styling(font_size = 9)
── Data Summary ────────────────────────
Values
Name vines_dataset_numeric
Number of rows 211
Number of columns 49
_______________________
Column type frequency:
numeric 49
________________________
Group variables None
vines_dataset %>% group_by(Calidad) %>% summarise(total=n()) %>%
ggplot()+
geom_col(aes(x=Calidad,y=total,fill=Calidad))+
ggdark::dark_theme_bw()
res<-VIM::aggr(vines_dataset_numeric,
combined = FALSE ,
numbers = TRUE,
sortCombs = TRUE,
sortVars = TRUE,
labels=names(vines_dataset_numeric),
cex.axis=.4,
varheight = FALSE,
cex.numbers=0.8,
cex.lab=0.8,
prop = TRUE)
Variables sorted by number of missings:
vines_dataset_numeric_imputed <- kNN(vines_dataset_numeric)
vines_dataset_numeric<-vines_dataset_numeric_imputed %>% select(-ends_with("_imp"))
#vines_dataset_numeric %>% select(-NAME)
vines_dataset_numeric_wcalnum <- vines_dataset_numeric %>%
tibble::add_column(Calidad = as.factor(vines_dataset$Calidad)) %>%
tibble::add_column(ubi_Zona = as.factor(vines_dataset$ubi_Zona)) %>%
mutate(Calidad = as.numeric(Calidad),
ubi_Zona = as.numeric(ubi_Zona))
library(d3heatmap)
vines_dataset_numeric_cor_matrix<-cor(vines_dataset_numeric_wcalnum ,method="spearman")
vines_dataset_numeric_cor_matrix %>% as.data.frame() %>% tibble::rownames_to_column("class") %>%
filter(class=="Calidad") %>%
reshape2::melt() %>% select(variable, value) %>%
arrange(desc(abs(value))) %>%
ggplot()+
geom_col(aes(x=variable,y=value,fill=variable))+
ggdark::dark_theme_bw()
Using class as id variables
#heatmap(postop_data_cor_matrix)
d3heatmap(vines_dataset_numeric_cor_matrix ,colors = "Blues",cexRow = 0.8, cexCol = 0.8)
NA
library(d3heatmap)
vines_dataset_numeric_cor_matrix<-cor(vines_dataset_numeric ,method="spearman")
#heatmap(postop_data_cor_matrix)
d3heatmap(vines_dataset_numeric_cor_matrix ,colors = "Blues",cexRow = 0.8, cexCol = 0.8)
library(FactoMineR)
#vines_dataset_numeric %>% select(-highlyCorrelated_var) %>% tibble::add_column(Calidad=vines_dataset$Calidad)
res_pca = PCA(vines_dataset_numeric %>%
tibble::add_column(Calidad=vines_dataset$Calidad)
, scale.unit=TRUE,
ncp=6,
graph=F,
quali.sup=50, #colid for Calidad
)
plot(res_pca,choix="ind",habillage=50)
par(mfcol=c(1,2))
plot(res_pca,choix="var",habillage="none",invisible = "ind") # para las variables
plotellipses(res_pca, invisible="ind",xlim=c(-10,10),ylim=c(-6,6))
#vines_dataset %>% mutate()
#vines_dataset$Calidad %>% as.numeric(as.factor(vines_dataset$Calidad))
100 new datasets used for evaluating feature selection algorithms
vines_dataset_numeric_reduced<-vines_dataset_numeric %>%
select(-all_of(highlyCorrelated_var)) %>%
#select(names(var_importance[1:11])) %>%
tibble::add_column(Calidad=as.factor(vines_dataset$Calidad))
resamples<-rsample::bootstraps(vines_dataset_numeric_reduced,strata= Calidad,times = 100 )
num_of_feat<-5
vines_dataset_bootstrap<-rsample::analysis(resamples$splits[[sample(1:100,1)]])
val_dataset <- vines_dataset_bootstrap %>% group_by(Calidad) %>% sample_n(2) %>% ungroup()
train_dataset <-dplyr::setdiff(vines_dataset_numeric_reduced,val_dataset)
var_importance_boruta<-Boruta(Calidad ~ . ,data=train_dataset)
var_importance_boruta<-attStats(var_importance_boruta) %>% filter(decision=='Confirmed') %>% select(meanImp) %>% arrange(desc(meanImp)) %>% top_n(num_of_feat)
Selecting by meanImp
var_importance_boruta<-var_importance_boruta %>% add_rownames('variable')
var_importance_boruta_final<-apply(var_importance_boruta$variable %>% t() ,2, function(x) stringr::str_replace_all(x, pattern="`",replacement = "")) %>% as.data.frame()
names(var_importance_boruta_final)<-"variable"
var_importance_boruta_final
var_importance<-c()
for (i in 1:100){
vines_dataset_bootstrap<-rsample::analysis(resamples$splits[[i]])
val_dataset <- vines_dataset_bootstrap %>% group_by(Calidad) %>% sample_n(2) %>% ungroup()
train_dataset <-dplyr::setdiff(vines_dataset_numeric_reduced,val_dataset)
tree<-rpart::rpart(Calidad~.,
data=train_dataset,
control = rpart.control(minsplit = 10),)
var_importance<- c(var_importance,tree$variable.importance[1:num_of_feat])
#rpart.plot(tree,type=1,
# extra=101, box.palette="GnBu",
# branch.lty=3, shadow.col="gray", nn=TRUE
# )
}
var_importance_cart_final<-data.frame(variable=names(var_importance),value=var_importance) %>% group_by(variable) %>% summarise(n=n()) %>% arrange(desc(n)) %>% top_n(20)
Selecting by n
var_importance_cart_final %>% as.data.frame()
vines_dataset_bootstrap<-rsample::analysis(resamples$splits[[sample(1:100,1)]])
val_dataset <- vines_dataset_bootstrap %>% group_by(Calidad) %>% sample_n(2) %>% ungroup()
train_dataset <-dplyr::setdiff(vines_dataset_numeric_reduced,val_dataset)
rf_names<-apply(names(train_dataset) %>% t() ,2, function(x) stringr::str_replace_all(x, pattern=" ",replacement = "_"))
rf_names<-apply(rf_names %>% t() ,2, function(x) stringr::str_replace_all(x, pattern="[/()%°>]+",replacement = "_"))
train_dataset_rf<-train_dataset
names(train_dataset_rf)<-rf_names
rf<-randomForest::randomForest(Calidad ~ .,data=train_dataset_rf,importance=TRUE)
var_importance_rf_final<-randomForest::importance(rf) %>% as.data.frame() %>% arrange(desc(MeanDecreaseGini)) %>% add_rownames('variable') %>% top_n(num_of_feat) %>% select(variable)
Selecting by MeanDecreaseGini
var_importance_rf_final<- data.frame(variable=names(train_dataset[,which(names(train_dataset_rf) %in% var_importance_rf_final$variable)]) )
var_importance_rf_final
library(glmnet)
vines_dataset_bootstrap<-rsample::analysis(resamples$splits[[sample(1:100,1)]])
val_dataset <- vines_dataset_bootstrap %>% group_by(Calidad) %>% sample_n(2) %>% ungroup()
train_dataset <-dplyr::setdiff(vines_dataset_numeric_reduced,val_dataset)
glmfit<-cv.glmnet(x = train_dataset %>% select(-Calidad) %>% as.matrix(),
y = train_dataset$Calidad %>% as.factor(),
family = "multinomial")
#coef(glmfit,s = 'lambda.min')
'%ni%'<-Negate('%in%')
c<-coef(glmfit,s='lambda.min',exact=TRUE)
var_importance_glmnet<-purrr::map(c, function(x) {
inds <- which(x != 0)
variables <- row.names(x)[inds]
variables <- variables[variables %ni% '(Intercept)']
})
#do.call(rbind,var_importance_glmnet) %>% as.data.frame()
var_importance_glmnet_final<-var_importance_glmnet %>% unlist() %>% unique() %>% as.data.frame()
names(var_importance_glmnet_final)<-"variable"
var_importance_glmnet_final
var_importance_glmnet
$A0
[1] "uva_L hollejo" "uva_a_hollejo" "uva_ANT(mg/gbaya)"
[4] "uva_Acti_antirr_hollejo" "uva_Acidez Mosto" "planta_Long Cordon"
[7] "planta_Nro de Racimos/m" "planta_ Peso de poda/m" "planta_Dist. hileras"
[10] "planta_Rnto/ha" "suelo_Aspecto" "suelo_Vol. Sedimentacion (%)"
[13] "eco_GDA historico" "eco_Temp. Mínima °C" "eco_Evapotranspiración mm Temp"
[16] "eco_Velocidad Media Viento km/h Temp" "eco_Ampl. Térmica Marzo °C Temp"
$A1
[1] "ubi_Altura s.n.m." "uva_a_hollejo" "uva_b_hollejo" "uva_FT(mg/gbaya)"
[5] "uva_FT(mg/baya)" "uva_Brix" "uva_Acidez Mosto" "planta_Long Cordon"
[9] "planta_Dist. hileras" "planta_Plantas/ha" "planta_ndvi" "suelo_Vol. Sedimentacion (%)"
[13] "suelo_Indice fondo Valle" "eco_Precipitaciones (mm) Temp 2020" "eco_Evapotranspiración mm Temp" "eco_I. de Fresco Nocturno Temp"
[17] "eco_Ampl. Térmica Marzo °C Temp"
$A2
[1] "ubi_Altura s.n.m." "uva_L hollejo" "uva_h_hollejo" "uva_FT(mg/gbaya)"
[5] "uva_FT(mg/baya)" "uva_ANT(mg/gbaya)" "uva_Acti_antirr_hollejo" "uva_Brix"
[9] "uva_pH_mosto" "uva_Acidez Mosto" "planta_Peso_racimo" "planta_Long Cordon"
[13] "planta_ Peso de poda/m" "planta_Plantas/ha" "suelo_Aspecto" "eco_Temp. Máxima °C"
[17] "eco_Precipitaciones (mm) Temp 2020" "eco_I. de Fresco Nocturno Temp"
$A3
[1] "uva_L hollejo" "uva_a_hollejo" "uva_TAN(mg/baya)"
[4] "uva_ANT(mg/gbaya)" "uva_pH_mosto" "uva_Acidez Mosto"
[7] "planta_Bayas/racimo" "planta_Ravaz" "planta_Nro de Racimos/m"
[10] "planta_Dist. hileras" "planta_Dist. plantas" "planta_Rnto/ha"
[13] "planta_ndvi" "suelo_Aspecto" "suelo_Vol. Sedimentacion (%)"
[16] "suelo_Indice fondo Valle" "eco_GDA historico" "eco_Temp. Mínima °C"
[19] "eco_Temp. Media °C" "eco_Temp. Máxima °C" "eco_Velocidad Media Viento km/h Temp"
[22] "eco_I. de Fresco Nocturno Temp"
$A5
[1] "ubi_Altura s.n.m." "uva_a_hollejo" "uva_TAN(mg/baya)"
[4] "uva_ANT(mg/gbaya)" "uva_Acti_antirr_hollejo" "uva_Acidez Mosto"
[7] "planta_Peso_racimo" "planta_ Peso de poda/m" "planta_Dist. hileras"
[10] "planta_Dist. plantas" "planta_ndvi" "suelo_Vol. Sedimentacion (%)"
[13] "suelo_Indice fondo Valle" "eco_Temp. Mínima °C" "eco_Temp. Máxima °C"
[16] "eco_Radiacíon W/m2 Temp" "eco_Velocidad Media Viento km/h Temp"
$B
[1] "ubi_Altura s.n.m." "planta_Nro de Racimos/m" "planta_Plantas/ha" "planta_Rnto/ha" "suelo_Vol. Sedimentacion (%)"
[6] "suelo_Indice fondo Valle" "eco_Radiacíon W/m2 Temp"
$C
[1] "uva_pH_mosto" "planta_Long Cordon" "planta_Nro de Racimos/m" "planta_ Peso de poda/m"
[5] "planta_Dist. hileras" "planta_Rnto/ha" "suelo_Aspecto" "suelo_Vol. Sedimentacion (%)"
[9] "suelo_Pendiente" "eco_GDA historico" "eco_Días con Temp.>35°C Temp"
selected_variables<-var_importance_boruta_final
#selected_variables<-data.frame(variable=c("planta_Peso_racimo",
#"uva_Acidez Mosto",
#"uva_TAN(mg/baya)",
#"planta_Bayas/racimo",
#"eco_Amplitud Térmica °C"))
vines_dataset_numeric_reduced %>% select(selected_variables$variable) %>% reshape2::melt() %>%
ggplot()+
facet_wrap(~variable,scales = 'free',ncol = 10)+
geom_boxplot(aes(x=variable,y=value,fill=variable),color='gray')+
ggdark::dark_theme_bw()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
theme(legend.position="none",
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())+
theme( strip.text = element_text(size = 6))
No id variables; using all as measure variables
NA
calidad_index<-vines_dataset_numeric_reduced %>%
select(selected_variables$variable,Calidad)
calidad_index<- which(colnames(calidad_index)=="Calidad")
library(FactoMineR)
res_pca = PCA(vines_dataset_numeric_reduced %>%
select(selected_variables$variable,Calidad)
, scale.unit=TRUE,
ncp=6,
graph=F,
quali.sup=calidad_index, #colid for Calidad
)
plot(res_pca,choix="ind",habillage=calidad_index)
#par(mfcol=c(1,2))
plot(res_pca,choix="var",habillage="none",invisible = "ind") # para las variables
plotellipses(res_pca, invisible="ind",xlim=c(-6,6),ylim=c(-6,6))
NA
NA
vines_dataset_bootstrap<-rsample::analysis(resamples$splits[[sample(1:100,1)]])
val_dataset <- vines_dataset_bootstrap %>% group_by(Calidad) %>% sample_n(2) %>% ungroup()
train_dataset <-dplyr::setdiff(vines_dataset_numeric_reduced,val_dataset)
tree<-rpart::rpart(Calidad~.,
data=train_dataset %>%
select(selected_variables$variable,Calidad),
control = rpart.control(minsplit = 5),)
rpart.plot(tree,type=1,
extra=101, box.palette="GnBu",
branch.lty=3, shadow.col="gray", nn=TRUE
)
predictions<-predict(tree,val_dataset,type = 'class')
caret::confusionMatrix(val_dataset$Calidad %>% as.factor(),predictions)
Confusion Matrix and Statistics
Reference
Prediction A0 A1 A2 A3 A5 B C
A0 2 0 0 0 0 0 0
A1 0 2 0 0 0 0 0
A2 0 0 2 0 0 0 0
A3 1 0 0 1 0 0 0
A5 0 0 0 0 2 0 0
B 0 0 0 0 0 2 0
C 0 0 0 0 0 0 2
Overall Statistics
Accuracy : 0.9286
95% CI : (0.6613, 0.9982)
No Information Rate : 0.2143
P-Value [Acc > NIR] : 2.253e-08
Kappa : 0.9167
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: A0 Class: A1 Class: A2 Class: A3 Class: A5 Class: B Class: C
Sensitivity 0.6667 1.0000 1.0000 1.00000 1.0000 1.0000 1.0000
Specificity 1.0000 1.0000 1.0000 0.92308 1.0000 1.0000 1.0000
Pos Pred Value 1.0000 1.0000 1.0000 0.50000 1.0000 1.0000 1.0000
Neg Pred Value 0.9167 1.0000 1.0000 1.00000 1.0000 1.0000 1.0000
Prevalence 0.2143 0.1429 0.1429 0.07143 0.1429 0.1429 0.1429
Detection Rate 0.1429 0.1429 0.1429 0.07143 0.1429 0.1429 0.1429
Detection Prevalence 0.1429 0.1429 0.1429 0.14286 0.1429 0.1429 0.1429
Balanced Accuracy 0.8333 1.0000 1.0000 0.96154 1.0000 1.0000 1.0000
#printcp(tree)
tree$variable.importance %>% as.data.frame()
library(caret)
library(doMC)
registerDoMC(cores = 4)
ctrl_fast <- trainControl(
method = "repeatedcv",
repeats = 3,
number = 10,
returnResamp = 'final',
savePredictions = 'final',
verboseIter = F,
classProbs = TRUE,
allowParallel = T
)
#rf_grid <- expand.grid(.mtry = c(5))
cartFit <- caret::train(
x = vines_dataset_numeric_reduced %>%
select(selected_variables$variable) %>% na.omit(),
y = vines_dataset_numeric_reduced %>%
select(Calidad) %>% unlist() %>% as.factor(),
method = "rpart",
tuneLength=10,
#tuneGrid = rf_grid,
#verbose = 2,
trControl = ctrl_fast,
#ntree = 200
)
cartFit$results %>%
ggplot(aes(x = cp, y = Accuracy)) +
geom_point(color = 'red') +
geom_errorbar(
aes(ymin = Accuracy - AccuracySD, ymax = Accuracy + AccuracySD),
width = .02,
color = 'yellow'
) +
ggdark::dark_theme_bw() +
labs(title="CART: Mean and Standard deviation after hyper-parameter (cp) tuning")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
cartFit
CART
211 samples
5 predictor
7 classes: 'A0', 'A1', 'A2', 'A3', 'A5', 'B', 'C'
No pre-processing
Resampling: Cross-Validated (10 fold, repeated 3 times)
Summary of sample sizes: 191, 191, 189, 191, 190, 191, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.00000000 0.7713507 0.72035640
0.02150538 0.7473188 0.69145307
0.04301075 0.7282070 0.66772317
0.06451613 0.6674427 0.59189092
0.08602151 0.5755858 0.47388470
0.10752688 0.5475735 0.43632317
0.12903226 0.4458962 0.28586830
0.15053763 0.3708944 0.16949186
0.17204301 0.3521282 0.14149779
0.19354839 0.3063336 0.07099667
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.
cart_data <-
learning_curve_dat(dat = vines_dataset_numeric_reduced %>%
select(selected_variables$variable,Calidad),
outcome = "Calidad",
#test_prop = 0.6,
proportion = seq(0.3,1,0.1),
## `train` arguments1
method = "rpart",
metric = "Accuracy",
trControl = ctrl_fast,
verbose = F)
ggplot(cart_data, aes(x = Training_Size, y = Accuracy, color = Data)) +
geom_smooth(method = loess, span = .8) +
ggdark::dark_theme_bw()+
labs(title="CART: Learning curves on training and resampled datasets")
NA
NA
#rf_grid <- expand.grid(.mtry = c(5))
rfFit <- caret::train(
x = vines_dataset_numeric_reduced %>%
select(selected_variables$variable) %>% na.omit(),
y = vines_dataset_numeric_reduced %>%
select(Calidad) %>% unlist() %>% as.factor(),
method = "rf",
tuneLength=10,
#tuneGrid = rf_grid,
#verbose = 2,
trControl = ctrl_fast,
#ntree = 200
)
note: only 4 unique complexity parameters in default grid. Truncating the grid to 4 .
rfFit$results %>%
ggplot(aes(x = mtry, y = Accuracy)) +
geom_point(color = 'red') +
geom_errorbar(
aes(ymin = Accuracy - AccuracySD, ymax = Accuracy + AccuracySD),
width = .02,
color = 'yellow'
) +
ggdark::dark_theme_bw() +
labs(title="Random Forest: Mean and Standard deviation after hyper-parameter (mtry) tuning")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
rfFit
Random Forest
211 samples
5 predictor
7 classes: 'A0', 'A1', 'A2', 'A3', 'A5', 'B', 'C'
No pre-processing
Resampling: Cross-Validated (10 fold, repeated 3 times)
Summary of sample sizes: 190, 190, 190, 189, 189, 191, ...
Resampling results across tuning parameters:
mtry Accuracy Kappa
2 0.9452368 0.9333587
3 0.9375754 0.9240269
4 0.9346109 0.9203579
5 0.9316600 0.9166846
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.
set.seed(21052025)
rf_data <-
learning_curve_dat(dat = vines_dataset_numeric_reduced %>%
select(selected_variables$variable,Calidad),
outcome = "Calidad",
#test_prop = 0.6,
proportion = seq(0.3,1,0.1),
## `train` arguments1
method = "ranger",
metric = "Accuracy",
trControl = ctrl_fast,
verbose = F)
ggplot(rf_data, aes(x = Training_Size, y = Accuracy, color = Data)) +
geom_smooth(method = loess, span = .8) +
ggdark::dark_theme_bw()+
labs(title="Random Forests: Learning curves on training and resampled datasets")
test_dataset_numeric<-test_dataset %>% select_if(~class(.) == 'numeric')
test_dataset_numeric_imputed <- kNN(test_dataset_numeric)
test_dataset_numeric<-test_dataset_numeric_imputed %>% select(-ends_with("_imp"))
preds<-predict(rfFit,test_dataset_numeric %>%
select(selected_variables$variable))
caret::confusionMatrix(data = as.factor(preds),
reference = as.factor(test_dataset$Calidad)
)
Confusion Matrix and Statistics
Reference
Prediction A0 A1 A2 A3 A5 B C
A0 8 0 1 0 0 0 0
A1 0 9 0 0 0 0 0
A2 0 0 9 0 0 0 0
A3 0 0 0 11 0 0 0
A5 0 0 0 1 2 0 0
B 0 0 0 0 0 7 0
C 0 0 0 0 0 0 5
Overall Statistics
Accuracy : 0.9623
95% CI : (0.8702, 0.9954)
No Information Rate : 0.2264
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.955
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: A0 Class: A1 Class: A2 Class: A3 Class: A5 Class: B Class: C
Sensitivity 1.0000 1.0000 0.9000 0.9167 1.00000 1.0000 1.00000
Specificity 0.9778 1.0000 1.0000 1.0000 0.98039 1.0000 1.00000
Pos Pred Value 0.8889 1.0000 1.0000 1.0000 0.66667 1.0000 1.00000
Neg Pred Value 1.0000 1.0000 0.9773 0.9762 1.00000 1.0000 1.00000
Prevalence 0.1509 0.1698 0.1887 0.2264 0.03774 0.1321 0.09434
Detection Rate 0.1509 0.1698 0.1698 0.2075 0.03774 0.1321 0.09434
Detection Prevalence 0.1698 0.1698 0.1698 0.2075 0.05660 0.1321 0.09434
Balanced Accuracy 0.9889 1.0000 0.9500 0.9583 0.99020 1.0000 1.00000