#install.packages("readxl")
#install.packages("dplyr")
#install.packages("ranger")

library(readxl)
## Warning: package 'readxl' was built under R version 4.2.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ranger)
## Warning: package 'ranger' was built under R version 4.2.3
#datos

df <- read_excel("1_4cepas.xlsx")
df <- df %>% 
  rename(
    wavelength = Wavelength,
    reflectance = `Reflectance (%)`,
    archivo = Archivo,
    cepa = CEPA
  )

df$cepa <- as.factor(df$cepa)

#Colapsar categorías de CEPA

df <- df %>%
  mutate(
    cepa_grupo = case_when(
      cepa %in% c("ROS006", "ROS010", "ROS011") ~ "Grupo1",
      TRUE ~ "Grupo2"
    )
  )

df$cepa_grupo <- as.factor(df$cepa_grupo)

#Selección de variables para el modelo

df_model <- df %>%
  select(reflectance, wavelength, cepa_grupo)

#Muestreo para reducir tamaño

set.seed(182)
df_model <- df_model %>% sample_n(40000)

#Entrenar modelo con RANGER

rf_model <- ranger(
  reflectance ~ .,
  data = df_model,
  num.trees = 1000,             
  mtry = 2,
  importance = "impurity",
  respect.unordered.factors = "order"
)

print(rf_model)
## Ranger result
## 
## Call:
##  ranger(reflectance ~ ., data = df_model, num.trees = 1000, mtry = 2,      importance = "impurity", respect.unordered.factors = "order") 
## 
## Type:                             Regression 
## Number of trees:                  1000 
## Sample size:                      40000 
## Number of independent variables:  2 
## Mtry:                             2 
## Target node size:                 5 
## Variable importance mode:         impurity 
## Splitrule:                        variance 
## OOB prediction error (MSE):       45.57202 
## R squared (OOB):                  0.6662542
#Importancia de variables


importance_values <- rf_model$variable.importance
print(importance_values)
## wavelength cepa_grupo 
## 3986445.13   23205.01
#Predicción

predicciones <- predict(rf_model, data = df_model)$predictions

# Calcular error RMSE
rmse <- sqrt(mean((predicciones - df_model$reflectance)^2))
cat("RMSE del modelo:", rmse, "\n")
## RMSE del modelo: 6.248295
print(rf_model)
## Ranger result
## 
## Call:
##  ranger(reflectance ~ ., data = df_model, num.trees = 1000, mtry = 2,      importance = "impurity", respect.unordered.factors = "order") 
## 
## Type:                             Regression 
## Number of trees:                  1000 
## Sample size:                      40000 
## Number of independent variables:  2 
## Mtry:                             2 
## Target node size:                 5 
## Variable importance mode:         impurity 
## Splitrule:                        variance 
## OOB prediction error (MSE):       45.57202 
## R squared (OOB):                  0.6662542