Librerias y configuracion

library(terra)
library(geodata)
library(tidyverse)
library(ranger)
library(sf)

valle_ext <- ext(-77.0, -75.5, 3.0, 4.5)

# carpeta fija para no repetir descargas
cache_dir <- "C:/Users/CTIRADO/Documents/soilgrids_cache"
dir.create(cache_dir, showWarnings = FALSE)

1. Descarga de propiedades de suelo (SoilGrids 2.0)

Descargo SOC, pH y arcilla para la profundidad 0-5cm desde SoilGrids 2.0 y recorto al Valle del Cauca. Los archivos se guardan en cache y no se vuelven a descargar en ejecuciones futuras.

soc_r  <- geodata::soil_world(var = "soc",   depth = 5, stat = "mean", path = cache_dir)
ph_r   <- geodata::soil_world(var = "phh2o", depth = 5, stat = "mean", path = cache_dir)
clay_r <- geodata::soil_world(var = "clay",  depth = 5, stat = "mean", path = cache_dir)

soc_r  <- crop(soc_r,  valle_ext)
ph_r   <- crop(ph_r,   valle_ext)
clay_r <- crop(clay_r, valle_ext)

# soilgrids guarda los valores como enteros escalados, hay que convertir
soc_r  <- soc_r  / 10
ph_r   <- ph_r   / 10
clay_r <- clay_r / 10
par(mfrow = c(1, 3))
plot(soc_r,  main = "SOC 0-5cm (g/kg)",  col = hcl.colors(20, "YlOrBr", rev = TRUE))
plot(ph_r,   main = "pH 0-5cm",           col = hcl.colors(20, "RdYlGn"))
plot(clay_r, main = "Arcilla 0-5cm (%)",  col = hcl.colors(20, "Blues"))

par(mfrow = c(1, 1))

2. Covariables de terreno (factor r del SCORPAN)

Genero pendiente, aspecto y TPI a partir del DEM SRTM, remuestreados a la misma resolucion que SoilGrids.

dem        <- geodata::elevation_30s("COL", path = cache_dir)
dem_valle  <- crop(dem, valle_ext)
dem_resamp <- resample(dem_valle, soc_r, method = "bilinear")

pendiente <- terrain(dem_resamp, v = "slope",  unit = "degrees")
aspecto   <- terrain(dem_resamp, v = "aspect", unit = "degrees")
tpi       <- terrain(dem_resamp, v = "TPI")

covariables <- c(dem_resamp, pendiente, aspecto, tpi)
names(covariables) <- c("elevacion", "pendiente", "aspecto", "tpi")
plot(covariables)

3. Tabla de modelado

Convierto el stack raster a un dataframe donde cada fila es un pixel.

stack_completo <- c(soc_r, ph_r, clay_r, covariables)
names(stack_completo)[1:3] <- c("soc", "ph", "clay")

df_modelo <- as.data.frame(stack_completo, na.rm = TRUE)

# tabla con coordenadas para spatial CV
df_coords <- as.data.frame(stack_completo, xy = TRUE, na.rm = TRUE)
names(df_coords)[1:2] <- c("x", "y")
names(df_coords)[3:9] <- c("soc", "ph", "clay", "elevacion", "pendiente", "aspecto", "tpi")

cat("Pixeles disponibles para modelar:", nrow(df_modelo), "\n")
## Pixeles disponibles para modelar: 31525
head(df_modelo)
##       soc   ph clay elevacion pendiente  aspecto     tpi
## 182 15.45 0.49  1.9        15 3.0842864 294.0698 -53.500
## 183 16.70 0.48  2.4       118 1.5562053 275.7314  36.000
## 184 16.71 0.49  2.6        93 1.9297750 119.9498   4.250
## 185 17.14 0.49  2.5        54 1.5044799 137.6123 -22.375
## 186 18.06 0.48  2.5        41 0.7719392 200.5544 -27.625
## 187 18.33 0.48  2.5        71 0.3187626 181.3921   6.375

4. Modelo Random Forest

set.seed(42)
idx_train <- sample(nrow(df_modelo), size = 0.8 * nrow(df_modelo))
train <- df_modelo[idx_train, ]
test  <- df_modelo[-idx_train, ]

modelo_rf <- ranger(
  formula    = soc ~ elevacion + pendiente + aspecto + tpi + ph + clay,
  data       = train,
  num.trees  = 300,
  importance = "impurity"
)

print(modelo_rf)
## Ranger result
## 
## Call:
##  ranger(formula = soc ~ elevacion + pendiente + aspecto + tpi +      ph + clay, data = train, num.trees = 300, importance = "impurity") 
## 
## Type:                             Regression 
## Number of trees:                  300 
## Sample size:                      25220 
## Number of independent variables:  6 
## Mtry:                             2 
## Target node size:                 5 
## Variable importance mode:         impurity 
## Splitrule:                        variance 
## OOB prediction error (MSE):       1.09941 
## R squared (OOB):                  0.9247901
imp <- sort(importance(modelo_rf), decreasing = TRUE)
barplot(imp, las = 2, main = "importancia de covariables",
        col = hcl.colors(6, "Teal"))

5. Validacion simple 80/20

pred_test <- predict(modelo_rf, data = test)$predictions
rmse <- sqrt(mean((test$soc - pred_test)^2))
r2   <- cor(test$soc, pred_test)^2
mae  <- mean(abs(test$soc - pred_test))
bias <- mean(pred_test - test$soc)

cat("--- validacion 80/20 ---\n")
## --- validacion 80/20 ---
cat("RMSE:", round(rmse, 2), "g/kg\n")
## RMSE: 1.01 g/kg
cat("MAE: ", round(mae,  2), "g/kg\n")
## MAE:  0.68 g/kg
cat("R2:  ", round(r2,   3), "\n")
## R2:   0.93
cat("Bias:", round(bias, 3), "g/kg\n")
## Bias: 0.007 g/kg
plot(test$soc, pred_test,
     xlab = "SOC observado (g/kg)",
     ylab = "SOC predicho (g/kg)",
     main = "Observado vs Predicho - test set",
     pch  = 16, cex = 0.5, col = rgb(0, 0, 0, 0.3))
abline(0, 1, col = "red", lwd = 2)

6. Spatial Cross-Validation

El k-fold normal sobreestima el desempeño porque pixeles vecinos pueden caer en train y test al mismo tiempo. El spatial CV evalua bloques geograficos completos para una estimacion mas honesta.

# bloques espaciales 4x4 sobre el area
df_coords$bloque <- as.integer(cut(df_coords$x, breaks = 4)) * 10 +
                    as.integer(cut(df_coords$y, breaks = 4))

bloques        <- unique(df_coords$bloque)
resultados_scv <- data.frame()

for (b in bloques) {
  train_scv <- df_coords[df_coords$bloque != b, ]
  test_scv  <- df_coords[df_coords$bloque == b, ]

  modelo_b <- ranger(
    formula   = soc ~ elevacion + pendiente + aspecto + tpi + ph + clay,
    data      = train_scv,
    num.trees = 300,
    seed      = 42
  )

  pred_b <- predict(modelo_b, data = test_scv)$predictions

  resultados_scv <- rbind(resultados_scv, data.frame(
    bloque = b,
    rmse   = sqrt(mean((test_scv$soc - pred_b)^2)),
    r2     = cor(test_scv$soc, pred_b)^2,
    mae    = mean(abs(test_scv$soc - pred_b)),
    bias   = mean(pred_b - test_scv$soc)
  ))
}

cat("--- spatial cross-validation ---\n")
## --- spatial cross-validation ---
print(resultados_scv)
##    bloque      rmse        r2       mae         bias
## 1      14 1.2287378 0.3222571 0.9692466 -0.800484197
## 2      24 2.3374496 0.6756078 1.5555110 -0.936360957
## 3      34 0.7276557 0.7852143 0.5065611 -0.184483946
## 4      44 0.7392940 0.9094468 0.5754492 -0.005304113
## 5      13 1.7958358 0.7978191 1.2732503 -0.222541733
## 6      23 0.7339576 0.9046206 0.5549086  0.087301302
## 7      33 0.7437838 0.9044947 0.5629619  0.128158253
## 8      43 0.8491705 0.7840411 0.6776386  0.332360309
## 9      12 2.1122843 0.6518611 1.4567047  0.953744348
## 10     22 0.7285615 0.7971689 0.5185649 -0.063719775
## 11     32 0.6971597 0.9252485 0.5427115 -0.131197432
## 12     42 0.8964392 0.9230265 0.6792578  0.331115604
## 13     11 1.5690081 0.4606713 1.0509429  0.335290603
## 14     21 0.6854869 0.7468119 0.5046350  0.306652683
## 15     31 0.8299753 0.8750360 0.6518532 -0.045756508
## 16     41 0.7903048 0.8812410 0.6187739 -0.122938202
cat("\nRMSE promedio:", round(mean(resultados_scv$rmse), 2), "g/kg\n")
## 
## RMSE promedio: 1.09 g/kg
cat("R2 promedio:  ", round(mean(resultados_scv$r2),   3), "\n")
## R2 promedio:   0.772
par(mfrow = c(1, 2))
barplot(resultados_scv$rmse,
        names.arg = resultados_scv$bloque,
        main = "RMSE por bloque espacial",
        ylab = "RMSE (g/kg)",
        col  = hcl.colors(16, "YlOrRd"))
abline(h = mean(resultados_scv$rmse), col = "red", lwd = 2, lty = 2)

barplot(resultados_scv$r2,
        names.arg = resultados_scv$bloque,
        main = "R2 por bloque espacial",
        ylab = "R2",
        col  = hcl.colors(16, "Teal"),
        ylim = c(0, 1))
abline(h = mean(resultados_scv$r2), col = "red", lwd = 2, lty = 2)

par(mfrow = c(1, 1))

7. Mapa de prediccion

stack_pred <- c(covariables, ph_r, clay_r)
names(stack_pred)[5:6] <- c("ph", "clay")

mapa_soc <- predict(stack_pred, modelo_rf, type = "response")

plot(mapa_soc,
     main = "SOC predicho 0-5cm (g/kg) - Valle del Cauca",
     col  = hcl.colors(20, "YlOrBr", rev = TRUE))

writeRaster(mapa_soc, "SOC_predicho_Valle_Cauca.tif", overwrite = TRUE)