library(terra)
library(geodata)
library(tidyverse)
library(ranger)
library(sf)
valle_ext <- ext(-77.0, -75.5, 3.0, 4.5)
# carpeta fija para no repetir descargas
cache_dir <- "C:/Users/CTIRADO/Documents/soilgrids_cache"
dir.create(cache_dir, showWarnings = FALSE)
Descargo SOC, pH y arcilla para la profundidad 0-5cm desde SoilGrids 2.0 y recorto al Valle del Cauca. Los archivos se guardan en cache y no se vuelven a descargar en ejecuciones futuras.
soc_r <- geodata::soil_world(var = "soc", depth = 5, stat = "mean", path = cache_dir)
ph_r <- geodata::soil_world(var = "phh2o", depth = 5, stat = "mean", path = cache_dir)
clay_r <- geodata::soil_world(var = "clay", depth = 5, stat = "mean", path = cache_dir)
soc_r <- crop(soc_r, valle_ext)
ph_r <- crop(ph_r, valle_ext)
clay_r <- crop(clay_r, valle_ext)
# soilgrids guarda los valores como enteros escalados, hay que convertir
soc_r <- soc_r / 10
ph_r <- ph_r / 10
clay_r <- clay_r / 10
par(mfrow = c(1, 3))
plot(soc_r, main = "SOC 0-5cm (g/kg)", col = hcl.colors(20, "YlOrBr", rev = TRUE))
plot(ph_r, main = "pH 0-5cm", col = hcl.colors(20, "RdYlGn"))
plot(clay_r, main = "Arcilla 0-5cm (%)", col = hcl.colors(20, "Blues"))
par(mfrow = c(1, 1))
Genero pendiente, aspecto y TPI a partir del DEM SRTM, remuestreados a la misma resolucion que SoilGrids.
dem <- geodata::elevation_30s("COL", path = cache_dir)
dem_valle <- crop(dem, valle_ext)
dem_resamp <- resample(dem_valle, soc_r, method = "bilinear")
pendiente <- terrain(dem_resamp, v = "slope", unit = "degrees")
aspecto <- terrain(dem_resamp, v = "aspect", unit = "degrees")
tpi <- terrain(dem_resamp, v = "TPI")
covariables <- c(dem_resamp, pendiente, aspecto, tpi)
names(covariables) <- c("elevacion", "pendiente", "aspecto", "tpi")
plot(covariables)
Convierto el stack raster a un dataframe donde cada fila es un pixel.
stack_completo <- c(soc_r, ph_r, clay_r, covariables)
names(stack_completo)[1:3] <- c("soc", "ph", "clay")
df_modelo <- as.data.frame(stack_completo, na.rm = TRUE)
# tabla con coordenadas para spatial CV
df_coords <- as.data.frame(stack_completo, xy = TRUE, na.rm = TRUE)
names(df_coords)[1:2] <- c("x", "y")
names(df_coords)[3:9] <- c("soc", "ph", "clay", "elevacion", "pendiente", "aspecto", "tpi")
cat("Pixeles disponibles para modelar:", nrow(df_modelo), "\n")
## Pixeles disponibles para modelar: 31525
head(df_modelo)
## soc ph clay elevacion pendiente aspecto tpi
## 182 15.45 0.49 1.9 15 3.0842864 294.0698 -53.500
## 183 16.70 0.48 2.4 118 1.5562053 275.7314 36.000
## 184 16.71 0.49 2.6 93 1.9297750 119.9498 4.250
## 185 17.14 0.49 2.5 54 1.5044799 137.6123 -22.375
## 186 18.06 0.48 2.5 41 0.7719392 200.5544 -27.625
## 187 18.33 0.48 2.5 71 0.3187626 181.3921 6.375
set.seed(42)
idx_train <- sample(nrow(df_modelo), size = 0.8 * nrow(df_modelo))
train <- df_modelo[idx_train, ]
test <- df_modelo[-idx_train, ]
modelo_rf <- ranger(
formula = soc ~ elevacion + pendiente + aspecto + tpi + ph + clay,
data = train,
num.trees = 300,
importance = "impurity"
)
print(modelo_rf)
## Ranger result
##
## Call:
## ranger(formula = soc ~ elevacion + pendiente + aspecto + tpi + ph + clay, data = train, num.trees = 300, importance = "impurity")
##
## Type: Regression
## Number of trees: 300
## Sample size: 25220
## Number of independent variables: 6
## Mtry: 2
## Target node size: 5
## Variable importance mode: impurity
## Splitrule: variance
## OOB prediction error (MSE): 1.09941
## R squared (OOB): 0.9247901
imp <- sort(importance(modelo_rf), decreasing = TRUE)
barplot(imp, las = 2, main = "importancia de covariables",
col = hcl.colors(6, "Teal"))
pred_test <- predict(modelo_rf, data = test)$predictions
rmse <- sqrt(mean((test$soc - pred_test)^2))
r2 <- cor(test$soc, pred_test)^2
mae <- mean(abs(test$soc - pred_test))
bias <- mean(pred_test - test$soc)
cat("--- validacion 80/20 ---\n")
## --- validacion 80/20 ---
cat("RMSE:", round(rmse, 2), "g/kg\n")
## RMSE: 1.01 g/kg
cat("MAE: ", round(mae, 2), "g/kg\n")
## MAE: 0.68 g/kg
cat("R2: ", round(r2, 3), "\n")
## R2: 0.93
cat("Bias:", round(bias, 3), "g/kg\n")
## Bias: 0.007 g/kg
plot(test$soc, pred_test,
xlab = "SOC observado (g/kg)",
ylab = "SOC predicho (g/kg)",
main = "Observado vs Predicho - test set",
pch = 16, cex = 0.5, col = rgb(0, 0, 0, 0.3))
abline(0, 1, col = "red", lwd = 2)
El k-fold normal sobreestima el desempeño porque pixeles vecinos pueden caer en train y test al mismo tiempo. El spatial CV evalua bloques geograficos completos para una estimacion mas honesta.
# bloques espaciales 4x4 sobre el area
df_coords$bloque <- as.integer(cut(df_coords$x, breaks = 4)) * 10 +
as.integer(cut(df_coords$y, breaks = 4))
bloques <- unique(df_coords$bloque)
resultados_scv <- data.frame()
for (b in bloques) {
train_scv <- df_coords[df_coords$bloque != b, ]
test_scv <- df_coords[df_coords$bloque == b, ]
modelo_b <- ranger(
formula = soc ~ elevacion + pendiente + aspecto + tpi + ph + clay,
data = train_scv,
num.trees = 300,
seed = 42
)
pred_b <- predict(modelo_b, data = test_scv)$predictions
resultados_scv <- rbind(resultados_scv, data.frame(
bloque = b,
rmse = sqrt(mean((test_scv$soc - pred_b)^2)),
r2 = cor(test_scv$soc, pred_b)^2,
mae = mean(abs(test_scv$soc - pred_b)),
bias = mean(pred_b - test_scv$soc)
))
}
cat("--- spatial cross-validation ---\n")
## --- spatial cross-validation ---
print(resultados_scv)
## bloque rmse r2 mae bias
## 1 14 1.2287378 0.3222571 0.9692466 -0.800484197
## 2 24 2.3374496 0.6756078 1.5555110 -0.936360957
## 3 34 0.7276557 0.7852143 0.5065611 -0.184483946
## 4 44 0.7392940 0.9094468 0.5754492 -0.005304113
## 5 13 1.7958358 0.7978191 1.2732503 -0.222541733
## 6 23 0.7339576 0.9046206 0.5549086 0.087301302
## 7 33 0.7437838 0.9044947 0.5629619 0.128158253
## 8 43 0.8491705 0.7840411 0.6776386 0.332360309
## 9 12 2.1122843 0.6518611 1.4567047 0.953744348
## 10 22 0.7285615 0.7971689 0.5185649 -0.063719775
## 11 32 0.6971597 0.9252485 0.5427115 -0.131197432
## 12 42 0.8964392 0.9230265 0.6792578 0.331115604
## 13 11 1.5690081 0.4606713 1.0509429 0.335290603
## 14 21 0.6854869 0.7468119 0.5046350 0.306652683
## 15 31 0.8299753 0.8750360 0.6518532 -0.045756508
## 16 41 0.7903048 0.8812410 0.6187739 -0.122938202
cat("\nRMSE promedio:", round(mean(resultados_scv$rmse), 2), "g/kg\n")
##
## RMSE promedio: 1.09 g/kg
cat("R2 promedio: ", round(mean(resultados_scv$r2), 3), "\n")
## R2 promedio: 0.772
par(mfrow = c(1, 2))
barplot(resultados_scv$rmse,
names.arg = resultados_scv$bloque,
main = "RMSE por bloque espacial",
ylab = "RMSE (g/kg)",
col = hcl.colors(16, "YlOrRd"))
abline(h = mean(resultados_scv$rmse), col = "red", lwd = 2, lty = 2)
barplot(resultados_scv$r2,
names.arg = resultados_scv$bloque,
main = "R2 por bloque espacial",
ylab = "R2",
col = hcl.colors(16, "Teal"),
ylim = c(0, 1))
abline(h = mean(resultados_scv$r2), col = "red", lwd = 2, lty = 2)
par(mfrow = c(1, 1))
stack_pred <- c(covariables, ph_r, clay_r)
names(stack_pred)[5:6] <- c("ph", "clay")
mapa_soc <- predict(stack_pred, modelo_rf, type = "response")
plot(mapa_soc,
main = "SOC predicho 0-5cm (g/kg) - Valle del Cauca",
col = hcl.colors(20, "YlOrBr", rev = TRUE))
writeRaster(mapa_soc, "SOC_predicho_Valle_Cauca.tif", overwrite = TRUE)