Introducción

En este ejercicio se busca obtener un modelo por interpolación, que permita estimar el número de muertes por caidas de rayos en Colombia entre los años 2012 y 2014.

Datos

Para el ejercicio se cuenta con 3 capas de información:

library(raster)
## Loading required package: sp
library(sf)
## Linking to GEOS 3.6.1, GDAL 2.2.3, proj.4 4.9.3
library(rgdal)
## rgdal: version: 1.3-4, (SVN revision 766)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 2.2.3, released 2017/11/20
##  Path to GDAL shared files: C:/Users/jorod/OneDrive/Documentos/R/win-library/3.5/rgdal/gdal
##  GDAL binary built with GEOS: TRUE 
##  Loaded PROJ.4 runtime: Rel. 4.9.3, 15 August 2016, [PJ_VERSION: 493]
##  Path to PROJ.4 shared files: C:/Users/jorod/OneDrive/Documentos/R/win-library/3.5/rgdal/proj
##  Linking to sp version: 1.3-1
library(tmap)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
# GeoTIFF file list
fl <- list.files("C:/Users/jorod/OneDrive/Documentos/PRA/lightning/rst_ex", pattern = ".tif$", full.names = TRUE)

# Create the raster stack
rst <- stack(fl)
#rst[rst==0] <-NA

# Change the layer names to coincide with table data
names(rst) <- c("ELEV", "POB", "DENS")
plot(rst)

También se cuenta con información del número de muertes reportadas para los años 2012 a 2014, la altura aproximada de la ocurrencia de las muertes, las coordenadas y la densidad de caida de rayos de la zona:

climDataPT <- read.csv("C:/Users/jorod/OneDrive/Documentos/PRA/lightning/csv/fatalidades.csv")

knitr::kable(head(climDataPT, n=10))
CODMPIO MUNICIPIO NUM_FAT LON LAT ELEV DENS POB
73226 Cunday 1 -74.687 3.985 733 3.733470 9065
76616 Riofro 1 -76.371 4.118 1224 17.693472 23210
50006 Acacias 1 -73.734 4.008 501 22.937889 50715
94001 Puerto Inirida 1 -68.457 3.316 129 20.662602 27711
76111 Buga 1 -76.098 3.859 2708 2.525778 132320
50318 Guamal 1 -73.939 3.922 1427 21.661762 8571
73168 Chaparral 2 -75.590 3.744 1206 21.183340 38094
25839 Ubal 2 -73.477 4.803 2160 21.999664 15462
66170 Dos Quebradas 2 -75.670 4.839 1450 15.105362 191909
25269 Facatativa 2 -74.336 4.837 2731 16.356615 102355

En el siguiente mapa se muestran los puntos reportados de ocurrancia de muertes por caida de rayos sobre el mapa de densidad de caida de rayos. También se muestra el histograma de las muertes ocurridas en cada punto reportado.

proj4Str <- "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"

statPoints <- SpatialPointsDataFrame(coords      = climDataPT[,c("LON","LAT")], 
                                     data        = climDataPT,
                                     proj4string = CRS(proj4Str))
par(mfrow=c(1,2),mar=c(5,6,3,2))

plot(rst[["DENS"]], main="Densidad de caida de rayos\n y muertes reportadas",
     xlab = "Longitud", ylab="Latitud")
plot(statPoints, add=TRUE)

hist(climDataPT$NUM_FAT, xlab= "Muertes", ylab="Frecuencia", main="Muertes 2012-2014")

## Método

Primero vamos a realizar un gráfico de correlación que nos permita deducir cuales son las variables que tienen mayor relación con la ocurrencia de las muertes.

library(corrplot)
## corrplot 0.84 loaded
corMat <- cor(climDataPT[,3:ncol(climDataPT)])

corrplot.mixed(corMat, number.cex=0.8, tl.cex = 0.9, tl.col = "black", 
               outline=FALSE, mar=c(0,0,2,2), upper="square", bg=NA)

La gráfica de correlación muestra que la densidad de población es la factor que más influye en las muertes reportadas.

A continuación se crean funciones auxiliares para crear las k iteraciones de entrenamiento y prueba, dividir los datos y obtener los residuales. Para este ejercicio se realizarán 10 iteraciones (k=10).

# Generate the K-fold train--test splits
# x are the row indices
# Outputs a list with test (or train) indices
kfoldSplit <- function(x, k=10, train=TRUE){
  x <- sample(x, size = length(x), replace = FALSE)
  out <- suppressWarnings(split(x, factor(1:k)))
  if(train) out <- lapply(out, FUN = function(x, len) (1:len)[-x], len=length(unlist(out)))
  return(out)
}

# Regression residuals from RF object
resid.RF <- function(x) return(x$y - x$predicted)

A continuación se inician algunas variables, evalData contendrá todos los valores de RMSE.

set.seed(12345)

k <- 10

kfolds <- kfoldSplit(1:nrow(climDataPT), k = 10, train = TRUE)

evalData <- matrix(NA, nrow=k, ncol=7, 
                   dimnames = list(1:k, c("OK","RF","GLM","GAM","RF_OK","GLM_OK","GAM_OK")))

Se usará un bloque de código, dentro del bucle “for” para cada algoritmo de regresión probado. Los residuales del entrenamiento se interpolan a través de kriging y luego los residuales de prueba se agregan a los resultados de regresión de prueba para su evaluación.

library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(mgcv)
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:raster':
## 
##     getData
## This is mgcv 1.8-24. For overview type 'help("mgcv-package")'.
library(gstat)


for(i in 1:k){
  
  cat("K-fold...",i,"of",k,"....\n")
  
  # TRAIN indices as integer
  idx <- kfolds[[i]]
  
  # TRAIN indices as a boolean vector
  idxBool <- (1:nrow(climDataPT)) %in% idx
  
  # Observed test data for the target variable
  obs.test <- climDataPT[!idxBool, "NUM_FAT"]
  
  
  
  ## ----------------------------------------------------------------------------- ##
  ## Ordinary Kriging ----
  ## ----------------------------------------------------------------------------- ##
    
  # Make variogram
  formMod <- NUM_FAT ~ 1
  mod <- vgm(model  = "Exp", psill  = 3, range  = 100, nugget = 0.5)
  variog <- variogram(formMod, statPoints[idxBool, ])
  
  # Variogram fitting by Ordinary Least Sqaure
  variogFitOLS<-fit.variogram(variog, model = mod,  fit.method = 6)
  #plot(variog, variogFitOLS, main="OLS Model")
    
  # kriging predictions
  OK <- krige(formula = formMod ,
              locations = statPoints[idxBool, ], 
              model = variogFitOLS,
              newdata = statPoints[!idxBool, ],
              debug.level = 0)
  
  ok.pred.test <- OK@data$var1.pred
  evalData[i,"OK"] <- sqrt(mean((ok.pred.test - obs.test)^2))
  
  
  
  ## ----------------------------------------------------------------------------- ##
  ## RF calibration ----
  ## ----------------------------------------------------------------------------- ##
  
  RF <- randomForest(y = climDataPT[idx, "NUM_FAT"], 
                     x = climDataPT[idx, c("POB", "DENS", "ELEV")],
                     ntree = 500,
                     mtry = 2)
  
  rf.pred.test <- predict(RF, newdata = climDataPT[-idx,], type="response")
  evalData[i,"RF"] <- sqrt(mean((rf.pred.test - obs.test)^2))
  
  # Ordinary Kriging of Random Forest residuals
  #
  statPointsTMP <- statPoints[idxBool, ]
  statPointsTMP@data <- cbind(statPointsTMP@data, residRF = resid.RF(RF))
  
  formMod <- residRF ~ 1
  mod <- vgm(model  = "Exp", psill  = 0.6, range  = 10, nugget = 0.01)
  variog <- variogram(formMod, statPointsTMP)
  
  # Variogram fitting by Ordinary Least Sqaure
  variogFitOLS<-fit.variogram(variog, model = mod,  fit.method = 6)
  #plot(variog, variogFitOLS, main="OLS Model")
    
  # kriging predictions
  RF.OK <- krige(formula = formMod ,
              locations = statPointsTMP, 
              model = variogFitOLS,
              newdata = statPoints[!idxBool, ],
              debug.level = 0)
  
  rf.ok.pred.test <- rf.pred.test + RF.OK@data$var1.pred
  evalData[i,"RF_OK"] <- sqrt(mean((rf.ok.pred.test - obs.test)^2))
  
  
  
  ## ----------------------------------------------------------------------------- ##
  ## GLM calibration ----
  ## ----------------------------------------------------------------------------- ##

  GLM <- glm(formula = NUM_FAT ~ POB + DENS + ELEV, data = climDataPT[idx, ])
  
  glm.pred.test <- predict(GLM, newdata = climDataPT[-idx,], type="response")
  evalData[i,"GLM"] <- sqrt(mean((glm.pred.test - obs.test)^2))
  
  # Ordinary Kriging of GLM residuals
  #
  statPointsTMP <- statPoints[idxBool, ]
  statPointsTMP@data <- cbind(statPointsTMP@data, residGLM = resid(GLM))
  
  formMod <- residGLM ~ 1
  mod <- vgm(model  = "Exp", psill  = 0.4, range  = 10, nugget = 0.01)
  variog <- variogram(formMod, statPointsTMP)
  
  # Variogram fitting by Ordinary Least Sqaure
  variogFitOLS<-fit.variogram(variog, model = mod,  fit.method = 6)
  #plot(variog, variogFitOLS, main="OLS Model")
    
  # kriging predictions
  GLM.OK <- krige(formula = formMod ,
              locations = statPointsTMP, 
              model = variogFitOLS,
              newdata = statPoints[!idxBool, ],
              debug.level = 0)
  
  glm.ok.pred.test <- glm.pred.test + GLM.OK@data$var1.pred
  evalData[i,"GLM_OK"] <- sqrt(mean((glm.ok.pred.test - obs.test)^2))
  
  
  
  ## ----------------------------------------------------------------------------- ##
  ## GAM calibration ----
  ## ----------------------------------------------------------------------------- ##
  
  GAM <- gam(formula = NUM_FAT ~ s(POB) + s(DENS) + s(ELEV), data = climDataPT[idx, ])
  
  gam.pred.test <- predict(GAM, newdata = climDataPT[-idx,], type="response")
  evalData[i,"GAM"] <- sqrt(mean((gam.pred.test - obs.test)^2))
 
  # Ordinary Kriging of GAM residuals
  #
  statPointsTMP <- statPoints[idxBool, ]
  statPointsTMP@data <- cbind(statPointsTMP@data, residGAM = resid(GAM))
  
  formMod <- residGAM ~ 1
  mod <- vgm(model  = "Exp", psill  = 0.3, range  = 10, nugget = 0.01)
  variog <- variogram(formMod, statPointsTMP)
  
  # Variogram fitting by Ordinary Least Sqaure
  variogFitOLS<-fit.variogram(variog, model = mod,  fit.method = 6)
  #plot(variog, variogFitOLS, main="OLS Model")
    
  # kriging predictions
  GAM.OK <- krige(formula = formMod ,
              locations = statPointsTMP, 
              model = variogFitOLS,
              newdata = statPoints[!idxBool, ],
              debug.level = 0)
  
  gam.ok.pred.test <- gam.pred.test + GAM.OK@data$var1.pred
  evalData[i,"GAM_OK"] <- sqrt(mean((gam.ok.pred.test - obs.test)^2))
  
}
## K-fold... 1 of 10 ....
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?

## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## K-fold... 2 of 10 ....
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?

## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## K-fold... 3 of 10 ....
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?

## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## K-fold... 4 of 10 ....
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?

## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?

## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## K-fold... 5 of 10 ....
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?

## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?

## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## K-fold... 6 of 10 ....
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?

## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?

## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## K-fold... 7 of 10 ....
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?

## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## K-fold... 8 of 10 ....
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?

## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?

## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## K-fold... 9 of 10 ....
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?

## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?

## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## K-fold... 10 of 10 ....
## K-fold... 1 of 10 ....
## K-fold... 2 of 10 ....
## K-fold... 3 of 10 ....
## K-fold... 4 of 10 ....
## K-fold... 5 of 10 ....
## K-fold... 6 of 10 ....
## K-fold... 7 of 10 ....
## K-fold... 8 of 10 ....
## K-fold... 9 of 10 ....
## K-fold... 10 of 10 ....

Ahora se pueden ver los valores de desviación estándar para cada iteración.

round(apply(evalData,2,FUN = function(x,...) c(mean(x,...),sd(x,...))),3)
##         OK    RF   GLM   GAM RF_OK GLM_OK GAM_OK
## [1,] 1.674 1.669 1.761 1.768  1.61  1.688  1.705
## [2,] 0.582 0.532 0.732 0.768  0.50  0.694  0.740
##         OK    RF   GLM   GAM RF_OK GLM_OK GAM_OK
## [1,] 1.674 1.669 1.761 1.768  1.61  1.688  1.705
## [2,] 0.582 0.532 0.732 0.768  0.50  0.694  0.740

Los resultados de esta tabla muestran similitud entre los métodos utilizados aunque el método basado en Random forest tiene un mejor desempeño con una desviación estándar de 1.61.

Hecho esto, se procede a predecir los valores de la capa de número de muertes para todo el territorio de Colombia.

# TRAIN indices as integer
idx <- kfolds[[i]]
  
# TRAIN indices as a boolean vector
idxBool <- (1:nrow(climDataPT)) %in% idx
  
# Observed test data for the target variable
obs.test <- climDataPT[!idxBool, "NUM_FAT"]
rstPixDF <- as(rst[[1]], "SpatialPixelsDataFrame")

#rstPixDF

Random Forest

RF <- randomForest(y = climDataPT[idx, "NUM_FAT"], 
                    x = climDataPT[idx, c("POB", "DENS", "ELEV")],
                    ntree = 500,
                    mtry = 2)
  
rstPredRF <- predict(rst, RF, type="response")

A continuación se muestran los variogramas de los dos mejores métodos.

# Ordinary Kriging of Random Forest residuals
#
statPointsTMPRF <- statPoints[idxBool, ]
crs(statPointsTMPRF) <- crs(rstPixDF)
statPointsTMPRF@data <- cbind(statPointsTMPRF@data, residRF = resid.RF(RF))
  
formModRF <- residRF ~ 1
modRF <- vgm(model  = "Exp", psill  = 0.6, range  = 10, nugget = 0.01)
variogRF <- variogram(formModRF, statPointsTMPRF)
  
# Variogram fitting by Ordinary Least Sqaure
variogFitOLSRF<-fit.variogram(variogRF, model = modRF,  fit.method = 6)
plot(variogRF, variogFitOLSRF, main="RF Model")

GLM <- glm(formula = NUM_FAT ~ POB + DENS + ELEV, data = climDataPT[idx, ])
rstPredGLM <- predict(rst, GLM, type="response")

statPointsTMPGLM <- statPoints[idxBool, ]
crs(statPointsTMPGLM) <- crs(rstPixDF)
statPointsTMPGLM@data <- cbind(statPointsTMPGLM@data, residGLM = resid(GLM))
  
formModGLM <- residGLM ~ 1
modGLM <- vgm(model  = "Exp", psill  = 0.4, range  = 10, nugget = 0.01)
variogGLM <- variogram(formModGLM, statPointsTMPGLM)
  
# Variogram fitting by Ordinary Least Sqaure
variogFitOLSGLM<-fit.variogram(variogGLM, model = modGLM,  fit.method = 6)
plot(variogGLM, variogFitOLSGLM, main="Semi-variogram of GLM residuals")

GAM <- gam(formula = NUM_FAT ~ s(POB) + s(DENS) + s(ELEV), data = climDataPT)
  
rstPredGAM <- predict(rst, GAM, type="response")

# Ordinary Kriging of Random Forest residuals
  #
# Create a temporary SpatialPointsDF object to store GAM residuals
statPointsTMPGAM <- statPoints
crs(statPointsTMPGAM) <- crs(rstPixDF)
statPointsTMPGAM@data <- cbind(statPointsTMPGAM@data, residGAM = resid(GAM))

# Define the kriging parameters and fit the variogram using OLS
formModGAM <- residGAM ~ 1
modGAM <- vgm(model  = "Exp", psill  = 0.15, range  = 10, nugget = 0.01)
variogGAM <- variogram(formModGAM, statPointsTMPGAM)
variogFitOLSGAM <- fit.variogram(variogGAM, model = modGAM,  fit.method = 6)
## Warning in fit.variogram(variogGAM, model = modGAM, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
# Plot the results
plot(variogGAM, variogFitOLSGAM, main="Semi-variogram of GAM residuals")

Finalmente se muestran los resultados de 3 métodos utilizados.

residKrigMapRF <- krige(formula = formModRF,
                      locations = statPointsTMPRF, 
                      model = variogFitOLSRF,
                      newdata = rstPixDF)
## [using ordinary kriging]
residKrigRstLayerRF <- as(residKrigMapRF, "RasterLayer")

gamKrigMapRF <- rstPredRF + residKrigRstLayerRF

plot(gamKrigMapRF, main="Muertes por rayos 2012-2014\n(RF regression-kriging RF)",
     xlab="Longitude", ylab="Latitude", cex.main=0.8, cex.axis=0.7, cex=0.8)

residKrigMapGLM <- krige(formula = formModGLM,
                      locations = statPointsTMPGLM, 
                      model = variogFitOLSGLM,
                      newdata = rstPixDF)
## [using ordinary kriging]
residKrigRstLayerGLM <- as(residKrigMapGLM, "RasterLayer")

gamKrigMapGLM <- rstPredGLM + residKrigRstLayerGLM

plot(gamKrigMapGLM, main="Muertes por rayos 2012-2014\n(RF regression-kriging GLM)",
     xlab="Longitude", ylab="Latitude", cex.main=0.8, cex.axis=0.7, cex=0.8)

residKrigMap <- krige(formula = formModGAM,
                      locations = statPointsTMPGAM, 
                      model = variogFitOLSGAM,
                      newdata = rstPixDF)
## [using ordinary kriging]
residKrigRstLayer <- as(residKrigMap, "RasterLayer")

gamKrigMap <- rstPredGAM + residKrigRstLayer

plot(gamKrigMap, main="Muertes por rayos 2012-2014\n(RF regression-kriging)",
     xlab="Longitude", ylab="Latitude", cex.main=0.8, cex.axis=0.7, cex=0.8)

Discusión

De los resultados se puede concluir que el método que mejor se ajusta es RF_Ok.