En este ejercicio se busca obtener un modelo por interpolación, que permita estimar el número de muertes por caidas de rayos en Colombia entre los años 2012 y 2014.
Para el ejercicio se cuenta con 3 capas de información:
library(raster)
## Loading required package: sp
library(sf)
## Linking to GEOS 3.6.1, GDAL 2.2.3, proj.4 4.9.3
library(rgdal)
## rgdal: version: 1.3-4, (SVN revision 766)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 2.2.3, released 2017/11/20
## Path to GDAL shared files: C:/Users/jorod/OneDrive/Documentos/R/win-library/3.5/rgdal/gdal
## GDAL binary built with GEOS: TRUE
## Loaded PROJ.4 runtime: Rel. 4.9.3, 15 August 2016, [PJ_VERSION: 493]
## Path to PROJ.4 shared files: C:/Users/jorod/OneDrive/Documentos/R/win-library/3.5/rgdal/proj
## Linking to sp version: 1.3-1
library(tmap)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
# GeoTIFF file list
fl <- list.files("C:/Users/jorod/OneDrive/Documentos/PRA/lightning/rst_ex", pattern = ".tif$", full.names = TRUE)
# Create the raster stack
rst <- stack(fl)
#rst[rst==0] <-NA
# Change the layer names to coincide with table data
names(rst) <- c("ELEV", "POB", "DENS")
plot(rst)
También se cuenta con información del número de muertes reportadas para los años 2012 a 2014, la altura aproximada de la ocurrencia de las muertes, las coordenadas y la densidad de caida de rayos de la zona:
climDataPT <- read.csv("C:/Users/jorod/OneDrive/Documentos/PRA/lightning/csv/fatalidades.csv")
knitr::kable(head(climDataPT, n=10))
| CODMPIO | MUNICIPIO | NUM_FAT | LON | LAT | ELEV | DENS | POB |
|---|---|---|---|---|---|---|---|
| 73226 | Cunday | 1 | -74.687 | 3.985 | 733 | 3.733470 | 9065 |
| 76616 | Riofro | 1 | -76.371 | 4.118 | 1224 | 17.693472 | 23210 |
| 50006 | Acacias | 1 | -73.734 | 4.008 | 501 | 22.937889 | 50715 |
| 94001 | Puerto Inirida | 1 | -68.457 | 3.316 | 129 | 20.662602 | 27711 |
| 76111 | Buga | 1 | -76.098 | 3.859 | 2708 | 2.525778 | 132320 |
| 50318 | Guamal | 1 | -73.939 | 3.922 | 1427 | 21.661762 | 8571 |
| 73168 | Chaparral | 2 | -75.590 | 3.744 | 1206 | 21.183340 | 38094 |
| 25839 | Ubal | 2 | -73.477 | 4.803 | 2160 | 21.999664 | 15462 |
| 66170 | Dos Quebradas | 2 | -75.670 | 4.839 | 1450 | 15.105362 | 191909 |
| 25269 | Facatativa | 2 | -74.336 | 4.837 | 2731 | 16.356615 | 102355 |
En el siguiente mapa se muestran los puntos reportados de ocurrancia de muertes por caida de rayos sobre el mapa de densidad de caida de rayos. También se muestra el histograma de las muertes ocurridas en cada punto reportado.
proj4Str <- "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
statPoints <- SpatialPointsDataFrame(coords = climDataPT[,c("LON","LAT")],
data = climDataPT,
proj4string = CRS(proj4Str))
par(mfrow=c(1,2),mar=c(5,6,3,2))
plot(rst[["DENS"]], main="Densidad de caida de rayos\n y muertes reportadas",
xlab = "Longitud", ylab="Latitud")
plot(statPoints, add=TRUE)
hist(climDataPT$NUM_FAT, xlab= "Muertes", ylab="Frecuencia", main="Muertes 2012-2014")
## Método
Primero vamos a realizar un gráfico de correlación que nos permita deducir cuales son las variables que tienen mayor relación con la ocurrencia de las muertes.
library(corrplot)
## corrplot 0.84 loaded
corMat <- cor(climDataPT[,3:ncol(climDataPT)])
corrplot.mixed(corMat, number.cex=0.8, tl.cex = 0.9, tl.col = "black",
outline=FALSE, mar=c(0,0,2,2), upper="square", bg=NA)
La gráfica de correlación muestra que la densidad de población es la factor que más influye en las muertes reportadas.
A continuación se crean funciones auxiliares para crear las k iteraciones de entrenamiento y prueba, dividir los datos y obtener los residuales. Para este ejercicio se realizarán 10 iteraciones (k=10).
# Generate the K-fold train--test splits
# x are the row indices
# Outputs a list with test (or train) indices
kfoldSplit <- function(x, k=10, train=TRUE){
x <- sample(x, size = length(x), replace = FALSE)
out <- suppressWarnings(split(x, factor(1:k)))
if(train) out <- lapply(out, FUN = function(x, len) (1:len)[-x], len=length(unlist(out)))
return(out)
}
# Regression residuals from RF object
resid.RF <- function(x) return(x$y - x$predicted)
A continuación se inician algunas variables, evalData contendrá todos los valores de RMSE.
set.seed(12345)
k <- 10
kfolds <- kfoldSplit(1:nrow(climDataPT), k = 10, train = TRUE)
evalData <- matrix(NA, nrow=k, ncol=7,
dimnames = list(1:k, c("OK","RF","GLM","GAM","RF_OK","GLM_OK","GAM_OK")))
Se usará un bloque de código, dentro del bucle “for” para cada algoritmo de regresión probado. Los residuales del entrenamiento se interpolan a través de kriging y luego los residuales de prueba se agregan a los resultados de regresión de prueba para su evaluación.
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:raster':
##
## getData
## This is mgcv 1.8-24. For overview type 'help("mgcv-package")'.
library(gstat)
for(i in 1:k){
cat("K-fold...",i,"of",k,"....\n")
# TRAIN indices as integer
idx <- kfolds[[i]]
# TRAIN indices as a boolean vector
idxBool <- (1:nrow(climDataPT)) %in% idx
# Observed test data for the target variable
obs.test <- climDataPT[!idxBool, "NUM_FAT"]
## ----------------------------------------------------------------------------- ##
## Ordinary Kriging ----
## ----------------------------------------------------------------------------- ##
# Make variogram
formMod <- NUM_FAT ~ 1
mod <- vgm(model = "Exp", psill = 3, range = 100, nugget = 0.5)
variog <- variogram(formMod, statPoints[idxBool, ])
# Variogram fitting by Ordinary Least Sqaure
variogFitOLS<-fit.variogram(variog, model = mod, fit.method = 6)
#plot(variog, variogFitOLS, main="OLS Model")
# kriging predictions
OK <- krige(formula = formMod ,
locations = statPoints[idxBool, ],
model = variogFitOLS,
newdata = statPoints[!idxBool, ],
debug.level = 0)
ok.pred.test <- OK@data$var1.pred
evalData[i,"OK"] <- sqrt(mean((ok.pred.test - obs.test)^2))
## ----------------------------------------------------------------------------- ##
## RF calibration ----
## ----------------------------------------------------------------------------- ##
RF <- randomForest(y = climDataPT[idx, "NUM_FAT"],
x = climDataPT[idx, c("POB", "DENS", "ELEV")],
ntree = 500,
mtry = 2)
rf.pred.test <- predict(RF, newdata = climDataPT[-idx,], type="response")
evalData[i,"RF"] <- sqrt(mean((rf.pred.test - obs.test)^2))
# Ordinary Kriging of Random Forest residuals
#
statPointsTMP <- statPoints[idxBool, ]
statPointsTMP@data <- cbind(statPointsTMP@data, residRF = resid.RF(RF))
formMod <- residRF ~ 1
mod <- vgm(model = "Exp", psill = 0.6, range = 10, nugget = 0.01)
variog <- variogram(formMod, statPointsTMP)
# Variogram fitting by Ordinary Least Sqaure
variogFitOLS<-fit.variogram(variog, model = mod, fit.method = 6)
#plot(variog, variogFitOLS, main="OLS Model")
# kriging predictions
RF.OK <- krige(formula = formMod ,
locations = statPointsTMP,
model = variogFitOLS,
newdata = statPoints[!idxBool, ],
debug.level = 0)
rf.ok.pred.test <- rf.pred.test + RF.OK@data$var1.pred
evalData[i,"RF_OK"] <- sqrt(mean((rf.ok.pred.test - obs.test)^2))
## ----------------------------------------------------------------------------- ##
## GLM calibration ----
## ----------------------------------------------------------------------------- ##
GLM <- glm(formula = NUM_FAT ~ POB + DENS + ELEV, data = climDataPT[idx, ])
glm.pred.test <- predict(GLM, newdata = climDataPT[-idx,], type="response")
evalData[i,"GLM"] <- sqrt(mean((glm.pred.test - obs.test)^2))
# Ordinary Kriging of GLM residuals
#
statPointsTMP <- statPoints[idxBool, ]
statPointsTMP@data <- cbind(statPointsTMP@data, residGLM = resid(GLM))
formMod <- residGLM ~ 1
mod <- vgm(model = "Exp", psill = 0.4, range = 10, nugget = 0.01)
variog <- variogram(formMod, statPointsTMP)
# Variogram fitting by Ordinary Least Sqaure
variogFitOLS<-fit.variogram(variog, model = mod, fit.method = 6)
#plot(variog, variogFitOLS, main="OLS Model")
# kriging predictions
GLM.OK <- krige(formula = formMod ,
locations = statPointsTMP,
model = variogFitOLS,
newdata = statPoints[!idxBool, ],
debug.level = 0)
glm.ok.pred.test <- glm.pred.test + GLM.OK@data$var1.pred
evalData[i,"GLM_OK"] <- sqrt(mean((glm.ok.pred.test - obs.test)^2))
## ----------------------------------------------------------------------------- ##
## GAM calibration ----
## ----------------------------------------------------------------------------- ##
GAM <- gam(formula = NUM_FAT ~ s(POB) + s(DENS) + s(ELEV), data = climDataPT[idx, ])
gam.pred.test <- predict(GAM, newdata = climDataPT[-idx,], type="response")
evalData[i,"GAM"] <- sqrt(mean((gam.pred.test - obs.test)^2))
# Ordinary Kriging of GAM residuals
#
statPointsTMP <- statPoints[idxBool, ]
statPointsTMP@data <- cbind(statPointsTMP@data, residGAM = resid(GAM))
formMod <- residGAM ~ 1
mod <- vgm(model = "Exp", psill = 0.3, range = 10, nugget = 0.01)
variog <- variogram(formMod, statPointsTMP)
# Variogram fitting by Ordinary Least Sqaure
variogFitOLS<-fit.variogram(variog, model = mod, fit.method = 6)
#plot(variog, variogFitOLS, main="OLS Model")
# kriging predictions
GAM.OK <- krige(formula = formMod ,
locations = statPointsTMP,
model = variogFitOLS,
newdata = statPoints[!idxBool, ],
debug.level = 0)
gam.ok.pred.test <- gam.pred.test + GAM.OK@data$var1.pred
evalData[i,"GAM_OK"] <- sqrt(mean((gam.ok.pred.test - obs.test)^2))
}
## K-fold... 1 of 10 ....
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## K-fold... 2 of 10 ....
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## K-fold... 3 of 10 ....
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## K-fold... 4 of 10 ....
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## K-fold... 5 of 10 ....
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## K-fold... 6 of 10 ....
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## K-fold... 7 of 10 ....
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## K-fold... 8 of 10 ....
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## K-fold... 9 of 10 ....
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## Warning in fit.variogram(variog, model = mod, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
## K-fold... 10 of 10 ....
## K-fold... 1 of 10 ....
## K-fold... 2 of 10 ....
## K-fold... 3 of 10 ....
## K-fold... 4 of 10 ....
## K-fold... 5 of 10 ....
## K-fold... 6 of 10 ....
## K-fold... 7 of 10 ....
## K-fold... 8 of 10 ....
## K-fold... 9 of 10 ....
## K-fold... 10 of 10 ....
Ahora se pueden ver los valores de desviación estándar para cada iteración.
round(apply(evalData,2,FUN = function(x,...) c(mean(x,...),sd(x,...))),3)
## OK RF GLM GAM RF_OK GLM_OK GAM_OK
## [1,] 1.674 1.669 1.761 1.768 1.61 1.688 1.705
## [2,] 0.582 0.532 0.732 0.768 0.50 0.694 0.740
## OK RF GLM GAM RF_OK GLM_OK GAM_OK
## [1,] 1.674 1.669 1.761 1.768 1.61 1.688 1.705
## [2,] 0.582 0.532 0.732 0.768 0.50 0.694 0.740
Los resultados de esta tabla muestran similitud entre los métodos utilizados aunque el método basado en Random forest tiene un mejor desempeño con una desviación estándar de 1.61.
Hecho esto, se procede a predecir los valores de la capa de número de muertes para todo el territorio de Colombia.
# TRAIN indices as integer
idx <- kfolds[[i]]
# TRAIN indices as a boolean vector
idxBool <- (1:nrow(climDataPT)) %in% idx
# Observed test data for the target variable
obs.test <- climDataPT[!idxBool, "NUM_FAT"]
rstPixDF <- as(rst[[1]], "SpatialPixelsDataFrame")
#rstPixDF
RF <- randomForest(y = climDataPT[idx, "NUM_FAT"],
x = climDataPT[idx, c("POB", "DENS", "ELEV")],
ntree = 500,
mtry = 2)
rstPredRF <- predict(rst, RF, type="response")
A continuación se muestran los variogramas de los dos mejores métodos.
# Ordinary Kriging of Random Forest residuals
#
statPointsTMPRF <- statPoints[idxBool, ]
crs(statPointsTMPRF) <- crs(rstPixDF)
statPointsTMPRF@data <- cbind(statPointsTMPRF@data, residRF = resid.RF(RF))
formModRF <- residRF ~ 1
modRF <- vgm(model = "Exp", psill = 0.6, range = 10, nugget = 0.01)
variogRF <- variogram(formModRF, statPointsTMPRF)
# Variogram fitting by Ordinary Least Sqaure
variogFitOLSRF<-fit.variogram(variogRF, model = modRF, fit.method = 6)
plot(variogRF, variogFitOLSRF, main="RF Model")
GLM <- glm(formula = NUM_FAT ~ POB + DENS + ELEV, data = climDataPT[idx, ])
rstPredGLM <- predict(rst, GLM, type="response")
statPointsTMPGLM <- statPoints[idxBool, ]
crs(statPointsTMPGLM) <- crs(rstPixDF)
statPointsTMPGLM@data <- cbind(statPointsTMPGLM@data, residGLM = resid(GLM))
formModGLM <- residGLM ~ 1
modGLM <- vgm(model = "Exp", psill = 0.4, range = 10, nugget = 0.01)
variogGLM <- variogram(formModGLM, statPointsTMPGLM)
# Variogram fitting by Ordinary Least Sqaure
variogFitOLSGLM<-fit.variogram(variogGLM, model = modGLM, fit.method = 6)
plot(variogGLM, variogFitOLSGLM, main="Semi-variogram of GLM residuals")
GAM <- gam(formula = NUM_FAT ~ s(POB) + s(DENS) + s(ELEV), data = climDataPT)
rstPredGAM <- predict(rst, GAM, type="response")
# Ordinary Kriging of Random Forest residuals
#
# Create a temporary SpatialPointsDF object to store GAM residuals
statPointsTMPGAM <- statPoints
crs(statPointsTMPGAM) <- crs(rstPixDF)
statPointsTMPGAM@data <- cbind(statPointsTMPGAM@data, residGAM = resid(GAM))
# Define the kriging parameters and fit the variogram using OLS
formModGAM <- residGAM ~ 1
modGAM <- vgm(model = "Exp", psill = 0.15, range = 10, nugget = 0.01)
variogGAM <- variogram(formModGAM, statPointsTMPGAM)
variogFitOLSGAM <- fit.variogram(variogGAM, model = modGAM, fit.method = 6)
## Warning in fit.variogram(variogGAM, model = modGAM, fit.method = 6): No
## convergence after 200 iterations: try different initial values?
# Plot the results
plot(variogGAM, variogFitOLSGAM, main="Semi-variogram of GAM residuals")
Finalmente se muestran los resultados de 3 métodos utilizados.
residKrigMapRF <- krige(formula = formModRF,
locations = statPointsTMPRF,
model = variogFitOLSRF,
newdata = rstPixDF)
## [using ordinary kriging]
residKrigRstLayerRF <- as(residKrigMapRF, "RasterLayer")
gamKrigMapRF <- rstPredRF + residKrigRstLayerRF
plot(gamKrigMapRF, main="Muertes por rayos 2012-2014\n(RF regression-kriging RF)",
xlab="Longitude", ylab="Latitude", cex.main=0.8, cex.axis=0.7, cex=0.8)
residKrigMapGLM <- krige(formula = formModGLM,
locations = statPointsTMPGLM,
model = variogFitOLSGLM,
newdata = rstPixDF)
## [using ordinary kriging]
residKrigRstLayerGLM <- as(residKrigMapGLM, "RasterLayer")
gamKrigMapGLM <- rstPredGLM + residKrigRstLayerGLM
plot(gamKrigMapGLM, main="Muertes por rayos 2012-2014\n(RF regression-kriging GLM)",
xlab="Longitude", ylab="Latitude", cex.main=0.8, cex.axis=0.7, cex=0.8)
residKrigMap <- krige(formula = formModGAM,
locations = statPointsTMPGAM,
model = variogFitOLSGAM,
newdata = rstPixDF)
## [using ordinary kriging]
residKrigRstLayer <- as(residKrigMap, "RasterLayer")
gamKrigMap <- rstPredGAM + residKrigRstLayer
plot(gamKrigMap, main="Muertes por rayos 2012-2014\n(RF regression-kriging)",
xlab="Longitude", ylab="Latitude", cex.main=0.8, cex.axis=0.7, cex=0.8)
De los resultados se puede concluir que el método que mejor se ajusta es RF_Ok.