There are two objectives for this function, first to reduce the memory usage (RAM), when transforming a rasterStack into a dataframe, on the other hand we want not to reduce the speed or if possibly to increase the speed of the function.
First lets load the packages we will need:
# For spaital manipulation
library(raster)
# For benchmarking speed and memory
library(bench)
# To parallelize operations
library(doParallel)
# To parallelize operations
library(foreach)
# For combination and looping
library(purrr)
# Data wranggling
library(dplyr)
library(data.table)
The main way to reduce the RAM usage is instead of processing one big raster is to transform it into tiled rasters, for that I developed the following function that you can include in you code:
SplitRas <- function(Raster, ppside, nclus = 1) {
TempRast <- paste0(getwd(), "/Temp")
h <- ceiling(ncol(Raster)/ppside)
v <- ceiling(nrow(Raster)/ppside)
agg <- aggregate(Raster, fact = c(h, v))
agg[] <- 1:ncell(agg)
agg_poly <- rasterToPolygons(agg)
names(agg_poly) <- "polis"
r_list <- list()
if (nclus == 1) {
for (i in 1:ncell(agg)) {
dir.create(TempRast)
rasterOptions(tmpdir = TempRast)
e1 <- extent(agg_poly[agg_poly$polis == i, ])
r_list[[i]] <- crop(Raster, e1)
if ((freq(r_list[[i]], value = NA)/ncell(r_list[[i]])) !=
1) {
writeRaster(r_list[[i]], filename = paste("SplitRas",
i, sep = ""), format = "GTiff", datatype = "FLT4S",
overwrite = TRUE)
}
unlink(TempRast, recursive = T, force = T)
}
} else if (nclus != 1) {
cl <- parallel::makeCluster(nclus)
doParallel::registerDoParallel(cl)
r_list <- foreach(i = 1:ncell(agg), .packages = c("raster")) %dopar%
{
dir.create(TempRast)
rasterOptions(tmpdir = TempRast)
e1 <- extent(agg_poly[agg_poly$polis == i, ])
Temp <- crop(Raster, e1)
if ((raster::freq(Temp, value = NA)/ncell(Temp)) !=
1) {
writeRaster(Temp, filename = paste("SplitRas",
i, sep = ""), format = "GTiff", datatype = "FLT4S",
overwrite = TRUE)
}
unlink(TempRast, recursive = T, force = T)
Temp
}
parallel::stopCluster(cl)
}
}
This function has 3 arguments:
At the end of this function you will have ppside*ppside
number of tiles, saved in your working directory all called SplitRasN.tif
where N is the number of the tile. Just as an example we will use the bioclimatic variables available in the raster package:
Bios <- getData("worldclim", var = "bio", res = 10)
Now I will just plot mean temperature as seen in figure 1.1, but we can see that we can divide this in diferent number of tiles as shown in figure 1.2, this will generate different performances in speed and or ram usage. As we will see afterwards
Figure 1.1: Mean temperature
Figure 1.2: Mean temerature shown with different number of tiles in red
so first we will use SplitRas
function to get the 16 tiles using 4 cores:
SplitRas(Raster = Bios, ppside = 4, nclus = 4)
This will return the following files: SplitRas1.tif, SplitRas10.tif, SplitRas11.tif, SplitRas12.tif, SplitRas13.tif, SplitRas14.tif, SplitRas15.tif, SplitRas16.tif, SplitRas2.tif, SplitRas3.tif, SplitRas4.tif, SplitRas5.tif, SplitRas6.tif, SplitRas7.tif, SplitRas8.tif, SplitRas9.tif
In order to get this tiles into one dataframe with all the non-NA cells we need a list of the tiles, which we get with:
Files <- list.files(pattern = "SplitRas", full.names = T)
Which we can use then in the following function:
SplitsToDataFrame <- function(Splits, ncores = 1) {
TempRast <- paste0(getwd(), "/Temp")
if (ncores == 1) {
Temps <- list()
for (i in 1:length(Splits)) {
dir.create(TempRast)
rasterOptions(tmpdir = TempRast)
Temp <- stack(Splits[i])
Temp <- as.data.frame(Temp, row.names = NULL, col.names = NULL,
xy = TRUE)
colnames(Temp)[3:ncol(Temp)] <- paste0("Var", 1:length(3:ncol(Temp)))
Temps[[i]] <- Temp[complete.cases(Temp), ]
gc()
unlink(TempRast, recursive = T, force = T)
message(i)
}
Temps <- data.table::rbindlist(Temps)
} else if (ncores > 1) {
cl <- parallel::makeCluster(ncores)
doParallel::registerDoParallel(cl)
Temps <- foreach(i = 1:length(Splits), .packages = c("raster",
"data.table")) %dopar% {
dir.create(TempRast)
rasterOptions(tmpdir = TempRast)
Temp <- stack(Splits[i])
Temp <- as.data.frame(Temp, row.names = NULL, col.names = NULL,
xy = TRUE)
colnames(Temp)[3:ncol(Temp)] <- paste0("Var", 1:length(3:ncol(Temp)))
gc()
unlink(TempRast, recursive = T, force = T)
Temp[complete.cases(Temp), ]
}
Temps <- data.table::rbindlist(Temps)
parallel::stopCluster(cl)
}
return(Temps)
}
Where Splits
is a character vectors with the paths to the tiles, and ncores
is the number of cores used for parallelization. This will result in the Data frame with the non empty cells.
DF <- SplitsToDataFrame(Splits = Files, ncores = 4)
The first 20 rows of results can be seen in table 1.1
x | y | Var1 | Var2 | Var3 | Var4 | Var5 | Var6 | Var7 | Var8 | Var9 | Var10 | Var11 | Var12 | Var13 | Var14 | Var15 | Var16 | Var17 | Var18 | Var19 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
-90.75 | 81.92 | -209 | 64 | 13 | 15866 | 62 | -428 | 490 | -16 | -384 | 15 | -384 | 70 | 15 | 2 | 78 | 40 | 6 | 35 | 6 |
-90.58 | 81.92 | -210 | 63 | 12 | 15891 | 62 | -428 | 490 | -16 | -385 | 15 | -385 | 70 | 15 | 2 | 78 | 40 | 6 | 35 | 6 |
-90.42 | 81.92 | -210 | 63 | 12 | 15899 | 61 | -429 | 490 | -17 | -385 | 14 | -385 | 69 | 14 | 2 | 75 | 39 | 6 | 34 | 6 |
-90.25 | 81.92 | -212 | 62 | 12 | 15896 | 59 | -430 | 489 | -19 | -387 | 12 | -387 | 73 | 15 | 2 | 67 | 40 | 6 | 36 | 6 |
-90.08 | 81.92 | -212 | 61 | 12 | 15911 | 59 | -429 | 488 | -19 | -387 | 13 | -387 | 74 | 15 | 2 | 68 | 41 | 6 | 36 | 6 |
-91.75 | 81.75 | -212 | 63 | 13 | 15646 | 55 | -429 | 484 | -21 | -385 | 9 | -385 | 76 | 16 | 2 | 72 | 43 | 6 | 38 | 6 |
-91.58 | 81.75 | -215 | 62 | 12 | 15639 | 52 | -431 | 483 | -24 | -388 | 6 | -388 | 80 | 17 | 2 | 72 | 44 | 6 | 39 | 6 |
-91.42 | 81.75 | -218 | 60 | 12 | 15605 | 48 | -434 | 482 | -28 | -391 | 2 | -391 | 85 | 17 | 3 | 64 | 46 | 9 | 40 | 9 |
-91.25 | 81.75 | -217 | 60 | 12 | 15659 | 50 | -433 | 483 | -26 | -390 | 4 | -390 | 84 | 17 | 3 | 63 | 45 | 9 | 40 | 9 |
-91.08 | 81.75 | -215 | 61 | 12 | 15706 | 52 | -432 | 484 | -24 | -389 | 6 | -389 | 78 | 16 | 2 | 71 | 43 | 6 | 38 | 6 |
-90.92 | 81.75 | -220 | 60 | 12 | 15692 | 47 | -435 | 482 | -29 | -393 | 2 | -393 | 87 | 17 | 3 | 64 | 46 | 9 | 41 | 9 |
-90.75 | 81.75 | -219 | 59 | 12 | 15716 | 48 | -435 | 483 | -28 | -393 | 3 | -393 | 86 | 17 | 3 | 64 | 46 | 9 | 41 | 9 |
-90.58 | 81.75 | -218 | 60 | 12 | 15759 | 50 | -434 | 484 | -27 | -392 | 4 | -392 | 85 | 17 | 3 | 64 | 46 | 9 | 40 | 9 |
-90.42 | 81.75 | -217 | 60 | 12 | 15785 | 52 | -433 | 485 | -25 | -391 | 6 | -391 | 83 | 16 | 3 | 69 | 44 | 9 | 39 | 9 |
-90.25 | 81.75 | -217 | 60 | 12 | 15818 | 52 | -433 | 485 | -25 | -392 | 6 | -392 | 85 | 16 | 3 | 62 | 45 | 9 | 39 | 9 |
-90.08 | 81.75 | -219 | 59 | 12 | 15842 | 50 | -435 | 485 | -26 | -394 | 5 | -394 | 88 | 17 | 3 | 63 | 46 | 9 | 41 | 9 |
-91.92 | 81.58 | -209 | 64 | 13 | 15649 | 59 | -427 | 486 | -18 | -382 | 12 | -382 | 72 | 16 | 2 | 71 | 42 | 6 | 36 | 6 |
-91.75 | 81.58 | -214 | 62 | 12 | 15582 | 52 | -430 | 482 | -24 | -387 | 6 | -387 | 78 | 17 | 2 | 76 | 45 | 6 | 39 | 6 |
-91.58 | 81.58 | -221 | 59 | 12 | 15518 | 44 | -435 | 479 | -31 | -393 | -2 | -393 | 90 | 18 | 3 | 69 | 49 | 9 | 43 | 9 |
-91.42 | 81.58 | -227 | 57 | 11 | 15481 | 37 | -439 | 476 | -38 | -398 | -8 | -398 | 101 | 20 | 3 | 66 | 54 | 10 | 48 | 10 |
If you need to change the number of tiles, before doing so, it is recommended that you do:
file.remove(Files)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE