1.1. INTRODUCTION


Libraries:

suppressPackageStartupMessages({
library(dplyr)
library(ggplot2)
library(RColorBrewer)
library(terra)
library(sf)
library(raster)
library(caret)
library(randomForest)
library(quantregForest)
library(viridis)
})
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'sf' was built under R version 4.3.3
## Warning: package 'caret' was built under R version 4.3.3
## Warning: package 'randomForest' was built under R version 4.3.3
## Warning: package 'quantregForest' was built under R version 4.3.3

Preparing pathways:

root <- "C:/Users/hudso/OneDrive/Documents/02. WUR/10. THESIS"
rproject_path <- file.path(root, '06 r_project')
proc_covar_path <- file.path(root, '05 data/02 processed_rasters/covariates')
proc_cat_path <- file.path(root, '05 data/02 processed_rasters/categorical_preparation')
proc_samples_path <- file.path(root, '05 data/03 processed_samples')
rfe_path <- file.path(rproject_path, 'rfe')
feat_imp_path <- file.path(rproject_path, 'feature_importance')
models_path <- file.path(rproject_path, 'models')
setwd(rproject_path)

Loading RFE variables:

rfe_feat <- read.csv(file.path(rfe_path, "60_optimal_features_CI.csv"))

Loading the prepared categorical covariates:

files <- list.files(path = proc_cat_path, full.names = TRUE)
names_vector_1 <- c()
rasters_vector_1 <- c()

path01 <- unlist(strsplit(files[1], split = "-"))[1]
path03 <- unlist(strsplit(files[1], split = "-"))[3]

list_to_load <- names(rfe_feat)

for (i in 1:length(list_to_load)) {
  
  name <- list_to_load[i]
  
  parts <- unlist(strsplit(name, split = "_"))
  last_part <- parts[length(parts)]
  
  if (last_part == "cat"){
    
    name_cat <- name
    
    new_folder_path <- file.path(proc_cat_path, name_cat)
    
    new_files <- list.files(path = new_folder_path, full.names = TRUE)
    
    for (j in new_files){
      
      name_parts <- unlist(strsplit(j, split = "/"))
      name_last_part <- name_parts[length(name_parts)]
      new_name_cat <- unlist(strsplit(name_last_part, split = "\\."))[1]
      
      names_vector_1 <- c(names_vector_1, new_name_cat)
      
      raster_j <- rast(j)

      names(raster_j) <- new_name_cat
      crs(raster_j) <- "EPSG:3035"
      rasters_vector_1 <- c(rasters_vector_1, raster_j)
      
    }
  }
}

names(rasters_vector_1) <- names_vector_1

raster_stack_1 <- rast(rasters_vector_1)

Stacking categorical raster with the continuous variables:

files <- list.files(path = proc_covar_path, full.names = TRUE)
names_vector_2 <- c()
rasters_vector_2 <- c()

path01 <- unlist(strsplit(files[1], split = "-"))[1]
path03 <- unlist(strsplit(files[1], split = "-"))[3]


for (i in 1:length(files)) {
  name <- unlist(strsplit(files[i], split = "-"))[2]
  
  parts <- unlist(strsplit(name, split = "_"))
  last_part <- parts[length(parts)]
  
  if (name %in% list_to_load & last_part != "cat"){ 
    
    raster_i <- rast(files[i])
    names(raster_i) <- name
    crs(raster_i) <- "EPSG:3035"
    rasters_vector_2 <- c(rasters_vector_2, raster_i)
  }
}

names_vector <- c(rasters_vector_1, rasters_vector_2)

raster_stack <- rast(names_vector)


1.2. RETRAINING THE MODEL WITH THE DUMMY VARIABLES


Loading samples:

nbis <- read.csv(file.path(proc_samples_path, "01_nbis_allrows_basic_cols_v01.csv"), header = T)

nbis$x_coord <- nbis$x_3035
nbis$y_coord <- nbis$y_3035

nbis_sf<- st_as_sf(nbis, coords = c("x_coord", "y_coord"), crs = 3035)
new_nbis_sf <- nbis_sf["ci"]

Extracting covariates (continuous and one hot encoded categorical ones):

nbi_covariates <- extract(raster_stack, new_nbis_sf)
nbi_covariates <- subset(nbi_covariates, select = -c(ID))
nbi_covariates <- cbind(nbis["ci"], nbi_covariates)

nrow(nbi_covariates)
## [1] 1327
nbi_covariates <- na.omit(nbi_covariates)
nrow(nbi_covariates)
## [1] 997

Verifying the histogram:

hist(nbi_covariates$ci, breaks=100)

Removing part of the values 100:

idx_110 <- sample(which(nbi_covariates$ci==100), 110)

predictors <- nbi_covariates[-idx_110, setdiff(names(nbi_covariates), c("ci"))]

response <- nbi_covariates[-idx_110,"ci"]

hist(response, breaks=100)


1.3. RF MODEL - MEAN PREDICTION


Training the model with dummy variables:

set.seed(12345)
rf_CI_alldata_onehotenc_cat <- randomForest(x = predictors, 
                        y = response, 
                        importance = TRUE,
                        mtry = 50, 
                        nodesize = 5, 
                        maxnodes = 95, 
                        ntree = 300)

Selecting the same optimal covariates as used in the model above:

raster_stack_opt <- subset(raster_stack, names(predictors))

Map prediction (it may take hours):

#RF_pred_CI <- predict(object = raster_stack_opt, 
#                        model  = rf_CI_alldata_onehotenc_cat,
#                        na.rm = TRUE)

Plot:

RF_pred_CI <- rast("C:/Users/hudso/OneDrive/Documents/02. WUR/10. THESIS/05 data/05 generated/CI/RF_mean_CI_v02.tif")

plot(
  RF_pred_CI,
  col = viridis(100, option = "inferno"),
  main = "RF Prediction Channel Index (CI)",
  xlab = "Longitude",
  ylab = "Latitude"
)

1.4. UNCERTAINTY QUANTIFICATION (QRF)

Train the Quantile Random Forest model:

QRF_model_CI <- quantregForest(x = predictors, 
                            y = response, 
                            mtry = 50, 
                            nodesize = 5, 
                            maxnodes = 95, 
                            ntree = 300)

Quantiles predictions:

## Define the total number of rows and chunks
#total_rows <- nrow(raster_stack_opt)
#num_chunks <- 1000
#rows_per_chunk <- ceiling(total_rows / num_chunks)
#
## Get the resolution of the raster (to calculate chunk extents)
#res_y <- res(raster_stack_opt)[2]  # Vertical resolution
#
## Initialize lists to store results
#pred_95_list <- list()
#pred_05_list <- list()
#
## Initialize a progress bar
#pb <- txtProgressBar(min = 0, max = num_chunks, style = 3)
#
# Loop through chunks
#for (chunk_index in seq_len(num_chunks)) {
#  # Calculate the extent for the current chunk
#  start_row <- (chunk_index - 1) * rows_per_chunk + 1
#  ymin <- ymin(raster_stack_opt) + (start_row - 1) * res_y
#  ymax <- min(ymin + rows_per_chunk * res_y, ymax(raster_stack_opt))
#  chunk_extent <- ext(xmin(raster_stack_opt), xmax(raster_stack_opt), ymin, ymax)
#  
#  # Crop the raster to the current chunk
#  raster_chunk <- crop(raster_stack_opt, chunk_extent)
#  
#  # Skip if raster_chunk is empty
#  if (is.null(raster_chunk) || all(is.na(values(raster_chunk)))) {
#    next
#  }
#  
#  # Convert the raster chunk to a data frame
#  chunk_df <- as.data.frame(raster_chunk, xy = TRUE, na.rm = TRUE)
#  
#  # Skip if the data frame is empty
#  if (nrow(chunk_df) == 0) {
#    next
#  }
#  
#  # Separate coordinates and predictor values
#  coords <- chunk_df[, c("x", "y")]
#  predictors <- chunk_df[, -c(1, 2)]
#  
#  # Ensure predictors are non-empty
#  if (ncol(predictors) == 0) {
#    next
#  }
#  
#  # Predict quantiles using QRF
#  pred_95 <- predict(QRF_model_CI, predictors, what = 0.95)  
#  pred_05 <- predict(QRF_model_CI, predictors, what = 0.05)
#  
#  # Store predictions with coordinates
#  pred_95_list[[chunk_index]] <- cbind(coords, pred_95)
#  pred_05_list[[chunk_index]] <- cbind(coords, pred_05)
#  
#  # Update the progress bar
#  setTxtProgressBar(pb, chunk_index)
#}
#
#
## Close the progress bar
#close(pb)

Saving quantiles 0.05 and 0.95. Calculating PI90:

#pred_95_df <- do.call(rbind, pred_95_list)
#raster_pred_95 <- rast(pred_95_df, type = "xyz", crs = "EPSG:3035")
#writeRaster(raster_pred_95, "QRF_pred_quantile_95_ci_v02.tif", overwrite = TRUE)
#
#pred_05_df <- do.call(rbind, pred_05_list)
#raster_pred_05 <- rast(pred_05_df, type = "xyz", crs = "EPSG:3035")
#writeRaster(raster_pred_05, "QRF_pred_quantile_05_ci_v02.tif", overwrite = TRUE)
#
## Combine the two rasters into a multi-band raster
#pred_raster <- c(raster_pred_95, raster_pred_05)
#names(pred_raster) <- c("pred_95", "pred_05")
#
#pred_raster$pi90 <- pred_raster$pred_95 - pred_raster$pred_05
#writeRaster(pred_raster$pi90, "QRF_pred_interval_90_ci_v02.tif", overwrite = TRUE)
pred_raster_pi90 <- rast("C:/Users/hudso/OneDrive/Documents/02. WUR/10. THESIS/05 data/05 generated/CI/QRF_pred_interval_90_CI_v02.tif")

plot(
  pred_raster_pi90,
  col = viridis(100, option = "inferno"),
  main = "QRF PI90 (CI)",
  xlab = "Longitude",
  ylab = "Latitude"
)