Libraries:
suppressPackageStartupMessages({
library(dplyr)
library(ggplot2)
library(RColorBrewer)
library(terra)
library(sf)
library(raster)
library(caret)
library(randomForest)
library(quantregForest)
library(viridis)
})
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'sf' was built under R version 4.3.3
## Warning: package 'caret' was built under R version 4.3.3
## Warning: package 'randomForest' was built under R version 4.3.3
## Warning: package 'quantregForest' was built under R version 4.3.3
Preparing pathways:
root <- "C:/Users/hudso/OneDrive/Documents/02. WUR/10. THESIS"
rproject_path <- file.path(root, '06 r_project')
proc_covar_path <- file.path(root, '05 data/02 processed_rasters/covariates')
proc_cat_path <- file.path(root, '05 data/02 processed_rasters/categorical_preparation')
proc_samples_path <- file.path(root, '05 data/03 processed_samples')
rfe_path <- file.path(rproject_path, 'rfe')
feat_imp_path <- file.path(rproject_path, 'feature_importance')
models_path <- file.path(rproject_path, 'models')
setwd(rproject_path)
Loading RFE variables:
rfe_feat <- read.csv(file.path(rfe_path, "60_optimal_features_CI.csv"))
Loading the prepared categorical covariates:
files <- list.files(path = proc_cat_path, full.names = TRUE)
names_vector_1 <- c()
rasters_vector_1 <- c()
path01 <- unlist(strsplit(files[1], split = "-"))[1]
path03 <- unlist(strsplit(files[1], split = "-"))[3]
list_to_load <- names(rfe_feat)
for (i in 1:length(list_to_load)) {
name <- list_to_load[i]
parts <- unlist(strsplit(name, split = "_"))
last_part <- parts[length(parts)]
if (last_part == "cat"){
name_cat <- name
new_folder_path <- file.path(proc_cat_path, name_cat)
new_files <- list.files(path = new_folder_path, full.names = TRUE)
for (j in new_files){
name_parts <- unlist(strsplit(j, split = "/"))
name_last_part <- name_parts[length(name_parts)]
new_name_cat <- unlist(strsplit(name_last_part, split = "\\."))[1]
names_vector_1 <- c(names_vector_1, new_name_cat)
raster_j <- rast(j)
names(raster_j) <- new_name_cat
crs(raster_j) <- "EPSG:3035"
rasters_vector_1 <- c(rasters_vector_1, raster_j)
}
}
}
names(rasters_vector_1) <- names_vector_1
raster_stack_1 <- rast(rasters_vector_1)
Stacking categorical raster with the continuous variables:
files <- list.files(path = proc_covar_path, full.names = TRUE)
names_vector_2 <- c()
rasters_vector_2 <- c()
path01 <- unlist(strsplit(files[1], split = "-"))[1]
path03 <- unlist(strsplit(files[1], split = "-"))[3]
for (i in 1:length(files)) {
name <- unlist(strsplit(files[i], split = "-"))[2]
parts <- unlist(strsplit(name, split = "_"))
last_part <- parts[length(parts)]
if (name %in% list_to_load & last_part != "cat"){
raster_i <- rast(files[i])
names(raster_i) <- name
crs(raster_i) <- "EPSG:3035"
rasters_vector_2 <- c(rasters_vector_2, raster_i)
}
}
names_vector <- c(rasters_vector_1, rasters_vector_2)
raster_stack <- rast(names_vector)
Loading samples:
nbis <- read.csv(file.path(proc_samples_path, "01_nbis_allrows_basic_cols_v01.csv"), header = T)
nbis$x_coord <- nbis$x_3035
nbis$y_coord <- nbis$y_3035
nbis_sf<- st_as_sf(nbis, coords = c("x_coord", "y_coord"), crs = 3035)
new_nbis_sf <- nbis_sf["ci"]
Extracting covariates (continuous and one hot encoded categorical ones):
nbi_covariates <- extract(raster_stack, new_nbis_sf)
nbi_covariates <- subset(nbi_covariates, select = -c(ID))
nbi_covariates <- cbind(nbis["ci"], nbi_covariates)
nrow(nbi_covariates)
## [1] 1327
nbi_covariates <- na.omit(nbi_covariates)
nrow(nbi_covariates)
## [1] 997
Verifying the histogram:
hist(nbi_covariates$ci, breaks=100)
Removing part of the values 100:
idx_110 <- sample(which(nbi_covariates$ci==100), 110)
predictors <- nbi_covariates[-idx_110, setdiff(names(nbi_covariates), c("ci"))]
response <- nbi_covariates[-idx_110,"ci"]
hist(response, breaks=100)
Training the model with dummy variables:
set.seed(12345)
rf_CI_alldata_onehotenc_cat <- randomForest(x = predictors,
y = response,
importance = TRUE,
mtry = 50,
nodesize = 5,
maxnodes = 95,
ntree = 300)
Selecting the same optimal covariates as used in the model above:
raster_stack_opt <- subset(raster_stack, names(predictors))
Map prediction (it may take hours):
#RF_pred_CI <- predict(object = raster_stack_opt,
# model = rf_CI_alldata_onehotenc_cat,
# na.rm = TRUE)
Plot:
RF_pred_CI <- rast("C:/Users/hudso/OneDrive/Documents/02. WUR/10. THESIS/05 data/05 generated/CI/RF_mean_CI_v02.tif")
plot(
RF_pred_CI,
col = viridis(100, option = "inferno"),
main = "RF Prediction Channel Index (CI)",
xlab = "Longitude",
ylab = "Latitude"
)
Train the Quantile Random Forest model:
QRF_model_CI <- quantregForest(x = predictors,
y = response,
mtry = 50,
nodesize = 5,
maxnodes = 95,
ntree = 300)
Quantiles predictions:
## Define the total number of rows and chunks
#total_rows <- nrow(raster_stack_opt)
#num_chunks <- 1000
#rows_per_chunk <- ceiling(total_rows / num_chunks)
#
## Get the resolution of the raster (to calculate chunk extents)
#res_y <- res(raster_stack_opt)[2] # Vertical resolution
#
## Initialize lists to store results
#pred_95_list <- list()
#pred_05_list <- list()
#
## Initialize a progress bar
#pb <- txtProgressBar(min = 0, max = num_chunks, style = 3)
#
# Loop through chunks
#for (chunk_index in seq_len(num_chunks)) {
# # Calculate the extent for the current chunk
# start_row <- (chunk_index - 1) * rows_per_chunk + 1
# ymin <- ymin(raster_stack_opt) + (start_row - 1) * res_y
# ymax <- min(ymin + rows_per_chunk * res_y, ymax(raster_stack_opt))
# chunk_extent <- ext(xmin(raster_stack_opt), xmax(raster_stack_opt), ymin, ymax)
#
# # Crop the raster to the current chunk
# raster_chunk <- crop(raster_stack_opt, chunk_extent)
#
# # Skip if raster_chunk is empty
# if (is.null(raster_chunk) || all(is.na(values(raster_chunk)))) {
# next
# }
#
# # Convert the raster chunk to a data frame
# chunk_df <- as.data.frame(raster_chunk, xy = TRUE, na.rm = TRUE)
#
# # Skip if the data frame is empty
# if (nrow(chunk_df) == 0) {
# next
# }
#
# # Separate coordinates and predictor values
# coords <- chunk_df[, c("x", "y")]
# predictors <- chunk_df[, -c(1, 2)]
#
# # Ensure predictors are non-empty
# if (ncol(predictors) == 0) {
# next
# }
#
# # Predict quantiles using QRF
# pred_95 <- predict(QRF_model_CI, predictors, what = 0.95)
# pred_05 <- predict(QRF_model_CI, predictors, what = 0.05)
#
# # Store predictions with coordinates
# pred_95_list[[chunk_index]] <- cbind(coords, pred_95)
# pred_05_list[[chunk_index]] <- cbind(coords, pred_05)
#
# # Update the progress bar
# setTxtProgressBar(pb, chunk_index)
#}
#
#
## Close the progress bar
#close(pb)
Saving quantiles 0.05 and 0.95. Calculating PI90:
#pred_95_df <- do.call(rbind, pred_95_list)
#raster_pred_95 <- rast(pred_95_df, type = "xyz", crs = "EPSG:3035")
#writeRaster(raster_pred_95, "QRF_pred_quantile_95_ci_v02.tif", overwrite = TRUE)
#
#pred_05_df <- do.call(rbind, pred_05_list)
#raster_pred_05 <- rast(pred_05_df, type = "xyz", crs = "EPSG:3035")
#writeRaster(raster_pred_05, "QRF_pred_quantile_05_ci_v02.tif", overwrite = TRUE)
#
## Combine the two rasters into a multi-band raster
#pred_raster <- c(raster_pred_95, raster_pred_05)
#names(pred_raster) <- c("pred_95", "pred_05")
#
#pred_raster$pi90 <- pred_raster$pred_95 - pred_raster$pred_05
#writeRaster(pred_raster$pi90, "QRF_pred_interval_90_ci_v02.tif", overwrite = TRUE)
pred_raster_pi90 <- rast("C:/Users/hudso/OneDrive/Documents/02. WUR/10. THESIS/05 data/05 generated/CI/QRF_pred_interval_90_CI_v02.tif")
plot(
pred_raster_pi90,
col = viridis(100, option = "inferno"),
main = "QRF PI90 (CI)",
xlab = "Longitude",
ylab = "Latitude"
)