1. Preprocessing samples

1.1. Introduction

Libraries:

suppressPackageStartupMessages({
library(dplyr)
library(terra)
library(raster)
library(sf)
library(ggplot2)
})

## Warning: package 'sf' was built under R version 4.3.3

## Warning: package 'ggplot2' was built under R version 4.3.3

Preparing pathways:

root <- "C:/Users/hudso/OneDrive/Documents/02. WUR/10. THESIS"
rproject_path <- file.path(root, '06 r_project')
raw_samples_path <- file.path(root, '04 data/01 raw/samples_vRAW')
proc_samples_path <- file.path(root, '04 data/03 processed_samples')
boun_path <- file.path(root, '04 data/02 processed_rasters/boundaries')

Loading raw samples:

nbis_raw <- read.csv(file.path(raw_samples_path, "nbis_vRAW.csv"), header = T)

Subsetting the essential information and renaming columns:

nbis <- nbis_raw[, c("latitude", "longitude", "year", "season", "land_use", 
                     "Structure.Index..MEAN.", "Enrichment.Index..MEAN.", "Channel.Index..MEAN.")]

names(nbis) <- c("lat_4326", "long_4326", "year", "season", "landuse", "si", "ei", "ci")

head(nbis)

##   lat_4326 long_4326 year season           landuse    si    ei    ci
## 1 46.25719  7.022394 2016 Summer natural grassland 72.73 43.24 37.50
## 2 46.26847  7.032504 2016 Summer natural grassland 64.49 54.22 55.56
## 3 46.27363  7.048331 2016 Summer natural grassland 88.99 88.79  3.16
## 4 46.28558  7.095852 2016 Summer natural grassland 76.26 50.91 85.71
## 5 46.28509  7.124600 2016 Summer natural grassland 55.47 53.17 58.21
## 6 46.29094  7.112580 2016 Summer natural grassland 39.84 54.52 46.96

1.2. Removing samples in the same location, but too high standard deviation

Calculating the standard deviation of SI in samples with exactly the same latitude:

for (i in 1:nrow(nbis)){
  
  lat <- nbis$lat_4326[i]
  
  df_same_lat <- subset(nbis, nbis$lat_4326==lat)
  
  if (nrow(df_same_lat)>1) {
    nbis$std_si[i] = sd(df_same_lat$si)
    } else {
      nbis$std_si[i] = NA
    }
}
nrow(nbis) # 2936 samples

## [1] 2936

Removing samples with standard deviation too high (arbitrary choice = 11):

nbis <- nbis[nbis$std_si <= 11 | is.na(nbis$std_si), ]
nrow(nbis) # 1742 samples

## [1] 1742

1.3. Aggregating samples

Creating a layer containing rows (band 1) and columns (band 2) to be used to extract this information for each sample point.

nrows <- 15965
ncols <- 13282

Creating a template raster:

r <- raster(nrows=15965, ncols=13282, res=c(250,250), xmn=2635500, xmx=5956000, ymn=1425000, ymx=5416250, crs = 3035) # rows layer
c <- raster(nrows=15965, ncols=13282, res=c(250,250), xmn=2635500, xmx=5956000, ymn=1425000, ymx=5416250, crs = 3035) # columns layer

values(r) <- rep(1:nrows, each=ncols) # giving the values of rows
values(c) <- rep(1:ncols, times=nrows) # giving the values of cols

Stacking both rasters:

raster_stack <- stack(r, c)
names(raster_stack) <- c("rows", "cols")

Loading boundary and sample data:

boundary_sf <- st_read(file.path(boun_path, "geoBoundariesCGAZ_Europe_Dissolved_v04.shp"))

## Reading layer `geoBoundariesCGAZ_Europe_Dissolved_v04' from data source 
##   `C:\Users\hudso\OneDrive\Documents\02. WUR\10. THESIS\04 data\02 processed_rasters\boundaries\geoBoundariesCGAZ_Europe_Dissolved_v04.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 1 feature and 8 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 2635657 ymin: 1425026 xmax: 5955764 ymax: 5416023
## Projected CRS: ETRS89-extended / LAEA Europe

Removing samples with empty latitudes/longitudes:

idx_del <- which(is.na(nbis$lat_4326))
nbis <- nbis[-idx_del,]
nrow(nbis) # 1740 samples

## [1] 1740

Making a copy of coordinates to be transformed into geometry:

nbis$latitude <- nbis$lat_4326
nbis$longitude <- nbis$long_4326

Convert to sf object (EPSG:4326 is the CRS for lat/long WGS84):

nbis_4326 <- st_as_sf(nbis, coords = c("longitude", "latitude"), crs = 4326)

Transform to EPSG:3035 (ETRS89 / LAEA Europe):

nbis_sf <- st_transform(nbis_4326, crs = 3035)

Registering also coordinates in CRS 3035:

nbis_sf$x_3035 <- st_coordinates(nbis_sf)[, 1]  # X coordinates (longitude)
nbis_sf$y_3035 <- st_coordinates(nbis_sf)[, 2]  # Y coordinates (latitude)

Using the function st_intersection to restrict the samples only in the study area:

# Original data (before intersection)
nbis_sf_original <- nbis_sf

# Perform the intersection
nbis_sf <- st_intersection(nbis_sf, boundary_sf)

## Warning: attribute variables are assumed to be spatially constant throughout
## all geometries

# Create the plot
plot_combined <- ggplot() +
  # Plot nbis_sf before intersection in gray
  geom_sf(data = nbis_sf_original, color = "gray50", alpha = 0.6) +
  # Plot nbis_sf after intersection in red
  geom_sf(data = nbis_sf, color = "red", alpha = 0.6) +
  # Plot the boundary
  geom_sf(data = boundary_sf, fill = NA, color = "gray80", size = 0.8) +
  labs(title = "NBIS Before (Gray) and After (Red) Intersection",
       fill = "Legend") +
  theme_minimal()

# Display the plot
print(plot_combined)

nrow(nbis_sf) # 1656 samples

## [1] 1656

Extracting information of rows and columns from the stacked raster created:

nbis_rows_cols <- extract(raster_stack, nbis_sf)
nbis_df <- st_drop_geometry(nbis_sf)
nbis_df <- nbis_df[,c("lat_4326", "long_4326", "x_3035", "y_3035", "year", "season", "landuse", "si", "ei", "ci")]
nbis_rows_cols <- cbind(nbis_df, nbis_rows_cols)

Creating a pixel_group which is a code for pixels sharing an unique combination of row and col:

df <- nbis_rows_cols %>%
  mutate(pixel_group = as.integer(factor(paste(rows, cols))))

Calculating average for each NBI for samples in the same pixel_group:

df <- df %>%
  group_by(pixel_group) %>%  
  mutate(avg_si = mean(si, na.rm = TRUE)) %>%  
  mutate(avg_ei = mean(ei, na.rm = TRUE)) %>%  
  mutate(avg_ci = mean(ci, na.rm = TRUE)) %>%  
  ungroup()

Calculating sd for each NBI for samples in the same pixel_group:

df <- df %>%
  group_by(pixel_group) %>%  
  mutate(sd_si = sd(si, na.rm = TRUE)) %>%  
  mutate(sd_ei = sd(ei, na.rm = TRUE)) %>%  
  mutate(sd_ci = sd(ci, na.rm = TRUE)) %>%  
  ungroup()

Counting repetitions. Pixel_group seen first (or just once) will be assigned 1

df <- df %>%
  group_by(pixel_group) %>%  
  mutate(count_pxgroup = n()) %>%  # Count the number of rows in each group and store it in 'count_same'
  ungroup()

Selecting pixels not repeated (i.e., count_pxgroup = 1). The new column agg_pixel receives a pixel_group for not repeated (or at least seen first than the others repetitions). The remaining pixel groups receive NA.

df <- df %>%
  group_by(pixel_group) %>%  
  mutate(agg_pixel = ifelse(row_number() == 1, pixel_group, NA)) %>%  
  ungroup() 

df <- data.frame(df)

Removing NA values from agg_pixel (i.e., samples repeated in the same pixel):

df_non_na  <- df[!is.na(df$agg_pixel),]
df_non_na  <- df_non_na[!is.na(df_non_na$avg_si),]

nrow(df_non_na) # 1327 samples

## [1] 1327

Subsetting the data frame for only relevant information for the next steps and renaming them:

final_df <- df_non_na[,c("lat_4326", "long_4326", "x_3035", "y_3035", "year", "season", "landuse", "avg_si", "avg_ei", "avg_ci")]

names(final_df) <- c("lat_4326", "long_4326", "x_3035", "y_3035", "year_cat", "season_cat", "landuse_cat", "si", "ei", "ci")

Creating sample_id:

final_df$sample_id <- 1:nrow(final_df)

Moving sample_id to the first column:

final_df <- final_df[,c("sample_id", "lat_4326", "long_4326", "x_3035", "y_3035", "year_cat", "season_cat", "landuse_cat", "si", "ei", "ci")]

#write.csv(final_df, file.path(proc_samples_path, "01_nbis_allrows_basic_cols_v01.csv"), row.names=FALSE)

1. Preprocessing samples

Hudson Passos

2024-12-29

1.1. Introduction

1.2. Removing samples in the same location, but too high standard deviation

1.3. Aggregating samples