Preprocessing RShiny

Author

Group6 - Claude Widmer

Preprocessing For RShiny App

Before integrating the data into the RShiny application, we perform preprocessing after the initial exploratory data analysis (EDA). To enhance performance, all necessary calculations and data transformations are conducted at this stage. The processed data is then exported as .geojson or .csv files, ready for seamless import into the RShiny app.

library(dplyr)
library(sf)
library(tmap)
library(tidyr)
library(dbscan)
library(sf)


bird_data <- read.csv("00_data/NuCra_Davos_all_data_2025-02-07_V2.csv")

# Check and handle missing values if any
bird_data <- bird_data %>% drop_na(longitude, latitude, datetime)
remove_ids <- c(48325, 15381, 15382, 15383, 45363, 7949)
bird_data <- bird_data %>% filter(!X %in% remove_ids)

# Convert datetime to proper format
bird_data$datetime <- as.POSIXct(bird_data$datetime, format="%Y-%m-%d %H:%M:%S", tz="UTC")

# Convert to sf object
bird_data <- bird_data %>%
  st_as_sf(coords = c("longitude", "latitude"), crs = 4326) %>%
  arrange(id, datetime) %>%
  mutate(season = case_when(
    month >= 2 & month <= 6 ~ "Breeding Time",
    month >= 7 & month <= 10 ~ "Harvesting Time",
    month %in% c(11, 12, 1) ~ "Winter",
    TRUE ~ NA_character_
  ))

# Reduce COlumns:
bird_data <- bird_data %>%
  select(
    datetime,
    timediff,
    altitude,
    id,
    steplength,
    weight,
    wing_length,
    bill_depth,
    bill_length,
    tarsus_length,
    stage.at.capture,
    geometry, 
    month,    
    season  
  )

# Create line geometries per bird
bird_tracks <- bird_data %>%
  group_by(id, month) %>%
  summarise(geometry = st_cast(st_combine(geometry), "LINESTRING"), .groups = 'drop')

# Count number of unique bird groups
num_groups <- n_distinct(bird_tracks$id)
print(paste("Number of unique bird groups:", num_groups))

[1] "Number of unique bird groups: 115"

# Save processed data
st_write(bird_tracks, "02_preprocessing_export/bird_tracks.geojson", delete_dsn = TRUE)

Deleting source `02_preprocessing_export/bird_tracks.geojson' using driver `GeoJSON'
Writing layer `bird_tracks' to data source 
  `02_preprocessing_export/bird_tracks.geojson' using driver `GeoJSON'
Writing 385 features with 2 fields and geometry type Line String.

Preprocess Cluster Analysis (DBScan)

For each bird, we apply DBScan clustering with a minimum points (minPts) parameter set to 15. The eps parameter is dynamically calculated as the mean of the k-nearest neighbor (kNN) distances, ensuring adaptive and data-driven clustering.

# Initialize cluster columns
bird_data$cluster_season <- NA
bird_data$cluster_all <- NA

# Set parameters
minPts <- 15
unique_ids <- unique(bird_data$id)

for (bird_id in unique_ids) {
  
  # --- DBScan for All Seasons ---
  bird_all_data <- bird_data %>% filter(id == bird_id)
  coords_all <- st_coordinates(bird_all_data)
  
  if (nrow(coords_all) >= minPts) {
    knn_distances_all <- dbscan::kNNdist(coords_all, k = minPts / 2)
    eps_all <- mean(knn_distances_all, na.rm = TRUE)
    db_all <- dbscan(coords_all, eps = eps_all, minPts = minPts)
    
    bird_data$cluster_all[bird_data$id == bird_id] <- as.factor(db_all$cluster)
  }
  
  # --- DBScan by Season ---
  for (season_name in c("Breeding Time", "Harvesting Time", "Winter")) {
    
    bird_season_data <- bird_data %>% filter(id == bird_id, season == season_name)
    coords_season <- st_coordinates(bird_season_data)
    
    if (nrow(coords_season) >= minPts) {
      knn_distances_season <- dbscan::kNNdist(coords_season, k = minPts / 2)
      eps_season <- mean(knn_distances_season, na.rm = TRUE)
      db_season <- dbscan(coords_season, eps = eps_season, minPts = minPts)
      
      bird_data$cluster_season[bird_data$id == bird_id & bird_data$season == season_name] <- as.factor(db_season$cluster)
    }
  }
}

# Save the processed data with new cluster columns
st_write(bird_data, "02_preprocessing_export/bird_data.geojson", delete_dsn = TRUE)

Deleting source `02_preprocessing_export/bird_data.geojson' using driver `GeoJSON'
Writing layer `bird_data' to data source 
  `02_preprocessing_export/bird_data.geojson' using driver `GeoJSON'
Writing 49757 features with 15 fields and geometry type Point.

Visualize

To verify the effectiveness of the preprocessing and clustering, we create visualizations that illustrate the clustering results. This step ensures that the DBScan algorithm has accurately identified clusters based on the calculated eps values.

# Visualize Clusters for Specific ID
target_id <- '.458'
print(target_id)

[1] ".458"

bird_data_filtered <- bird_data[bird_data$id == target_id, ]

if (nrow(bird_data_filtered) > 0) {
  tmap_mode("view")
  tm_shape(bird_data_filtered) +
    tm_dots(col = "cluster_all", palette = "Set1", title = "Cluster ID") +
    tm_layout(title = paste("Clusters for ID", target_id))
} else {
  print("No data available for the specified ID")
}

ℹ tmap mode set to "view".


── tmap v3 code detected ───────────────────────────────────────────────────────

[v3->v4] `tm_tm_dots()`: migrate the argument(s) related to the scale of the
visual variable `fill` namely 'palette' (rename to 'values') to fill.scale =
tm_scale(<HERE>).
[v3->v4] `tm_dots()`: use 'fill' for the fill color of polygons/symbols
(instead of 'col'), and 'col' for the outlines (instead of 'border.col').
[tm_dots()] Argument `title` unknown.
[v3->v4] `tm_layout()`: use `tm_title()` instead of `tm_layout(title = )`
[cols4all] color palettes: use palettes from the R package cols4all. Run
`cols4all::c4a_gui()` to explore them. The old palette name "Set1" is named
"brewer.set1"

Prepare Data for Export of DB-Clusters

DBScan Algorithm

In this step, we structure the data for export to .geojson files. Each bird’s DBScan clusters are saved as individual .geojson files, with each file containing the cluster polygons for that specific bird, facilitating organized data handling and subsequent analysis.

# Load Data
bird_data <- st_read("02_preprocessing_export/bird_data.geojson")

Reading layer `bird_data' from data source 
  `C:\Users\claud\Documents\Geographie-Studium-10. Semester\GEO880\Project\GEO880_Project\Project\02_preprocessing_export\bird_data.geojson' 
  using driver `GeoJSON'
Simple feature collection with 49757 features and 15 fields
Geometry type: POINT
Dimension:     XY
Bounding box:  xmin: 9.34394 ymin: 46.42356 xmax: 10.47204 ymax: 46.95712
Geodetic CRS:  WGS 84

# Check and set CRS to WGS84 (EPSG:4326)
target_crs <- 4326
if (st_crs(bird_data)$epsg != target_crs) {
  bird_data <- st_transform(bird_data, crs = target_crs)
}

# Initialize lists for storing polygons and attributes
convex_hulls_polygons <- st_sfc(crs = target_crs)
convex_hulls_ids <- c()
convex_hulls_clusters <- c()
convex_hulls_seasons <- c()

# Iterate through each bird
for (id_name in unique(bird_data$id)) {
  
  # Iterate through each season and "All Seasons"
  for (season_name in c("Breeding Time", "Harvesting Time", "Winter", "All Seasons")) {
    
    if (season_name == "All Seasons") {
      selected_data <- bird_data[bird_data$id == id_name, ]
      cluster_column <- "cluster_all"
    } else {
      selected_data <- bird_data[bird_data$id == id_name & bird_data$season == season_name, ]
      cluster_column <- "cluster_season"
    }
    
    # Ensure CRS is consistent
    if (st_crs(selected_data)$epsg != target_crs) {
      selected_data <- st_transform(selected_data, crs = target_crs)
    }
    
    # Get unique clusters
    clusters <- unique(na.omit(selected_data[[cluster_column]]))
    
    # Skip if no clusters found
    if (length(clusters) == 0) next
    
    # Identify the smallest cluster ID and exclude it
    min_cluster <- min(clusters)
    clusters <- clusters[clusters != min_cluster]
    
    
    # Process each cluster
    for (cluster_id in clusters) {
      # Filter by cluster
      cluster_data <- selected_data[selected_data[[cluster_column]] == cluster_id, ]
      
      # Ensure sufficient points for convex hull
      if (nrow(cluster_data) > 3) {
        hull <- st_convex_hull(st_union(cluster_data))
        
        # Store data
        convex_hulls_polygons <- c(convex_hulls_polygons, hull)
        convex_hulls_ids <- c(convex_hulls_ids, id_name)
        convex_hulls_clusters <- c(convex_hulls_clusters, cluster_id)
        convex_hulls_seasons <- c(convex_hulls_seasons, season_name)
      }
    }
  }
}

# Create sf object with 3 columns: id, cluster_id, season
convex_hulls_sf <- st_as_sf(
  data.frame(
    id = convex_hulls_ids,
    cluster_id = convex_hulls_clusters,
    season = convex_hulls_seasons,
    geometry = convex_hulls_polygons
  ),
  crs = target_crs
)

# Save as GeoJSON
st_write(convex_hulls_sf, "02_preprocessing_export/DB_Scan_polygons.geojson", delete_dsn = TRUE)

Deleting source `02_preprocessing_export/DB_Scan_polygons.geojson' using driver `GeoJSON'
Writing layer `DB_Scan_polygons' to data source 
  `02_preprocessing_export/DB_Scan_polygons.geojson' using driver `GeoJSON'
Writing 593 features with 3 fields and geometry type Polygon.

Calculate Overlap Matrix (all Season)

The overlap matrix is calculated to identify intersecting DBScan clusters between birds. The function calculate_overlap_matrix() uses vectorized operations to efficiently compute the intersections and generate a symmetric matrix, indicating overlap presence with binary values. The resulting matrix is exported as a .csv file for further analysis.

# Funktion zur Berechnung der Overlap-Matrix
# Funktion zur effizienten Berechnung der Overlap-Matrix
calculate_overlap_matrix <- function(DB_Scan_data) {
  # Extrahiere eindeutige Vogel-IDs
  bird_ids <- unique(DB_Scan_data$id)
  n <- length(bird_ids)
  
  # Erstelle eine leere Overlap-Matrix
  overlap_matrix <- matrix(0, nrow = n, ncol = n, dimnames = list(bird_ids, bird_ids))
  
  # Nutze Vektorisierung zur Berechnung der Overlaps
  for (i in 1:(n - 1)) {
    bird1_data <- DB_Scan_data[DB_Scan_data$id == bird_ids[i], ]

    for (j in (i + 1):n) {
      bird2_data <- DB_Scan_data[DB_Scan_data$id == bird_ids[j], ]

      # Berechne die Intersection
      intersection <- st_intersection(bird1_data, bird2_data)

      # Überprüfe, ob eine Intersection vorhanden ist
      overlap_matrix[i, j] <- ifelse(nrow(intersection) > 0, 1, 0)
      overlap_matrix[j, i] <- overlap_matrix[i, j]
    }
  }
  return(overlap_matrix)
}

# Beispielaufruf
overlap_matrix <- calculate_overlap_matrix(convex_hulls_sf)
write.csv(overlap_matrix, '02_preprocessing_export/DB_Scan_Matrix.csv')

Function to find relatable birds

The find_related_birds() function identifies birds with overlapping DBScan clusters based on the overlap matrix. By inputting a specific bird ID, the function extracts all bird IDs with intersecting clusters (indicated by a value of 1 in the matrix). This provides a quick way to find potential interactions or shared areas among birds.

find_related_birds <- function(matrix_data, bird_id) {
  if (!(bird_id %in% rownames(matrix_data))) {
    stop("Die angegebene Vogel-ID existiert nicht in der Matrix.")
  }

  # Finde die Zeile, die der bird_id entspricht
  bird_row <- matrix_data[as.character(bird_id), ]

  # Extrahiere die IDs der ähnlichen Vögel (Spalten mit Wert 1)
  similar_birds <- colnames(matrix_data)[which(bird_row == 1)]

  return(similar_birds)
}

find_related_birds(overlap_matrix, "7934")

 [1] ".458" "0500" "0504" "0519" "0633" "19"   "20"   "21"   "22"   "28"  
[11] "29"   "30"   "31"   "32"   "33"   "37"   "5450" "5999" "6000" "6447"
[21] "6453" "6454" "6456" "6457" "6461" "6462" "6463" "6464" "6465" "6466"
[31] "6467" "6468" "6524" "6526" "6527" "6528" "7314" "7315" "7316" "7319"
[41] "7321" "7323" "7326" "7329" "7330" "7331" "7332" "7932" "7933" "7935"
[51] "7939" "7942" "7944" "7948" "7949" "85"   "89"