Exercise_4

Author

dominik erni

Published

March 12, 2025

Input Segmentation

library("readr")
library("sf")
library("dplyr")
library("ggplot2")

We will demonstrate implementing this method on the wild boar “Sabi”, restricting ourselves to a couple of tracking days. Your task will be to understand this implementation and apply it to your own movement data.

Open a RStudio Project for this week. Next, copy the wild boar data you downloaded last week (wildschwein_BE_2056.csv) to your project folder. If you cannot find this dataset on your computer, you can re-download it from moodle. Transform the data into an sf object, filter for the wild boar Sabi and a datetime between “2015-07-01” and “2015-07-03”.

wildschwein <- read_delim("data/wildschwein_BE_2056.csv", ",")

# Careful! What Timezone is assumed?
sabi <- wildschwein |>
    st_as_sf(coords = c("E", "N"), crs = 2056, remove = FALSE) |>
    filter(
      TierName == "Sabi", 
      DatetimeUTC >= "2015-07-01", 
      DatetimeUTC < "2015-07-03"
      )
# Plot paths
ggplot(sabi) +
  geom_sf() +
  geom_path(aes(E, N))

Temporal window and distance within

In the above dataset, the sampling interval is 15 minutes. If we take a temporal window of 60 minutes, that would mean including 4 fixes.

# Function:
distance_by_element <- function(later, now) {
  as.numeric(
    st_distance(later, now, by_element = TRUE)
  )
}

We can use the function distance_by_element from week 2 in combination with lead() and lag() to calculate the Euclidean distance. For example, to create the necessary offset of n-2, we use lag(x, 2).

# Definition the distances between:
sabi <- sabi |>
    mutate(
        nMinus2 = distance_by_element(lag(geometry, 2), geometry),  # distance to pos -30 minutes
        nMinus1 = distance_by_element(lag(geometry, 1), geometry),  # distance to pos -15 minutes
        nPlus1  = distance_by_element(geometry, lead(geometry, 1)), # distance to pos +15 mintues
        nPlus2  = distance_by_element(geometry, lead(geometry, 2))  # distance to pos +30 minutes
    )

Mean distance

Now we want to calculate the mean distance of nMinus2, nMinus1, nPlus1, nPlus2 for each row. Since we want the mean value per Row, we have to explicitly specify this before mutate() with the function rowwise(). To remove this rowwise-grouping, we end the operation with ungroup().

sabi <- sabi |>
    rowwise() |> # für jeden Punkt 
    mutate(
        stepMean = mean(c(nMinus2, nMinus1, nPlus1, nPlus2))
    ) |>
    ungroup()

Remove “static points” (wo das tier sich nicht bewegt)

We can now determine if an animal is moving or not by specifying a threshold distance on stepMean. In our example, we use the mean value as a threshold: Positions with distances below this value are considered static.

sabi <- sabi |>
    mutate(static = stepMean < mean(stepMean, na.rm = TRUE))

sabi_filter <- sabi |>
    filter(!static)

sabi_filter |>
    ggplot(aes(E, N)) +
    geom_point(data = sabi, col = "red") + # rote Punkte wurden als "statisch" entfernt
    geom_path() +
    geom_point() +
    coord_fixed() +
    theme(legend.position = "bottom")

Exercise A

library("readr")
library("sf")
library("dplyr")
library("ggplot2")
# Function:
distance_by_element <- function(later, now) {
  as.numeric(
    st_distance(later, now, by_element = TRUE)
  )
}

Data informations: This GPS trajectory dataset was collected in (Microsoft Research Asia) Geolife project by 182 users in a period of over five years (from April 2007 to August 2012). For the following analysis the trajectory of one person at 2008-10-23 is used.

# Pakete laden
library(sf)
library(tmap)

# Datei einlesen, erste 6 Zeilen überspringen
file_path <- "data/Trajectory/20081023025304.plt"
gps_data <- read.csv(file_path, skip = 6, header = FALSE)

# Spaltennamen setzen
colnames(gps_data) <- c("Latitude", "Longitude", "Unused", "Altitude", "Timestamp", "Date", "Time")

# Unnötige Spalte entfernen
gps_data <- gps_data[, c("Latitude", "Longitude", "Altitude", "Timestamp", "Date", "Time")]

# In sf-Objekt umwandeln (WICHTIG: Longitude zuerst!)
gps_data_sf <- st_as_sf(gps_data, coords = c("Longitude", "Latitude"), crs = 4326, remove = FALSE)

# tmap interaktive Karte anzeigen
tmap_mode("view")  # Interaktive Ansicht aktivieren
tm_shape(gps_data_sf) +
  tm_dots(col = "red")   # Punkte als Referenz anzeigen
ggplot(gps_data_sf) +
  geom_sf() +
  geom_path(aes(Longitude, Latitude)) +
  theme_classic()

Calculate distances

# Different windows:
gps_data_sf <- gps_data_sf |>
    mutate(
        nMinus2 = distance_by_element(lag(geometry, 2), geometry),  
        nMinus1 = distance_by_element(lag(geometry, 1), geometry),  
        nPlus1  = distance_by_element(geometry, lead(geometry, 1)), 
        nPlus2  = distance_by_element(geometry, lead(geometry, 2))  
    )

# Mean of Point-Distances:
gps_data_sf <- gps_data_sf |>
    rowwise() |> # für jeden Punkt 
    mutate(
        stepMean = mean(c(nMinus2, nMinus1, nPlus1, nPlus2))
    ) |>
    ungroup()

Specify and apply threshold d

summary(gps_data_sf$stepMean)
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
   0.2127    7.9860   17.0496   23.9094   24.2946 1403.7360         4 
ggplot(gps_data_sf, aes(x = stepMean)) +
  geom_histogram(binwidth = 1, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Histogram of stepMean Values", x = "stepMean", y = "Frequency")

ggplot(gps_data_sf, aes(y = stepMean)) +
  geom_boxplot(fill = "lightblue", color = "darkblue") +
  labs(title = "Boxplot of stepMean Values", y = "stepMean")

Remove “static points” (wo nicht bewegt)

gps_data_sf <- gps_data_sf |>
    mutate(static = stepMean < quantile(stepMean, 0.50, na.rm = TRUE))

gps_data_filter <- gps_data_sf |>
    filter(!static)

gps_data_filter |>
    ggplot(aes(Longitude, Latitude)) +
    geom_point(data = gps_data_sf, col = "red") + # rote Punkte wurden als "statisch" entfernt
    geom_path() +
    geom_point() +
    coord_fixed() +
    theme(legend.position = "bottom")

Segment-based analysis

# Function
rle_id <- function(vec) {
    x <- rle(vec)$lengths
    as.factor(rep(seq_along(x), times = x))
}
# assign unique IDs to subtrajectories
gps_data_sf <- gps_data_sf |>
    mutate(segment_id = rle_id(static))

Visualize the moving segments by colourizing them by segment_ID:

ggplot(gps_data_sf) +
  geom_sf() +
  geom_path(aes(Longitude, Latitude, color = factor(segment_id))) +
  theme_classic() +
  scale_color_viridis_d() +  # Optional: Uses a colorblind-friendly palette
  theme(legend.position = "none")  # Removes the legend

Then use segment_ID as a grouping variable to determine the segments duration and remove short segments (e.g. segments with a duration < 5 Minutes):

# Convert Timestamp to POSIXct using the correct origin date
gps_data_sf <- gps_data_sf |> 
  mutate(Timestamp = as.POSIXct(Timestamp * 86400, origin = "1899-12-30", tz = "UTC"))

# Compute segment durations and filter out short segments
gps_data_filter <- gps_data_sf  |> 
  group_by(segment_id) |>
  mutate(segment_duration = difftime(max(Timestamp), min(Timestamp), units = "mins")) |>
  ungroup() |>
  filter(segment_duration >= 5)  # Keep only segments with duration >= 5 minutes

# Plot the filtered data
ggplot(gps_data_filter) +
  geom_sf() +
  geom_path(aes(Longitude, Latitude, color = factor(segment_id))) +
  theme_classic() +
  scale_color_viridis_d() +  # Optional: Improves color mapping
  theme(legend.position = "none")  # Removes legend

Exercise B: Similarity

Similarity measures

We will now calculate similarties between trajectories using a new dataset pedestrian.csv (available on moodle). Download an import this dataset as a data.frame or tibble. It it a set of six different but similar trajectories from pedestrians walking on a path. For this task, explore the trajectories first and get an idea on how the pedestrians moved.

# Import data:
pedestrian <- read_delim("data/pedestrian.csv")
# create an sf object:
pedestrian <- st_as_sf(pedestrian, coords = c("E", "N"), crs = 2056, remove = F)
str(pedestrian)
sf [289 × 5] (S3: sf/spec_tbl_df/tbl_df/tbl/data.frame)
 $ TrajID     : num [1:289] 1 1 1 1 1 1 1 1 1 1 ...
 $ E          : num [1:289] 2571414 2571396 2571373 2571347 2571336 ...
 $ N          : num [1:289] 1205804 1205791 1205770 1205753 1205744 ...
 $ DatetimeUTC: POSIXct[1:289], format: "2015-03-01 12:01:00" "2015-03-01 12:02:00" ...
 $ geometry   :sfc_POINT of length 289; first list element:  'XY' num [1:2] 2571414 1205804
 - attr(*, "spec")=
  .. cols(
  ..   TrajID = col_double(),
  ..   E = col_double(),
  ..   N = col_double(),
  ..   DatetimeUTC = col_datetime(format = "")
  .. )
 - attr(*, "problems")=<externalptr> 
 - attr(*, "sf_column")= chr "geometry"
 - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA
  ..- attr(*, "names")= chr [1:4] "TrajID" "E" "N" "DatetimeUTC"
# plot:

# ?facet_wrap

ggplot() +
  # Grey background points for the entire data (repeated in every facet)
  geom_point(data = transform(pedestrian, TrajID = NULL), aes(E, N), color = "grey70", alpha = 0.3) +  
  # Highlight paths for each specific TrajID
  geom_path(data = pedestrian, aes(E, N, color = factor(TrajID)), size = 1) +
  # Highlight points for each TrajID
  geom_point(data = pedestrian, aes(E, N, color = factor(TrajID)), size = 2) +  
  theme_classic() +
  scale_color_viridis_d() +  # Colorblind-friendly color scale
  theme(legend.position = "none") +  # Removes the legend
  facet_wrap(~TrajID)

Calculate similarity

library(SimilarityMeasures)
library(sf)
library(dplyr)
library(ggplot2)
library(readr)

pedestrian <- read_delim("data/pedestrian.csv")

# Convert trajectories to matrices based on unique TrajID
trajectory_matrices <- pedestrian %>%
  split(.$TrajID) %>%
  lapply(function(x) as.matrix(x[, c("E", "N")]))
# Compute similarity of Trajectory 1 with others (2 to 6)
similarity_results <- lapply(2:6, function(i) {
  trajectory1 <- trajectory_matrices[[1]]  # Reference trajectory
  trajectory2 <- trajectory_matrices[[i]]  # Comparison trajectory
  
  # Compute similarity measures
  dtw_dist <- DTW(trajectory1, trajectory2)  # Dynamic Time Warping
  edit_dist <- EditDist(trajectory1, trajectory2)  # Edit Distance
  frechet_dist <- Frechet(trajectory1, trajectory2)  # Frechet Distance
  lcss_dist <- LCSS(trajectory1, trajectory2, pointSpacing = 1, pointDistance = 5, errorMarg = 2)
  # Longest Common Subsequence
  
  return(data.frame(
    Trajectory = i,
    DTW = dtw_dist,
    EditDist = edit_dist,
    Frechet = frechet_dist,
    LCSS = lcss_dist
  ))
})

# Combine results into one dataframe
similarity_df <- bind_rows(similarity_results)
# Convert to long format for ggplot
similarity_long <- similarity_df %>%
  tidyr::pivot_longer(cols = -Trajectory, names_to = "Measure", values_to = "Value")
trajectory_colors <- c("2" = "#253B80",  # Dunkelblau
                       "3" = "#1F78B4",  # Blau
                       "4" = "#33A02C",  # Grün
                       "5" = "#B2DF8A",  # Hellgrün
                       "6" = "#FDBF00")  # Gelb

# Create the ggplot visualization
ggplot(similarity_long, aes(x = as.factor(Trajectory), y = Value, fill = as.factor(Trajectory))) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = trajectory_colors) +
  facet_wrap(~Measure, scales = "free") +
  labs(
    title = "Computed similarities using different measures\nbetween trajectory 1 to all other trajectories",
    x = "Comparison trajectory",
    y = "Value"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16),
    axis.text.x = element_text(size = 12),
    strip.text = element_text(size = 12)
  ) +
  theme(legend.position = "none")