WEEK 4 EXERCISES

Author

Dimitri Chryssolouris

1 Input: Segmentation

library("readr")
library("sf")
library("dplyr")
library(ggplot2)
library(tidyverse)
library("SimilarityMeasures")
library("patchwork")


wildschwein <- read_delim("Data/wildschwein_BE_2056.csv", ",")


# Careful! What Timezone is assumed?
sabi <- wildschwein |>
    st_as_sf(coords = c("E", "N"), crs = 2056, remove = FALSE) |>
    filter(
      TierName == "Sabi", 
      DatetimeUTC >= "2015-07-01", 
      DatetimeUTC < "2015-07-03"
      )

ggplot(sabi) +
  geom_sf() +
  geom_path(aes(E, N)) +
  theme_minimal()

1.1 Step a): Specify a temporal wondow

#define the function


distance_by_element <- function(later, now){
  as.numeric(
    st_distance(later, now, by_element = TRUE)
  )
}

# Ensure your data is sorted by time
sabi <- sabi |>
  arrange(DatetimeUTC) |>
  mutate(
    # Distance from pos[n-2] to pos[n]
    dist_n_minus_2 = distance_by_element(lag(geometry, 2), geometry),
    # Distance from pos[n-1] to pos[n]
    dist_n_minus_1 = distance_by_element(lag(geometry, 1), geometry),
    # Distance from pos[n] to pos[n+1]
    dist_n_plus_1 = distance_by_element(geometry, lead(geometry, 1)),
    # Distance from pos[n] to pos[n+2]
    dist_n_plus_2  = distance_by_element(geometry, lead(geometry, 2))
  )

sabi <- sabi |>
    rowwise() |>
    mutate(
        stepMean = mean(c(dist_n_minus_2, dist_n_minus_1, dist_n_plus_1, dist_n_plus_2)) #with na.rm = TRUE the beginning and End would stay
    )|>
    ungroup()

sabi

Simple feature collection with 192 features and 11 fields
Geometry type: POINT
Dimension:     XY
Bounding box:  xmin: 2569724 ymin: 1204916 xmax: 2570927 ymax: 1205957
Projected CRS: CH1903+ / LV95
# A tibble: 192 × 12
   TierID TierName CollarID DatetimeUTC                E        N
   <chr>  <chr>       <dbl> <dttm>                 <dbl>    <dbl>
 1 002A   Sabi        12275 2015-06-30 22:00:13 2569972. 1205366.
 2 002A   Sabi        12275 2015-06-30 22:16:06 2569975. 1205637.
 3 002A   Sabi        12275 2015-06-30 22:30:19 2570266. 1205857.
 4 002A   Sabi        12275 2015-06-30 22:45:13 2570208. 1205913.
 5 002A   Sabi        12275 2015-06-30 23:00:10 2570247. 1205731.
 6 002A   Sabi        12275 2015-06-30 23:15:17 2570512. 1205279.
 7 002A   Sabi        12275 2015-06-30 23:30:38 2570684. 1205103.
 8 002A   Sabi        12275 2015-06-30 23:45:16 2570526. 1205051.
 9 002A   Sabi        12275 2015-07-01 00:00:10 2570532. 1205044.
10 002A   Sabi        12275 2015-07-01 00:15:14 2570530. 1205059.
# ℹ 182 more rows
# ℹ 6 more variables: geometry <POINT [m]>, dist_n_minus_2 <dbl>,
#   dist_n_minus_1 <dbl>, dist_n_plus_1 <dbl>, dist_n_plus_2 <dbl>,
#   stepMean <dbl>

1.2 Step c): Remove “static points”

We can now determine if an animal is moving or not by specifying a threshold distance on stepMean. In our example, we use the mean value as a threshold: Positions with distances below this value are considered static.

sabi <- sabi |>
    mutate(static = stepMean < mean(stepMean, na.rm = TRUE))

sabi_filter <- sabi |>
    filter(!static)

sabi_filter |>
    ggplot(aes(E, N)) +
    geom_point(data = sabi, col = "red") +
    geom_path() +
    geom_point() +
    coord_fixed() +
    theme(legend.position = "bottom")

2 Exercise A: Segmentation

With the skills from Input: Segmentation you can now implement the segmentation algorithm described in Laube and Purves (2011) to either your own movement data or to a different wild boar using different sampling intervals.

2.1 Task 1: Calculate distances

Now, you can Step a): Specify a temporal window v and Step b): Measure the distance to every point within v, which you had used with sabi, on your own movement data or to a different wild boar using different sampling intervals.

unique(wildschwein$TierName)

[1] "Sabi" "Rosa" "Ruth"

now <- wildschwein$DatetimeUTC
later <- lead(now)

difftime_secs <- function(later, now){
    as.numeric(difftime(later, now))
}

wildschwein <- wildschwein |> 
  group_by(TierID) |> 
  mutate(
    timelag = difftime_secs(lead(DatetimeUTC), DatetimeUTC)
    )

wildschwein |>
  filter(TierName == "Rosa") |>
  summarise(start_time = min(DatetimeUTC, na.rm = TRUE))

# A tibble: 1 × 2
  TierID start_time         
  <chr>  <dttm>             
1 016A   2014-11-07 07:45:44

rosa <- wildschwein |>
    st_as_sf(coords = c("E", "N"), crs = 2056, remove = FALSE) |>
    filter(
      TierName == "Rosa", 
      DatetimeUTC >= "2014-11-07", 
      DatetimeUTC < "2014-11-10"
      )

ggplot(rosa) +
  geom_sf() +
  geom_path(aes(E, N)) +
  theme_minimal()

rosa <- rosa |>
  arrange(DatetimeUTC) |>
  mutate(
    # Distance from pos[n-2] to pos[n]
    dist_n_minus_2 = distance_by_element(lag(geometry, 2), geometry),
    # Distance from pos[n-1] to pos[n]
    dist_n_minus_1 = distance_by_element(lag(geometry, 1), geometry),
    # Distance from pos[n] to pos[n+1]
    dist_n_plus_1 = distance_by_element(geometry, lead(geometry, 1)),
    # Distance from pos[n] to pos[n+2]
    dist_n_plus_2  = distance_by_element(geometry, lead(geometry, 2))
  )



rosa <- rosa |>
    rowwise() |>
    mutate(
        stepMean = mean(c(dist_n_minus_2, dist_n_minus_1, dist_n_plus_1, dist_n_plus_2)) #with na.rm = TRUE the beginning and End would stay
    )|>
    ungroup()

rosa

Simple feature collection with 253 features and 12 fields
Geometry type: POINT
Dimension:     XY
Bounding box:  xmin: 2569889 ymin: 1204537 xmax: 2570395 ymax: 1205037
Projected CRS: CH1903+ / LV95
# A tibble: 253 × 13
   TierID TierName CollarID DatetimeUTC                E        N timelag
   <chr>  <chr>       <dbl> <dttm>                 <dbl>    <dbl>   <dbl>
 1 016A   Rosa        13972 2014-11-07 07:45:44 2570385. 1204812.     866
 2 016A   Rosa        13972 2014-11-07 08:00:10 2570389. 1204821.     908
 3 016A   Rosa        13972 2014-11-07 08:15:18 2570375. 1204787.     905
 4 016A   Rosa        13972 2014-11-07 08:30:23 2570387. 1204828.     945
 5 016A   Rosa        13972 2014-11-07 08:46:08 2570387. 1204853.     843
 6 016A   Rosa        13972 2014-11-07 09:00:11 2570387. 1204857.     904
 7 016A   Rosa        13972 2014-11-07 09:15:15 2570392. 1204821.     896
 8 016A   Rosa        13972 2014-11-07 09:30:11 2570395. 1204818.     905
 9 016A   Rosa        13972 2014-11-07 09:45:16 2570391. 1204838.     928
10 016A   Rosa        13972 2014-11-07 10:00:44 2569917. 1205031.     869
# ℹ 243 more rows
# ℹ 6 more variables: geometry <POINT [m]>, dist_n_minus_2 <dbl>,
#   dist_n_minus_1 <dbl>, dist_n_plus_1 <dbl>, dist_n_plus_2 <dbl>,
#   stepMean <dbl>

2.2 Task 2: Specify and apply threshold d

After calculating the Euclidean distances to positions within the temporal window v in task 1, you can explore these values (we stored them in the column stepMean) using summary statistics (histograms, boxplot, summary()): This way we can define a reasonable threshold value to differentiate between stops and moves. There is no “correct” way of doing this, specifying a threshold always depends on data as well as the question that needs to be answered. In this exercise, use the mean of all stepMean values.

Store the new information (boolean to differentiate between stops (TRUE) and moves (FALSE)) in a new column named static.

# Compute the threshold from the stepMean values
threshold <- mean(rosa$stepMean, na.rm = TRUE)


# Create the 'static' column: TRUE when stepMean is below the threshold (stop), FALSE otherwise (move)
rosa <- rosa |>
  mutate(static = stepMean < threshold)



# Summary statistics
summary(rosa$stepMean)

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
  0.7326   2.3506   3.5855  11.5894   6.6988 265.2851        4

# Histogram of stepMean values
hist(rosa$stepMean, 
     breaks = 30, 
     main = "Histogram of stepMean Values", 
     xlab = "stepMean", 
     col = "lightblue", 
     border = "black")

# Boxplot of stepMean values
boxplot(rosa$stepMean, 
        main = "Boxplot of stepMean Values", 
        ylab = "stepMean", 
        col = "lightgreen")

threshold = mean = 11.5

2.3 Task 3: Visualize segmented trajectories

Now visualize the segmented trajectory spatially. Just like last week, you can use ggplot with geom_path(), geom_point() and coord_equal(). Assign colour = static within aes() to distinguish between segments with “movement” and without.

# Extract coordinates into columns
rosa_coords <- cbind(rosa, st_coordinates(rosa))

ggplot(rosa_coords) +
  geom_path(aes(x = X, y = Y, color = static), size = 1) +
  geom_point(aes(x = X, y = Y, color = static), size = 2) +
  coord_equal() +
  labs(title = "Segmented Trajectory of Rosa",
       x = "Easting", y = "Northing",
       color = "Static") +
  theme_minimal()

Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

# or 
rosa_filter <- rosa |>
    filter(!static)


rosa_filter |>
    ggplot(aes(E, N)) +
    geom_point(data = rosa, col = "red") +
    geom_path() +
    geom_point() +
    coord_fixed() +
    theme(legend.position = "bottom")

2.4 Task 4: Segment-based analysis

In applying Laube and Purves (2011), we’ve come as far as step b) in Figure 16.1. In order to complete the last steps (c and d), we need a unique ID for each segment that we can use as a grouping variable. The following function does just that (it assigns unique IDs based on the column static which you created in Task 2). You will learn about functions next week. For now, just copy the following code chunk into your script and run it.

rle_id <- function(vec) {
    x <- rle(vec)$lengths
    as.factor(rep(seq_along(x), times = x))
}


rosa <- rosa |>
    mutate(segment_id = rle_id(static))

# Calculate duration for each segment in minutes
segment_info <- rosa |>
  group_by(segment_id) |>
  summarise(
    start_time = min(DatetimeUTC),
    end_time = max(DatetimeUTC),
    duration = as.numeric(difftime(end_time, start_time, units = "mins"))
  )

# Remove segments with duration < 5 minutes
valid_segments <- segment_info |>
  filter(duration >= 5)

# Keep only observations in valid segments
rosa_valid <- rosa |>
  filter(segment_id %in% valid_segments$segment_id)


# Extract coordinates for plotting
rosa_coords <- cbind(rosa_valid, st_coordinates(rosa_valid))

# Create the plot with path and point layers
ggplot(rosa_coords) +
  geom_path(aes(x = X, y = Y, color = segment_id), size = 1) +
  geom_point(aes(x = X, y = Y, color = segment_id), size = 2) +
  coord_equal() +
  labs(title = "Segmented Trajectory by Segment ID",
       x = "Easting", y = "Northing",
       color = "Segment ID") +
  theme_minimal()

3 Exercise B: Similarity

library(“SimilarityMeasures”) added in the first code chunk of Week 4 exercise. SimilarityMeasures is a package that provides a collection of similarity measures for time series data. The package is available on CRAN and can be installed using install.packages(“SimilarityMeasures”). ## Task 1

3.0.1 Read df

ped <- read_delim("Data/pedestrian.csv")

Rows: 289 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
dbl  (3): TrajID, E, N
dttm (1): DatetimeUTC

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

ped <- ped |> 
  st_as_sf(coords = c("E", "N"), crs = 2056, remove = FALSE) |> 
  mutate(TrajID = as.factor(TrajID))

3.0.2 Visualize Trajectories

ggplot(ped, aes(x = E, y = N, colour = TrajID)) +
  geom_point() +
  geom_path(aes(E,N)) +
  facet_wrap(~ TrajID) + # Small multiples by 'group'
  labs(title = "Visual comparison of the 6 trajectories",
       x = "E", y = "N")

3.1 Task 2: Calculate similarity

3.1.1 Split trajcetories into matrices

tra1 <- ped |> 
  filter(TrajID == 1) |> 
  select(c(E,N)) |> 
  st_drop_geometry() |> 
  as.matrix()

tra2 <- ped |> 
  filter(TrajID == 2) |> 
  select(c(E,N)) |> 
  st_drop_geometry() |> 
  as.matrix()

tra3 <- ped |> 
  filter(TrajID == 3) |> 
  select(c(E,N)) |> 
  st_drop_geometry() |> 
  as.matrix()

tra4 <- ped |> 
  filter(TrajID == 4) |> 
  select(c(E,N)) |> 
  st_drop_geometry() |> 
  as.matrix()

tra5 <- ped |> 
  filter(TrajID == 5) |> 
  select(c(E,N)) |> 
  st_drop_geometry() |> 
  as.matrix()

tra6 <- ped |> 
  filter(TrajID == 6) |> 
  select(c(E,N)) |> 
  st_drop_geometry() |> 
  as.matrix()

3.1.2 With function

ped_as_fUnction <- function(data, trajnr) {
  tra <- data |> 
  filter(TrajID == trajnr) |> 
  select(c(E,N)) |> 
  st_drop_geometry() |> 
  as.matrix()
}

tra1 <- ped_as_fUnction(ped,1)
tra2 <- ped_as_fUnction(ped,2)
tra3 <- ped_as_fUnction(ped,3)
tra4 <- ped_as_fUnction(ped,4)
tra5 <- ped_as_fUnction(ped,5)
tra6 <- ped_as_fUnction(ped,6)

3.2 Comparisons

Similarity Functions

help(package="SimilarityMeasures")

3.2.1 DTW

The dynamic time warping algorithm (DTW) calculates the smallest warp path for the two trajectories. This is the DTW version discussed by Berndt and Clifford (1994). Several variations of calculating the warping cost exist. In this version, the warping path is equal to the sum of the distances at each point along the path.

dtw12 <- DTW(tra1, tra2, pointSpacing=-1)
dtw13 <- DTW(tra1, tra3, pointSpacing=-1)
dtw14 <- DTW(tra1, tra4, pointSpacing=-1)
dtw15 <- DTW(tra1, tra5, pointSpacing=-1)
dtw16 <- DTW(tra1, tra6, pointSpacing=-1)

comp_dtw <- c("2", "3", "4", "5", "6")
dtw <- c(dtw12, dtw13, dtw14, dtw15, dtw16)

dtw_comp <- tibble(comp_dtw, dtw)

ggplot(dtw_comp, aes(x = comp_dtw, y = dtw, fill = factor(comp_dtw))) +
  geom_col() + 
  ggtitle("DTW") +
  ylab("") +
  xlab("Comparison") +
  theme_minimal() +  # Optional for better aesthetics
  theme(
    legend.position = "none",  # Remove legend
    plot.title = element_text(hjust = 0.5)  # Center title
  )

plot_dtw <- ggplot(dtw_comp, aes(x = comp_dtw, y = dtw, fill = factor(comp_dtw))) +
  geom_col() + 
  ggtitle("DTW") +
  ylab("") +
  xlab("Comparison") +
  theme_minimal() +  # Optional for better aesthetics
  theme(
    legend.position = "none",  # Remove legend
    plot.title = element_text(hjust = 0.5)  # Center title
  )

3.2.2 EditDist

The edit distance algorithm calculates the minimum number of edits required to allow the two trajectories to be considered equivalent. This function uses Edit Distance on Real sequence (EDR) as described by Chen, Özsu, and Oria (2005).

ED12 <- EditDist(tra1, tra2, pointDistance=20)
ED13 <- EditDist(tra1, tra3, pointDistance=20)
ED14 <- EditDist(tra1, tra4, pointDistance=20)
ED15 <- EditDist(tra1, tra5, pointDistance=20)
ED16 <- EditDist(tra1, tra6, pointDistance=20)

comp_ED <- c("2", "3", "4", "5", "6")
ED <- c(ED12, ED13, ED14, ED15, ED16)

ED_comp <- tibble(comp_ED, ED)

ggplot(ED_comp, aes(x = comp_ED, y = ED, fill = factor(comp_ED))) +
  geom_col() + 
  ggtitle("EditDist") +
  ylab("") +
  xlab("Comparison") +
  theme_minimal() +  # Optional for better aesthetics
  theme(
    legend.position = "none",  # Remove legend
    plot.title = element_text(hjust = 0.5)  # Center title
  )

plot_ED <- ggplot(ED_comp, aes(x = comp_ED, y = ED, fill = factor(comp_ED))) +
  geom_col() + 
  ggtitle("EditDist") +
  ylab("") +
  xlab("Comparison") +
  theme_minimal() +  # Optional for better aesthetics
  theme(
    legend.position = "none",  # Remove legend
    plot.title = element_text(hjust = 0.5)  # Center title
  )

3.2.3 Frechet

This algorithm calculates the Frechet distance. The Frechet metric (or distance) is generally described in the following way: A man is walking a dog on a leash, the man walks on one curve while the dog walks on the other (Alt and Godau 1995). The dog and the man are able to vary their speeds, or even stop, but not go backwards. The Frechet metric is the minimum leash length required to complete the traversal of both curves.

F12 <- Frechet(tra1, tra2, testLeash=-1)
F13 <- Frechet(tra1, tra3, testLeash=-1)
F14 <- Frechet(tra1, tra4, testLeash=-1)
F15 <- Frechet(tra1, tra5, testLeash=-1)
F16 <- Frechet(tra1, tra6, testLeash=-1)

comp_F <- c("2", "3", "4", "5", "6")
F <- c(F12, F13, F14, F15, F16)

F_comp <- tibble(comp_F, F)

ggplot(F_comp, aes(x = comp_F, y = F, fill = factor(comp_F))) +
  geom_col() +
  ggtitle("Frechet") +
  ylab("") +
  xlab("Comparison") +
  theme_minimal() +  # Optional for better aesthetics
  theme(
    legend.position = "none",  # Remove legend
    plot.title = element_text(hjust = 0.5)  # Center title
  )

plot_F <- ggplot(F_comp, aes(x = comp_F, y = F, fill = factor(comp_F))) +
  geom_col() +
  ggtitle("Frechet") +
  ylab("") +
  xlab("Comparison") +
  theme_minimal() +  # Optional for better aesthetics
  theme(
    legend.position = "none",  # Remove legend
    plot.title = element_text(hjust = 0.5)  # Center title
  )

3.2.4 LCSS

The LCSS algorithm calculates the largest number of equivalent points between the two trajectories when traversed in a monotone way. Possible translations are calculated in each dimension and used to provide the maximum LCSS. The accuracy of the algorithm can be varied to provide faster or slower calculations (Vlachos, Kollios, and Gunopulos 2002).

LCSS12 <- LCSS(tra1, tra2, pointSpacing=-1, pointDistance=20, 
     errorMarg=5, returnTrans=FALSE)
LCSS13 <- LCSS(tra1, tra3, pointSpacing=-1, pointDistance=20, 
     errorMarg=5, returnTrans=FALSE)
LCSS14 <- LCSS(tra1, tra4, pointSpacing=-1, pointDistance=20, 
     errorMarg=5, returnTrans=FALSE)
LCSS15 <- LCSS(tra1, tra5, pointSpacing=-1, pointDistance=20, 
     errorMarg=5, returnTrans=FALSE)
LCSS16 <- LCSS(tra1, tra6, pointSpacing=-1, pointDistance=20, 
     errorMarg=5, returnTrans=FALSE)

comp_LCSS <- c("2", "3", "4", "5", "6")
LCSS <- c(LCSS12, LCSS13, LCSS14, LCSS15, LCSS16)

LCSS_comp <- tibble(comp_LCSS, LCSS)

ggplot(LCSS_comp, aes(x = comp_LCSS, y = LCSS, fill = factor(comp_LCSS))) +
  geom_col() +
  ggtitle("LCSS") +
  ylab("") +
  xlab("Comparison") +
  theme_minimal() +  # Optional for better aesthetics
  theme(
    legend.position = "none",  # Remove legend
    plot.title = element_text(hjust = 0.5)  # Center title
  )

plot_LCSS <- ggplot(LCSS_comp, aes(x = comp_LCSS, y = LCSS, fill = factor(comp_LCSS))) +
  geom_col() +
  ggtitle("LCSS") +
  ylab("") +
  xlab("Comparison") +
  theme_minimal() +  # Optional for better aesthetics
  theme(
    legend.position = "none",  # Remove legend
    plot.title = element_text(hjust = 0.5)  # Center title
  )

3.3 Visualize results

plot_list <- list(
  plot_dtw,
  plot_ED,
  plot_F,
  plot_LCSS
)

combined_plot <- (plot_list[[1]] + plot_list[[2]] + 
                  plot_list[[3]] + plot_list[[4]]) + 
  plot_layout(ncol = 2) &  # Arrange plots in 2 columns
  labs(x = "Comparison", y = "Value")  # Shared axis labels

combined_plot + plot_annotation(title = "Compute Similarities")

References

Alt, Helmut, and Michael Godau. 1995. “Computing the Fréchet Distance Between Two Polygonal Curves.” International Journal of Computational Geometry & Applications 5 (01n02): 75–91.

Berndt, Donald J, and James Clifford. 1994. “Using Dynamic Time Warping to Find Patterns in Time Series.” In Proceedings of the 3rd International Conference on Knowledge Discovery and Data Mining, 359–70.

Chen, Lei, M Tamer Özsu, and Vincent Oria. 2005. “Robust and Fast Similarity Search for Moving Object Trajectories.” In Proceedings of the 2005 ACM SIGMOD International Conference on Management of Data, 491–502.

Vlachos, Michail, George Kollios, and Dimitrios Gunopulos. 2002. “Discovering Similar Multidimensional Trajectories.” In Proceedings 18th International Conference on Data Engineering, 673–84. IEEE.