Behavioral Clustering of Eagle Flight Patterns

Introduction

  • Animal movement data is being collected rapidly
  • Ecologists need a better way to analytically research this data
  • Within mass data volume (2 million rows) we need a way to analyze these differences

Data

Data collected on eagles using high-frequency GPS

One row = what the eagle is doing at a given point in time

Key variables = KPH, Sn, AGL, |Angle|, Vertical rate, and |VR|

# A tibble: 6 × 15
  Animal_ID TimeDiff segment_id segment_length LocalTime           Latitude Longitude       X        Y   KPH    Sn   AGL VerticalRate abs_angle absVR
      <int>    <int>      <dbl>          <int> <dttm>                 <dbl>     <dbl>   <dbl>    <dbl> <dbl> <dbl> <dbl>        <dbl>     <dbl> <dbl>
1       105        5          1             10 2019-06-25 16:44:10     41.6     -92.9 509745. 4609596.  0.07  0.52  9.55        -0.99      0.45  0.99
2       105        6          1             10 2019-06-25 16:44:16     41.6     -92.9 509746. 4609597.  0.07  0.17  9.55         0         0.21  0   
3       105        5          1             10 2019-06-25 16:44:21     41.6     -92.9 509747. 4609597.  0.2   0.14  9.55         0         0.91  0   
4       105        6          1             10 2019-06-25 16:44:27     41.6     -92.9 509747. 4609599.  0.16  0.22  9.55         0         3.14  0   
5       105        5          1             10 2019-06-25 16:44:32     41.6     -92.9 509747. 4609598.  0.11  0.08 11.6          0.39      2.69  0.39
6       105        6          1             10 2019-06-25 16:44:38     41.6     -92.9 509746. 4609599.  0.07  0.22 11.6          0         0.53  0   

What we are Interested in

  1. Using our key variables, can we distinguish in-flight from perching points?
  1. Of the in-flight points:
  • Can flight behaviors be identified?
  • What are characteristics of these behaviors?
  • What are visual examples to demonstrate these behaviors?

Variable Distributions

KPH & Sn: large frequency at 0, suggesting perched points

AGL & |Angle|: larger values suggest sharp turning points or flying above trees

Perching vs. In-Flight

Perching (cluster 1) - low speed and distance between points

In-flight (cluster 2) - high speed, distance between points, and VR change

Choosing the Amount of Clusters

First, we need to see what the optimal number of clusters is to analyze variation.

Is 4 Clusters Optimal?

We can wee the Silhouette and Elbow method both suggest 4 clusters is optimal.

Identifying Distinct In-Flight Behaviors

Perched

Gliding

Ascending

Flapping

Characteristics of These Behaviors

Gliding = high AGL & KPH, low absolute turning angle

Ascending = high absolute turning angle

Flapping = low AGL and KPH

Visualizing Flight Behaviors

Sampling Method

Any questions?

Appendix

Variable Distributions

library(tidyverse)
load("C:/Users/fx3734qp/Downloads/DSCI 415/Project2/project_2/eagle_data.Rdata")
eagle_data <- as_tibble(eagle_data)

# Keep only the focal variables (raw, no transforms)
numeric_only <- eagle_data %>%
  select(Sn, KPH, AGL, abs_angle, VerticalRate, absVR) %>%
  drop_na() %>%
  filter_all(all_vars(is.finite(.)))

par(mfrow = c(2, 3))

# histogram plots based on frequency for each variable 
hist(numeric_only$Sn, breaks = 50, main = "Sn",
     xlab = "Sn", col = "lightblue")
hist(numeric_only$KPH, breaks = 50, main = "KPH",
     xlab = "KPH", col = "lightgreen")
hist(numeric_only$AGL, breaks = 50, main = "AGL",
     xlab = "AGL", col = "pink")
hist(numeric_only$abs_angle, breaks = 50, main = "|Angle|",
     xlab = "|Angle|", col = "purple")
hist(numeric_only$VerticalRate, breaks = 20, main = "VR",
     xlab = "Vertical Rate", col = "orange")
hist(numeric_only$absVR, breaks = 20, main = "|VR|",
     xlab = "|Vertical Rate|", col = "red")

par(mfrow = c(1, 1))

Appendix

K-means 2 cluster plot

library(tidyverse)
library(ggplot2)
load("C:/Users/fx3734qp/Downloads/DSCI 415/Project2/project_2/eagle_data.Rdata")
eagle_data <- as_tibble(eagle_data)

# transforming the data with square root
numeric_only <- eagle_data %>%
  mutate(
    KPH_sq       = sqrt(pmax(KPH, 0)),
    Sn_sq        = sqrt(pmax(Sn, 0)),
    AGL_sq       = sqrt(pmax(AGL, 0)),
    abs_angle_sq = sqrt(pmax(abs_angle, 0)),
    absVR_sq     = sqrt(pmax(absVR, 0)),
    VerticalRate = VerticalRate
  ) %>%
  # filtering to only needed columns
  select(KPH_sq, Sn_sq, AGL_sq, abs_angle_sq, absVR_sq, VerticalRate) %>%
  drop_na() %>%
  filter_all(all_vars(is.finite(.)))

# scale the data and sample
numeric_scaled <- scale(numeric_only)
set.seed(42)
numeric_scaled <- numeric_scaled[sample(1:nrow(numeric_scaled), 20000), ]

# kmeans cluster using 2 centers
kmeans_clusters <- kmeans(numeric_scaled, centers = 2, iter.max = 20, nstart = 25)

# calculates PCA on scaled data
pca_res <- prcomp(numeric_scaled, scale. = FALSE) 

# merge PCA scares with actual data
pca_scores <- as_tibble(pca_res$x[, 1:2]) %>%
  mutate(cluster = factor(kmeans_clusters$cluster))

# add in PCA explained on the x and y axis
pca_var <- round(100 * pca_res$sdev^2 / sum(pca_res$sdev^2), 1)
xlab <- paste0("PC1 (", pca_var[1], "%)")
ylab <- paste0("PC2 (", pca_var[2], "%)")

# create loadings for all 6 variables
loadings <- as_tibble(pca_res$rotation[, 1:2], rownames = "variable") %>%
  mutate(across(c(PC1, PC2), ~ .x * 5)) %>%
  mutate(variable = recode(variable,
                           KPH_sq       = "KPH",
                           Sn_sq        = "Sn",
                           AGL_sq       = "AGL",
                           abs_angle_sq = "|Angle|",
                           absVR_sq     = "|VR|",
                           VerticalRate = "VR"))

# assign cluster colors
cluster_colors <- c("1" = "pink", "2" = "lightgreen")

p <- ggplot(pca_scores, aes(x = PC1, y = PC2, color = cluster)) +
  geom_point(alpha = 0.6) +
  geom_segment(data = loadings,
               aes(x = 0, y = 0, xend = PC1, yend = PC2),
               arrow = arrow(length = unit(0.2, "cm")),
               color = "black") +
  geom_text(data = loadings,
            aes(x = PC1, y = PC2, label = variable),
            color = "black", size = 4, vjust = -0.5) +
  scale_color_manual(values = cluster_colors) +
  labs(title = "",
       x = xlab, y = ylab, color = "Cluster") +
  theme_classic(base_size = 16)

print(p)

Appendix

Clusters 2-7

library(tidyverse)
library(ggplot2)
library(gridExtra)

# again, loading in the data, transforming the data, selecting the correct columns, scaling, and taking a sample
load("C:/Users/fx3734qp/Downloads/DSCI 415/Project2/project_2/eagle_data.Rdata")
eagle_data <- as_tibble(eagle_data)
numeric_only <- eagle_data %>%
  mutate(
    KPH_sq       = sqrt(pmax(KPH, 0)),
    Sn_sq        = sqrt(pmax(Sn, 0)),
    AGL_sq       = sqrt(pmax(AGL, 0)),
    abs_angle_sq = sqrt(pmax(abs_angle, 0)),
    absVR_sq     = sqrt(pmax(absVR, 0)),
    VerticalRate = VerticalRate
  ) %>%
  select(KPH_sq, Sn_sq, AGL_sq, abs_angle_sq, absVR_sq, VerticalRate) %>%
  drop_na() %>%
  filter_all(all_vars(is.finite(.)))
numeric_scaled <- scale(numeric_only)
set.seed(42)
numeric_scaled <- numeric_scaled[sample(1:nrow(numeric_scaled), 2000), ]

# calucalte PCA on the scaled data
pca_res <- prcomp(numeric_scaled, scale. = FALSE)  

pca_res$x[,1] <- -pca_res$x[,1]
pca_res$rotation[,1] <- -pca_res$rotation[,1]

# creating a scatterplot with PCA's on x and y axis
pca_scores_base <- as_tibble(pca_res$x[, 1:2])

# assigning cluster colors
cluster_colors <- c("1" = "pink", "2" = "lightgreen", "3" = "lightblue",
                    "4" = "violet", "5" = "orange", "6" = "brown", "7" = "gray")

# making the plot function
make_plot <- function(k) {
  set.seed(42)
  # need k amount of centers as each plot will be different
  km <- kmeans(numeric_scaled, centers = k, iter.max = 20, nstart = 25)
  # PCA scores on x and y axis
  pca_scores <- pca_scores_base %>%
    mutate(cluster = factor(km$cluster))
  
  ggplot(pca_scores, aes(x = PC1, y = PC2, color = cluster)) +
    geom_point(alpha = 0.6) +
    scale_color_manual(values = cluster_colors) +
    labs(title = paste(k, "clusters"),
         x = "PC1", y = "PC2") +
    theme_classic(base_size = 14) +
    theme(legend.position = "none")   # no legend
}

# creates the plot 6 times with clusters 2-7 using list apply
plots <- lapply(2:7, make_plot)
grid.arrange(grobs = plots, ncol = 3)

Appendix

Determining K

library(tidyverse)
library(cluster)
library(factoextra)
library(patchwork)  
# taking a sample of the data to run smoother
set.seed(1)
eagle_sample <- eagle_data %>%
  sample_n(2000)

# selecting only the movement variables and scaling
movement_vars <- eagle_sample %>%
  select(KPH, Sn, AGL, abs_angle, VerticalRate, absVR)
movement_scaled <- scale(movement_vars)

# agglomerative clustering using the ward method
hc_ward <- agnes(movement_scaled, method = "ward")

# elbow plot with a k max of 10
p1 <- fviz_nbclust(movement_scaled, FUN = hcut, method = "wss",
                   hc_method = "ward", k.max = 10) +
  labs(title = "Elbow Method (WSS)") 

# silhouette width plot with a k max of 10
p2 <- fviz_nbclust(movement_scaled, FUN = hcut, method = "silhouette",
                   hc_method = "ward", k.max = 10) +
  labs(title = "Silhouette Method")
p1 + p2

Appendix

4 clusters with loadings

library(tidyverse)
library(ggplot2)

# again, loading in the data, transforming the data, selecting the correct columns, scaling, and taking a sample
load("C:/Users/fx3734qp/Downloads/DSCI 415/Project2/project_2/eagle_data.Rdata")
eagle_data <- as_tibble(eagle_data)
numeric_only <- eagle_data %>%
  mutate(
    KPH_sq       = sqrt(pmax(KPH, 0)),
    Sn_sq        = sqrt(pmax(Sn, 0)),
    AGL_sq       = sqrt(pmax(AGL, 0)),
    abs_angle_sq = sqrt(pmax(abs_angle, 0)),
    absVR_sq     = sqrt(pmax(absVR, 0)),
    VerticalRate = VerticalRate
  ) %>%
  select(KPH_sq, Sn_sq, AGL_sq, abs_angle_sq, absVR_sq, VerticalRate) %>%
  drop_na() %>%
  filter_all(all_vars(is.finite(.)))
numeric_scaled <- scale(numeric_only)
set.seed(42)
numeric_scaled <- numeric_scaled[sample(1:nrow(numeric_scaled), 20000), ]

# using kmeans clustering create 4 clusters
kmeans_clusters <- kmeans(numeric_scaled, centers = 4, iter.max = 20, nstart = 25)

# calucalte PCA on the scaled data
pca_res <- prcomp(numeric_scaled, scale. = FALSE)

# creating a scatterplot with PCA's on x and y axis
pca_scores <- as_tibble(pca_res$x[, 1:2]) %>%
  # changing cluster 1-4 names to their behavior
  mutate(cluster = factor(kmeans_clusters$cluster,
                          levels = c(1,2,3,4),
                          labels = c("Perched","Gliding","Ascending","Flapping")))

# add in variance explained on the PC axis
pca_var <- round(100 * pca_res$sdev^2 / sum(pca_res$sdev^2), 1)
xlab <- paste0("PC1 (", pca_var[1], "%)")
ylab <- paste0("PC2 (", pca_var[2], "%)")

# insert laodings into graph
loadings <- as_tibble(pca_res$rotation[, 1:2], rownames = "variable") %>%
  mutate(across(c(PC1, PC2), ~ .x * 5)) %>%
  mutate(variable = recode(variable,
                           KPH_sq       = "KPH",
                           Sn_sq        = "Sn",
                           AGL_sq       = "AGL",
                           abs_angle_sq = "|Angle|",
                           absVR_sq     = "|VR|",
                           VerticalRate = "VR"))

cluster_colors <- c("Perched" = "orange",
                    "Gliding" = "green",
                    "Ascending" = "blue",
                    "Flapping" = "purple")

p <- ggplot(pca_scores, aes(x = PC1, y = PC2, color = cluster)) +
  geom_point(alpha = 0.6) +
  geom_segment(data = loadings,
               aes(x = 0, y = 0, xend = PC1, yend = PC2),
               arrow = arrow(length = unit(0.2, "cm")),
               color = "black") +
  geom_text(data = loadings,
            aes(x = PC1, y = PC2, label = variable),
            color = "black", size = 4, vjust = -0.5) +
  scale_color_manual(values = cluster_colors) +
  labs(title = "",
       x = xlab, y = ylab, color = NULL) +  
  theme_classic(base_size = 16)

print(p)

Appendix

Boxplot of in-flight variables

library(tidyverse)
library(ggplot2)

#again, load in the data and select only needed variabels and sample
load("C:/Users/fx3734qp/Downloads/DSCI 415/Project2/project_2/eagle_data.Rdata")
eagle_data <- as_tibble(eagle_data)
numeric_only <- eagle_data %>%
  select(KPH, Sn, AGL, abs_angle, absVR, VerticalRate) %>%
  drop_na() %>%
  filter_all(all_vars(is.finite(.)))
set.seed(42)
numeric_only <- sample_n(numeric_only, 50000)

# kmeans cluster on 4 clusters in order to separate each behavior when plotted
km <- kmeans(numeric_only, centers = 4, iter.max = 20, nstart = 25)

# attach cluster labels
boxplot_data <- numeric_only %>%
  mutate(kmeans_clusters = factor(km$cluster))

# remove cluster 1 (as it is not in-flight) and relabel remaining clusters 
boxplot_data <- boxplot_data %>%
  filter(kmeans_clusters != 1) %>%
  mutate(kmeans_clusters = droplevels(kmeans_clusters)) %>%
  mutate(kmeans_clusters = fct_recode(kmeans_clusters,
                                      "Gliding"   = "2",
                                      "Ascending" = "3",
                                      "Flapping"  = "4"))
cluster_colors <- c("Gliding"   = "green",
                    "Ascending" = "orange",
                    "Flapping"  = "purple")

# boxplot with 2 rows and 3 columns
par(mfrow = c(2,3)) 

# each individual boxplot with specified variable and its frequency
boxplot(KPH ~ kmeans_clusters, data = boxplot_data,
        main = "KPH", xlab = "", ylab = "KPH",
        col = cluster_colors, cex.axis = 0.8)
boxplot(Sn ~ kmeans_clusters, data = boxplot_data,
        main = "Sn", xlab = "", ylab = "Sn",
        col = cluster_colors, cex.axis = 0.8)
boxplot(AGL ~ kmeans_clusters, data = boxplot_data,
        main = "AGL", xlab = "", ylab = "AGL",
        col = cluster_colors, cex.axis = 0.8)
boxplot(abs_angle ~ kmeans_clusters, data = boxplot_data,
        main = "|Angle|", xlab = "", ylab = "|Angle|",
        col = cluster_colors, cex.axis = 0.8)
boxplot(absVR ~ kmeans_clusters, data = boxplot_data,
        main = "|VR|", xlab = "", ylab = "|VR|",
        col = cluster_colors, cex.axis = 0.8)
boxplot(VerticalRate ~ kmeans_clusters, data = boxplot_data,
        main = "VerticalRate", xlab = "", ylab = "VerticalRate",
        col = cluster_colors, cex.axis = 0.8)

# reset layout
par(mfrow = c(1,1)) 

Appendix

Visualizing Flight Behaviors

library(dplyr)
library(ggplot2)
library(patchwork)
 # keeps only variables I want to analyze
vars <- c("KPH", "Sn", "AGL", "VerticalRate")

df <- eagle_data %>%
drop_na(all_of(vars))
 
# scale and cluster by k means
X <- scale(df %>% select(all_of(vars)))
 
set.seed(123)
km <- kmeans(X, centers = 4)

df$cluster <- factor(km$cluster)

perching_cluster <- df %>%
group_by(cluster) %>%
summarise(mean_speed = mean(KPH, na.rm = TRUE)) %>%
arrange(mean_speed) %>%
slice(1) %>% pull(cluster)
# filters to one bird
bird_id <- 107
 
flight_segment <- df %>%
  # Filter the data to include only the tracking points
  filter(Animal_ID == bird_id, cluster != perching_cluster) %>%
  # Group the remaining data by the distinct flight segments
  group_by(segment_id) %>%
  # Calculate the range of speed for each segment by subtracting the minimum KPH from the maximum KPH
  summarise(kph_range = max(KPH) - min(KPH), .groups="drop") %>%
  # Sort the resulting segments in descending order based
  arrange(desc(kph_range)) %>%
  # Select only the first row, which represents the segment with the highest speed range
  slice(1) %>%
  pull(segment_id)
 
flight_segment   
 
bird_df <- df %>%
filter(Animal_ID == bird_id) %>%
filter(segment_id == flight_segment) %>% 
#  Filter out the perching points again, leaving only the three flight clusters
filter(cluster != perching_cluster) %>% 
arrange(LocalTime) %>%
# Create a new column 'seconds' which represents the cumulative time elapsed from the start of the selected flight segment
mutate(seconds = cumsum(TimeDiff))
 
remaining_clusters <- sort(unique(bird_df$cluster))
 
custom_colors <- setNames(
c("orange", "purple", "green"),
remaining_clusters)
# Plot KPH
p_kph <- ggplot(bird_df, aes(x = seconds, y = KPH, color = cluster)) +
geom_point(size = 1.2) +
scale_color_manual(values = custom_colors) +
theme_minimal(base_size = 14) +
labs(title = paste(""),
x = "Seconds",
y = "Speed" )
# Plot AGL
p_agl <- ggplot(bird_df, aes(x = seconds, y = AGL, color = cluster)) +
geom_point(size = 1.2) +
scale_color_manual(values = custom_colors) +
theme_minimal(base_size = 14) +
labs(title = paste(""),
x = "Seconds",
y = "Height Above the Ground (m)")
 
final_plot <- p_kph / p_agl
final_plot