Classifying Eagle Flight Behaviors Using K-Means Clustering

Kate Martin

Background

  • Using eagle data: Bergen et al., 2021

    • Behavior classification helps ecologists understand movement + habitat use
    • Six movement variables: KPH, Sn, AGL, abs_angle, VerticalRate, absVR
    • Goal: Use PCA + clustering to classify flight modes

Research Questions

1. Can movement variables distinguish in-flight vs. perching?


2. Among in-flight points:


a. Are there distinct flight behaviors?


b. What characterizes each behavior?


c. What do flight segments look like for each behavior type?

Data and Processing

Data: GPS + accelerometer-derived movement metrics from an adult eagle

Variables: - KPH – instantaneous speed
- Sn – horizontal speed
- AGL – altitude
- abs_angle – direction change
- VerticalRate – vertical movement
- absVR – absolute vertical rate

Randomly sampled 20,000 points for PCA & clustering.

Distribution of Key Movement Variables

What These Distributions Show

  • Each variable differs dramatically in scale and shape.

  • Reinforcing the need for scaling before PCA and clustering.

Checking Samples Before Clustering

  • Used 20,000-point random samples for PCA and clustering.

    • Generated multiple independent samples to avoid flukes.

    • PCA structure was stable across samples (same variance patterns, same PC1–PC2 shape).

WSS (Elbow) Method

Elbow Plot
  • Used elbow method to compare WSS for K = 2–10.
  • The curve flattens at K = 4.
  • After K = 4, improvements were minimal.
  • Captures major behavior patterns
    • Without overfitting

Cluster Comparisons (K = 2 to 7)

  • Compared cluster structures for K = 2–7.
  • Higher K adds detail but gives diminishing returns.
  • K = 4 provides the clearest and most interpretable separation.

PCA Biplot Interpretation (K = 4)

  • Ascending (blue cluster)
    • strong movement upwards.
  • Gliding (teal cluster):
    • stable movement, low turning.
  • Flapping (lavender cluster):
    • sits between gliding and ascending.
  • Perching (orange cluster):
    • low altitude change, low speed.

Summary & Flight Segment Examples

Appendix (1)

# loading data
library(tidyverse)
library(cluster)
library(factoextra)
library(ggplot2)
library(ggpubr)
library(gridExtra)
library(RColorBrewer)

load(params$datafile)

# Clean working dataset 
df <- eagle_data %>%
  select(
    KPH,
    Sn,
    AGL,
    abs_angle,
    VerticalRate,
    absVR
  )

# pastel palette
pastel <- c("#AEC6CF", "#FFB347", "#C3B1E1", "#B5EAD7", "#FFDAC1", "#E2F0CB", "#FF9AA2")

Appendix (2)

# distribution plot
df <- eagle_data %>%
  select(
    KPH,
    Sn,
    AGL,
    abs_angle,
    VerticalRate,
    absVR
  )

plot_hist <- function(var){
  ggplot(df, aes(x = .data[[var]])) +
    geom_histogram(fill = pastel[7], color = "white", bins = 60) +
    theme_minimal(base_size = 14) +
    ggtitle(var) +
    theme(
      axis.title.y = element_blank(),
      axis.text.y  = element_blank(),
      axis.ticks.y = element_blank()
    )
}

grid.arrange(
  plot_hist("KPH"),
  plot_hist("Sn"),
  plot_hist("AGL"),
  plot_hist("abs_angle"),
  plot_hist("VerticalRate"),
  plot_hist("absVR"),
  ncol = 3
)

Appendix (3)

df <- eagle_data %>%
  select(
    KPH,
    Sn,
    AGL,
    abs_angle,
    VerticalRate,
    absVR
  ) %>%
  drop_na()
library(ggplot2)
library(ggfortify)

# checking samples plot

set.seed(111)

# 1. PCA ON THE FULL DATASET ONCE
pc_all <- prcomp(df, scale = TRUE)   # df has KPH, Sn, AGL, abs_angle, VerticalRate, absVR
scores_all <- as.data.frame(pc_all$x)  # PC scores for every point

# 2. FOUR RANDOM SUBSETS OF THE PCA SCORES
idx_list <- replicate(4, sample(nrow(scores_all), 8000), simplify = FALSE)

plot_pca_sample <- function(idx, title) {
  ggplot(scores_all[idx, ], aes(PC1, PC2)) +
    geom_point(alpha = 0.25, color = "#7BAFD4", size = 0.6) +
    coord_equal() +                              # keep aspect ratio 1:1
    theme_minimal(base_size = 14) +
    ggtitle(title) +
    # optional: make axes look like the slide
    scale_x_continuous(name = "PC1") +
    scale_y_continuous(name = "PC2")
}

grid.arrange(
  plot_pca_sample(idx_list[[1]], "Sample 1"),
  plot_pca_sample(idx_list[[2]], "Sample 2"),
  plot_pca_sample(idx_list[[3]], "Sample 3"),
  plot_pca_sample(idx_list[[4]], "Sample 4"),
  ncol = 2
)

Appendix (4)

df_clean <- eagle_data %>%
  select(KPH, Sn, AGL, abs_angle, VerticalRate, absVR)
df_cc <- df_clean
set.seed(123)
idx_cc <- sample(nrow(df_cc), 20000)
wss <- sapply(2:10, function(k) {
  kmeans(df_cc[idx_cc, ], centers = k, nstart = 10)$tot.withinss
})

elbow_df <- data.frame(
  k = 2:10,
  wss = wss
)

library(ggplot2)

ggplot(elbow_df, aes(x = k, y = wss)) +
  geom_line(size = 1.2, color = "#97C5E8") +
  geom_point(size = 3, color = "#97C5E8") +
  scale_x_continuous(breaks = 2:10) +
  scale_y_continuous(breaks = seq(20000, 70000, by = 10000)) +
  labs(
    x = "Number of Clusters (K)",
    y = "Total Within-Cluster Sum of Squares",
    title = "Elbow Method for Choosing Optimal K"
  ) +
  theme_minimal(base_size = 16)

Appendix (5)

## Slide: Cluster Comparisons Across K = 2–7

set.seed(123)
data_for_pca <- if (exists("df_clust")) df_clust else df
pc_all    <- prcomp(data_for_pca, scale. = TRUE)
scores_all <- as.data.frame(pc_all$x)   

idx_cc    <- sample(nrow(scores_all), 20000)
scores_cc <- scores_all[idx_cc, ]     

# 4. Pastel palette (yours)
pastel2 <- c(
  "#AEC6CF", "#FFB347", "#C3B1E1", "#B5EAD7",
  "#FFDAC1", "#E2F0CB", "#FF9AA2"
)

xlim_vals <- c(-2.5, 7.5)
ylim_vals <- c(-2.5, 4)

plot_k <- function(k) {
  # k-means on FIRST 6 PCs 
  km <- kmeans(scores_cc[, 1:6], centers = k, nstart = 20)

  ggplot(scores_cc, aes(PC1, PC2, colour = factor(km$cluster))) +
    geom_point(alpha = 0.35, size = 0.6) +
    scale_color_manual(values = pastel2) +
    coord_cartesian(xlim = xlim_vals, ylim = ylim_vals) +
    coord_equal() +
    theme_minimal(base_size = 13) +
    ggtitle(paste("K =", k)) +
    guides(colour = "none")
}

grid.arrange(
  plot_k(2), plot_k(3), plot_k(4),
  plot_k(5), plot_k(6), plot_k(7),
  ncol = 3
)

Appendix (6)

df_clean <- eagle_data %>%
  select(
    KPH,
    Sn,
    AGL,
    abs_angle,      
    VerticalRate,   
    absVR           
  )
df_cc <- df_clean   
set.seed(123)

# 1. PCA on the 20k subset (this already works in your cluster comparison code)
pc_cc <- prcomp(df_cc[idx_cc, ], scale = TRUE)

# 2. PCA scores for these same 20k rows
scores_cc <- as.data.frame(pc_cc$x[, 1:2])

# 3. K-means clustering on PCs 1–6
km4_cc <- kmeans(pc_cc$x[, 1:6], centers = 4, nstart = 20)

Appendix (7)

# PCA BIPLOT FOR K = 4 

library(ggplot2)
library(dplyr)

vars_use <- c("KPH", "Sn", "AGL", "abs_angle", "VerticalRate", "absVR")
L_use <- pc_cc$rotation[vars_use, 1:2]

arrow_scale <- 5
L_plot <- as.data.frame(L_use * arrow_scale)

L_plot$var <- c("abs_angle", "VerticalRate", "absVR", "AGL", "KPH", "Sn")
biplot_df <- data.frame(
  PC1 = pc_cc$x[,1],
  PC2 = pc_cc$x[,2],
  cluster = factor(km4_cc$cluster)
)

# 6. Final BIPLOT
ggplot(biplot_df, aes(PC1, PC2, color = cluster)) +
  geom_point(alpha = 0.25, size = 0.8) +
  scale_color_manual(values = pastel2, name = "Cluster") +

  # Arrows
  geom_segment(
    data = L_plot,
    aes(x = 0, y = 0, xend = PC1, yend = PC2),
    inherit.aes = FALSE,          # << FIXED
    arrow = arrow(length = unit(0.25, "cm")),
    linewidth = 1,
    color = "black"
  ) +

  # Arrow labels
  geom_text(
    data = L_plot,
    aes(x = PC1, y = PC2, label = var),
    inherit.aes = FALSE,          # << FIXED
    size = 5,
    hjust = -0.2, 
    vjust = -0.4
  ) +

  coord_cartesian(xlim = c(-2.5, 7.5), ylim = c(-2.5, 4)) +
  theme_minimal(base_size = 15) +
  ggtitle("Distinct Flight Behaviors (K = 4)")

Appendix (8)

library(dplyr)
library(ggplot2)
library(patchwork)
 
vars <- c("KPH", "Sn", "AGL", "VerticalRate")
 
df <- eagle_data %>%
  drop_na(all_of(vars))
 
X <- scale(df %>% select(all_of(vars)))
 
set.seed(123)
km <- kmeans(X, centers = 4)
 
df$cluster <- factor(km$cluster)
 
 
perching_cluster <- df %>%
  group_by(cluster) %>%
  summarise(mean_speed = mean(KPH, na.rm = TRUE)) %>%
  arrange(mean_speed) %>%
  slice(1) %>% pull(cluster)
 
bird_id <- 107
 
 
flight_segment <- df %>%
  filter(Animal_ID == bird_id, cluster != perching_cluster) %>%
  count(segment_id) %>%
  arrange(desc(n)) %>%
  slice(1) %>%
  pull(segment_id)
 
flight_segment  
[1] 4511
bird_df <- df %>%
  filter(Animal_ID == bird_id) %>%
  filter(segment_id == flight_segment) %>% 
  filter(cluster != perching_cluster) %>% 
  arrange(LocalTime) %>%
  mutate(seconds = cumsum(TimeDiff))
 
 
remaining_clusters <- sort(unique(bird_df$cluster))
 
custom_colors <- setNames(
  c("#AEC6CF", "#C3B1E1", "#B5EAD7"),
  remaining_clusters
)
 
# Plot KPH
p_kph <- ggplot(bird_df, aes(x = seconds, y = KPH, color = cluster)) +
  geom_line(alpha = 0.5) +
  geom_point(size = 1.2) +
  scale_color_manual(values = custom_colors) +
  theme_minimal(base_size = 14) +
  labs(
    title = paste("Speed (KPH), Bird", bird_id,
                  "Segment", flight_segment),
    x = "Seconds",
    y = "KPH"
  )
 
# Plot AGL
p_agl <- ggplot(bird_df, aes(x = seconds, y = AGL, color = cluster)) +
  geom_line(alpha = 0.5) +
  geom_point(size = 1.2) +
  scale_color_manual(values = custom_colors) +
  theme_minimal(base_size = 14) +
  labs(
    title = paste("Altitude (AGL), Bird", bird_id,
                  "Segment", flight_segment),
    x = "Seconds",
    y = "Altitude (m)"
  )
 
 
final_plot <- p_kph / p_agl
final_plot