| Behavior | N | Speed (KPH) | Altitude (m) | Vertical Rate | Turn Angle |
|---|---|---|---|---|---|
| 1 | 157786 | 44.40 | 315.28 | -1.31 | 0.24 |
| 2 | 92108 | 76.24 | 525.47 | -1.24 | 0.14 |
| 3 | 188232 | 56.38 | 157.40 | -0.22 | 0.22 |
| 4 | 58936 | 54.18 | 429.82 | 0.50 | 1.25 |
| 5 | 76724 | 61.73 | 394.90 | -3.46 | 0.20 |
| Behavior | N | Speed (KPH) | Altitude (m) | Vertical Rate | Turn Angle |
|---|---|---|---|---|---|
| 1 | 157786 | 44.40 | 315.28 | -1.31 | 0.24 |
| 2 | 92108 | 76.24 | 525.47 | -1.24 | 0.14 |
| 3 | 188232 | 56.38 | 157.40 | -0.22 | 0.22 |
| 4 | 58936 | 54.18 | 429.82 | 0.50 | 1.25 |
| 5 | 76724 | 61.73 | 394.90 | -3.46 | 0.20 |
Bergen, S., Huso, M.M., Duerr, A.E., Braham, M.A., Katzner, T.E., Schmuecker, S., & Miller, T.A. (2022). Classifying behavior from short-interval biologging data: An example with GPS tracking of birds. Ecology and Evolution, 12, e8395.
# Load all required libraries
library(tidyverse) # Data manipulation and visualization
library(cluster) # Clustering algorithms and silhouette analysis
library(factoextra) # PCA and cluster visualization
library(gridExtra) # Arrange multiple plots
options(width = 10000)
load('eagle_data.Rdata')
# Convert to tibble for easier manipulation
eagle_data <- eagle_data %>% as_tibble()
# Inspect the data structure
head(eagle_data)
str(eagle_data)
summary(eagle_data)# Check for missing values in key movement variables
# This helps us understand data quality before filtering
eagle_data %>%
select(KPH, Sn, AGL, abs_angle, VerticalRate, absVR) %>%
summarise(across(everything(), ~sum(is.na(.))))
# Remove observations with missing values in any of the key movement variables
# Complete cases are necessary for clustering algorithms
eagle_clean <- eagle_data %>%
filter(!is.na(KPH), !is.na(Sn), !is.na(AGL),
!is.na(abs_angle), !is.na(VerticalRate), !is.na(absVR))
# Filter to segments with at least 10 consecutive time points
eagle_clean <- eagle_clean %>%
filter(segment_length >= 10)
#Square-root transform to account for skewness
eagle_clean <- eagle_clean %>%
mutate(
KPH_trans = sqrt(KPH),
Sn_trans = sqrt(Sn),
AGL_trans = sqrt(AGL),
abs_angle_trans = sqrt(abs_angle),
absVR_trans = sqrt(absVR),
VerticalRate_trans = VerticalRate
)
# Verify cleaning results
cat("Original observations:", nrow(eagle_data), "\n")
cat("Clean observations:", nrow(eagle_clean), "\n")
cat("Unique eagles:", n_distinct(eagle_clean$Animal_ID), "\n")
cat("Unique segments:", n_distinct(eagle_clean$segment_id), "\n")# Summary statistics of key movement variables
# Provides overview of central tendency and spread
eagle_clean %>%
select(KPH, Sn, AGL, abs_angle, VerticalRate, absVR) %>%
summary()
# Calculate standard deviations
eagle_clean %>%
select(KPH, Sn, AGL, abs_angle, VerticalRate, absVR) %>%
summarise(across(everything(), ~sd(., na.rm = TRUE)))# Correlation matrix of movement variables
cor_matrix <- eagle_clean %>%
select(KPH, Sn, AGL, abs_angle, VerticalRate, absVR) %>%
cor(use = "complete.obs")
print(round(cor_matrix, 2))
# Note the high correlation between KPH and Sn
# Both measure speed, so correlation is expected
cat("\nCorrelation between KPH and Sn:", round(cor_matrix["KPH", "Sn"], 3), "\n")# Visualize distributions of each variable
# Instantaneous speed (KPH)
p1 <- ggplot(eagle_clean, aes(x = KPH)) +
geom_histogram(bins = 50, fill = "steelblue", alpha = 0.7) +
labs(title = "Distribution of KPH", x = "KPH", y = "Count") +
theme_minimal()
# Speed between points (Sn)
p2 <- ggplot(eagle_clean, aes(x = Sn)) +
geom_histogram(bins = 50, fill = "steelblue", alpha = 0.7) +
labs(title = "Distribution of Sn", x = "Sn (m/s)", y = "Count") +
theme_minimal()
# Altitude above ground level
p3 <- ggplot(eagle_clean, aes(x = AGL)) +
geom_histogram(bins = 50, fill = "steelblue", alpha = 0.7) +
labs(title = "Distribution of AGL", x = "AGL (m)", y = "Count") +
theme_minimal()
# Absolute turn angle
p4 <- ggplot(eagle_clean, aes(x = abs_angle)) +
geom_histogram(bins = 50, fill = "steelblue", alpha = 0.7) +
labs(title = "Distribution of Absolute Angle", x = "Absolute Angle", y = "Count") +
theme_minimal()
# Vertical rate (rate of altitude change)
p5 <- ggplot(eagle_clean, aes(x = VerticalRate)) +
geom_histogram(bins = 50, fill = "steelblue", alpha = 0.7) +
labs(title = "Distribution of Vertical Rate", x = "Vertical Rate", y = "Count") +
theme_minimal()
# Absolute vertical rate
p6 <- ggplot(eagle_clean, aes(x = absVR)) +
geom_histogram(bins = 50, fill = "steelblue", alpha = 0.7) +
labs(title = "Distribution of Absolute VR", x = "Absolute VR", y = "Count") +
theme_minimal()
# Display all histograms in grid layout
grid.arrange(p1, p2, p3, p4, p5, p6, ncol = 3)# Create pairs plot to visualize relationships between variables
# Sample data for easier computations
pairs_data <- eagle_clean %>%
select(KPH, Sn, AGL, abs_angle, VerticalRate, absVR) %>%
slice_sample(n = 5000)
# Create pairwise scatterplots
pairs(pairs_data, pch = 16, col = rgb(0, 0, 1, 0.1),
main = "Pairwise Scatterplots of Movement Variables")# Select movement features for classification
movement_vars <- eagle_clean %>%
select(KPH, Sn, AGL, abs_angle, VerticalRate, absVR)
# scale the variables
# Centers each variable at mean=0 and scales to sd=1
movement_scaled <- scale(movement_vars)
# Verify standardization
colMeans(movement_scaled) # Should be approximately 0
apply(movement_scaled, 2, sd) # Should be approximately 1# Determine optimal number of clusters using elbow method
# Test k from 1 to 10 clusters
set.seed(123) # Set seed for reproducibility
wss <- numeric(10)
for (k in 1:10) {
# Fit k-means with k clusters
# nstart=10 tries multiple random starting points
kmeans_model <- kmeans(movement_scaled, centers = k, nstart = 10)
# Store total within-cluster sum of squares
wss[k] <- kmeans_model$tot.withinss
}
# Plot elbow curve
plot(1:10, wss, type = "b", pch = 19,
xlab = "Number of Clusters",
ylab = "Total Within-Cluster Sum of Squares",
main = "Elbow Method for Optimal k")# Compute silhouette scores for different k values
# Silhouette measures how similar points are to their own cluster
# vs. other clusters (ranges from -1 to 1, higher is better)
# Use sample for speed - silhouette is computationally expensive
set.seed(123)
sample_size <- 2000
sample_idx <- sample(1:nrow(movement_scaled), sample_size)
movement_scaled_sample <- movement_scaled[sample_idx, ]
# Calculate silhouette scores for k=2 to k=10
sil_scores <- numeric(9)
for (k in 2:10) {
kmeans_model <- kmeans(movement_scaled_sample, centers = k, nstart = 10)
# Calculate silhouette for each point
sil <- silhouette(kmeans_model$cluster, dist(movement_scaled_sample))
# Store average silhouette width
sil_scores[k-1] <- mean(sil[, 3])
}
# Plot silhouette scores
plot(2:10, sil_scores, type = "b", pch = 19,
xlab = "Number of Clusters",
ylab = "Average Silhouette Width",
main = "Silhouette Analysis for Optimal k")
# Report optimal k based on silhouette
# Higher average silhouette width indicates better clustering
cat("Silhouette scores suggest k =", which.max(sil_scores) + 1, "is optimal\n")# Fit final K-means model with k=2 (flight vs. perching)
# Based on both elbow and silhouette analysis
set.seed(123)
kmeans_flight_perch <- kmeans(movement_scaled, centers = 2, nstart = 25)
# Add cluster assignments to data
eagle_clean <- eagle_clean %>%
mutate(flight_status = kmeans_flight_perch$cluster)
# Examine cluster characteristics to identify which is flight vs. perching
#calculate mean values for each cluster
cluster_summary <- eagle_clean %>%
group_by(flight_status) %>%
summarise(
n = n(),
mean_KPH = mean(KPH),
mean_Sn = mean(Sn),
mean_AGL = mean(AGL),
mean_abs_angle = mean(abs_angle),
mean_VR = mean(VerticalRate),
mean_absVR = mean(absVR)
)
print(cluster_summary)# Recode clusters based on speed characteristics
# Higher speed cluster = flight, lower speed = perching
if (cluster_summary$mean_KPH[1] > cluster_summary$mean_KPH[2]) {
eagle_clean <- eagle_clean %>%
mutate(flight_status = ifelse(flight_status == 1, "flight", "perching"))
} else {
eagle_clean <- eagle_clean %>%
mutate(flight_status = ifelse(flight_status == 2, "flight", "perching"))
}
# verify classification
classification_table <- table(eagle_clean$flight_status)
print(classification_table)
# Report percentages
cat("\nPercentage in flight:",
round(100 * classification_table["flight"] / sum(classification_table), 1), "%\n")
cat("Percentage perching:",
round(100 * classification_table["perching"] / sum(classification_table), 1), "%\n")# Create comparison boxplots for flight vs. perching
# Visualizes differences in movement variables between states
# Speed (KPH) comparison
p1 <- ggplot(eagle_clean, aes(x = flight_status, y = KPH, fill = flight_status)) +
geom_boxplot() +
scale_fill_manual(values = c("flight" = "skyblue", "perching" = "coral")) +
labs(title = "KPH by Status", x = "", y = "KPH") +
theme_minimal() +
theme(legend.position = "none")
# Horizontal speed (Sn) comparison
p2 <- ggplot(eagle_clean, aes(x = flight_status, y = Sn, fill = flight_status)) +
geom_boxplot() +
scale_fill_manual(values = c("flight" = "skyblue", "perching" = "coral")) +
labs(title = "Sn by Status", x = "", y = "Sn (m/s)") +
theme_minimal() +
theme(legend.position = "none")
# Altitude (AGL) comparison
p3 <- ggplot(eagle_clean, aes(x = flight_status, y = AGL, fill = flight_status)) +
geom_boxplot() +
scale_fill_manual(values = c("flight" = "skyblue", "perching" = "coral")) +
labs(title = "AGL by Status", x = "", y = "AGL (m)") +
theme_minimal() +
theme(legend.position = "none")
# Turn angle comparison
p4 <- ggplot(eagle_clean, aes(x = flight_status, y = abs_angle, fill = flight_status)) +
geom_boxplot() +
scale_fill_manual(values = c("flight" = "skyblue", "perching" = "coral")) +
labs(title = "Absolute Angle by Status", x = "", y = "Absolute Angle") +
theme_minimal() +
theme(legend.position = "none")
# Display all boxplots in 2x2 grid
grid.arrange(p1, p2, p3, p4, ncol = 2)# Prepare data for behavior clustering
flight_vars <- eagle_flight %>%
select(KPH, Sn, AGL, abs_angle, VerticalRate, absVR)
# Standardize the variables
# Important: standardize within flight data, not using original scale
flight_scaled <- scale(flight_vars)
# Verify standardization
colMeans(flight_scaled)
apply(flight_scaled, 2, sd)# Elbow method for flight behaviors
set.seed(456) # Different seed for flight analysis
wss_flight <- numeric(10)
for (k in 1:10) {
kmeans_model <- kmeans(flight_scaled, centers = k, nstart = 10)
wss_flight[k] <- kmeans_model$tot.withinss
}
# Plot elbow curve
plot(1:10, wss_flight, type = "b", pch = 19,
xlab = "Number of Clusters",
ylab = "Total Within-Cluster Sum of Squares",
main = "Elbow Method for Flight Behaviors")
# Silhouette analysis on sample
set.seed(456)
sample_size_flight <- min(2000, nrow(flight_scaled))
sample_idx_flight <- sample(1:nrow(flight_scaled), sample_size_flight)
flight_scaled_sample <- flight_scaled[sample_idx_flight, ]
sil_scores_flight <- numeric(9)
for (k in 2:10) {
kmeans_model <- kmeans(flight_scaled_sample, centers = k, nstart = 10)
sil <- silhouette(kmeans_model$cluster, dist(flight_scaled_sample))
sil_scores_flight[k-1] <- mean(sil[, 3])
}
# Plot silhouette scores
plot(2:10, sil_scores_flight, type = "b", pch = 19,
xlab = "Number of Clusters",
ylab = "Average Silhouette Width",
main = "Silhouette Analysis for Flight Behaviors")
# Report optimal k
optimal_k <- which.max(sil_scores_flight) + 1
cat("Silhouette scores suggest k =", optimal_k, "flight behaviors\n")# Fit K-means model with optimal k
# Using nstart=25 for more robust results
set.seed(456)
kmeans_flight_behaviors <- kmeans(flight_scaled, centers = optimal_k, nstart = 25)
# Add cluster assignments to flight data
eagle_flight <- eagle_flight %>%
mutate(behavior_type = kmeans_flight_behaviors$cluster)
# Display cluster sizes
cat("Cluster sizes:\n")
print(table(eagle_flight$behavior_type))
# Check for very small or very large clusters
# Imbalanced clusters may need further investigation
prop.table(table(eagle_flight$behavior_type))# Calculate comprehensive summary statistics by behavior type
behavior_summary <- eagle_flight %>%
group_by(behavior_type) %>%
summarise(
n = n(),
mean_KPH = mean(KPH),
sd_KPH = sd(KPH),
mean_Sn = mean(Sn),
sd_Sn = sd(Sn),
mean_AGL = mean(AGL),
sd_AGL = sd(AGL),
mean_abs_angle = mean(abs_angle),
sd_abs_angle = sd(abs_angle),
mean_VR = mean(VerticalRate),
sd_VR = sd(VerticalRate),
mean_absVR = mean(absVR),
sd_absVR = sd(absVR)
) %>%
arrange(behavior_type)
print(behavior_summary)
# Export
write.csv(behavior_summary, "behavior_summary.csv", row.names = FALSE)# Create detailed boxplots comparing all behaviors across key variables
# Speed comparison
p1 <- ggplot(eagle_flight, aes(x = as.factor(behavior_type),
y = KPH,
fill = as.factor(behavior_type))) +
geom_boxplot() +
labs(title = "Speed (KPH) by Behavior Type",
x = "Behavior Type",
y = "KPH") +
theme_minimal() +
theme(legend.position = "none")
# Altitude comparison
p2 <- ggplot(eagle_flight, aes(x = as.factor(behavior_type),
y = AGL,
fill = as.factor(behavior_type))) +
geom_boxplot() +
labs(title = "Altitude (AGL) by Behavior Type",
x = "Behavior Type",
y = "AGL (m)") +
theme_minimal() +
theme(legend.position = "none")
# Turn angle comparison
p3 <- ggplot(eagle_flight, aes(x = as.factor(behavior_type),
y = abs_angle,
fill = as.factor(behavior_type))) +
geom_boxplot() +
labs(title = "Turn Angle by Behavior Type",
x = "Behavior Type",
y = "Absolute Angle") +
theme_minimal() +
theme(legend.position = "none")
# Vertical rate comparison
p4 <- ggplot(eagle_flight, aes(x = as.factor(behavior_type),
y = VerticalRate,
fill = as.factor(behavior_type))) +
geom_boxplot() +
labs(title = "Vertical Rate by Behavior Type",
x = "Behavior Type",
y = "Vertical Rate") +
theme_minimal() +
theme(legend.position = "none")
# Display all plots
grid.arrange(p1, p2, p3, p4, ncol = 2)# Perform PCA on standardized flight data
# PCA reduces dimensionality while preserving variance
pca_flight <- prcomp(flight_scaled, scale. = FALSE)
# Examine variance explained by each component
summary(pca_flight)
# Create data frame for plotting
pca_df <- data.frame(
PC1 = pca_flight$x[, 1],
PC2 = pca_flight$x[, 2],
behavior = as.factor(eagle_flight$behavior_type)
)
# Plot first two principal components
# Colors represent different behavior types
ggplot(pca_df, aes(x = PC1, y = PC2, color = behavior)) +
geom_point(alpha = 0.4, size = 1) +
stat_ellipse(level = 0.68, size = 1) +
labs(title = "Flight Behavior Clusters (PCA Visualization)",
x = paste0("PC1 (", round(100 * summary(pca_flight)$importance[2, 1], 1), "%)"),
y = paste0("PC2 (", round(100 * summary(pca_flight)$importance[2, 2], 1), "%)"),
color = "Behavior") +
theme_minimal()
# Examine variable loadings
# Shows which variables contribute most to each PC
print("PC1 loadings:")
print(sort(pca_flight$rotation[, 1], decreasing = TRUE))
print("\nPC2 loadings:")
print(sort(pca_flight$rotation[, 2], decreasing = TRUE))# Select example segments from each behavior type
# Choose segments with sufficient length for visualization
set.seed(789)
example_segments <- eagle_flight %>%
group_by(behavior_type) %>%
filter(n() >= 20) %>% # Only behaviors with enough points
slice_sample(n = 1) %>% # One segment per behavior
pull(segment_id)
# Extract data for example segments
example_data <- eagle_flight %>%
filter(segment_id %in% example_segments[1:min(4, length(example_segments))])
# Create trajectory plots
if(nrow(example_data) > 0) {
# Spatial trajectories
p1 <- ggplot(example_data, aes(x = Longitude, y = Latitude,
color = as.factor(behavior_type))) +
geom_path(linewidth = 1) +
geom_point(alpha = 0.5, size = 2) +
facet_wrap(~behavior_type, scales = "free") +
labs(title = "Example Flight Trajectories by Behavior Type",
x = "Longitude", y = "Latitude",
color = "Behavior") +
theme_minimal() +
theme(legend.position = "bottom")
print(p1)
# Altitude profiles
p2 <- ggplot(example_data, aes(x = seq_along(Animal_ID), y = AGL,
color = as.factor(behavior_type))) +
geom_line(linewidth = 1) +
facet_wrap(~behavior_type, scales = "free") +
labs(title = "Altitude Profiles by Behavior Type",
x = "Time Point", y = "AGL (m)",
color = "Behavior") +
theme_minimal() +
theme(legend.position = "bottom")
print(p2)
}# Calculate within-cluster sum of squares for final model
cat("Total within-cluster SS:", kmeans_flight_behaviors$tot.withinss, "\n")
cat("Between-cluster SS:", kmeans_flight_behaviors$betweenss, "\n")
cat("Total SS:", kmeans_flight_behaviors$totss, "\n")
# Calculate proportion of variance explained
var_explained <- kmeans_flight_behaviors$betweenss / kmeans_flight_behaviors$totss
cat("\nProportion of variance explained:", round(var_explained, 3), "\n")
# Silhouette analysis for final model
sil_final <- silhouette(kmeans_flight_behaviors$cluster,
dist(flight_scaled[sample(1:nrow(flight_scaled),
min(2000, nrow(flight_scaled))), ]))
cat("Average silhouette width:", round(mean(sil_final[, 3]), 3), "\n")
# Plot silhouette
plot(sil_final, main = "Silhouette Plot for Final Clustering")