#reading in the data
#| attr-output: "style='font-size:0.3em'"
library(tidyverse)
library(factoextra)
library(patchwork)
library(cluster)
library(patchwork)
options(width=10000)
load('eagle_data.Rdata')
(eagle_data
%>% as_tibble()
) %>% head()Exam_2
Appendix
#df for just clustering variables
flight_vars <- eagle_data %>%
select(KPH:absVR)#need a sqrt transform for all but vertical rate
hist(flight_vars$KPH)
hist(flight_vars$Sn)
hist(flight_vars$AGL)
hist(flight_vars$VerticalRate)
hist(flight_vars$abs_angle)
hist(flight_vars$absVR)#scaled clustering variables, sqrt variables to resolve skewing problem
scaled_vars <- flight_vars %>%
mutate_at(vars(AGL, Sn, absVR,abs_angle), sqrt) %>%
scale()#investigating k2-7 to determine appropriate number of clusters
set.seed(234)
k2 <- kmeans(scaled_vars, centers = 2, nstart = 10, iter.max = 30)
set.seed(234)
k3 <- kmeans(scaled_vars, centers = 3, nstart = 10, iter.max = 30)
set.seed(234)
k4 <- kmeans(scaled_vars, centers = 4, nstart = 10, iter.max = 30)
set.seed(234)
k5 <- kmeans(scaled_vars, centers = 5, nstart = 10, iter.max = 30)
set.seed(234)
k6 <- kmeans(scaled_vars, centers = 6, nstart = 10, iter.max = 30)
set.seed(234)
k7 <- kmeans(scaled_vars,centers=7,nstart=10,iter.max = 30)#plotting to help choose K for clustering (for k-means)
wss_df = data.frame(k = 2:6,
wss = c(k2$tot.withinss,
k3$tot.withinss,
k4$tot.withinss,
k5$tot.withinss,
k6$tot.withinss))
with(wss_df, plot(k,wss,type='b'))
#not very helpful, will check silhouette disatnces#Finding average silhouette width - 4 looks to be the best option
B <- 100
avg.sils <- matrix(NA,B,6)
set.seed(12)
for(b in 1:B){
ids <- sample(1:nrow(scaled_vars),1000,replace=TRUE)
distmat <- dist(scaled_vars[ids,])
clusters <- list(k2$cluster[ids],
k3$cluster[ids],
k4$cluster[ids],
k5$cluster[ids],
k6$cluster[ids]
)
for(i in 1:5) {
sil <- silhouette(clusters[[i]],distmat)
avg.sils[b,i] <- mean(sil[,3])
}
}
silhouette_plot <- plot(2:7, avg.sils[1,],type='b',ylim=c(0.2, 0.35),col='maroon', ylab='Average silhouette width',xlab = 'K')
for(i in 2:nrow(avg.sils)) lines(2:7, avg.sils[i,],type='b',col='maroon')#now testing out how each cluster number looks on a plot
#first, add the clusters to the df
eagle_data <- eagle_data %>%
mutate(k2 = k2$cluster,
k3 = k3$cluster,
k4 = k4$cluster,
k5 = k5$cluster,
k6 = k6$cluster,
k7 = k7$cluster)#second, need to do a PCA to be able to plot
pca <- prcomp(scaled_vars)
#add pc1 and pc2 to the df
eagle_data <- eagle_data %>%
mutate(pc1 = pca$x[,1],pc2 = pca$x[,2])#now for the plot
#K=4
plot_k4 <- (ggplot(data = eagle_data) +
geom_point(
aes(x = pc1, y = pc2, col = factor(k4))
,shape='.',alpha = 0.15) +
guides(color='none') +
xlim(c(-3.7,6)) + ylim(c(-3,6)) +
xlab('PC1') + ylab('PC2')+
ggtitle('K=4') +
scale_color_manual(
values = c("red","cyan","green","orange","blue","purple")
)
)
plot_k4#will generate all plots for clusters 2-6 to examine them side by side using patchwork
library(ggplot2)
library(patchwork)
plots <- list()
for (k in 2:7) {
k_var <- paste0("k", k)
p <- ggplot(eagle_data) +
geom_point(aes(x = pc1, y = pc2, col = factor(.data[[k_var]])), shape='.',alpha = 0.15) +
xlim(c(-3.5,6)) + ylim(c(-3,6)) +
guides(color='none') +
xlab('PC1') + ylab('PC2') +
ggtitle(paste0("K=", k)) +
scale_color_manual(values = c("red","cyan","green","orange","blue","purple","black"))+
theme(
plot.title = element_text(size = 8),
axis.title = element_text(size = 5),
axis.text = element_text(size = 4)
)
plots[[k - 1]] <- p # store in list (index 1→k2, 2→k3, etc.)
}
grid_k_clusters <- wrap_plots(plots, nrow = 2, ncol = 3)#ensure that 2 PCs are enough to accuratly represent this data:
fviz_eig(pca)
#3 does explain a decent bit more variability, but 2 still explains over 65%#first, will perform SVD to find the loadings
eagle_svd <- svd(scaled_vars)
loadings <- eagle_svd$v
rownames(loadings) <- colnames(flight_vars)
loadings[,1:2] #PC1 and PC2 loadings for each variable#will generate bar charts showing these:
load_df <- loadings[, 1:2] %>%
as.data.frame() %>%
mutate(Variable = rownames(.))
ggplot(load_df, aes(x = Variable, y = V1)) +
geom_col() +
theme_classic() +
xlab("Variable") +
ylab("Loading Value") +
ggtitle("PC1 Loadings")#PC2 loadings
ggplot(load_df, aes(x = Variable, y = V2)) +
geom_col() +
theme_classic() +
xlab("Variable") +
ylab("Loading Value") +
ggtitle("PC2 Loadings")# next, PC overall variability
summary(pca)
#pc1 and 2 explain 67% of the variability#now that we have the vectors, we can classify our clusters
eagle_data <- eagle_data %>%
mutate(K4 = recode(k4,
`1` = "Perching",
`2` = "Ascending",
`3` = "Soaring",
`4` = "Flapping/Low Flying"))# ready to add vector arrows to K = 4 plot:
library(ggrepel)
arrow_scale <- 4
k4_plot_with_vectors <- (ggplot(data = eagle_data) +
geom_point(
aes(x = pc1, y = pc2, color = K4)
,shape='.',alpha = 0.15) +
xlim(c(-3.7,6)) + ylim(c(-3,6)) +
xlab('PC1') + ylab('PC2')+
ggtitle('Clusters with PC component vectors') +
scale_color_manual(
values = c(
"Perching" = "red",
"Ascending" = "cyan",
"Soaring" = "green",
"Flapping/Low Flying" = "orange"
),name = "Cluster", guide = guide_legend(
override.aes = list(shape = 16, alpha = 1, size = 4)
)
) + geom_segment(
data = load_df,
aes(x = 0, y = 0,
xend = V1 * arrow_scale,
yend = V2 * arrow_scale),
arrow = arrow(length = unit(0.25, "cm")),
color = "black",
size = 0.7
) +
geom_text_repel(
data = load_df,
aes(x = V1 * arrow_scale,
y = V2 * arrow_scale,
label = Variable),
size = 4,
max.overlaps = Inf,
box.padding = 0.4,
point.padding = 0.3,
min.segment.length = 0,
segment.color = "grey"
)
)
k4_plot_with_vectors#next, want to make some visuals to help answer 2c:
#first, will generate some boxplots for each variable within each cluster
#will only do four, |vertical rate| and Sn are a bit redundant with some of the other variables here
b1 <- ggplot(eagle_data, aes(x = K4, y = VerticalRate,fill=K4)) +
geom_boxplot(outlier.shape = NA) + # hides extreme points
labs(
y = "Vertical Rate") +
ylim(c(-4,4))+
scale_color_manual(
values = c(
"Perching" = "red",
"Ascending" = "cyan",
"Soaring" = "green",
"Flapping/Low Flying" = "orange"
)) +
theme(
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.text.x = element_blank()
)
b2 <- ggplot(eagle_data, aes(x = K4, y = abs_angle,fill=K4)) +
geom_boxplot(outlier.shape = NA) + # hides extreme points
labs(
y = "|Turn Angle| (Radians)") +
# ylim(c(-1,5))+
scale_color_manual(
values = c(
"Perching" = "red",
"Ascending" = "cyan",
"Soaring" = "green",
"Flapping/Low Flying" = "orange"
)) +
theme(
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.text.x = element_blank()
)
b3 <- ggplot(eagle_data, aes(x = K4, y = KPH,fill=K4)) +
geom_boxplot(outlier.shape = NA) + # hides extreme points
labs(
y = "Kilometers/Hour") +
ylim(c(-1,101))+
scale_color_manual(
values = c(
"Perching" = "red",
"Ascending" = "cyan",
"Soaring" = "green",
"Flapping/Low Flying" = "orange"
)) +
theme(
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.text.x = element_blank()
)
b4 <- ggplot(eagle_data, aes(x = K4, y = AGL,fill=K4)) +
geom_boxplot(outlier.shape = NA) + # hides extreme points
labs(
y = "Above Ground Level") +
ylim(c(-1,1250))+
scale_color_manual(
values = c(
"Perching" = "red",
"Ascending" = "cyan",
"Soaring" = "green",
"Flapping/Low Flying" = "orange"
)) +
theme(
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.text.x = element_blank()
)
combined_plot <- (b1 + b2) / (b3 + b4) +
plot_layout(guides = "collect") &
theme(legend.position = "bottom")
# Add a global title
boxplots <- (combined_plot + plot_annotation(title = "Summary of Vertical Rate, Angle, Speed, and Altitude by Cluster"))
boxplots#last, want to investigate a flight for an eagle to get a sense of how well the clusters actually work
#to do that, need flight segments to choose from
segment_lengths <- eagle_data %>%
group_by(segment_id) %>%
summarize(len = n()) %>%
filter(len <= 1000) %>%
arrange(-len)
head(segment_lengths,100)
#use flight segment 3644, length of 2104
#flight segment 82513, len of 823segment_to_plot <- eagle_data %>%
filter(segment_id==7471)
flight_2 <- ggplot(data = segment_to_plot,aes(x = X, y = Y)) +
geom_point(aes(col = K4),size=.6) +
scale_color_manual( values = c(
"Perching" = "red",
"Ascending" = "cyan",
"Soaring" = "green",
"Flapping/Low Flying" = "orange"
),name = "Cluster", guide = guide_legend(
override.aes = list(shape = 16, alpha = 1, size = 4))
)+
labs(title = "Flight 2") +
theme(
plot.title = element_blank(),
axis.title.x = element_blank(),
# axis.text.x = element_blank()
) + theme_minimal()segment_to_plot_2 <- eagle_data %>%
filter(segment_id==123335)
flight_1 <- ggplot(data = segment_to_plot_2,aes(x = X, y = Y)) +
geom_point(aes(col = K4),size=.8) +
scale_color_manual( values = c(
"Perching" = "red",
"Ascending" = "cyan",
"Soaring" = "green",
"Flapping/Low Flying" = "orange"
),name = "Cluster", guide = guide_legend(
override.aes = list(shape = 16, alpha = 1, size = 4))
)+
labs(title = "Flight 1") +
theme(
axis.title.x = element_blank(),
axis.text.x = element_blank()
) +
theme_minimal()
combined_flight_plot <- ((flight_1) + (flight_2)) +
plot_layout(guides = "collect") &
theme(legend.position = "bottom")
combined_flight_plot#a few more visuals to highlight differences between different types of flights...
segment_to_plot <- segment_to_plot %>%
mutate(
Minutes = (LocalTime$hour * 60 +
LocalTime$min +
LocalTime$sec / 60
)-840)
segment_to_plot_2 <- segment_to_plot_2 %>%
mutate(
Minutes = (LocalTime$hour * 60 +
LocalTime$min +
LocalTime$sec / 60
)-840)
#add minutes to the data frame
#now generate plots, want to examine KPH and AGL
AGL_1 <- ggplot(data = segment_to_plot,aes(x = Minutes, y = AGL)) +
geom_point(aes(col = K4),size=.8) +
scale_color_manual( values = c(
"Perching" = "red",
"Ascending" = "cyan",
"Soaring" = "green",
"Flapping/Low Flying" = "orange"
),name = "Cluster", guide = guide_legend(
override.aes = list(shape = 16, alpha = 1, size = 4))
)+
labs(title = "Flight 1") +
ylim(c(-1,1300))+
theme(
axis.title.x = element_blank(),
axis.text.x = element_blank()
) +
theme_minimal()
AGL_2 <- ggplot(data = segment_to_plot_2,aes(x = Minutes, y = AGL)) +
geom_point(aes(col = K4),size=.8) +
scale_color_manual( values = c(
"Perching" = "red",
"Ascending" = "cyan",
"Soaring" = "green",
"Flapping/Low Flying" = "orange"
),name = "Cluster", guide = guide_legend(
override.aes = list(shape = 16, alpha = 1, size = 4))
)+
labs(title = "Flight 2") +
ylim(c(-1,1300))+
theme(
axis.title.x = element_blank(),
axis.text.x = element_blank()
) +
theme_minimal()
combined_AGL_plot <- ((AGL_1) + (AGL_2)) +
plot_layout(guides = "collect") &
theme(legend.position = "bottom")
combined_AGL_plot