Exam_2

Appendix

#reading in the data
#| attr-output: "style='font-size:0.3em'"
library(tidyverse)
library(factoextra)
library(patchwork)
library(cluster)
library(patchwork)
options(width=10000)
load('eagle_data.Rdata')
(eagle_data
  %>% as_tibble() 
  ) %>% head()
#df for just clustering variables
flight_vars <- eagle_data %>% 
  select(KPH:absVR)
#need a sqrt transform for all but vertical rate
hist(flight_vars$KPH)  
hist(flight_vars$Sn)
hist(flight_vars$AGL)
hist(flight_vars$VerticalRate)
hist(flight_vars$abs_angle)
hist(flight_vars$absVR)
#scaled clustering variables, sqrt variables to resolve skewing problem
scaled_vars <- flight_vars %>% 
  mutate_at(vars(AGL, Sn, absVR,abs_angle), sqrt) %>% 
  scale()
#investigating k2-7 to determine appropriate number of clusters
set.seed(234) 
k2 <- kmeans(scaled_vars, centers = 2, nstart = 10, iter.max = 30)
set.seed(234) 
k3 <- kmeans(scaled_vars, centers = 3, nstart = 10, iter.max = 30)
set.seed(234) 
k4 <- kmeans(scaled_vars, centers = 4, nstart = 10, iter.max = 30)
set.seed(234) 
k5 <- kmeans(scaled_vars, centers = 5, nstart = 10, iter.max = 30)
set.seed(234) 
k6 <- kmeans(scaled_vars, centers = 6, nstart = 10, iter.max = 30)
set.seed(234) 
k7 <- kmeans(scaled_vars,centers=7,nstart=10,iter.max = 30)
#plotting to help choose K for clustering (for k-means)

wss_df = data.frame(k = 2:6, 
                    wss = c(k2$tot.withinss,
                            k3$tot.withinss,
                            k4$tot.withinss,
                            k5$tot.withinss,
                            k6$tot.withinss))
 with(wss_df, plot(k,wss,type='b'))


#not very helpful, will check silhouette disatnces
#Finding average silhouette width -  4 looks to be the best option

B <- 100
avg.sils <- matrix(NA,B,6)
set.seed(12)
for(b in 1:B){
  ids <- sample(1:nrow(scaled_vars),1000,replace=TRUE) 
  distmat <- dist(scaled_vars[ids,])
  clusters <- list(k2$cluster[ids],
                   k3$cluster[ids],
                   k4$cluster[ids],
                   k5$cluster[ids],
                   k6$cluster[ids]
                   )
  for(i in 1:5) {
    sil <- silhouette(clusters[[i]],distmat) 
    avg.sils[b,i] <- mean(sil[,3])
  }
}


silhouette_plot <- plot(2:7, avg.sils[1,],type='b',ylim=c(0.2, 0.35),col='maroon', ylab='Average silhouette width',xlab = 'K')
for(i in 2:nrow(avg.sils)) lines(2:7, avg.sils[i,],type='b',col='maroon')
#now testing out how each cluster number looks on a plot
#first, add the clusters to the df
eagle_data <- eagle_data %>%
    mutate(k2 = k2$cluster,
           k3 = k3$cluster,
           k4 = k4$cluster,
           k5 = k5$cluster,
           k6 = k6$cluster,
           k7 = k7$cluster)
#second, need to do a PCA to be able to plot
pca <- prcomp(scaled_vars)
#add pc1 and pc2 to the df
eagle_data <- eagle_data %>% 
  mutate(pc1 = pca$x[,1],pc2 = pca$x[,2])
#now for the plot
#K=4
plot_k4 <- (ggplot(data = eagle_data) + 
  geom_point(
    aes(x = pc1, y = pc2, col = factor(k4))
    ,shape='.',alpha = 0.15) +
    guides(color='none') +
  xlim(c(-3.7,6)) + ylim(c(-3,6)) + 
  xlab('PC1') + ylab('PC2')+ 
  ggtitle('K=4') +
  scale_color_manual(
    values = c("red","cyan","green","orange","blue","purple")
  )
)
plot_k4
#will generate all plots for clusters 2-6 to examine them side by side using patchwork
library(ggplot2)
library(patchwork)

plots <- list()

for (k in 2:7) {
  k_var <- paste0("k", k)
  
  p <- ggplot(eagle_data) +
    geom_point(aes(x = pc1, y = pc2, col = factor(.data[[k_var]])), shape='.',alpha = 0.15) +
    xlim(c(-3.5,6)) + ylim(c(-3,6)) +
    guides(color='none') +
    xlab('PC1') + ylab('PC2') +
    ggtitle(paste0("K=", k)) +
    scale_color_manual(values = c("red","cyan","green","orange","blue","purple","black"))+
       theme(
      plot.title = element_text(size = 8),  
      axis.title = element_text(size = 5),               
      axis.text = element_text(size = 4)                                
    )
  
  plots[[k - 1]] <- p   # store in list (index 1→k2, 2→k3, etc.)
}

grid_k_clusters <- wrap_plots(plots, nrow = 2, ncol = 3)
#ensure that 2 PCs are enough to accuratly represent this data:
fviz_eig(pca) 
#3 does explain a decent bit more variability, but 2 still explains over 65%
#first, will perform SVD to find the loadings
eagle_svd <- svd(scaled_vars)
loadings <- eagle_svd$v
rownames(loadings) <- colnames(flight_vars)
loadings[,1:2] #PC1 and PC2 loadings for each variable
#will generate bar charts showing these:
load_df <- loadings[, 1:2] %>%
  as.data.frame() %>%
  mutate(Variable = rownames(.)) 


ggplot(load_df, aes(x = Variable, y = V1)) +
  geom_col() +
  theme_classic() +
  xlab("Variable") +
  ylab("Loading Value") +
  ggtitle("PC1 Loadings")
#PC2 loadings
ggplot(load_df, aes(x = Variable, y = V2)) +
  geom_col() +
  theme_classic() +
  xlab("Variable") +
  ylab("Loading Value") +
  ggtitle("PC2 Loadings")
# next, PC overall variability
summary(pca)
#pc1 and 2 explain 67% of the variability
#now that we have the vectors, we can classify our clusters
eagle_data <- eagle_data %>%
  mutate(K4 = recode(k4,
                     `1` = "Perching",
                     `2` = "Ascending",
                     `3` = "Soaring",
                     `4` = "Flapping/Low Flying"))
# ready to add vector arrows to K = 4 plot:
library(ggrepel)
arrow_scale <- 4

k4_plot_with_vectors <- (ggplot(data = eagle_data) + 
  geom_point(
    aes(x = pc1, y = pc2, color = K4)
    ,shape='.',alpha = 0.15) +
  xlim(c(-3.7,6)) + ylim(c(-3,6)) + 
  xlab('PC1') + ylab('PC2')+ 
  ggtitle('Clusters with PC component vectors') +
 scale_color_manual(
  values = c(
    "Perching" = "red",
    "Ascending" = "cyan",
    "Soaring" = "green",
    "Flapping/Low Flying" = "orange"
  ),name = "Cluster", guide = guide_legend(
      override.aes = list(shape = 16, alpha = 1, size = 4)
    )
) + geom_segment(
    data = load_df,
    aes(x = 0, y = 0,
        xend = V1 * arrow_scale,
        yend = V2 * arrow_scale),
    arrow = arrow(length = unit(0.25, "cm")),
    color = "black",
    size = 0.7
  ) +
  geom_text_repel(
    data = load_df,
    aes(x = V1 * arrow_scale,
        y = V2 * arrow_scale,
        label = Variable),
    size = 4,
    max.overlaps = Inf,   
    box.padding = 0.4,
    point.padding = 0.3,
    min.segment.length = 0,
    segment.color = "grey"
  )
)
k4_plot_with_vectors
#next, want to make some visuals to help answer 2c:
#first, will generate some boxplots for each variable within each cluster
#will only do four, |vertical rate| and Sn are a bit redundant with some of the other variables here
b1 <- ggplot(eagle_data, aes(x = K4, y = VerticalRate,fill=K4)) +
  geom_boxplot(outlier.shape = NA) +  # hides extreme points
  labs(
       y = "Vertical Rate") +
   ylim(c(-4,4))+
   scale_color_manual(
  values = c(
    "Perching" = "red",
    "Ascending" = "cyan",
    "Soaring" = "green",
    "Flapping/Low Flying" = "orange"
  )) +
  theme(
    plot.title = element_blank(),
    axis.title.x = element_blank(),
    axis.text.x = element_blank()
  )

b2 <- ggplot(eagle_data, aes(x = K4, y = abs_angle,fill=K4)) +
  geom_boxplot(outlier.shape = NA) +  # hides extreme points
  labs(
       y = "|Turn Angle| (Radians)") +
  # ylim(c(-1,5))+
   scale_color_manual(
  values = c(
    "Perching" = "red",
    "Ascending" = "cyan",
    "Soaring" = "green",
    "Flapping/Low Flying" = "orange"
  )) +
  theme(
    plot.title = element_blank(),
    axis.title.x = element_blank(),
    axis.text.x = element_blank()
  )

b3 <- ggplot(eagle_data, aes(x = K4, y = KPH,fill=K4)) +
  geom_boxplot(outlier.shape = NA) +  # hides extreme points
  labs(
       y = "Kilometers/Hour") +
   ylim(c(-1,101))+
   scale_color_manual(
  values = c(
    "Perching" = "red",
    "Ascending" = "cyan",
    "Soaring" = "green",
    "Flapping/Low Flying" = "orange"
  )) +
  theme(
    plot.title = element_blank(),
    axis.title.x = element_blank(),
    axis.text.x = element_blank()
  )

b4 <- ggplot(eagle_data, aes(x = K4, y = AGL,fill=K4)) +
  geom_boxplot(outlier.shape = NA) +  # hides extreme points
  labs(  
       y = "Above Ground Level") +
   ylim(c(-1,1250))+
   scale_color_manual(
  values = c(
    "Perching" = "red",
    "Ascending" = "cyan",
    "Soaring" = "green",
    "Flapping/Low Flying" = "orange"
  )) +
  theme(
    plot.title = element_blank(),
    axis.title.x = element_blank(),
    axis.text.x = element_blank()
  )
combined_plot <- (b1 + b2) / (b3 + b4) +
  plot_layout(guides = "collect") & 
  theme(legend.position = "bottom")

# Add a global title
boxplots <- (combined_plot + plot_annotation(title = "Summary of Vertical Rate, Angle, Speed, and Altitude by Cluster"))
boxplots
#last, want to investigate a flight for an eagle to get a sense of how well the clusters actually work
#to do that, need flight segments to choose from
segment_lengths <- eagle_data %>% 
  group_by(segment_id) %>% 
  summarize(len = n()) %>% 
  filter(len <= 1000) %>%
  arrange(-len) 
head(segment_lengths,100)
#use flight segment 3644, length of 2104
#flight segment 82513, len of 823
segment_to_plot <- eagle_data %>% 
  filter(segment_id==7471)

flight_2 <- ggplot(data = segment_to_plot,aes(x = X, y = Y)) +
  geom_point(aes(col = K4),size=.6) + 
  scale_color_manual( values = c(
    "Perching" = "red",
    "Ascending" = "cyan",
    "Soaring" = "green",
    "Flapping/Low Flying" = "orange"
  ),name = "Cluster", guide = guide_legend(
      override.aes = list(shape = 16, alpha = 1, size = 4))
  )+
   labs(title = "Flight 2") + 
   theme(
    plot.title = element_blank(),
    axis.title.x = element_blank(),
   # axis.text.x = element_blank()
  ) + theme_minimal()
segment_to_plot_2 <- eagle_data %>% 
  filter(segment_id==123335)

flight_1 <- ggplot(data = segment_to_plot_2,aes(x = X, y = Y)) +
  geom_point(aes(col = K4),size=.8) + 
  scale_color_manual( values = c(
    "Perching" = "red",
    "Ascending" = "cyan",
    "Soaring" = "green",
    "Flapping/Low Flying" = "orange"
  ),name = "Cluster", guide = guide_legend(
      override.aes = list(shape = 16, alpha = 1, size = 4))
  )+
    labs(title = "Flight 1") + 
   theme(
    axis.title.x = element_blank(),
    axis.text.x = element_blank()
  ) +
  theme_minimal()

combined_flight_plot <- ((flight_1) + (flight_2)) +
  plot_layout(guides = "collect") & 
  theme(legend.position = "bottom")
combined_flight_plot
#a few more visuals to highlight differences between different types of flights...
segment_to_plot <- segment_to_plot %>%
  mutate(
    Minutes = (LocalTime$hour * 60 +
              LocalTime$min +
              LocalTime$sec / 60
  )-840)
segment_to_plot_2 <- segment_to_plot_2 %>%
  mutate(
    Minutes = (LocalTime$hour * 60 +
              LocalTime$min +
              LocalTime$sec / 60
  )-840)
#add minutes to the data frame
#now generate plots, want to examine KPH and AGL
AGL_1 <- ggplot(data = segment_to_plot,aes(x = Minutes, y = AGL)) +
  geom_point(aes(col = K4),size=.8) + 
  scale_color_manual( values = c(
    "Perching" = "red",
    "Ascending" = "cyan",
    "Soaring" = "green",
    "Flapping/Low Flying" = "orange"
  ),name = "Cluster", guide = guide_legend(
      override.aes = list(shape = 16, alpha = 1, size = 4))
  )+
    labs(title = "Flight 1") + 
   ylim(c(-1,1300))+
   theme(
    axis.title.x = element_blank(),
    axis.text.x = element_blank()
  ) +
  theme_minimal()

AGL_2 <- ggplot(data = segment_to_plot_2,aes(x = Minutes, y = AGL)) +
  geom_point(aes(col = K4),size=.8) + 
  scale_color_manual( values = c(
    "Perching" = "red",
    "Ascending" = "cyan",
    "Soaring" = "green",
    "Flapping/Low Flying" = "orange"
  ),name = "Cluster", guide = guide_legend(
      override.aes = list(shape = 16, alpha = 1, size = 4))
  )+
    labs(title = "Flight 2") + 
   ylim(c(-1,1300))+
   theme(
    axis.title.x = element_blank(),
    axis.text.x = element_blank()
  ) +
  theme_minimal()

combined_AGL_plot <- ((AGL_1) + (AGL_2)) +
  plot_layout(guides = "collect") & 
  theme(legend.position = "bottom")
combined_AGL_plot