| Cluster | Speed | Vertical Rate | Turning Angle | AGL | Behavior |
|---|---|---|---|---|---|
| 1 | 0 | 0 | Highly Variant | Near 0 | Perching |
| 2 | Medium | 0 | Lower | Near 0 | Maintaining |
| 3 | Medium | Positive | High | Positive | Climbing |
| 4 | High | Negative | Near 0 | Positive | Gliding |
# Packages Used
library(tidyverse) # For Data Cleaning and Visualizing
library(cluster) # For Clustering the Data
library(ggrepel) # For Lableing Variables on PCA Plots
library(patchwork) # For Putting Plots Together
#Loading Data
load('eagle_data.Rdata')
(
eagle_data
%>% mutate(KPH = sqrt(KPH),
Sn = sqrt(Sn),
AGL = sqrt(AGL),
abs_angle = sqrt(abs_angle),
absVR = sqrt(absVR))
) -> eagle_data # Square Root Transforming the variables
eagle_scaled <- (
eagle_data
%>% select(KPH:absVR) # Selecting the numeric variables for Clustering
%>% scale(scale = T, center = T) # Scaling the data to put all of the variables on the same scale
)
eagle_pca <- prcomp(eagle_scaled) # Performing PCA on the data.ggplot(data = eagle_data, aes(x = (KPH)**2)) + # Squaring KPH since we square-rooted it earlier
geom_density() + # Creating the Density Curve
xlab('KPH') + ylab('Density') +
ggtitle('Density of KPH') +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5, size = 18), # Adjusting the position and size of the title and axis labels
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text = element_text(size = 13),
# element_text(size = 16)
)
ggplot(data = eagle_data, aes(x = (KPH))) + # The Data frame is storing square root of KPH as KPH
geom_density() +
xlab('Square Root of KPH') + ylab('Density') +
theme_classic() +
ggtitle('Density of Square Root of KPH') +
theme(plot.title = element_text(hjust = 0.5, size = 18),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text = element_text(size = 13),
legend.title = element_text(size = 18),
legend.text = element_text(size = 14),
# element_text(size = 16)
)set.seed(6725)
kmean_2 <- kmeans(eagle_scaled, centers = 2, iter.max = 20, nstart = 10) # K-means for 2 clusters
set.seed(8770)
kmean_3 <- kmeans(eagle_scaled, centers = 3, iter.max = 20, nstart = 10) # K-means for 3 Clusters
set.seed(5255)
kmean_4 <- kmeans(eagle_scaled, centers = 4, iter.max = 20, nstart = 10) # K-means for 4 Clusters
set.seed(9925)
kmean_5 <- kmeans(eagle_scaled, centers = 5, iter.max = 20, nstart = 10) # K-means for 5 Clusters
set.seed(2770)
kmean_6 <- kmeans(eagle_scaled, centers = 6, iter.max = 20, nstart = 10) # K-means for 6 Clusters
set.seed(5403)
kmean_7 <- kmeans(eagle_scaled, centers = 7, iter.max = 20, nstart = 10) # K-means for 7 Clusters
# All seeds came from random.org. A random number was generated between 1 and 10000. This was done so that cluster definitions would not change each time I render this document
wss_df <- (
data.frame(clusters = c(2, 3, 4, 5, 6, 7),
wss = c(kmean_2$tot.withinss, kmean_3$tot.withinss,
kmean_4$tot.withinss, kmean_5$tot.withinss,
kmean_6$tot.withinss, kmean_7$tot.withinss
))
) # This data frame is stroing the WSS from each clustering solution. This data frame will be used to create the WSS plot latern = 100 # Number of bootstrap Samples
kmeans_sil <- data.frame(trial = c(0), clusters = c(0), avg_sil_width = c(0)) # Creating an Empty Data Frame to store the silhouette widths
(
eagle_data
%>% mutate(cluster_2 = kmean_2$cluster,
cluster_3 = kmean_3$cluster,
cluster_4 = kmean_4$cluster,
cluster_5 = kmean_5$cluster,
cluster_6 = kmean_6$cluster,
cluster_7 = kmean_7$cluster)
) -> eagle_bootstrap
# Each of the points was assigned to a cluster for each of the k-means solutions (k=2 - k = 7). This data frame above is assigning each point to its cluster for each solution. This is the data frame we will take bootstrap samples from.
for (i in 1:n) {
bootstrap_sample <- (
eagle_bootstrap
%>% sample_n(size = 10000, replace = FALSE) # Taking a bootstrap sample of size 10,000
)
dist <- (
bootstrap_sample
%>% select(KPH:absVR) # Selecting just the numeric variables
%>% scale(scale = T, center = T) # Scaling the variables
%>% dist() # Computing a Distance matrix for the scaled data
)
# These 6 lines of code calculate the silhouette width for each of the points for each solution.
sils_2 <- silhouette(bootstrap_sample$cluster_2, dist)
sils_3 <- silhouette(bootstrap_sample$cluster_3, dist)
sils_4 <- silhouette(bootstrap_sample$cluster_4, dist)
sils_5 <- silhouette(bootstrap_sample$cluster_5, dist)
sils_6 <- silhouette(bootstrap_sample$cluster_6, dist)
sils_7 <- silhouette(bootstrap_sample$cluster_7, dist)
# This data frame below saves the trial number, the number of clusters, and the average silhouette width.
df_of_rows <- data.frame(trial = c(i,i,i,i,i,i), clusters = c(2, 3, 4, 5, 6, 7),
avg_sil_width = c(mean(sils_2[,3]),
mean(sils_3[,3]),
mean(sils_4[,3]),
mean(sils_5[,3]),
mean(sils_6[,3]),
mean(sils_7[,3]))
)
kmeans_sil <- rbind(kmeans_sil, df_of_rows) # Adds the trial to the data frame that stores the silhouette widths
}
kmeans_sil <- (
kmeans_sil
%>% filter(clusters != 0) # Filters out the blank row that was used to setup the data frame.
)
sil_mean <- (
kmeans_sil
%>% group_by(clusters)
%>% summarize(avg = mean(avg_sil_width)) # Computes the average silhouette width across all trials to graph the overall average silhouette width later.
)ggplot(data = wss_df, aes(x = clusters, y = wss)) +
geom_point() + # Adds a point of the WSS for each cluster
geom_line() + # Connects the points with a line
xlab('Number of Clusters') + ylab('WSS') +
ggtitle('WSS by Cluster') +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5, size = 18), # Adjusts the size and position of the text
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text = element_text(size = 13),
)
ggplot() +
geom_line(data = kmeans_sil, aes(x = clusters, y = avg_sil_width, group = trial), alpha = 0.5) + # Creates one line for each bootstrap trial. This line is the average silhouette width for each cluster.
geom_line(data = sil_mean, aes(x = clusters, y = avg), col = 'firebrick2', linewidth = 2) + # Creates a big red line for the average silhouette width by cluster across all trials.
xlab('Number of Clusters') + ylab('Average Bootstrapped Silhouette Width') +
ggtitle('Average Bootstrapped Silhouette \nWidth by Cluster') +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5, size = 18), # Adjusts the size and position of the text
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text = element_text(size = 13),
legend.title = element_text(size = 18),
legend.text = element_text(size = 14),
# element_text(size = 16)
)(
eagle_data
%>% mutate(cluster = as.factor(kmean_4$cluster))
%>% cbind(eagle_pca$x[,1:2])
) -> eagles_points # Adds the Cluster assignments and the PC1 and PC2 values for each point into a data frame.
(
data.frame(eagle_pca$rotation[,1:2])
%>% rownames_to_column(var = 'var')
) -> arrow_df # Pulls the loadings for PC1 and PC2 and adds them to a data frame. The purpose is to create the arrows on the Biplot.
# PCA Biplot Code
ggplot() +
geom_point(data = eagles_points %>% filter(Animal_ID == 112), aes(x = PC1, y = PC2, color = cluster), shape = '.') + # Adds each point to the plot. Each point is color coded by its cluster.
geom_segment(data = arrow_df, aes(x = 0, y = 0, xend = PC1*5, yend = PC2*5), # Adds the arrows of the variables to this plot. The arrows on their own were too small, so they were scaled to be visible.
arrow = arrow(length = unit(0.2, "cm"))) +
geom_text_repel(data = arrow_df, aes(x = PC1*5, y = PC2*5, label = var)) + # Adds the labels of the variables to the plot.
xlab('PC1') + ylab('PC2') +
ggtitle('PCA Biplot of Eagle 112') +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5, size = 18), # Adjusts the size and position of the text.
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text = element_text(size = 13),
legend.position = "none" # Removes the legend
# element_text(size = 16)
)eagle_clusters <- ( # Puts the numeric variables back on their regular scales. This is done for visualization purposes.
eagle_data
%>% mutate(KPH = KPH**2,
Sn = Sn**2,
AGL = AGL**2,
abs_angle = abs_angle**2,
absVR = absVR**2)
%>% mutate(cluster = factor(kmean_4$cluster))
)
vr <- (
ggplot(data = eagle_clusters, aes(x =cluster, y = VerticalRate, fill = cluster)) +
geom_boxplot(outlier.shape = NA) + # Remove the outliers from the plot
xlab('Cluster') + ylab('Vertical Rate (m/s)') + ggtitle('Vertical rate by Cluster') +
ylim(-5, 5) + # Limits the y-axis to make the plots visible
geom_hline(yintercept = 0, linetype = 'dashed') +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5, size = 18), # Adjusts the position and size of the text
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text = element_text(size = 13),
legend.position = "none"
# element_text(size = 16)
)
)
angle <- (
ggplot(data = eagle_clusters, aes(x =cluster, y = abs_angle, fill = cluster)) +
geom_boxplot(outlier.shape = NA) + # Remove the outliers from the plot
xlab('Cluster') + ylab('Angle (Radians)') + ggtitle('Absolute Value of Turning \nAngle by Cluster') +
geom_hline(yintercept = 0, linetype = 'dashed') +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5, size = 18), # Adjusts the position and size of the text
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text = element_text(size = 13),
legend.position = "none"
# element_text(size = 16)
)
)
agl <- (
ggplot(data = eagle_clusters, aes(x =cluster, y = AGL, fill = cluster)) +
geom_boxplot(outlier.shape = NA) + # Remove the outliers from the plot
xlab('Cluster') + ylab('Distance (Meters)') + ggtitle('Distance Above Ground \nLevel by Cluster') +
geom_hline(yintercept = 0, linetype = 'dashed') +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5, size = 18), # Adjusts the position and size of the text
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text = element_text(size = 13),
legend.position = "none"
# element_text(size = 16)
)
)
kph <- (
ggplot(data = eagle_clusters, aes(x =cluster, y = KPH, fill = cluster)) +
geom_boxplot(outlier.shape = NA) + # Remove the outliers from the plot
xlab('Cluster') + ylab('KPH') + ggtitle('KPH by Cluster') +
geom_hline(yintercept = 0, linetype = 'dashed') +
theme_classic()+
theme(plot.title = element_text(hjust = 0.5, size = 18), # Adjusts the position and size of the text
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text = element_text(size = 13),
legend.position = "none"
# element_text(size = 16)
)
)
# This code uses the patchwork package to put the plots together into one plot
kph + angle +
vr + agl +
plot_layout(ncol = 2)ggplot(data = eagle_clusters %>% filter(segment_id == 60036), aes(x = (
as.numeric(LocalTime) - as.numeric((eagle_clusters %>% filter(segment_id == 60036) %>% summarize(min(as.numeric(LocalTime)))) # This code is setting the start of the segment to 0 seconds to make the numbers interpretable.
), y = AGL)) +
geom_point(aes(color = cluster)) +
xlab('Time (Seconds)') + ylab('Distance (Meters)') +
ggtitle('Distance Above Ground Over Time') +
theme_classic() +
labs(color = "Cluster") + # Setting the legend title to have a capital C.
theme(plot.title = element_text(hjust = 0.5, size = 18),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text = element_text(size = 13),
legend.title = element_text(size = 18),
legend.text = element_text(size = 14),
# element_text(size = 16)
)