prcomp()
Outputdata("USArrests")
pca_results <- prcomp(USArrests, center = TRUE, scale. = TRUE)
pve_direct <- (pca_results$sdev)^2 / sum((pca_results$sdev)^2)
pve_direct
## [1] 0.62006039 0.24744129 0.08914080 0.04335752
cumulative_pve_direct <- cumsum(pve_direct)
cumulative_pve_direct
## [1] 0.6200604 0.8675017 0.9566425 1.0000000
Interpretation:
pve_direct
shows the proportion of total variance
explained by each principal component.cumulative_pve_direct
accumulates the variance
explained as components are added.# Manually verifying the PVE calculation
manual_pve_check <- (pca_results$sdev)^2 / sum((pca_results$sdev)^2)
manual_pve_check
## [1] 0.62006039 0.24744129 0.08914080 0.04335752
Interpretation:
prcomp()
.hc_no_scale <- hclust(dist(USArrests), method = "complete")
plot(hc_no_scale, main = "Dendrogram without Scaling - Complete Linkage", xlab = "", sub = "", cex = 0.7)
Interpretation:
clusters_raw <- cutree(hc_no_scale, k = 3)
table(clusters_raw)
## clusters_raw
## 1 2 3
## 16 14 20
split(names(clusters_raw), clusters_raw)
## $`1`
## [1] "Alabama" "Alaska" "Arizona" "California"
## [5] "Delaware" "Florida" "Illinois" "Louisiana"
## [9] "Maryland" "Michigan" "Mississippi" "Nevada"
## [13] "New Mexico" "New York" "North Carolina" "South Carolina"
##
## $`2`
## [1] "Arkansas" "Colorado" "Georgia" "Massachusetts"
## [5] "Missouri" "New Jersey" "Oklahoma" "Oregon"
## [9] "Rhode Island" "Tennessee" "Texas" "Virginia"
## [13] "Washington" "Wyoming"
##
## $`3`
## [1] "Connecticut" "Hawaii" "Idaho" "Indiana"
## [5] "Iowa" "Kansas" "Kentucky" "Maine"
## [9] "Minnesota" "Montana" "Nebraska" "New Hampshire"
## [13] "North Dakota" "Ohio" "Pennsylvania" "South Dakota"
## [17] "Utah" "Vermont" "West Virginia" "Wisconsin"
Interpretation:
USArrests_scaled <- scale(USArrests)
hc_scaled <- hclust(dist(USArrests_scaled), method = "complete")
plot(hc_scaled, main = "Dendrogram with Scaling - Complete Linkage", xlab = "", sub = "", cex = 0.7)
clusters_scaled <- cutree(hc_scaled, k = 3)
table(clusters_scaled)
## clusters_scaled
## 1 2 3
## 8 11 31
Interpretation:
Interpretation:
set.seed(42)
n_samples <- 20
n_features <- 50
class_A <- matrix(rnorm(n_samples * n_features, mean = 0), nrow = n_samples)
class_B <- matrix(rnorm(n_samples * n_features, mean = 2.5), nrow = n_samples)
class_C <- matrix(rnorm(n_samples * n_features, mean = -2.5), nrow = n_samples)
X_data <- rbind(class_A, class_B, class_C)
true_groups <- factor(rep(1:3, each = n_samples))
Interpretation:
pca_sim_data <- prcomp(X_data, scale. = TRUE)
pca_df <- data.frame(PC1 = pca_sim_data$x[,1], PC2 = pca_sim_data$x[,2], TrueClass = true_groups)
ggplot(pca_df, aes(x = PC1, y = PC2, color = TrueClass)) +
geom_point(size = 3) +
labs(title = "PCA Projection of Simulated Data", x = "Principal Component 1", y = "Principal Component 2") +
theme_minimal()
Interpretation:
set.seed(42)
kmeans_result_3 <- kmeans(X_data, centers = 3, nstart = 25)
table(kmeans_result_3$cluster, true_groups)
## true_groups
## 1 2 3
## 1 0 0 20
## 2 0 20 0
## 3 20 0 0
Interpretation:
set.seed(42)
kmeans_result_2 <- kmeans(X_data, centers = 2, nstart = 25)
table(kmeans_result_2$cluster, true_groups)
## true_groups
## 1 2 3
## 1 20 0 20
## 2 0 20 0
Interpretation:
set.seed(42)
kmeans_result_4 <- kmeans(X_data, centers = 4, nstart = 25)
table(kmeans_result_4$cluster, true_groups)
## true_groups
## 1 2 3
## 1 20 0 0
## 2 0 0 11
## 3 0 20 0
## 4 0 0 9
Interpretation:
set.seed(42)
X_PC12 <- pca_sim_data$x[,1:2]
kmeans_pc12_3 <- kmeans(X_PC12, centers = 3, nstart = 25)
table(kmeans_pc12_3$cluster, true_groups)
## true_groups
## 1 2 3
## 1 0 0 20
## 2 0 20 0
## 3 20 0 0
Interpretation:
set.seed(42)
X_data_scaled <- scale(X_data)
kmeans_scaled_3 <- kmeans(X_data_scaled, centers = 3, nstart = 25)
table(kmeans_scaled_3$cluster, true_groups)
## true_groups
## 1 2 3
## 1 0 0 20
## 2 0 20 0
## 3 20 0 0
Interpretation: