library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(cluster)
simulate_student_features <- function(n = 100) {
set.seed(180823)
student_ids <- seq(1, n)
student_engagement <- rnorm(n, mean = 50, sd = 10)
student_performance <- rnorm(n, mean = 60, sd = 15)
student_features <- data.frame(
student_id = student_ids,
student_engagement = student_engagement,
student_performance = student_performance
)
return(student_features)
}
student_features <- simulate_student_features(n = 100)
student_features <- simulate_student_features(n = 100)
head(student_features)
## student_id student_engagement student_performance
## 1 1 40.64528 71.28236
## 2 2 53.66356 53.11158
## 3 3 54.65734 81.90933
## 4 4 39.89541 46.25021
## 5 5 34.69576 65.82668
## 6 6 42.57230 47.27764
scaled_data <- scale(student_features[, c("student_engagement", "student_performance")])# standardizing the features
pca_result <- prcomp(scaled_data, center = TRUE, scale. = TRUE)# performing Principal Component Analysis
summary(pca_result)
## Importance of components:
## PC1 PC2
## Standard deviation 1.0428 0.9553
## Proportion of Variance 0.5437 0.4563
## Cumulative Proportion 0.5437 1.0000
pca_data <- as.data.frame(pca_result$x[, 1:2])# select the number of principal components to 2
set.seed(10523)
kmeans_result <- kmeans(pca_data, centers = 3)# number of clusters have been chosen as 3
student_features$cluster <- kmeans_result$cluster# adding cluster labels to the original datalibrary(ggplot2)
ggplot(student_features, aes(x = student_engagement, y = student_performance, color = factor(cluster))) +
geom_point() + labs(title = "KMeans Clustering of Students",
x = "Student Engagement",
y = "Student Performance") +
theme_minimal()
cluster_centers <- as.data.frame(kmeans_result$centers)
cluster_centers
## PC1 PC2
## 1 -0.3367469 0.8659136
## 2 -0.5500968 -0.7812643
## 3 1.3952797 -0.2026606
hierarchical_result <- hclust(dist(pca_data), method = "ward.D2")# performing hierarchical clustering
cluster_assignments <- cutree(hierarchical_result, k = 3)# cutting the tree to get a number of clusters as 3
student_features$cluster_hierarchical <- cluster_assignments# adding cluster labels to the original data
ggplot(student_features, aes(x = student_engagement, y = student_performance, color = factor(cluster_hierarchical))) +
geom_point() + labs(title = "Hierarchical Clustering of Students",
x = "Student Engagement",
y = "Student Performance") +
theme_minimal()
hierarchical_clusters <- data.frame(
Cluster = unique(cluster_assignments),
Num_Students = table(cluster_assignments)
)
hierarchical_clusters
## Cluster Num_Students.cluster_assignments Num_Students.Freq
## 1 1 1 39
## 2 2 2 35
## 3 3 3 26
student_features %>%
group_by(cluster_hierarchical) %>%
summarise(
Avg_Engagement = mean(student_engagement),
Avg_Performance = mean(student_performance),
Num_Students = n()
)
## # A tibble: 3 × 4
## cluster_hierarchical Avg_Engagement Avg_Performance Num_Students
## <int> <dbl> <dbl> <int>
## 1 1 39.4 64.5 39
## 2 2 51.9 46.8 35
## 3 3 60.2 74.7 26
Submit a report containing the following:
Your report should include your code. Submit the published RPubs link to Blackboard.