library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
simulate_student_features <- function(n = 100) {
set.seed(260923)
student_ids <- seq(1, n)
student_engagement <- rnorm(n, mean = 50, sd = 10)
student_performance <- rnorm(n, mean = 60, sd = 15)
student_features <- data.frame(
student_id = student_ids,
student_engagement = student_engagement,
student_performance = student_performance
)
return(student_features)
}
student_features <- simulate_student_features(n = 100)
student_data <- student_features %>%
select(-student_id) # Exclude the "student_id" column
student_pca <- student_data %>%
prcomp(center = TRUE, scale. = TRUE)
fviz_eig(student_pca, addlabels = TRUE, ylim = c(0, 50)) +
labs(title = "Variance Explained by Principal Components")
# Determine the number of principal components to retain (e.g., 2
components)
num_components <- 2
student_pca_data <- as.data.frame(predict(student_pca, newdata = student_data)[, 1:num_components])
glimpse(student_pca_data)
## Rows: 100
## Columns: 2
## $ PC1 <dbl> -0.44153737, 0.27373527, 1.93912437, -1.06354148, 1.01397062, 0.87…
## $ PC2 <dbl> -1.643380141, -0.083321127, -0.268457565, -1.059273032, 0.23352364…
# Load necessary libraries
library(ggplot2)
library(dplyr)
library(cluster)
# Initialize an empty vector to store the within-cluster sum of squares
wcss <- vector()
# Define the range of possible cluster numbers (e.g., from 1 to 10)
k_values <- 1:10
# Calculate the within-cluster sum of squares for different cluster numbers
for (k in k_values) {
kmeans_model <- kmeans(student_pca_data, centers = k)
wcss[k] <- kmeans_model$tot.withinss
}
# Create a data frame with the number of clusters and corresponding WCSS values
elbow_data <- data.frame(K = k_values, WCSS = wcss)
# Plot the elbow curve
ggplot(elbow_data, aes(x = K, y = WCSS)) +
geom_line() +
geom_point() +
labs(title = "Elbow Method for Optimal Number of Clusters") +
xlab("Number of Clusters (K)") +
ylab("Within-Cluster Sum of Squares (WCSS)")
set.seed(123)
kmeans_clusters <- kmeans(student_pca_data, centers = 4) # You can choose the number of clusters
hierarchical_clusters <- hclust(dist(student_pca_data))
hierarchical_clusters_cut <- cutree(hierarchical_clusters, k = 4) # You can choose the number of clusters
ggplot(student_pca_data, aes(x = PC1, y = PC2)) +
geom_point(aes(color = factor(kmeans_clusters$cluster)), size = 3) +
labs(title = "KMeans Clustering") +
theme_minimal()
ggplot(student_pca_data, aes(x = PC1, y = PC2)) +
geom_point(aes(color = factor(hierarchical_clusters_cut)), size = 3) +
labs(title = "Hierarchical Clustering") +
theme_minimal()
kmeans_clusters$size
## [1] 30 17 21 32
kmeans_clusters$centers
## PC1 PC2
## 1 0.06815923 1.08221768
## 2 0.48614297 -1.29915216
## 3 1.11639343 -0.05797355
## 4 -1.05479592 -0.28635934
cluster_summary <- student_data %>%
mutate(KMeans_Cluster = kmeans_clusters$cluster,
Hierarchical_Cluster = hierarchical_clusters_cut)
head(cluster_summary)
## student_engagement student_performance KMeans_Cluster Hierarchical_Cluster
## 1 35.47855 50.52231 2 1
## 2 51.79512 58.88396 3 1
## 3 62.41012 40.56755 3 2
## 4 35.20679 62.46033 4 3
## 5 59.37552 54.69326 3 2
## 6 57.00109 54.09745 3 2