Load necessary libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(cluster)

Simulate student features

simulate_student_features <- function(n = 100) {
  set.seed(180823)
  student_ids <- seq(1, n)
  student_engagement <- rnorm(n, mean = 50, sd = 10)
  student_performance <- rnorm(n, mean = 60, sd = 15)
  student_features <- data.frame(
    student_id = student_ids,
    student_engagement = student_engagement,
    student_performance = student_performance
  )
  return(student_features)
}
student_features <- simulate_student_features(n = 100)

Exclude the “student_id” variable

student_features <- simulate_student_features(n = 100)
head(student_features)
##   student_id student_engagement student_performance
## 1          1           40.64528            71.28236
## 2          2           53.66356            53.11158
## 3          3           54.65734            81.90933
## 4          4           39.89541            46.25021
## 5          5           34.69576            65.82668
## 6          6           42.57230            47.27764

Perform dimensionality reduction using PCA

scaled_data <- scale(student_features[, c("student_engagement", "student_performance")])# standardizing the features
pca_result <- prcomp(scaled_data, center = TRUE, scale. = TRUE)# performing Principal Component Analysis
summary(pca_result)
## Importance of components:
##                           PC1    PC2
## Standard deviation     1.0428 0.9553
## Proportion of Variance 0.5437 0.4563
## Cumulative Proportion  0.5437 1.0000

Plot variance explained by principal components

pca_data <- as.data.frame(pca_result$x[, 1:2])# select the number of principal components to 2
set.seed(10523)
kmeans_result <- kmeans(pca_data, centers = 3)# number of clusters have been chosen as 3
student_features$cluster <- kmeans_result$cluster# adding cluster labels to the original datalibrary(ggplot2)

ggplot(student_features, aes(x = student_engagement, y = student_performance, color = factor(cluster))) +
  geom_point() +  labs(title = "KMeans Clustering of Students",
       x = "Student Engagement",
       y = "Student Performance") +
  theme_minimal()

cluster_centers <- as.data.frame(kmeans_result$centers)
cluster_centers
##          PC1        PC2
## 1 -0.3367469  0.8659136
## 2 -0.5500968 -0.7812643
## 3  1.3952797 -0.2026606
hierarchical_result <- hclust(dist(pca_data), method = "ward.D2")# performing hierarchical clustering
cluster_assignments <- cutree(hierarchical_result, k = 3)# cutting the tree to get a number of clusters as 3
student_features$cluster_hierarchical <- cluster_assignments# adding cluster labels to the original data
ggplot(student_features, aes(x = student_engagement, y = student_performance, color = factor(cluster_hierarchical))) +
  geom_point() +  labs(title = "Hierarchical Clustering of Students",
       x = "Student Engagement",
       y = "Student Performance") +
  theme_minimal()

hierarchical_clusters <- data.frame(
  Cluster = unique(cluster_assignments),
  Num_Students = table(cluster_assignments)
)

hierarchical_clusters
##   Cluster Num_Students.cluster_assignments Num_Students.Freq
## 1       1                                1                39
## 2       2                                2                35
## 3       3                                3                26
student_features %>%
  group_by(cluster_hierarchical) %>%
  summarise(
    Avg_Engagement = mean(student_engagement),
    Avg_Performance = mean(student_performance),
    Num_Students = n()
  )
## # A tibble: 3 × 4
##   cluster_hierarchical Avg_Engagement Avg_Performance Num_Students
##                  <int>          <dbl>           <dbl>        <int>
## 1                    1           39.4            64.5           39
## 2                    2           51.9            46.8           35
## 3                    3           60.2            74.7           26

Tasks

  • Simulate the data.
  • Perform dimensionality reduction on the data using PCA.
  • Cluster the data using KMeans.
  • Interpret the results of your analysis.

Submission

Submit a report containing the following:

  • A brief description of your approach to dimensionality reduction and clustering.
  • The results of your analysis, including the number of clusters identified, the characteristics of each cluster, and any other insights you gained from the data.
  • A discussion of the implications of your findings for learning analytics.
  • Provide at least one scholarly reference.

Your report should include your code. Submit the published RPubs link to Blackboard.