Load necessary libraries

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(cluster)

Simulate student features

simulate_student_features <- function(n = 100) {
  set.seed(180823)
  student_ids <- seq(1, n)
  student_engagement <- rnorm(n, mean = 50, sd = 10)
  student_performance <- rnorm(n, mean = 60, sd = 15)
  student_features <- data.frame(
    student_id = student_ids,
    student_engagement = student_engagement,
    student_performance = student_performance
  )
  return(student_features)
}

student_features <- simulate_student_features(n = 100)

Exclude the “student_id” variable

student_features <- simulate_student_features(n = 100)
head(student_features)

##   student_id student_engagement student_performance
## 1          1           40.64528            71.28236
## 2          2           53.66356            53.11158
## 3          3           54.65734            81.90933
## 4          4           39.89541            46.25021
## 5          5           34.69576            65.82668
## 6          6           42.57230            47.27764

Perform dimensionality reduction using PCA

scaled_data <- scale(student_features[, c("student_engagement", "student_performance")])# standardizing the features
pca_result <- prcomp(scaled_data, center = TRUE, scale. = TRUE)# performing Principal Component Analysis
summary(pca_result)

## Importance of components:
##                           PC1    PC2
## Standard deviation     1.0428 0.9553
## Proportion of Variance 0.5437 0.4563
## Cumulative Proportion  0.5437 1.0000

Plot variance explained by principal components

pca_data <- as.data.frame(pca_result$x[, 1:2])# select the number of principal components to 2

set.seed(10523)
kmeans_result <- kmeans(pca_data, centers = 3)# number of clusters have been chosen as 3
student_features$cluster <- kmeans_result$cluster# adding cluster labels to the original datalibrary(ggplot2)

ggplot(student_features, aes(x = student_engagement, y = student_performance, color = factor(cluster))) +
  geom_point() +  labs(title = "KMeans Clustering of Students",
       x = "Student Engagement",
       y = "Student Performance") +
  theme_minimal()

cluster_centers <- as.data.frame(kmeans_result$centers)
cluster_centers

##          PC1        PC2
## 1 -0.3367469  0.8659136
## 2 -0.5500968 -0.7812643
## 3  1.3952797 -0.2026606

hierarchical_result <- hclust(dist(pca_data), method = "ward.D2")# performing hierarchical clustering
cluster_assignments <- cutree(hierarchical_result, k = 3)# cutting the tree to get a number of clusters as 3
student_features$cluster_hierarchical <- cluster_assignments# adding cluster labels to the original data
ggplot(student_features, aes(x = student_engagement, y = student_performance, color = factor(cluster_hierarchical))) +
  geom_point() +  labs(title = "Hierarchical Clustering of Students",
       x = "Student Engagement",
       y = "Student Performance") +
  theme_minimal()

hierarchical_clusters <- data.frame(
  Cluster = unique(cluster_assignments),
  Num_Students = table(cluster_assignments)
)

hierarchical_clusters

##   Cluster Num_Students.cluster_assignments Num_Students.Freq
## 1       1                                1                39
## 2       2                                2                35
## 3       3                                3                26

student_features %>%
  group_by(cluster_hierarchical) %>%
  summarise(
    Avg_Engagement = mean(student_engagement),
    Avg_Performance = mean(student_performance),
    Num_Students = n()
  )

## # A tibble: 3 × 4
##   cluster_hierarchical Avg_Engagement Avg_Performance Num_Students
##                  <int>          <dbl>           <dbl>        <int>
## 1                    1           39.4            64.5           39
## 2                    2           51.9            46.8           35
## 3                    3           60.2            74.7           26

Tasks

Simulate the data.
Perform dimensionality reduction on the data using PCA.
Cluster the data using KMeans.
Interpret the results of your analysis.

Submission

Submit a report containing the following:

A brief description of your approach to dimensionality reduction and clustering.
The results of your analysis, including the number of clusters identified, the characteristics of each cluster, and any other insights you gained from the data.
A discussion of the implications of your findings for learning analytics.
Provide at least one scholarly reference.

Your report should include your code. Submit the published RPubs link to Blackboard.

Lab 3 Case Study: Unsupervised Learning in Learning Analytics

[Enter your name]

2023-09-26

Load necessary libraries

Simulate student features

Exclude the “student_id” variable

Perform dimensionality reduction using PCA

Plot variance explained by principal components

Tasks

Submission