library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.1
library(cluster)
simulate_student_features <- function(n = 100) {
  # Set the random seed
  set.seed(100523)
  
  # Generate unique student IDs
  student_ids <- seq(1, n)

  # Simulate student engagement
  student_engagement <- rnorm(n, mean = 50, sd = 10)

  # Simulate student performance
  student_performance <- rnorm(n, mean = 60, sd = 15)

  # Combine the data into a data frame
  student_features <- data.frame(
    student_id = student_ids,
    student_engagement = student_engagement,
    student_performance = student_performance
  )

  # Return the data frame
  return(student_features)
}
student_features <- simulate_student_features(n = 100)
head(student_features)
##   student_id student_engagement student_performance
## 1          1           49.17973            45.99057
## 2          2           55.56813            45.07527
## 3          3           40.06329            69.69600
## 4          4           45.45881            66.96083
## 5          5           45.29457            34.25193
## 6          6           51.70637            41.25864
scaled_data <- scale(student_features[, c("student_engagement", "student_performance")])
# standardizing the features

pca_result <- prcomp(scaled_data, center = TRUE, scale. = TRUE)
# performing Principal Component Analysis

summary(pca_result)
## Importance of components:
##                          PC1    PC2
## Standard deviation     1.036 0.9623
## Proportion of Variance 0.537 0.4630
## Cumulative Proportion  0.537 1.0000
pca_data <- as.data.frame(pca_result$x[, 1:2])
# select the number of principal components to 2
set.seed(10523)
kmeans_result <- kmeans(pca_data, centers = 3)
# number of clusters have been chosen as 3

student_features$cluster <- kmeans_result$cluster
# adding cluster labels to the original data

library(ggplot2)

ggplot(student_features, aes(x = student_engagement, y = student_performance, color = factor(cluster))) +
  geom_point() +
  labs(title = "KMeans Clustering of Students",
       x = "Student Engagement",
       y = "Student Performance") +
  theme_minimal()

cluster_centers <- as.data.frame(kmeans_result$centers)
cluster_centers
##          PC1         PC2
## 1  0.4842112 -0.90197505
## 2  0.5883916  0.85412490
## 3 -1.2513700  0.05582517
student_features%>%
  group_by(cluster)%>%
  summarise(
    Avg_Engagement = mean(student_engagement),
    Avg_Performance = mean(student_performance),
    Num_Students = n()
  )
## # A tibble: 3 × 4
##   cluster Avg_Engagement Avg_Performance Num_Students
##     <int>          <dbl>           <dbl>        <int>
## 1       1           48.0            46.5           35
## 2       2           60.5            64.7           35
## 3       3           42.8            76.2           30
hierarchical_result <- hclust(dist(pca_data), method = "ward.D2")
# performing hierarchical clustering

cluster_assignments <- cutree(hierarchical_result, k = 3)
# cutting the tree to get a number of clusters as 3

student_features$cluster_hierarchical <- cluster_assignments
# adding cluster labels to the original data

ggplot(student_features, aes(x = student_engagement, y = student_performance, color = factor(cluster_hierarchical))) +
  geom_point() +
  labs(title = "Hierarchical Clustering of Students",
       x = "Student Engagement",
       y = "Student Performance") +
  theme_minimal()

hierarchical_clusters <- data.frame(
  Cluster = unique(cluster_assignments),
  Num_Students = table(cluster_assignments)
)

hierarchical_clusters
##   Cluster Num_Students.cluster_assignments Num_Students.Freq
## 1       1                                1                49
## 2       2                                2                23
## 3       3                                3                28
student_features %>%
  group_by(cluster_hierarchical) %>%
  summarise(
    Avg_Engagement = mean(student_engagement),
    Avg_Performance = mean(student_performance),
    Num_Students = n()
  )
## # A tibble: 3 × 4
##   cluster_hierarchical Avg_Engagement Avg_Performance Num_Students
##                  <int>          <dbl>           <dbl>        <int>
## 1                    1           50.5            49.1           49
## 2                    2           40.9            74.8           23
## 3                    3           59.6            73.3           28

Introduction

Learning analytics is the use of data to understand and improve learning. Unsupervised learning is a type of machine learning that can be used to identify patterns and relationships in data without the need for labeled data.

In this case study, you will use unsupervised learning to analyze learning data from a Simulated School course. You will use dimensionality reduction to reduce the number of features in the data, and then use clustering to identify groups of students with similar learning patterns.

Data

The data for this case study is generated with the simulated function below. The data contains the following features:

Student ID: A unique identifier for each student Feature 1: A measure of student engagement Feature 2: A measure of student performance

Tasks

Submission

Submit a report containing the following:

Your report should include your code. Submit the published RPubs link to Blackboard.