Load necessary libraries

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(cluster)

Simulate student features

simulate_student_features <- function(n = 100) {
  set.seed(260923)
  student_ids <- seq(1, n)
  student_engagement <- rnorm(n, mean = 50, sd = 10)
  student_performance <- rnorm(n, mean = 60, sd = 15)
  student_features <- data.frame(
    student_id = student_ids,
    student_engagement = student_engagement,
    student_performance = student_performance
  )
  return(student_features)
}

student_features <- simulate_student_features(n = 100)

Exclude the “student_id” variable

student_data <- student_features %>%
  select(-student_id)  # Exclude the "student_id" column

Perform dimensionality reduction using PCA

student_pca <- student_data %>%
  prcomp(center = TRUE, scale. = TRUE)

Plot variance explained by principal components

fviz_eig(student_pca, addlabels = TRUE, ylim = c(0, 50)) +
  labs(title = "Variance Explained by Principal Components")

# Determine the number of principal components to retain (e.g., 2 components)

num_components <- 2
student_pca_data <- as.data.frame(predict(student_pca, newdata = student_data)[, 1:num_components])
glimpse(student_pca_data)

## Rows: 100
## Columns: 2
## $ PC1 <dbl> -0.44153737, 0.27373527, 1.93912437, -1.06354148, 1.01397062, 0.87…
## $ PC2 <dbl> -1.643380141, -0.083321127, -0.268457565, -1.059273032, 0.23352364…

# Load necessary libraries
library(ggplot2)
library(dplyr)
library(cluster)

# Initialize an empty vector to store the within-cluster sum of squares
wcss <- vector()

# Define the range of possible cluster numbers (e.g., from 1 to 10)
k_values <- 1:10

# Calculate the within-cluster sum of squares for different cluster numbers
for (k in k_values) {
  kmeans_model <- kmeans(student_pca_data, centers = k)
  wcss[k] <- kmeans_model$tot.withinss
}

# Create a data frame with the number of clusters and corresponding WCSS values
elbow_data <- data.frame(K = k_values, WCSS = wcss)

# Plot the elbow curve
ggplot(elbow_data, aes(x = K, y = WCSS)) +
  geom_line() +
  geom_point() +
  labs(title = "Elbow Method for Optimal Number of Clusters") +
  xlab("Number of Clusters (K)") +
  ylab("Within-Cluster Sum of Squares (WCSS)")

Perform KMeans clustering

set.seed(123)
kmeans_clusters <- kmeans(student_pca_data, centers = 4)  # You can choose the number of clusters

Hierarchical clustering

hierarchical_clusters <- hclust(dist(student_pca_data))
hierarchical_clusters_cut <- cutree(hierarchical_clusters, k = 4)  # You can choose the number of clusters

Visualize clustering results

ggplot(student_pca_data, aes(x = PC1, y = PC2)) +
  geom_point(aes(color = factor(kmeans_clusters$cluster)), size = 3) +
  labs(title = "KMeans Clustering") +
  theme_minimal()

ggplot(student_pca_data, aes(x = PC1, y = PC2)) +
  geom_point(aes(color = factor(hierarchical_clusters_cut)), size = 3) +
  labs(title = "Hierarchical Clustering") +
  theme_minimal()

kmeans_clusters$size

## [1] 30 17 21 32

kmeans_clusters$centers

##           PC1         PC2
## 1  0.06815923  1.08221768
## 2  0.48614297 -1.29915216
## 3  1.11639343 -0.05797355
## 4 -1.05479592 -0.28635934

Interpretation of clustering results

cluster_summary <- student_data %>%
  mutate(KMeans_Cluster = kmeans_clusters$cluster,
         Hierarchical_Cluster = hierarchical_clusters_cut)

head(cluster_summary)

##   student_engagement student_performance KMeans_Cluster Hierarchical_Cluster
## 1           35.47855            50.52231              2                    1
## 2           51.79512            58.88396              3                    1
## 3           62.41012            40.56755              3                    2
## 4           35.20679            62.46033              4                    3
## 5           59.37552            54.69326              3                    2
## 6           57.00109            54.09745              3                    2

Lab 3 Case Study: Unsupervised Learning in Learning Analytics

[Abhishek]

2023-10-05

Load necessary libraries

Simulate student features

Exclude the “student_id” variable

Perform dimensionality reduction using PCA

Plot variance explained by principal components

Perform KMeans clustering

Hierarchical clustering

Visualize clustering results

Interpretation of clustering results