Lab 3 Case Study: Unsupervised Learning in Learning Analytics

Data

The data for this case study is generated with the simulated function below. The data contains the following features:

Student ID: A unique identifier for each student Feature 1: A measure of student engagement Feature 2: A measure of student performance

Simulating the data

This function takes the number of students to simulate as an input and returns a data frame with three columns: student_id, student_engagement, and student_performance. The student_engagement and student_performance features are simulated using normal distributions with mean values of 50 and 60, respectively, and standard deviations of 10 and 15, respectively.

simulate_student_features <- function(n = 100) {
  # Set the random seed
  set.seed(260923)
  
  # Generate unique student IDs
  student_ids <- seq(1, n)

  # Simulate student engagement
  student_engagement <- rnorm(n, mean = 50, sd = 10)

  # Simulate student performance
  student_performance <- rnorm(n, mean = 60, sd = 15)

  # Combine the data into a data frame
  student_features <- data.frame(
    student_id = student_ids,
    student_engagement = student_engagement,
    student_performance = student_performance
  )

  # Return the data frame
  return(student_features)
}

summary(simulate_student_features())

##    student_id     student_engagement student_performance
##  Min.   :  1.00   Min.   :24.56      Min.   :33.94      
##  1st Qu.: 25.75   1st Qu.:43.56      1st Qu.:53.75      
##  Median : 50.50   Median :51.37      Median :64.45      
##  Mean   : 50.50   Mean   :50.43      Mean   :62.42      
##  3rd Qu.: 75.25   3rd Qu.:58.57      3rd Qu.:72.01      
##  Max.   :100.00   Max.   :73.08      Max.   :97.40

To use the simulate_student_features() function, we can simply pass the desired number of students to simulate as the argument:

student_features <- simulate_student_features(n = 100)

We can then use this data frame to perform unsupervised learning to identify groups of students with similar learning patterns,

Perform dimensionality reduction on the data using PCA.

standardized_data <- scale(student_features[, -1])
pca_results <- prcomp(standardized_data, center = TRUE, scale. = TRUE)
summary(pca_results)

## Importance of components:
##                           PC1    PC2
## Standard deviation     1.0104 0.9895
## Proportion of Variance 0.5104 0.4896
## Cumulative Proportion  0.5104 1.0000

projected_data <- predict(pca_results, newdata = standardized_data)[, 1:2]

elbow method is used to find optimal clusters.

# Elbow method to determine the optimal number of clusters (k)
wcss_values <- numeric(10)  # Initialize an empty vector to store WCSS values
for (k in 1:10) {
  kmeans_model <- kmeans(projected_data, centers = k)
  wcss_values[k] <- kmeans_model$tot.withinss
}

# Create a plot to visualize the elbow method
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.3.1

elbow_data <- data.frame(K = 1:10, WCSS = wcss_values)

ggplot(elbow_data, aes(x = K, y = WCSS)) +
  geom_line() +
  geom_point() +
  labs(x = "Number of Clusters (K)", y = "Within-Cluster Sum of Squares (WCSS)") +
  ggtitle("Elbow Method for Optimal K")

# Find the optimal number of clusters (K) based on the elbow point
optimal_k <- 1  # Initialize with a default value
for (i in 2:length(wcss_values)) {
  if ((wcss_values[i - 1] - wcss_values[i]) / wcss_values[i - 1] > 0.05) {
    optimal_k <- i
    break
  }
}

# Print the optimal number of clusters
cat("Optimal number of clusters (K) based on the elbow method:", optimal_k, "\n")

## Optimal number of clusters (K) based on the elbow method: 2

Cluster the data using KMeans

from the elbow method i got optimal clusters are 2.

set.seed(12)
kmeans_result <- kmeans(projected_data , centers = 2)
# number of clusters have been chosen as 2

student_features$cluster <- kmeans_result$cluster
# adding cluster labels to the original data

library(ggplot2)

ggplot(student_features, aes(x = student_engagement, y = student_performance, color = factor(cluster))) +
  geom_point() +
  labs(title = "KMeans Clustering of Students",
       x = "Student Engagement",
       y = "Student Performance") +
  theme_minimal()

cluster_centers <- as.data.frame(kmeans_result$centers)
cluster_centers

##          PC1        PC2
## 1  0.6303755  0.3334006
## 2 -0.8705186 -0.4604103

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

student_features %>%
  group_by(cluster) %>%
  summarise(
    Avg_Engagement = mean(student_engagement),
    Avg_Performance = mean(student_performance),
    Num_Students = n()
  )

## # A tibble: 2 × 4
##   cluster Avg_Engagement Avg_Performance Num_Students
##     <int>          <dbl>           <dbl>        <int>
## 1       1           57.3            59.5           58
## 2       2           40.9            66.5           42

To find silhouette_scores performance matrix for k means.

options(warn.conflicts = FALSE)



###install.packages("factoextra")
### install.packages("fpc")

# Load necessary libraries
library(cluster)
library(fpc)

## Warning: package 'fpc' was built under R version 4.3.1

# Calculate silhouette scores
silhouette_scores <- silhouette(kmeans_result$cluster, dist(projected_data))

# Calculate the average silhouette score
average_silhouette_score <- mean(silhouette_scores[, "sil_width"])

# Print the average silhouette score
cat("Average Silhouette Score:", average_silhouette_score, "\n")

## Average Silhouette Score: 0.3332294

Interpreting the k-mean clustering:

This score provides information about the clusters’ quality and separation. A higher average silhouette score often indicates better clustering, with values ranging from -1 (poor clustering) to 1 (excellent clustering). Your calculated score of around 0.333 indicates that the clusters appear to be reasonably distinct, but cluster quality may still be able to be improved.

Hierarchical clustering :

hierarchical_result <- hclust(dist(projected_data), method = "ward.D2")
# performing hierarchical clustering

cluster_assignments <- cutree(hierarchical_result, k = 2)
# cutting the tree to get a number of clusters as 2

student_features$cluster_hierarchical <- cluster_assignments
# adding cluster labels to the original data

ggplot(student_features, aes(x = student_engagement, y = student_performance, color = factor(cluster_hierarchical))) +
  geom_point() +
  labs(title = "Hierarchical Clustering of Students",
       x = "Student Engagement",
       y = "Student Performance") +
  theme_minimal()

### i plotted dendogram for this.

# Load necessary libraries
library(ggplot2)

# Perform hierarchical clustering
dist_matrix <- dist(projected_data)  # Calculate pairwise distances
hclust_result <- hclust(dist_matrix, method = "ward.D2")

# Create a dendrogram to visualize the hierarchical clustering
dendrogram <- as.dendrogram(hclust_result)

# Plot the dendrogram
plot(dendrogram)

# Find the optimal number of clusters based on the dendrogram
optimal_k <- 1  # Initialize with a default value
for (i in 2:length(dendrogram)) {
  if (attr(dendrogram[[i]], "height") > 5) {  # Adjust the height threshold as needed
    optimal_k <- i
    break
  }
}

# Print the optimal number of clusters
cat("Optimal number of clusters based on hierarchical clustering:", optimal_k, "\n")

## Optimal number of clusters based on hierarchical clustering: 2

hierarchical_clusters <- data.frame(
  Cluster = unique(cluster_assignments),
  Num_Students = table(cluster_assignments)
)

hierarchical_clusters

##   Cluster Num_Students.cluster_assignments Num_Students.Freq
## 1       1                                1                72
## 2       2                                2                28

student_features %>%
  group_by(cluster_hierarchical) %>%
  summarise(
    Avg_Engagement = mean(student_engagement),
    Avg_Performance = mean(student_performance),
    Num_Students = n()
  )

## # A tibble: 2 × 4
##   cluster_hierarchical Avg_Engagement Avg_Performance Num_Students
##                  <int>          <dbl>           <dbl>        <int>
## 1                    1           48.4            68.6           72
## 2                    2           55.6            46.6           28

# Load necessary libraries
library(cluster)

# Perform hierarchical clustering
dist_matrix <- dist(projected_data)
hclust_result <- hclust(dist_matrix, method = "ward.D2")

# Cut the tree to get a number of clusters as 2
cluster_assignments <- cutree(hclust_result, k = 2)

# Calculate the Dunn Index using the dunn() function from the fpc package
# Install the fpc package if not already installed
if (!requireNamespace("fpc", quietly = TRUE)) {
  install.packages("fpc")
}
library(fpc)
dunn_index <- cluster.stats(dist_matrix, cluster_assignments)$dunn

# Calculate the Silhouette Score
silhouette_score <- silhouette(cluster_assignments, dist_matrix)

# Print the Dunn Index and Silhouette Score
cat("Dunn Index:", dunn_index, "\n")

## Dunn Index: 0.02890451

cat("Average Silhouette Score:", mean(silhouette_score[, "sil_width"]), "\n")

## Average Silhouette Score: 0.2951394

Interpreting the hierarchical clustering results

The Dunn Index

The Dunn Index (0.0289) calculates the ratio of the smallest inter-cluster distance to the largest intra-cluster distance. A lower Dunn Index value indicates that the clusters are more apart, whereas a higher number indicates improved grouping. Your situation’s Dunn Index value of 0.0289 suggests that cluster separation might be enhanced and that there could be some overlap or close proximity between clusters.

Score for silhouette:

The differentiation and quality of clusters are evaluated by the Average Silhouette Score (0.2951). The scale ranges from -1 (poor clustering) to 1, where higher values suggest stronger separation. A score of 0.2951 indicates that the clusters are reasonably distinct, while there may be room for improvement in the cluster quality. The low level of the score indicates that the clusters may be more distinct even when there is significant differentiation.