Main Function: Creating Balanced Groups
The core of the solution is the create_balanced_groups
function, which implements the algorithm for assigning students to
balanced groups.
#' Create balanced groups using a greedy algorithm approach
#'
#' @param data_file Path to CSV file containing student data
#' @param ideal_group_size Target size for each group (default = 5)
#' @return List containing group assignments and metrics
create_balanced_groups <- function(data_file, ideal_group_size = 5) {
# Read the data
students <- read.csv(data_file, stringsAsFactors = FALSE)
# Display column names for debugging
cat("Original column names:\n")
print(names(students))
# Find column indices by partial matching
id_col <- grep("Student.*ID", names(students), ignore.case = TRUE)[1]
nat_col <- grep("National", names(students), ignore.case = TRUE)[1]
culture_col <- grep("Cultural|Culture", names(students), ignore.case = TRUE)[1]
gender_col <- grep("Gender", names(students), ignore.case = TRUE)[1]
quant_col <- grep("Quantitative", names(students), ignore.case = TRUE)[1]
# Check if all required columns were found
if (any(is.na(c(id_col, nat_col, culture_col, gender_col, quant_col)))) {
stop("Could not identify all required columns in the data file")
}
# Get column names for easier reference
id_name <- names(students)[id_col]
nat_name <- names(students)[nat_col]
culture_name <- names(students)[culture_col]
gender_name <- names(students)[gender_col]
quant_name <- names(students)[quant_col]
cat("Using the following columns:\n")
cat("ID column:", id_name, "\n")
cat("Nationality column:", nat_name, "\n")
cat("Cultural background column:", culture_name, "\n")
cat("Gender column:", gender_name, "\n")
cat("Quantitative background column:", quant_name, "\n\n")
# Clean the data (remove problematic rows)
students <- students[!is.na(students[[id_name]]), ]
students <- students[students[[nat_name]] != "NA", ]
students <- students[students[[nat_name]] != "9 Groups for 5 people per group", ]
# Calculate the number of groups needed
num_students <- nrow(students)
num_groups <- ceiling(num_students / ideal_group_size)
cat("Total students after cleaning:", num_students, "\n")
cat("Number of groups needed:", num_groups, "\n\n")
# Initialize empty groups
groups <- list()
for (i in 1:num_groups) {
groups[[i]] <- list(
students = data.frame(),
male_count = 0,
female_count = 0,
avg_quant = 0
)
}
# Step 1: First handle international students by nationality
# Group international students by nationality
international_students <- students[students[[nat_name]] != "British", ]
international_students <- international_students[order(international_students[[nat_name]]), ]
# Create a list of students by nationality
unique_nationalities <- unique(international_students[[nat_name]])
nationality_groups <- list()
for (nat in unique_nationalities) {
nationality_groups[[nat]] <- international_students[international_students[[nat_name]] == nat, ]
}
# Assign international students of same nationality to the same groups
for (nat in names(nationality_groups)) {
nat_students <- nationality_groups[[nat]]
# Skip if only one student of this nationality
if (nrow(nat_students) <= 1) {
next
}
# Assign pairs to groups (try to balance group sizes)
for (i in 1:(floor(nrow(nat_students) / 2))) {
student1 <- nat_students[2*i-1, , drop = FALSE]
student2 <- nat_students[2*i, , drop = FALSE]
# Find the least full group
group_sizes <- sapply(groups, function(g) nrow(g$students))
target_group <- which.min(group_sizes)
# Add students to the group
groups[[target_group]]$students <- rbind(groups[[target_group]]$students,
student1, student2)
# Update group metrics
groups[[target_group]]$male_count <- sum(groups[[target_group]]$students[[gender_name]] == "Male")
groups[[target_group]]$female_count <- sum(groups[[target_group]]$students[[gender_name]] == "Female")
# Handle potential NA values in quantitative scores
quant_values <- as.numeric(groups[[target_group]]$students[[quant_name]])
groups[[target_group]]$avg_quant <- mean(quant_values, na.rm = TRUE)
}
}
# Step 2: Handle remaining international students by cultural background
# Identify students not yet assigned
assigned_ids <- unlist(lapply(groups, function(g) {
if (nrow(g$students) > 0) return(g$students[[id_name]])
else return(NULL)
}))
unassigned_intl <- international_students[!(international_students[[id_name]] %in% assigned_ids), ]
# Create a list of students by cultural background
unique_cultures <- unique(unassigned_intl[[culture_name]])
culture_groups <- list()
for (cult in unique_cultures) {
culture_groups[[cult]] <- unassigned_intl[unassigned_intl[[culture_name]] == cult, ]
}
# Assign by cultural background
for (cult in names(culture_groups)) {
cult_students <- culture_groups[[cult]]
# Skip if only one student of this cultural background
if (nrow(cult_students) <= 1) {
next
}
# Assign pairs to groups
for (i in 1:(floor(nrow(cult_students) / 2))) {
student1 <- cult_students[2*i-1, , drop = FALSE]
student2 <- cult_students[2*i, , drop = FALSE]
# Find the least full group
group_sizes <- sapply(groups, function(g) nrow(g$students))
target_group <- which.min(group_sizes)
# Add students to the group
groups[[target_group]]$students <- rbind(groups[[target_group]]$students,
student1, student2)
# Update group metrics
groups[[target_group]]$male_count <- sum(groups[[target_group]]$students[[gender_name]] == "Male")
groups[[target_group]]$female_count <- sum(groups[[target_group]]$students[[gender_name]] == "Female")
# Handle potential NA values in quantitative scores
quant_values <- as.numeric(groups[[target_group]]$students[[quant_name]])
groups[[target_group]]$avg_quant <- mean(quant_values, na.rm = TRUE)
}
}
# Step 3: Assign remaining students (both international and British)
# Update the list of assigned students
assigned_ids <- unlist(lapply(groups, function(g) {
if (nrow(g$students) > 0) return(g$students[[id_name]])
else return(NULL)
}))
unassigned <- students[!(students[[id_name]] %in% assigned_ids), ]
# Calculate current gender balance across groups
total_male <- sum(sapply(groups, function(g) g$male_count))
total_female <- sum(sapply(groups, function(g) g$female_count))
# Sort unassigned students to prioritize balanced assignments
# We'll sort by gender, then by quantitative background
unassigned <- unassigned[order(unassigned[[gender_name]], unassigned[[quant_name]]), ]
# Assign remaining students one by one
for (i in 1:nrow(unassigned)) {
student <- unassigned[i, , drop = FALSE]
# Evaluate each group for best fit
best_score <- -Inf
best_group <- 1
for (j in 1:num_groups) {
# Skip if group is already full
if (nrow(groups[[j]]$students) >= ceiling(num_students / num_groups)) {
next
}
# Calculate gender balance score
gender_score <- 0
if (student[[gender_name]] == "Male") {
gender_score <- -groups[[j]]$male_count
} else {
gender_score <- -groups[[j]]$female_count
}
# Calculate quantitative score (how close to group average)
# Handle potential NA values
student_quant <- as.numeric(student[[quant_name]])
if (is.na(student_quant)) student_quant <- mean(as.numeric(students[[quant_name]]), na.rm = TRUE)
quant_score <- -abs(student_quant - groups[[j]]$avg_quant)
# Calculate size score (prefer smaller groups)
size_score <- -nrow(groups[[j]]$students)
# Combine scores (weighted by importance)
total_score <- gender_score * 2 + quant_score + size_score * 3
if (total_score > best_score) {
best_score <- total_score
best_group <- j
}
}
# Assign student to best group
groups[[best_group]]$students <- rbind(groups[[best_group]]$students, student)
# Update group metrics
groups[[best_group]]$male_count <- sum(groups[[best_group]]$students[[gender_name]] == "Male")
groups[[best_group]]$female_count <- sum(groups[[best_group]]$students[[gender_name]] == "Female")
# Handle potential NA values in quantitative scores
quant_values <- as.numeric(groups[[best_group]]$students[[quant_name]])
groups[[best_group]]$avg_quant <- mean(quant_values, na.rm = TRUE)
}
# Prepare output
students$assigned_group <- NA
for (j in 1:num_groups) {
if (nrow(groups[[j]]$students) > 0) {
for (id in groups[[j]]$students[[id_name]]) {
students$assigned_group[students[[id_name]] == id] <- j
}
}
}
# Calculate group metrics for evaluation
group_metrics <- data.frame(
group = 1:num_groups,
size = sapply(groups, function(g) nrow(g$students)),
male_count = sapply(groups, function(g) g$male_count),
female_count = sapply(groups, function(g) g$female_count),
avg_quant = sapply(groups, function(g) g$avg_quant)
)
# Calculate gender percentages
group_metrics$male_pct <- round(group_metrics$male_count / group_metrics$size * 100, 1)
group_metrics$female_pct <- round(group_metrics$female_count / group_metrics$size * 100, 1)
# Calculate success metrics for nationality and cultural pairings
nat_pairs <- 0
cult_pairs <- 0
for (j in 1:num_groups) {
group_nat_counts <- table(groups[[j]]$students[[nat_name]])
group_cult_counts <- table(groups[[j]]$students[[culture_name]])
# Count nationality pairs (excluding British)
nat_pairs <- nat_pairs + sum(group_nat_counts[names(group_nat_counts) != "British"] >= 2)
# Count cultural background pairs (for international students)
non_british_cultures <- groups[[j]]$students[groups[[j]]$students[[nat_name]] != "British", culture_name]
if (length(non_british_cultures) > 0) {
cult_table <- table(non_british_cultures)
cult_pairs <- cult_pairs + sum(cult_table >= 2)
}
}
# Print summary of results
cat("\nGroup assignment summary:\n")
print(group_metrics)
cat("\nSuccessful nationality pairings (international students):", nat_pairs, "\n")
cat("Successful cultural background pairings (international students):", cult_pairs, "\n\n")
# Return the assignment and metrics
result <- list(
students = students,
group_metrics = group_metrics,
groups = groups,
success_metrics = list(
nationality_pairs = nat_pairs,
cultural_pairs = cult_pairs
)
)
return(result)
}Understanding the Algorithm
The create_balanced_groups function implements a
multi-step greedy algorithm:
- Data Loading and Preparation:
- Reads the student data from a CSV file
- Identifies relevant columns using partial string matching
- Cleans problematic data entries
- Calculates the number of groups needed based on ideal size
- International Student Pairing by Nationality:
- Identifies international students (non-British)
- Groups students of the same nationality
- Places pairs of students from the same nationality in the least-filled groups
- International Student Pairing by Cultural
Background:
- For remaining unassigned international students
- Groups them by cultural background
- Places pairs of students with the same cultural background together
- Optimized Assignment of Remaining Students:
- For all unassigned students (international and British)
- Uses a scoring system based on gender balance, quantitative background, and group size
- Places each student in the group that maximizes the overall balance
- Results Calculation:
- Computes metrics for each group: size, gender count/percentage, average quantitative score
- Counts successful nationality and cultural pairings
- Returns comprehensive results for evaluation and visualization
This approach ensures that international students with the same nationality or cultural background are placed together when possible, while still maintaining overall balance across all criteria.