Overview

Purpose:

Clean + decode coded survey/student fields into readable labels
Validate codes to prevent silent mapping issues
Export a Power BI–ready dataset
Run a lightweight EDA to summarize distributions + at-risk patterns

Data Preparation

0) Load Packages

These packages support fast, readable data wrangling and clean printing.

required_pkgs <- c("readr", "dplyr", "tibble", "tidyr", "stringr", "knitr")

to_install <- setdiff(required_pkgs, rownames(installed.packages()))
if (length(to_install) > 0) install.packages(to_install)

invisible(lapply(required_pkgs, library, character.only = TRUE))

1) File Paths

Keeping stable paths ensures Power BI refresh works consistently.

BASE_DIR <- "C:/Users/yudit/Downloads/UWG/2nd-Round/Activities/R_PowerBI"
INPUT_PATH  <- file.path(BASE_DIR, "DATA.csv")
OUTPUT_PATH <- file.path(BASE_DIR, "students_db_R.csv")

# Guardrail: fail early if input is missing
if (!file.exists(INPUT_PATH)) {
  stop(paste0(
    "Input file not found: ", INPUT_PATH,
    "\nMake sure DATA.csv is in: ", BASE_DIR
  ))
}

2) Load Data

Read the raw coded dataset exactly once; all transformations happen after.

df <- readr::read_csv(INPUT_PATH, show_col_types = FALSE)

3) Rename Columns

Raw columns use generic identifiers (“1”,“2”,“3”…). Renaming makes the dataset self-describing for analysis + BI modeling.

rename_map <- c(
  "STUDENT ID" = "student_id",
  "COURSE ID"  = "course_id",
  "GRADE"      = "output_grade_code",
  "1"  = "student_age_group_code",
  "2"  = "sex_code",
  "3"  = "high_school_type_code",
  "4"  = "scholarship_type_code",
  "5"  = "additional_work_code",
  "6"  = "art_sports_activity_code",
  "7"  = "has_partner_code",
  "8"  = "salary_group_code",
  "9"  = "transportation_code",
  "10" = "accommodation_code",
  "11" = "mother_education_code",
  "12" = "father_education_code",
  "13" = "siblings_group_code",
  "14" = "parental_status_code",
  "15" = "mother_occupation_code",
  "16" = "father_occupation_code",
  "17" = "weekly_study_hours_code",
  "18" = "reading_non_scientific_code",
  "19" = "reading_scientific_code",
  "20" = "attends_seminars_code",
  "21" = "project_impact_code",
  "22" = "class_attendance_code",
  "23" = "midterm1_preparation_code",
  "24" = "midterm2_preparation_code",
  "25" = "taking_notes_code",
  "26" = "listening_in_class_code",
  "27" = "discussion_improves_success_code",
  "28" = "flip_classroom_code",
  "29" = "last_semester_gpa_group_code",
  "30" = "expected_grad_gpa_group_code"
)

old_names <- names(df)
names(df) <- ifelse(old_names %in% names(rename_map), rename_map[old_names], old_names)

4) Decode: Code → Label Mappings

Codes are useful for storage, but labels are essential for communication, reporting, and stakeholder-facing insights.

maps <- list(
  student_age_group_code = c("1"="18-21", "2"="22-25", "3"="above 26"),
  sex_code = c("1"="female", "2"="male"),
  high_school_type_code = c("1"="private", "2"="state", "3"="other"),
  scholarship_type_code = c("1"="0%", "2"="25%", "3"="50%", "4"="75%", "5"="Full"),
  additional_work_code = c("1"="Yes", "2"="No"),
  art_sports_activity_code = c("1"="Yes", "2"="No"),
  has_partner_code = c("1"="Yes", "2"="No"),
  salary_group_code = c("1"="USD 135-200", "2"="USD 201-270", "3"="USD 271-340", "4"="USD 341-410", "5"="above 410"),
  transportation_code = c("1"="Bus", "2"="Private car/taxi", "3"="bicycle", "4"="Other"),
  accommodation_code = c("1"="rental", "2"="dormitory", "3"="with family", "4"="Other"),
  mother_education_code = c("1"="primary school", "2"="secondary school", "3"="high school", "4"="university", "5"="MSc.", "6"="Ph.D."),
  father_education_code = c("1"="primary school", "2"="secondary school", "3"="high school", "4"="university", "5"="MSc.", "6"="Ph.D."),
  siblings_group_code = c("1"="1", "2"="2", "3"="3", "4"="4", "5"="5 or above"),
  parental_status_code = c("1"="married", "2"="divorced", "3"="died - one of them or both"),
  mother_occupation_code = c("1"="retired", "2"="housewife", "3"="government officer", "4"="private sector employee", "5"="self-employment", "6"="other"),
  father_occupation_code = c("1"="retired", "2"="government officer", "3"="private sector employee", "4"="self-employment", "5"="other"),
  weekly_study_hours_code = c("1"="0 hours", "2"="<5 hours", "3"="6-10 hours", "4"="11-20 hours", "5"="more than 20 hours"),
  reading_non_scientific_code = c("1"="Never", "2"="Sometimes", "3"="Often"),
  reading_scientific_code = c("1"="Never", "2"="Sometimes", "3"="Often"),
  attends_seminars_code = c("1"="Yes", "2"="No"),
  project_impact_code = c("1"="positive", "2"="negative", "3"="neutral"),
  class_attendance_code = c("1"="always", "2"="sometimes", "3"="never"),
  midterm1_preparation_code = c("1"="alone", "2"="with friends", "3"="not applicable"),
  midterm2_preparation_code = c("1"="closest date to the exam", "2"="regularly during the semester", "3"="never"),
  taking_notes_code = c("1"="never", "2"="sometimes", "3"="always"),
  listening_in_class_code = c("1"="never", "2"="sometimes", "3"="always"),
  discussion_improves_success_code = c("1"="never", "2"="sometimes", "3"="always"),
  flip_classroom_code = c("1"="not useful", "2"="useful", "3"="not applicable"),
  last_semester_gpa_group_code = c("1"="<2.00", "2"="2.00-2.49", "3"="2.50-2.99", "4"="3.00-3.49", "5"="above 3.49"),
  expected_grad_gpa_group_code = c("1"="<2.00", "2"="2.00-2.49", "3"="2.50-2.99", "4"="3.00-3.49", "5"="above 3.49"),
  output_grade_code = c("0"="Fail", "1"="DD", "2"="DC", "3"="CC", "4"="CB", "5"="BB", "6"="BA", "7"="AA")
)

5) Validate Codes (Data Integrity Check)

If the source data includes a new/unknown code, we STOP early so the analyst can update the mapping. This prevents blank labels and bad reporting.

for (col in names(maps)) {
  if (!col %in% names(df)) {
    stop(paste0("Expected column '", col, "' was not found after renaming."))
  }

  present <- as.character(unique(stats::na.omit(df[[col]])))
  allowed <- names(maps[[col]])
  bad <- setdiff(present, allowed)

  if (length(bad) > 0) {
    stop(paste0(col, " has unexpected codes: {", paste(bad, collapse = ", "), "}"))
  }
}

cat("✅ All codes validated successfully.\n")

## ✅ All codes validated successfully.

6) Create Label Columns

Keeping both versions helps: BI modeling (codes can be stable keys) and readability (labels improve interpretability).

for (code_col in names(maps)) {
  label_col <- sub("_code$", "", code_col)
  df[[label_col]] <- unname(maps[[code_col]][as.character(df[[code_col]])])
}

7) Export for Power BI

write_excel_csv writes UTF-8 with BOM so Excel/Power BI read it cleanly.

readr::write_excel_csv(df, OUTPUT_PATH)
cat(sprintf("✅ Export complete: %s  (rows=%s, cols=%s)\n",
            OUTPUT_PATH, format(nrow(df), big.mark=","), ncol(df)))

## ✅ Export complete: C:/Users/yudit/Downloads/UWG/2nd-Round/Activities/R_PowerBI/students_db_R.csv  (rows=145, cols=64)

Exploratory Data Analysis (EDA)

Goal: Provide quick, stakeholder-ready summaries of overall performance, distributions of key behaviors/demographics, and performance differences by group.

# Helper functions
safe_mean <- function(x) mean(suppressWarnings(as.numeric(x)), na.rm = TRUE)

# Percent display helpers (presentation-friendly)
fmt_pct100 <- function(x) sprintf("%.2f%%", x)
fmt_pct01  <- function(x) sprintf("%.2f%%", x * 100)

# Frequency table helper
freq_table <- function(data, col, dropna = FALSE) {
  if (!col %in% names(data)) return(NULL)

  s <- data[[col]]
  if (dropna) s <- s[!is.na(s)]

  tibble::tibble(value = s) %>%
    dplyr::filter(!is.na(value)) %>%
    dplyr::count(value, name = "count", sort = TRUE) %>%
    dplyr::mutate(percent = fmt_pct100((count / sum(count)) * 100))
}

1) Data Snapshot

Quick sanity check that data loaded correctly and columns look right.

cat("Shape (rows, cols):", paste(dim(df), collapse = ", "), "\n")

## Shape (rows, cols): 145, 64

dplyr::as_tibble(head(df, 5))

## # A tibble: 5 × 64
##   student_id student_age_group_code sex_code high_school_type_code
##   <chr>                       <dbl>    <dbl>                 <dbl>
## 1 STUDENT1                        2        2                     3
## 2 STUDENT2                        2        2                     3
## 3 STUDENT3                        2        2                     2
## 4 STUDENT4                        1        1                     1
## 5 STUDENT5                        2        2                     1
## # ℹ 60 more variables: scholarship_type_code <dbl>, additional_work_code <dbl>,
## #   art_sports_activity_code <dbl>, has_partner_code <dbl>,
## #   salary_group_code <dbl>, transportation_code <dbl>,
## #   accommodation_code <dbl>, mother_education_code <dbl>,
## #   father_education_code <dbl>, siblings_group_code <dbl>,
## #   parental_status_code <dbl>, mother_occupation_code <dbl>,
## #   father_occupation_code <dbl>, weekly_study_hours_code <dbl>, …

2) Schema / Missing / Duplicates

Identify data quality issues early (types, nulls, duplicates).

# Column data types
dtypes <- tibble::tibble(
  column = names(df),
  dtype  = vapply(df, function(x) paste(class(x), collapse=","), character(1))
) %>%
  dplyr::arrange(dtype, column)

cat("Dtypes (sorted):\n")

## Dtypes (sorted):

knitr::kable(dtypes)

column	dtype
accommodation	character
additional_work	character
art_sports_activity	character
attends_seminars	character
class_attendance	character
discussion_improves_success	character
expected_grad_gpa_group	character
father_education	character
father_occupation	character
flip_classroom	character
has_partner	character
high_school_type	character
last_semester_gpa_group	character
listening_in_class	character
midterm1_preparation	character
midterm2_preparation	character
mother_education	character
mother_occupation	character
output_grade	character
parental_status	character
project_impact	character
reading_non_scientific	character
reading_scientific	character
salary_group	character
scholarship_type	character
sex	character
siblings_group	character
student_age_group	character
student_id	character
taking_notes	character
transportation	character
weekly_study_hours	character
accommodation_code	numeric
additional_work_code	numeric
art_sports_activity_code	numeric
attends_seminars_code	numeric
class_attendance_code	numeric
course_id	numeric
discussion_improves_success_code	numeric
expected_grad_gpa_group_code	numeric
father_education_code	numeric
father_occupation_code	numeric
flip_classroom_code	numeric
has_partner_code	numeric
high_school_type_code	numeric
last_semester_gpa_group_code	numeric
listening_in_class_code	numeric
midterm1_preparation_code	numeric
midterm2_preparation_code	numeric
mother_education_code	numeric
mother_occupation_code	numeric
output_grade_code	numeric
parental_status_code	numeric
project_impact_code	numeric
reading_non_scientific_code	numeric
reading_scientific_code	numeric
salary_group_code	numeric
scholarship_type_code	numeric
sex_code	numeric
siblings_group_code	numeric
student_age_group_code	numeric
taking_notes_code	numeric
transportation_code	numeric
weekly_study_hours_code	numeric

# Missing values table (count + %)
missing_tbl <- tibble::tibble(
  column = names(df),
  missing_n = colSums(is.na(df)),
  missing_pct = round((missing_n / nrow(df)) * 100, 2)
) %>%
  dplyr::arrange(dplyr::desc(missing_n))

cat("\nMissing values (top 25):\n")

## 
## Missing values (top 25):

if (all(missing_tbl$missing_n == 0)) {
  cat("✅ No missing values detected (all columns have 0 missing).\n\n")
}

## ✅ No missing values detected (all columns have 0 missing).

knitr::kable(dplyr::slice_head(missing_tbl, n = 25))

column	missing_n	missing_pct
student_id	0	0
student_age_group_code	0	0
sex_code	0	0
high_school_type_code	0	0
scholarship_type_code	0	0
additional_work_code	0	0
art_sports_activity_code	0	0
has_partner_code	0	0
salary_group_code	0	0
transportation_code	0	0
accommodation_code	0	0
mother_education_code	0	0
father_education_code	0	0
siblings_group_code	0	0
parental_status_code	0	0
mother_occupation_code	0	0
father_occupation_code	0	0
weekly_study_hours_code	0	0
reading_non_scientific_code	0	0
reading_scientific_code	0	0
attends_seminars_code	0	0
project_impact_code	0	0
class_attendance_code	0	0
midterm1_preparation_code	0	0
midterm2_preparation_code	0	0

# Duplicates
if ("student_id" %in% names(df)) {
  dup_n <- sum(duplicated(df$student_id))
  cat("\nDuplicate student_id:", dup_n, "\n")
} else {
  dup_n <- sum(duplicated(df))
  cat("\nDuplicate rows:", dup_n, "\n")
}

## 
## Duplicate student_id: 0

3) Outcome Setup

Translate grade categories into numeric GPA points for aggregation, then define pass/fail and at-risk for student success monitoring.

GRADE_POINTS_MAP <- c(
  "AA"=4.0, "BA"=3.5, "BB"=3.0,
  "CB"=2.5, "CC"=2.0,
  "DC"=1.5, "DD"=1.0,
  "Fail"=0.0
)

grade_col <- if ("output_grade" %in% names(df)) "output_grade" else NULL
gp_col <- if ("grade_points" %in% names(df)) "grade_points" else "Grade Points"

if (!is.null(grade_col) && !(gp_col %in% names(df))) {
  df[[gp_col]] <- unname(GRADE_POINTS_MAP[as.character(df[[grade_col]])])
}

FAIL_SET   <- c("Fail")
AT_RISK_SET <- c("DC", "DD", "Fail")

if (!is.null(grade_col)) {
  df$is_fail    <- df[[grade_col]] %in% FAIL_SET
  df$is_at_risk <- df[[grade_col]] %in% AT_RISK_SET
  df$is_pass    <- !df$is_fail
} else {
  cat("WARNING: 'output_grade' column not found. Outcome flags not created.\n")
}

created <- c("is_pass","is_fail","is_at_risk", gp_col)
cat("Created columns:", paste(created[created %in% names(df)], collapse=", "), "\n")

## Created columns: is_pass, is_fail, is_at_risk, Grade Points

4) Overall KPIs

High-level performance summary (health check).

N <- if ("student_id" %in% names(df)) dplyr::n_distinct(df$student_id) else nrow(df)
avg_gpa <- if (gp_col %in% names(df)) safe_mean(df[[gp_col]]) else NaN
pass_rate    <- if ("is_pass" %in% names(df)) mean(df$is_pass, na.rm = TRUE) else NaN
fail_rate    <- if ("is_fail" %in% names(df)) mean(df$is_fail, na.rm = TRUE) else NaN
at_risk_rate <- if ("is_at_risk" %in% names(df)) mean(df$is_at_risk, na.rm = TRUE) else NaN

cat("Total Students:", N, "\n")

## Total Students: 145

cat(sprintf("Avg GPA (grade-mapped): %.2f\n", avg_gpa))

## Avg GPA (grade-mapped): 2.09

cat("Pass Rate:", fmt_pct01(pass_rate), "\n")

## Pass Rate: 94.48%

cat("Fail Rate:", fmt_pct01(fail_rate), "\n")

## Fail Rate: 5.52%

cat("At-Risk (DC/DD/Fail):", fmt_pct01(at_risk_rate), "\n")

## At-Risk (DC/DD/Fail): 46.21%

Interpretation: If at-risk is high, focus analysis on groups with elevated at-risk rates.

5) Final Grade Distribution

Shows how outcomes are distributed—useful for spotting clustering at low grades.

if (!is.null(grade_col)) {
  freq_table(df, grade_col)
}

## # A tibble: 8 × 3
##   value count percent
##   <chr> <int> <chr>  
## 1 DD       35 24.14% 
## 2 DC       24 16.55% 
## 3 CC       21 14.48% 
## 4 AA       17 11.72% 
## 5 BB       17 11.72% 
## 6 BA       13 8.97%  
## 7 CB       10 6.90%  
## 8 Fail      8 5.52%

6) Behavior / Engagement Distributions

Highlights dominant study/attendance patterns that may relate to outcomes.

behavior_cols <- c("weekly_study_hours", "midterm2_preparation", "class_attendance", 
                   "attends_seminars", "additional_work")

for (c in behavior_cols) {
  if (c %in% names(df)) {
    cat("\n", c, ":\n", sep = "")
    print(freq_table(df, c))
  }
}

## 
## weekly_study_hours:
## # A tibble: 5 × 3
##   value              count percent
##   <chr>              <int> <chr>  
## 1 <5 hours              74 51.03% 
## 2 6-10 hours            30 20.69% 
## 3 0 hours               29 20.00% 
## 4 11-20 hours            8 5.52%  
## 5 more than 20 hours     4 2.76%  
## 
## midterm2_preparation:
## # A tibble: 3 × 3
##   value                         count percent
##   <chr>                         <int> <chr>  
## 1 closest date to the exam        123 84.83% 
## 2 regularly during the semester    20 13.79% 
## 3 never                             2 1.38%  
## 
## class_attendance:
## # A tibble: 2 × 3
##   value     count percent
##   <chr>     <int> <chr>  
## 1 always      110 75.86% 
## 2 sometimes    35 24.14% 
## 
## attends_seminars:
## # A tibble: 2 × 3
##   value count percent
##   <chr> <int> <chr>  
## 1 Yes     114 78.62% 
## 2 No       31 21.38% 
## 
## additional_work:
## # A tibble: 2 × 3
##   value count percent
##   <chr> <int> <chr>  
## 1 No       96 66.21% 
## 2 Yes      49 33.79%

Interpretation: Look for behaviors with both high prevalence and high at-risk rates.

7) Performance by Key Groups

Compares average GPA + at-risk/fail rates across groups to identify gaps.

perf_by_group <- function(data, group_col, min_n = 0) {
  if (!(group_col %in% names(data))) return(NULL)
  if (!(gp_col %in% names(data))) return(NULL)
  if (!("is_fail" %in% names(data)) || !("is_at_risk" %in% names(data))) return(NULL)

  out <- data %>%
    dplyr::group_by(.data[[group_col]], .drop = FALSE) %>%
    dplyr::summarise(
      students = if ("student_id" %in% names(data)) dplyr::n_distinct(student_id) else dplyr::n(),
      avg_gpa = safe_mean(.data[[gp_col]]),
      fail_rate = mean(is_fail, na.rm = TRUE),
      at_risk_rate = mean(is_at_risk, na.rm = TRUE),
      .groups = "drop"
    ) %>%
    dplyr::mutate(avg_gpa = round(avg_gpa, 2))

  if (min_n > 0) out <- dplyr::filter(out, students >= min_n)

  out <- dplyr::arrange(out, dplyr::desc(at_risk_rate), avg_gpa)

  out %>%
    dplyr::mutate(
      fail_rate = fmt_pct01(fail_rate),
      at_risk_rate = fmt_pct01(at_risk_rate)
    ) %>%
    { names(.)[1] <- group_col; . }
}

key_groups <- c("additional_work", "class_attendance", "attends_seminars", 
                "weekly_study_hours", "midterm2_preparation")

for (c in key_groups) {
  if (c %in% names(df)) {
    cat("\nPerformance by ", c, ":\n", sep = "")
    print(perf_by_group(df, c))
  }
}

## 
## Performance by additional_work:
## # A tibble: 2 × 5
##   additional_work students avg_gpa fail_rate at_risk_rate
##   <chr>              <int>   <dbl> <chr>     <chr>       
## 1 Yes                   49    1.81 10.20%    53.06%      
## 2 No                    96    2.23 3.12%     42.71%      
## 
## Performance by class_attendance:
## # A tibble: 2 × 5
##   class_attendance students avg_gpa fail_rate at_risk_rate
##   <chr>               <int>   <dbl> <chr>     <chr>       
## 1 sometimes              35    1.81 5.71%     62.86%      
## 2 always                110    2.17 5.45%     40.91%      
## 
## Performance by attends_seminars:
## # A tibble: 2 × 5
##   attends_seminars students avg_gpa fail_rate at_risk_rate
##   <chr>               <int>   <dbl> <chr>     <chr>       
## 1 No                     31    1.66 12.90%    64.52%      
## 2 Yes                   114    2.2  3.51%     41.23%      
## 
## Performance by weekly_study_hours:
## # A tibble: 5 × 5
##   weekly_study_hours students avg_gpa fail_rate at_risk_rate
##   <chr>                 <int>   <dbl> <chr>     <chr>       
## 1 more than 20 hours        4    1.38 25.00%    75.00%      
## 2 0 hours                  29    1.93 3.45%     55.17%      
## 3 <5 hours                 74    2.2  5.41%     44.59%      
## 4 6-10 hours               30    2.1  6.67%     43.33%      
## 5 11-20 hours               8    1.94 0.00%     25.00%      
## 
## Performance by midterm2_preparation:
## # A tibble: 3 × 5
##   midterm2_preparation          students avg_gpa fail_rate at_risk_rate
##   <chr>                            <int>   <dbl> <chr>     <chr>       
## 1 closest date to the exam           123    2.06 5.69%     47.97%      
## 2 regularly during the semester       20    2.15 5.00%     40.00%      
## 3 never                                2    3    0.00%     0.00%

Interpretation: Prioritize groups with larger student counts and higher at-risk rates.

8) Demographics

Useful for equity lens / segmentation while avoiding tiny-sample noise.

Distribution

demo_cols <- c("sex", "student_age_group", "scholarship_type", "high_school_type", "salary_group")

for (c in demo_cols) {
  if (c %in% names(df)) {
    cat("\n", c, ":\n", sep = "")
    print(freq_table(df, c))
  }
}

## 
## sex:
## # A tibble: 2 × 3
##   value  count percent
##   <chr>  <int> <chr>  
## 1 male      87 60.00% 
## 2 female    58 40.00% 
## 
## student_age_group:
## # A tibble: 3 × 3
##   value    count percent
##   <chr>    <int> <chr>  
## 1 22-25       70 48.28% 
## 2 18-21       65 44.83% 
## 3 above 26    10 6.90%  
## 
## scholarship_type:
## # A tibble: 5 × 3
##   value count percent
##   <chr> <int> <chr>  
## 1 50%      76 52.41% 
## 2 75%      42 28.97% 
## 3 Full     23 15.86% 
## 4 25%       3 2.07%  
## 5 0%        1 0.69%  
## 
## high_school_type:
## # A tibble: 3 × 3
##   value   count percent
##   <chr>   <int> <chr>  
## 1 state     103 71.03% 
## 2 private    25 17.24% 
## 3 other      17 11.72% 
## 
## salary_group:
## # A tibble: 5 × 3
##   value       count percent
##   <chr>       <int> <chr>  
## 1 USD 135-200    93 64.14% 
## 2 USD 201-270    27 18.62% 
## 3 USD 271-340    16 11.03% 
## 4 above 410       5 3.45%  
## 5 USD 341-410     4 2.76%

Performance by Demographics (n ≥ 5)

for (c in demo_cols) {
  if (c %in% names(df)) {
    cat("\n", c, " (n>=5):\n", sep = "")
    print(perf_by_group(df, c, min_n = 5))
  }
}

## 
## sex (n>=5):
## # A tibble: 2 × 5
##   sex    students avg_gpa fail_rate at_risk_rate
##   <chr>     <int>   <dbl> <chr>     <chr>       
## 1 female       58    1.59 13.79%    62.07%      
## 2 male         87    2.41 0.00%     35.63%      
## 
## student_age_group (n>=5):
## # A tibble: 3 × 5
##   student_age_group students avg_gpa fail_rate at_risk_rate
##   <chr>                <int>   <dbl> <chr>     <chr>       
## 1 22-25                   70    2.04 2.86%     48.57%      
## 2 18-21                   65    2.17 9.23%     44.62%      
## 3 above 26                10    1.85 0.00%     40.00%      
## 
## scholarship_type (n>=5):
## # A tibble: 3 × 5
##   scholarship_type students avg_gpa fail_rate at_risk_rate
##   <chr>               <int>   <dbl> <chr>     <chr>       
## 1 Full                   23    1.83 0.00%     56.52%      
## 2 50%                    76    1.92 3.95%     51.32%      
## 3 75%                    42    2.49 11.90%    33.33%      
## 
## high_school_type (n>=5):
## # A tibble: 3 × 5
##   high_school_type students avg_gpa fail_rate at_risk_rate
##   <chr>               <int>   <dbl> <chr>     <chr>       
## 1 private                25    1.76 8.00%     60.00%      
## 2 other                  17    2.18 0.00%     47.06%      
## 3 state                 103    2.15 5.83%     42.72%      
## 
## salary_group (n>=5):
## # A tibble: 4 × 5
##   salary_group students avg_gpa fail_rate at_risk_rate
##   <chr>           <int>   <dbl> <chr>     <chr>       
## 1 USD 271-340        16    1.72 6.25%     62.50%      
## 2 above 410           5    1.8  20.00%    60.00%      
## 3 USD 201-270        27    1.96 3.70%     59.26%      
## 4 USD 135-200        93    2.23 4.30%     38.71%

9) Course Summary

Helps identify courses with unusually high at-risk rates (potential action areas).

if ("course_id" %in% names(df)) {
  course_summary <- df %>%
    dplyr::group_by(course_id) %>%
    dplyr::summarise(
      students = if ("student_id" %in% names(df)) dplyr::n_distinct(student_id) else dplyr::n(),
      avg_gpa = safe_mean(.data[[gp_col]]),
      at_risk_rate = mean(is_at_risk, na.rm = TRUE),
      fail_rate = mean(is_fail, na.rm = TRUE),
      .groups = "drop"
    ) %>%
    dplyr::mutate(avg_gpa = round(avg_gpa, 2)) %>%
    dplyr::arrange(dplyr::desc(at_risk_rate), dplyr::desc(students)) %>%
    dplyr::mutate(
      at_risk_rate = fmt_pct01(at_risk_rate),
      fail_rate = fmt_pct01(fail_rate)
    )

  print(course_summary)
} else {
  cat("course_id column not found — skipping course summary.\n")
}

## # A tibble: 9 × 5
##   course_id students avg_gpa at_risk_rate fail_rate
##       <dbl>    <int>   <dbl> <chr>        <chr>    
## 1         8       14    1.14 100.00%      7.14%    
## 2         1       66    1.62 62.12%       4.55%    
## 3         9       21    1.5  52.38%       19.05%   
## 4         2        2    2    50.00%       0.00%    
## 5         7       15    3.67 0.00%        0.00%    
## 6         3        8    3.56 0.00%        0.00%    
## 7         6        8    3.44 0.00%        0.00%    
## 8         5        7    3.14 0.00%        0.00%    
## 9         4        4    2.75 0.00%        0.00%

Interpretation: Focus first on courses with high at-risk rates and larger enrollments.

Summary

This analysis prepared the student dataset for Power BI and identified key patterns:

Overall at-risk rate: ~46% of students are at-risk (DC/DD/Fail)
Key behavioral factors: Seminar attendance, class attendance, and study hours are associated with differences in at-risk rates
Courses of concern: Some courses show 100% at-risk rates and warrant further investigation

Student Dataset - Data Preparation + EDA

Yudheksha Govindarajan Kavitha

2025-12-31