STUDENT ATTENDANCE HEATMAP

Author

Aaditya Negi & Anshul Kumar

TEAM - 9

Q - Develop a heatmap of student attendance across schools per week during a calender year

Rpub link : https://rpubs.com/PhEnOMeoN/LA-1

Step 1: Load Packages and Data

# Load required libraries for data handling and visualization
library(tidyverse)     # Includes ggplot2 and dplyr for plotting and data wrangling

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)     # For working with date columns
library(scales)        # For formatting axis/legend labels (e.g., percentages)


Attaching package: 'scales'

The following object is masked from 'package:purrr':

    discard

The following object is masked from 'package:readr':

    col_factor

library(viridis)       # For colorblind-friendly color palettes

Loading required package: viridisLite

Attaching package: 'viridis'

The following object is masked from 'package:scales':

    viridis_pal

# Read the CSV file into R (update the filename/path if needed)
attendance_df <- read_csv("2018-2019_Daily_Attendance_20240429.csv")

Rows: 277153 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): School DBN
dbl (5): Date, Enrolled, Absent, Present, Released

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Step 2: Convert and Prepare Columns

attendance_df <- attendance_df %>%
  mutate(
    Date = ymd(Date),                           # Convert to date
    Week = isoweek(Date),                       # Extract ISO week
    AttendanceRate = Present / Enrolled         # Compute attendance rate
  )

Step 3: Group by School and Week

heatmap_data <- attendance_df %>%
  group_by(`School DBN`, Week) %>%                     # Group data by school and week number
  summarise(AvgAttendance = mean(AttendanceRate, na.rm = TRUE)) %>%  # Compute weekly average attendance
  ungroup()                                            # Remove grouping to return regular dataframe

`summarise()` has grouped output by 'School DBN'. You can override using the
`.groups` argument.

Step 4: Filter to Top 20 Schools

top_schools <- attendance_df %>%
  count(`School DBN`, sort = TRUE) %>%  # Count how many records each school has
  slice_head(n = 20) %>%                # Keep only the top 20 schools by record count
  pull(`School DBN`)                    # Extract just the school names into a vector

heatmap_filtered <- heatmap_data %>%
  filter(`School DBN` %in% top_schools)  # Keep only the records from the top 20 schools

Step 5: Create the Heatmap

library(ggplot2)
library(forcats)
library(scales)
library(viridis)

ggplot(heatmap_filtered, aes(
  x = factor(Week),
  y = fct_reorder(`School DBN`, AvgAttendance),
  fill = AvgAttendance
)) +
  geom_tile(color = "white") +                                # Create the heatmap tiles with white borders
  scale_fill_viridis_c(                                       # Use colorblind-friendly gradient for attendance
    name = "Attendance Rate",                                 # Legend title
    option = "D",
    labels = percent_format(accuracy = 1)                     # Format fill values as percentages
  ) +
  labs(
    title = "Weekly Attendance Heatmap by School (2018–2019)",          # Main plot title
    subtitle = "Each tile represents the average attendance rate per school per week",  # Subtitle
    caption = "Data Source: NYC DOE via Kaggle | Visualization by Your Name",           # Caption with attribution
    x = "Week Number",                                                  # X-axis label
    y = "School DBN"                                                    # Y-axis label
  ) +
  theme_minimal(base_size = 12) +                        # Apply clean minimal theme with larger font
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),   # Tilt x-axis labels for readability
    plot.title = element_text(face = "bold", size = 14), # Make the title bold and larger
    plot.caption = element_text(hjust = 0)               # Align caption to the left
  )