# Load required libraries for data handling and visualizationlibrary(tidyverse) # Includes ggplot2 and dplyr for plotting and data wrangling
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.4
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate) # For working with date columnslibrary(scales) # For formatting axis/legend labels (e.g., percentages)
Attaching package: 'scales'
The following object is masked from 'package:purrr':
discard
The following object is masked from 'package:readr':
col_factor
library(viridis) # For colorblind-friendly color palettes
Loading required package: viridisLite
Attaching package: 'viridis'
The following object is masked from 'package:scales':
viridis_pal
# Read the CSV file into R (update the filename/path if needed)attendance_df <-read_csv("2018-2019_Daily_Attendance_20240429.csv")
Rows: 277153 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): School DBN
dbl (5): Date, Enrolled, Absent, Present, Released
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
heatmap_data <- attendance_df %>%group_by(`School DBN`, Week) %>%# Group data by school and week numbersummarise(AvgAttendance =mean(AttendanceRate, na.rm =TRUE)) %>%# Compute weekly average attendanceungroup() # Remove grouping to return regular dataframe
`summarise()` has grouped output by 'School DBN'. You can override using the
`.groups` argument.
Step 4: Filter to Top 20 Schools
top_schools <- attendance_df %>%count(`School DBN`, sort =TRUE) %>%# Count how many records each school hasslice_head(n =20) %>%# Keep only the top 20 schools by record countpull(`School DBN`) # Extract just the school names into a vectorheatmap_filtered <- heatmap_data %>%filter(`School DBN`%in% top_schools) # Keep only the records from the top 20 schools
Step 5: Create the Heatmap
library(ggplot2)library(forcats)library(scales)library(viridis)ggplot(heatmap_filtered, aes(x =factor(Week),y =fct_reorder(`School DBN`, AvgAttendance),fill = AvgAttendance)) +geom_tile(color ="white") +# Create the heatmap tiles with white bordersscale_fill_viridis_c( # Use colorblind-friendly gradient for attendancename ="Attendance Rate", # Legend titleoption ="D",labels =percent_format(accuracy =1) # Format fill values as percentages ) +labs(title ="Weekly Attendance Heatmap by School (2018–2019)", # Main plot titlesubtitle ="Each tile represents the average attendance rate per school per week", # Subtitlecaption ="Data Source: NYC DOE via Kaggle | Visualization by Your Name", # Caption with attributionx ="Week Number", # X-axis labely ="School DBN"# Y-axis label ) +theme_minimal(base_size =12) +# Apply clean minimal theme with larger fonttheme(axis.text.x =element_text(angle =45, hjust =1), # Tilt x-axis labels for readabilityplot.title =element_text(face ="bold", size =14), # Make the title bold and largerplot.caption =element_text(hjust =0) # Align caption to the left )