import fitz # PyMuPDF
import re
import pandas as pd
import os
# === Woke keywords ===
WOKE_TERMS = [
"social justice", "colonialism", "land acknowledgement", "oppression",
"marginalized", "intersectional", "equity", "inclusivity", "diversity",
"privilege", "systemic", "anti-racism", "decolonization", "reparations",
"intersectionality", "systemic racism", "white supremacy", "microaggressions",
"cultural appropriation", "inclusion", "DEI", "postcolonial", "indigenous rights",
"settler colonialism", "gender identity", "queer theory", "trans rights", "feminism",
"critical race theory", "white privilege", "liberation", "critical theory",
"racism", "imperialism", "decolonial", "anti-blackness", "xenophobia"
]
# === Department headings ===
DEPARTMENT_HEADINGS = [
"African and African American Studies", "American Studies", "Anthropology", "Applied Computation",
"Applied Mathematics", "Applied Physics", "Art, Film, and Visual Studies", "Astronomy",
"Biological Sciences in Dental Medicine", "Biological Sciences in Public Health", "Biomedical Engineering",
"Biophysics", "Biostatistics", "Celtic Languages and Literatures", "Chemical and Physical Biology",
"Chemical Biology", "Chemistry and Chemical Biology", "Classics, The", "Comparative Literature",
"Computer Science", "Earth and Planetary Sciences", "East Asian Languages and Civilizations", "Economics",
"Education Studies", "Engineering Sciences", "English", "Environmental Science and Public Policy",
"Environmental Science and Engineering", "Ethnicity, Migration, Rights", "Expository Writing",
"Folklore and Mythology", "General Education", "Germanic Languages and Literatures",
"Global Health and Health Policy", "Government", "Health Policy", "History", "History and Literature",
"History of Art and Architecture", "History of Science", "Human Evolutionary Biology", "Humanities",
"Inner Asian and Altaic Studies", "Linguistics", "Mathematics", "Medical Sciences", "Medieval Studies",
"Mind, Brain, and Behavior", "Molecular and Cellular Biology", "Music", "Near Eastern Languages and Civilizations",
"Neuroscience", "Organismic and Evolutionary Biology", "Philosophy", "Physics",
"Political Economy and Government", "Population Health Sciences", "Psychology", "Public Policy",
"Quantum Science & Engineering", "Regional Studies - East Asia", "Religion, The Study of",
"Romance Languages and Literatures", "Russia, Eastern Europe, and Central Asia",
"Slavic Languages and Literatures", "Social Policy", "Social Studies", "Sociology", "South Asian Studies",
"Special Concentrations", "Statistics", "Stem Cell and Regenerative Biology", "Systems Biology",
"Theater, Dance, and Media", "Ukrainian Studies", "Women, Gender, and Sexuality, Studies of"
]
# === Extract text from PDF ===
def extract_text(pdf_path):
print("📄 Reading PDF...")
doc = fitz.open(pdf_path)
text = ""
for page in doc:
blocks = page.get_text("blocks")
for block in blocks:
block_text = block[4].strip()
if block_text and not block_text.startswith("HARVARD UNIVERSITY"):
text += block_text + "\n"
doc.close()
return text
# === Split and parse courses ===
def parse_courses(text):
print("✂️ Splitting and parsing courses...")
lines = text.splitlines()
courses = []
current_dept = "Unknown"
buffer = []
code_pattern = re.compile(r'^([A-Z]{3,15})\s+\d{1,4}[A-Z]*\s*(\(\d{1,2}\))?')
dept_lookup = {d.lower(): d for d in DEPARTMENT_HEADINGS}
for line in lines:
line = line.strip()
if not line:
continue
line_lower = line.lower()
# Check if line is a department heading
matched_dept = dept_lookup.get(line_lower)
if matched_dept:
current_dept = matched_dept
if buffer:
courses.append(process_buffer(buffer, current_dept))
buffer = []
continue
# Check if line is a course code
match = code_pattern.match(line)
if match:
if buffer:
courses.append(process_buffer(buffer, current_dept))
buffer = [line]
else:
if buffer:
buffer.append(line)
# Process final buffer
if buffer:
courses.append(process_buffer(buffer, current_dept))
print(f"📦 Found {len(courses)} course entries")
return courses
# === Process course buffer ===
def process_buffer(buffer, dept):
if not buffer:
return None
first_line = buffer[0]
match = re.match(r'^([A-Z]{3,15})\s+(\S.*)', first_line)
if not match:
return None
code = match.group(1)
title = match.group(2)
description = " ".join(buffer[1:]).strip()
return {
"code": code,
"title": title,
"description": description,
"department": dept
}
# === Detect woke terms ===
def detect_woke_terms(courses):
print("🕵️ Detecting woke matches...")
parsed_courses = [c for c in courses if c] # Remove None entries
for course in parsed_courses:
desc = course.get("description", "").lower()
matched = []
for term in WOKE_TERMS:
if re.search(rf'\b{re.escape(term)}\b', desc, re.IGNORECASE):
matched.append(term)
course["matched_terms"] = ", ".join(set(matched)) if matched else ""
course["num_matched_terms"] = len(set(matched))
return parsed_courses
# === Calculate percentages per code ===
def calculate_percentages(courses):
df = pd.DataFrame(courses)
summary = df.groupby("code").agg(
total_courses=pd.NamedAgg(column="code", aggfunc="count"),
matched_courses=pd.NamedAgg(column="num_matched_terms", aggfunc=lambda x: sum(x > 0))
).reset_index()
summary["percent_matched"] = summary["matched_courses"] / summary["total_courses"]
return summary
# === Main pipeline ===
def main(pdf_path, output_dir):
os.makedirs(output_dir, exist_ok=True)
# Extract text
text = extract_text(pdf_path)
# Parse courses
courses = parse_courses(text)
# Detect woke terms
courses = detect_woke_terms(courses)
# Create DataFrame
df = pd.DataFrame(courses)
# Save all courses
print("💾 Saving output...")
df.to_csv(os.path.join(output_dir, "all_courses.csv"), index=False)
# Save woke courses
df[df["num_matched_terms"] > 0].to_csv(os.path.join(output_dir, "woke_courses.csv"), index=False)
# Calculate and save percentages
summary = calculate_percentages(courses)
summary.to_csv(os.path.join(output_dir, "tabs.csv"), index=False)
print(f"✅ Parsed {len(courses)} courses")
print(f"📊 Summary saved with {len(summary)} course codes")
print("🎉 Done.")
# === Run the pipeline ===
if __name__ == "__main__":
main(
pdf_path="fas_crse_cat.pdf", # Update with your PDF path
output_dir="/Users/charleenadams/harvard/results"
)
# === End of script ===
library(data.table)
# Load the dataset
courses <- fread('/Users/charleenadams/harvard/results/all_courses.csv')
# Ensure num_matched_terms is numeric
courses[, num_matched_terms := as.integer(num_matched_terms)]
# Sort in descending order by num_matched_terms
courses <- courses[order(-num_matched_terms)]
colnames(courses)
## [1] "code" "title" "description"
## [4] "department" "matched_terms" "num_matched_terms"
courses$department=NULL
# Compute summary stats by course code
summary_by_code <- courses[, .(
total_courses = .N,
woke_courses = sum(num_matched_terms > 0),
total_woke_terms = sum(num_matched_terms, na.rm = TRUE)
), by = code]
# Calculate percentages
summary_by_code[, proportion_woke_courses := round((woke_courses / total_courses), 2)]
# Calculate percent of all matched woke terms across the dataset
total_woke_terms_all_codes <- sum(summary_by_code$total_woke_terms, na.rm = TRUE)
# Add cleaned list of unique matched terms for each code (remove NA, strip whitespace)
matched_terms_by_code <- courses[num_matched_terms > 0, .(
matched_terms = paste(na.omit(unique(trimws(unlist(tstrsplit(matched_terms, ",\\s*"))))), collapse = ", ")
), by = code]
summary_by_code <- merge(summary_by_code, matched_terms_by_code, by = "code", all.x = TRUE)
# Move department to the first column, matched_terms to last
setcolorder(summary_by_code, c("code", "total_courses", "woke_courses", "total_woke_terms", "proportion_woke_courses", "matched_terms"))
# Sort by percent_woke_courses and then total_woke_terms
summary_by_code <- summary_by_code[order(-proportion_woke_courses, -total_woke_terms)]
# View top rows
head(summary_by_code)
## code total_courses woke_courses total_woke_terms proportion_woke_courses
## <char> <int> <int> <int> <num>
## 1: EMR 7 5 14 0.71
## 2: MODMDEST 15 6 12 0.40
## 3: CLASPHIL 3 1 4 0.33
## 4: KORHIST 3 1 1 0.33
## 5: YIDDISH 3 1 1 0.33
## 6: WOMGEN 25 8 18 0.32
## matched_terms
## <char>
## 1: decolonization, anti-blackness, xenophobia, decolonial, racism, reparations, imperialism, critical race theory, settler colonialism, intersectional, equity, feminism, colonialism
## 2: liberation, diversity, decolonization, colonialism
## 3: imperialism, colonialism, white supremacy, racism
## 4: liberation
## 5: diversity
## 6: critical race theory, cultural appropriation, racism, postcolonial, privilege, feminism, intersectional, liberation, queer theory, colonialism, marginalized, inclusion, anti-blackness
# Save results
fwrite(summary_by_code, '/Users/charleenadams/harvard/results/woke_summary_by_code_cleaned.csv')
library(DT)
datatable(
courses,
options = list(
pageLength = 10,
scrollX = TRUE,
autoWidth = TRUE,
searchHighlight = TRUE,
dom = 'Bfrtip'
),
filter = 'top',
caption = htmltools::tags$caption(
style = 'caption-side: top; text-align: left;',
'📚 Full Harvard Courses Table'
)
)
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
datatable(
summary_by_code,
options = list(
pageLength = 25,
scrollX = TRUE,
autoWidth = TRUE,
searchHighlight = TRUE,
dom = 'Bfrtip'
),
filter = 'top',
caption = htmltools::tags$caption(
style = 'caption-side: top; text-align: left;',
'📊 Summary of Wokeness by Course Code'
)
)