Analysis of Ideological Terms in the FAS Catalog

import fitz  # PyMuPDF
import re
import pandas as pd
import os

# === Woke keywords ===
WOKE_TERMS = [
    "social justice", "colonialism", "land acknowledgement", "oppression",
    "marginalized", "intersectional", "equity", "inclusivity", "diversity",
    "privilege", "systemic", "anti-racism", "decolonization", "reparations",
    "intersectionality", "systemic racism", "white supremacy", "microaggressions",
    "cultural appropriation", "inclusion", "DEI", "postcolonial", "indigenous rights",
    "settler colonialism", "gender identity", "queer theory", "trans rights", "feminism",
    "critical race theory", "white privilege", "liberation", "critical theory",
    "racism", "imperialism", "decolonial", "anti-blackness", "xenophobia"
]

# === Department headings ===
DEPARTMENT_HEADINGS = [
    "African and African American Studies", "American Studies", "Anthropology", "Applied Computation",
    "Applied Mathematics", "Applied Physics", "Art, Film, and Visual Studies", "Astronomy",
    "Biological Sciences in Dental Medicine", "Biological Sciences in Public Health", "Biomedical Engineering",
    "Biophysics", "Biostatistics", "Celtic Languages and Literatures", "Chemical and Physical Biology",
    "Chemical Biology", "Chemistry and Chemical Biology", "Classics, The", "Comparative Literature",
    "Computer Science", "Earth and Planetary Sciences", "East Asian Languages and Civilizations", "Economics",
    "Education Studies", "Engineering Sciences", "English", "Environmental Science and Public Policy",
    "Environmental Science and Engineering", "Ethnicity, Migration, Rights", "Expository Writing",
    "Folklore and Mythology", "General Education", "Germanic Languages and Literatures",
    "Global Health and Health Policy", "Government", "Health Policy", "History", "History and Literature",
    "History of Art and Architecture", "History of Science", "Human Evolutionary Biology", "Humanities",
    "Inner Asian and Altaic Studies", "Linguistics", "Mathematics", "Medical Sciences", "Medieval Studies",
    "Mind, Brain, and Behavior", "Molecular and Cellular Biology", "Music", "Near Eastern Languages and Civilizations",
    "Neuroscience", "Organismic and Evolutionary Biology", "Philosophy", "Physics",
    "Political Economy and Government", "Population Health Sciences", "Psychology", "Public Policy",
    "Quantum Science & Engineering", "Regional Studies - East Asia", "Religion, The Study of",
    "Romance Languages and Literatures", "Russia, Eastern Europe, and Central Asia",
    "Slavic Languages and Literatures", "Social Policy", "Social Studies", "Sociology", "South Asian Studies",
    "Special Concentrations", "Statistics", "Stem Cell and Regenerative Biology", "Systems Biology",
    "Theater, Dance, and Media", "Ukrainian Studies", "Women, Gender, and Sexuality, Studies of"
]

# === Extract text from PDF ===
def extract_text(pdf_path):
    print("📄 Reading PDF...")
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        blocks = page.get_text("blocks")
        for block in blocks:
            block_text = block[4].strip()
            if block_text and not block_text.startswith("HARVARD UNIVERSITY"):
                text += block_text + "\n"
    doc.close()
    return text

# === Split and parse courses ===
def parse_courses(text):
    print("✂️ Splitting and parsing courses...")
    lines = text.splitlines()
    courses = []
    current_dept = "Unknown"
    buffer = []
    code_pattern = re.compile(r'^([A-Z]{3,15})\s+\d{1,4}[A-Z]*\s*(\(\d{1,2}\))?')
    dept_lookup = {d.lower(): d for d in DEPARTMENT_HEADINGS}
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
        line_lower = line.lower()
        
        # Check if line is a department heading
        matched_dept = dept_lookup.get(line_lower)
        if matched_dept:
            current_dept = matched_dept
            if buffer:
                courses.append(process_buffer(buffer, current_dept))
                buffer = []
            continue
        
        # Check if line is a course code
        match = code_pattern.match(line)
        if match:
            if buffer:
                courses.append(process_buffer(buffer, current_dept))
            buffer = [line]
        else:
            if buffer:
                buffer.append(line)
    
    # Process final buffer
    if buffer:
        courses.append(process_buffer(buffer, current_dept))
    
    print(f"📦 Found {len(courses)} course entries")
    return courses

# === Process course buffer ===
def process_buffer(buffer, dept):
    if not buffer:
        return None
    first_line = buffer[0]
    match = re.match(r'^([A-Z]{3,15})\s+(\S.*)', first_line)
    if not match:
        return None
    code = match.group(1)
    title = match.group(2)
    description = " ".join(buffer[1:]).strip()
    return {
        "code": code,
        "title": title,
        "description": description,
        "department": dept
    }

# === Detect woke terms ===
def detect_woke_terms(courses):
    print("🕵️ Detecting woke matches...")
    parsed_courses = [c for c in courses if c]  # Remove None entries
    for course in parsed_courses:
        desc = course.get("description", "").lower()
        matched = []
        for term in WOKE_TERMS:
            if re.search(rf'\b{re.escape(term)}\b', desc, re.IGNORECASE):
                matched.append(term)
        course["matched_terms"] = ", ".join(set(matched)) if matched else ""
        course["num_matched_terms"] = len(set(matched))
    return parsed_courses

# === Calculate percentages per code ===
def calculate_percentages(courses):
    df = pd.DataFrame(courses)
    summary = df.groupby("code").agg(
        total_courses=pd.NamedAgg(column="code", aggfunc="count"),
        matched_courses=pd.NamedAgg(column="num_matched_terms", aggfunc=lambda x: sum(x > 0))
    ).reset_index()
    summary["percent_matched"] = summary["matched_courses"] / summary["total_courses"]
    return summary

# === Main pipeline ===
def main(pdf_path, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    
    # Extract text
    text = extract_text(pdf_path)
    
    # Parse courses
    courses = parse_courses(text)
    
    # Detect woke terms
    courses = detect_woke_terms(courses)
    
    # Create DataFrame
    df = pd.DataFrame(courses)
    
    # Save all courses
    print("💾 Saving output...")
    df.to_csv(os.path.join(output_dir, "all_courses.csv"), index=False)
    
    # Save woke courses
    df[df["num_matched_terms"] > 0].to_csv(os.path.join(output_dir, "woke_courses.csv"), index=False)
    
    # Calculate and save percentages
    summary = calculate_percentages(courses)
    summary.to_csv(os.path.join(output_dir, "tabs.csv"), index=False)
    
    print(f"✅ Parsed {len(courses)} courses")
    print(f"📊 Summary saved with {len(summary)} course codes")
    print("🎉 Done.")

# === Run the pipeline ===
if __name__ == "__main__":
    main(
        pdf_path="fas_crse_cat.pdf",  # Update with your PDF path
        output_dir="/Users/charleenadams/harvard/results"
    )

# === End of script ===

library(data.table)

# Load the dataset
courses <- fread('/Users/charleenadams/harvard/results/all_courses.csv')

# Ensure num_matched_terms is numeric
courses[, num_matched_terms := as.integer(num_matched_terms)]

# Sort in descending order by num_matched_terms
courses <- courses[order(-num_matched_terms)]
colnames(courses)

## [1] "code"              "title"             "description"      
## [4] "department"        "matched_terms"     "num_matched_terms"

courses$department=NULL

# Compute summary stats by course code
summary_by_code <- courses[, .(
  total_courses = .N,
  woke_courses = sum(num_matched_terms > 0),
  total_woke_terms = sum(num_matched_terms, na.rm = TRUE)
), by = code]

# Calculate percentages
summary_by_code[, proportion_woke_courses := round((woke_courses / total_courses), 2)]

# Calculate percent of all matched woke terms across the dataset
total_woke_terms_all_codes <- sum(summary_by_code$total_woke_terms, na.rm = TRUE)

# Add cleaned list of unique matched terms for each code (remove NA, strip whitespace)
matched_terms_by_code <- courses[num_matched_terms > 0, .(
  matched_terms = paste(na.omit(unique(trimws(unlist(tstrsplit(matched_terms, ",\\s*"))))), collapse = ", ")
), by = code]
summary_by_code <- merge(summary_by_code, matched_terms_by_code, by = "code", all.x = TRUE)

# Move department to the first column, matched_terms to last
setcolorder(summary_by_code, c("code", "total_courses", "woke_courses", "total_woke_terms", "proportion_woke_courses", "matched_terms"))

# Sort by percent_woke_courses and then total_woke_terms
summary_by_code <- summary_by_code[order(-proportion_woke_courses, -total_woke_terms)]

# View top rows
head(summary_by_code)

##        code total_courses woke_courses total_woke_terms proportion_woke_courses
##      <char>         <int>        <int>            <int>                   <num>
## 1:      EMR             7            5               14                    0.71
## 2: MODMDEST            15            6               12                    0.40
## 3: CLASPHIL             3            1                4                    0.33
## 4:  KORHIST             3            1                1                    0.33
## 5:  YIDDISH             3            1                1                    0.33
## 6:   WOMGEN            25            8               18                    0.32
##                                                                                                                                                                              matched_terms
##                                                                                                                                                                                     <char>
## 1:      decolonization, anti-blackness, xenophobia, decolonial, racism, reparations, imperialism, critical race theory, settler colonialism, intersectional, equity, feminism, colonialism
## 2:                                                                                                                                      liberation, diversity, decolonization, colonialism
## 3:                                                                                                                                       imperialism, colonialism, white supremacy, racism
## 4:                                                                                                                                                                              liberation
## 5:                                                                                                                                                                               diversity
## 6: critical race theory, cultural appropriation, racism, postcolonial, privilege, feminism, intersectional, liberation, queer theory, colonialism, marginalized, inclusion, anti-blackness

# Save results
fwrite(summary_by_code, '/Users/charleenadams/harvard/results/woke_summary_by_code_cleaned.csv')

library(DT)

datatable(
  courses,
  options = list(
    pageLength = 10,
    scrollX = TRUE,
    autoWidth = TRUE,
    searchHighlight = TRUE,
    dom = 'Bfrtip'
  ),
  filter = 'top',
  caption = htmltools::tags$caption(
    style = 'caption-side: top; text-align: left;',
    '📚 Full Harvard Courses Table'
  )
)

## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html

datatable(
  summary_by_code,
  options = list(
    pageLength = 25,
    scrollX = TRUE,
    autoWidth = TRUE,
    searchHighlight = TRUE,
    dom = 'Bfrtip'
  ),
  filter = 'top',
  caption = htmltools::tags$caption(
    style = 'caption-side: top; text-align: left;',
    '📊 Summary of Wokeness by Course Code'
  )
)

Analysis of Ideological Terms in the FAS Catalog

שׁוֹשַׁנָּה🌹

April 30, 2025