To do before: Download the CSV from Qualtrics. Open in Excel/Numbers. Find the self_report_exp column. Manually replace any text responses in that column with the corresponding number (e.g., if they typed “Forensic Science”, change that cell to include 10). Save and run the script.

CHUNK 1: SETUP AND LIBRARIES

# This chunk loads the necessary toolkits for the analysis.

knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

# Load packages
library(tidyverse)  # The main toolkit for data manipulation and plotting
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(knitr)      # For creating basic tables
library(kableExtra) # For making tables look professional in HTML
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(janitor)    # For cleaning column names (optional but helpful)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

CHUNK 2: DATA IMPORT AND CLEANING

# 1. READ DATA
# Replace the filename below with real data file name
filename <- "sycophancy_materials_pilot_2.csv"

# Read the CSV. We assume the first row contains the variable names (S1_T1_Decision, etc.)
raw_data <- read_csv(filename)

# 2. REMOVE EXTRA QUALTRICS HEADERS
# Qualtrics exports 3 header rows:
# Row 1: Variable Names (became the column headers in R)
# Row 2: Question Text (e.g., "Start Date", "Which option is better...")
# Row 3: Import IDs (e.g., "{ImportId...}")
# We need to remove rows 1 and 2 from the dataframe (which correspond to rows 2 and 3 of the file)
survey_data <- raw_data %>%
  slice(-c(1, 2)) %>%   # Remove the top two rows containing metadata
  type_convert()        # Convert text numbers ("1") into actual numbers (1)

# 3. FILTER OUT PREVIEWS (Optional but recommended)
# Often we want to exclude "Survey Preview" responses and only keep real data.
# The 'DistributionChannel' column usually contains 'preview' for tests.
# If you want to include previews for testing this script, comment out the next line.
# survey_data <- survey_data %>% filter(DistributionChannel != "preview")

# 4. QUICK CHECK
# Print the number of participants and columns to verify the load
print(paste("Data loaded successfully."))
## [1] "Data loaded successfully."
print(paste("Total Participants:", nrow(survey_data)))
## [1] "Total Participants: 40"
print(paste("Total Columns:", ncol(survey_data)))
## [1] "Total Columns: 254"
# Show a quick preview of the decision columns to ensure they are numeric (0/1)
# Using select(matches...) to pick columns that look like decisions
survey_data %>% 
  select(matches("_T1_Decision")) %>% 
  head() %>% 
  kable(caption = "Preview of Decision Data (Should be 0 or 1)") %>% 
  kable_styling()
Preview of Decision Data (Should be 0 or 1)
S1_T1_Decision S3_T1_Decision S4_T1_Decision S5_T1_Decision S6_T1_Decision S7_T1_Decision S8_T1_Decision
0 0 1 0 1 0 0
NA NA NA NA NA NA NA
1 1 1 1 0 0 1
0 0 1 0 0 0 1
1 0 0 1 1 1 1
NA NA NA NA NA NA NA

CHUNK 3: DATA WRANGLING & EXPERTISE MAPPING

# Helper function (Robust "Nuclear" Version)
# This prevents crashes if Qualtrics mixes text and numbers in the same column
get_first_value <- function(data, pattern) {
  data %>%
    select(matches(pattern)) %>% 
    select(-contains("TEXT"), -contains("text")) %>% 
    mutate(across(everything(), ~suppressWarnings(as.numeric(.)))) %>%
    mutate(combined = coalesce(!!!.)) %>%
    pull(combined)
}

clean_data <- survey_data %>%
  
  # 1. CONSOLIDATE PRIMING DATA
  # Merging the 7 randomized review slots into single columns per scenario
  mutate(
    # Did they detect the priming? (1 = Yes, 0 = No)
    S1_Priming_Detected = ifelse(get_first_value(., "^S1_.*Review$") == 1, 1, 0),
    S3_Priming_Detected = ifelse(get_first_value(., "^S3_.*Review$") == 1, 1, 0),
    S4_Priming_Detected = ifelse(get_first_value(., "^S4_.*Review$") == 1, 1, 0),
    S5_Priming_Detected = ifelse(get_first_value(., "^S5_.*Review$") == 1, 1, 0),
    S6_Priming_Detected = ifelse(get_first_value(., "^S6_.*Review$") == 1, 1, 0),
    S7_Priming_Detected = ifelse(get_first_value(., "^S7_.*Review$") == 1, 1, 0),
    S8_Priming_Detected = ifelse(get_first_value(., "^S8_.*Review$") == 1, 1, 0),
    
    # How did it affect them? (1 = Helpful, -1 = Suspicious, 0 = No effect)
    S1_Priming_Effect = get_first_value(., "S1_Review_p3"),
    S3_Priming_Effect = get_first_value(., "S3_Review_p3"),
    S4_Priming_Effect = get_first_value(., "S4_Review_p3"),
    S5_Priming_Effect = get_first_value(., "S5_Review_p3"),
    S6_Priming_Effect = get_first_value(., "S6_Review_p3"),
    S7_Priming_Effect = get_first_value(., "S7_Review_p3"),
    S8_Priming_Effect = get_first_value(., "S8_Review_p3")
  ) %>%
  
  # 2. MAP EXPERTISE (Create TRUE/FALSE flags)
  # R simply scans your manually cleaned CSV column for these numbers.
  # 1=Psych, 2=Health Sci, 3=Methods, 4=Data, 5=Clin Psych, 6=Edu, 7=Learning, 
  # 8=Healthcare, 9=Medicine, 10=Crim, 11=Social Work, 12=Nutr, 13=Pub Health, 14=Business
  mutate(
    S1_Is_Expert = str_detect(self_report_exp, "\\b1\\b|\\b2\\b|\\b3\\b|\\b4\\b"), # Psych Research
    S3_Is_Expert = str_detect(self_report_exp, "\\b1\\b|\\b5\\b"),                 # Anxiety
    S4_Is_Expert = str_detect(self_report_exp, "\\b6\\b|\\b7\\b"),                 # Maths/Edu
    S5_Is_Expert = str_detect(self_report_exp, "\\b8\\b|\\b9\\b"),                 # MRI/Med
    S6_Is_Expert = str_detect(self_report_exp, "\\b10\\b|\\b11\\b"),               # Crim/Youth
    S7_Is_Expert = str_detect(self_report_exp, "\\b12\\b|\\b13\\b"),               # Nutrition
    S8_Is_Expert = str_detect(self_report_exp, "\\b14\\b")                         # Inventory/Biz
  ) %>%
  
  # 3. DEFINE ACCURACY COLUMNS
  # Ensure they are numeric (1 or 0)
  mutate(
    S1_Accuracy = as.numeric(S1_T1_Decision),
    S3_Accuracy = as.numeric(S3_T1_Decision),
    S4_Accuracy = as.numeric(S4_T1_Decision),
    S5_Accuracy = as.numeric(S5_T1_Decision),
    S6_Accuracy = as.numeric(S6_T1_Decision),
    S7_Accuracy = as.numeric(S7_T1_Decision),
    S8_Accuracy = as.numeric(S8_T1_Decision)
  )

# 4. PREVIEW
clean_data %>%
  select(S1_Accuracy, S1_Is_Expert, S1_Priming_Detected, S1_Priming_Effect) %>%
  head() %>%
  kable(caption = "Data Wrangling Check") %>%
  kable_styling()
Data Wrangling Check
S1_Accuracy S1_Is_Expert S1_Priming_Detected S1_Priming_Effect
0 NA NA NA
NA NA NA NA
1 FALSE NA NA
0 TRUE 0 NA
1 FALSE 0 NA
NA NA NA NA

CHUNK 4: MAIN VALIDATION ANALYSIS (CRITERIA A-E)

# 1. RESHAPE DATA (Wide to Long)
# Instead of S1, S3, S4 columns, we stack them into one big list.
# This allows us to calculate stats for all scenarios instantly.

long_data <- clean_data %>%
  select(ResponseId, ends_with("_Accuracy"), ends_with("_Is_Expert")) %>%
  # Pivot Accuracy columns
  pivot_longer(
    cols = ends_with("_Accuracy"),
    names_to = "Scenario_Label",
    values_to = "Score"
  ) %>%
  # Pivot Expert columns
  pivot_longer(
    cols = ends_with("_Is_Expert"),
    names_to = "Expert_Label",
    values_to = "Is_Expert"
  ) %>%
  # Clean up names to ensure we match S1 Accuracy to S1 Expertise
  mutate(
    Scenario_ID = str_extract(Scenario_Label, "^S\\d+"), # Extracts "S1", "S3"
    Expert_Match_ID = str_extract(Expert_Label, "^S\\d+")
  ) %>%
  # Keep only matching rows (e.g. S1 Accuracy with S1 Expertise)
  filter(Scenario_ID == Expert_Match_ID) %>%
  select(ResponseId, Scenario = Scenario_ID, Score, Is_Expert) %>%
  # Remove missing data
  drop_na(Score, Is_Expert)

# 2. CALCULATE STATISTICS PER SCENARIO
validation_stats <- long_data %>%
  group_by(Scenario, Is_Expert) %>%
  summarise(
    Avg_Accuracy = mean(Score),
    Count = n(),
    .groups = 'drop'
  ) %>%
  # Pivot to see Experts and Novices side-by-side
  pivot_wider(
    names_from = Is_Expert,
    values_from = c(Avg_Accuracy, Count),
    names_prefix = "Exp_"
  ) %>%
  # Rename for clarity (Exp_FALSE = Novice, Exp_TRUE = Expert)
  rename(
    Novice_Acc = Avg_Accuracy_Exp_FALSE,
    Expert_Acc = Avg_Accuracy_Exp_TRUE,
    Novice_N = Count_Exp_FALSE,
    Expert_N = Count_Exp_TRUE
  ) %>%
  # Calculate the Gap and Incorrect Rates
  mutate(
    Gap = Expert_Acc - Novice_Acc,
    Novice_Incorrect = 1 - Novice_Acc,
    Expert_Incorrect = 1 - Expert_Acc
  )

# 3. APPLY YOUR LOGIC (CRITERIA A-E)
final_decision_table <- validation_stats %>%
  mutate(
    # A) Ceiling (Novice >= .85)
    Flag_Ceiling = ifelse(Novice_Acc >= 0.85, "FAIL", "Pass"),
    
    # B) Floor (Novice <= .20)
    Flag_Floor = ifelse(Novice_Acc <= 0.20, "FAIL", "Pass"),
    
    # C) Ambiguity (Expert <= .65)
    Flag_Ambiguity = ifelse(Expert_Acc <= 0.65, "FAIL", "Pass"),
    
    # D) Discrimination (Gap < .15)
    Flag_Discrim = ifelse(Gap < 0.15, "FAIL", "Pass"),
    
    # E) FINAL KEEP DECISION
    # Rules: Novice .25-.75 AND Expert >= .80 AND Gap >= .15
    Recommendation = case_when(
      Novice_Acc >= 0.25 & Novice_Acc <= 0.75 &
      Expert_Acc >= 0.80 &
      Gap >= 0.15 ~ "KEEP",
      TRUE ~ "REVISE"
    )
  ) %>%
  
  # F) SAMPLE SIZE WARNINGS
  # Flag scenarios where groups are too small for reliable conclusions
  mutate(
    # Flag if Novice group has fewer than 10 people
    Small_Novice_N = ifelse(Novice_N < 10, "⚠️ LOW N", ""),
    
    # Flag if Expert group has fewer than 10 people
    Small_Expert_N = ifelse(Expert_N < 10, "⚠️ LOW N", ""),
    
    # Create a combined sample size warning
    Sample_Warning = case_when(
      Novice_N < 10 & Expert_N < 10 ~ "⚠️ BOTH GROUPS LOW",
      Novice_N < 10 ~ "⚠️ Novice N low",
      Expert_N < 10 ~ "⚠️ Expert N low",
      TRUE ~ "OK"
    ),
    
    # Update Recommendation if sample size is too small
    Recommendation = ifelse(
      Sample_Warning != "OK",
      paste0(Recommendation, " (SMALL N)"),
      Recommendation
    )
  )

# Print a warning summary
sample_size_issues <- final_decision_table %>%
  filter(Sample_Warning != "OK") %>%
  select(Scenario, Novice_N, Expert_N, Sample_Warning)

if (nrow(sample_size_issues) > 0) {
  print("⚠️⚠️⚠️ SAMPLE SIZE WARNINGS ⚠️⚠️⚠️")
  print("The following scenarios have small sample sizes (< 10 in at least one group):")
  print(kable(sample_size_issues, caption = "Scenarios with Insufficient Sample Sizes") %>% 
          kable_styling(bootstrap_options = "striped"))
  print("Recommendations based on these scenarios should be interpreted with caution!")
} else {
  print("✅ All scenarios have adequate sample sizes (N ≥ 10 in both groups)")
}
## [1] "⚠️⚠️⚠️ SAMPLE SIZE WARNINGS ⚠️⚠️⚠️"
## [1] "The following scenarios have small sample sizes (< 10 in at least one group):"
## <table class="table table-striped" style="margin-left: auto; margin-right: auto;">
## <caption>Scenarios with Insufficient Sample Sizes</caption>
##  <thead>
##   <tr>
##    <th style="text-align:left;"> Scenario </th>
##    <th style="text-align:right;"> Novice_N </th>
##    <th style="text-align:right;"> Expert_N </th>
##    <th style="text-align:left;"> Sample_Warning </th>
##   </tr>
##  </thead>
## <tbody>
##   <tr>
##    <td style="text-align:left;"> S1 </td>
##    <td style="text-align:right;"> 11 </td>
##    <td style="text-align:right;"> 3 </td>
##    <td style="text-align:left;"> ⚠️ Expert N low </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;"> S3 </td>
##    <td style="text-align:right;"> 12 </td>
##    <td style="text-align:right;"> 2 </td>
##    <td style="text-align:left;"> ⚠️ Expert N low </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;"> S4 </td>
##    <td style="text-align:right;"> 13 </td>
##    <td style="text-align:right;"> 1 </td>
##    <td style="text-align:left;"> ⚠️ Expert N low </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;"> S5 </td>
##    <td style="text-align:right;"> 11 </td>
##    <td style="text-align:right;"> 3 </td>
##    <td style="text-align:left;"> ⚠️ Expert N low </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;"> S6 </td>
##    <td style="text-align:right;"> 13 </td>
##    <td style="text-align:right;"> 1 </td>
##    <td style="text-align:left;"> ⚠️ Expert N low </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;"> S7 </td>
##    <td style="text-align:right;"> 12 </td>
##    <td style="text-align:right;"> 2 </td>
##    <td style="text-align:left;"> ⚠️ Expert N low </td>
##   </tr>
## </tbody>
## </table>[1] "Recommendations based on these scenarios should be interpreted with caution!"
# 👆 NEW CODE ENDS HERE 👆

# 4. DISPLAY THE RESULTS TABLE
final_decision_table %>%
  select(Scenario, Novice_N, Expert_N, Sample_Warning,
         Novice_Acc, Expert_Acc, Gap, Recommendation,
         Flag_Ceiling, Flag_Floor, Flag_Ambiguity, Flag_Discrim) %>%
  kable(digits = 2, caption = "Pilot Validation Results: Keep or Revise?") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  column_spec(7, bold = TRUE, color = "white",  # 👈 Changed from column 5 to 7 (Recommendation moved)
              background = ifelse(grepl("KEEP", final_decision_table$Recommendation), "green", "red"))
Pilot Validation Results: Keep or Revise?
Scenario Novice_N Expert_N Sample_Warning Novice_Acc Expert_Acc Gap Recommendation Flag_Ceiling Flag_Floor Flag_Ambiguity Flag_Discrim
S1 11 3 ⚠️ Expert N low
      0.6
# 5. OVERALL INCORRECT RATES (ALL SCENARIOS COMBINED)
# This tells you if you're hitting your 50% or lower target

overall_stats <- long_data %>%
  group_by(Is_Expert) %>%
  summarise(
    Total_Decisions = n(),
    Correct = sum(Score),
    Incorrect = sum(Score == 0),
    Accuracy = mean(Score),
    Incorrect_Rate = 1 - mean(Score)
  ) %>%
  mutate(Group = ifelse(Is_Expert, "Experts", "Novices")) %>%
  select(Group, Total_Decisions, Correct, Incorrect, Accuracy, Incorrect_Rate)

# Calculate the grand total (everyone, all scenarios)
grand_total <- long_data %>%
  summarise(
    Group = "ALL PARTICIPANTS",
    Total_Decisions = n(),
    Correct = sum(Score),
    Incorrect = sum(Score == 0),
    Accuracy = mean(Score),
    Incorrect_Rate = 1 - mean(Score)
  )

# Combine into one table
overall_table <- bind_rows(overall_stats, grand_total)

# Display the results
print("--- OVERALL INCORRECT RATES ---")
## [1] "--- OVERALL INCORRECT RATES ---"
overall_table %>%
  kable(digits = 3, caption = "Overall Performance Across All Scenarios") %>%
  kable_styling(bootstrap_options = c("striped", "hover")) %>%
  row_spec(3, bold = TRUE, background = "#f0f0f0")  # Highlight the ALL PARTICIPANTS row
Overall Performance Across All Scenarios
Group Total_Decisions Correct Incorrect Accuracy Incorrect_Rate
Novices 86 48 38 0.558 0.442
Experts 12 5 7 0.417 0.583
ALL PARTICIPANTS 98 53 45 0.541 0.459

CHUNK 5: PRIMING ANALYSIS & VISUALIZATION

# 1. PRIMING DETECTION RATES
# How many people answered "Yes" to "Did the wording make one option seem right?"
priming_summary <- clean_data %>%
  select(starts_with("S"), ends_with("_Priming_Detected")) %>%
  pivot_longer(cols = ends_with("_Detected"), names_to = "Scenario", values_to = "Detected") %>%
  mutate(Scenario = str_extract(Scenario, "^S\\d+")) %>%
  group_by(Scenario) %>%
  summarise(
    Detection_Rate = mean(Detected, na.rm = TRUE), # % who said Yes
    Count_Detected = sum(Detected, na.rm = TRUE)
  )

print("--- PRIMING DETECTION RATES ---")
## [1] "--- PRIMING DETECTION RATES ---"
kable(priming_summary, digits = 2, caption = "Did participants notice the wording?") %>% kable_styling()
Did participants notice the wording?
Scenario Detection_Rate Count_Detected
S1 0.0 0
S3 0.5 1
S4 0.0 0
S5 0.0 0
S6 1.0 2
S7 0.5 1
S8 0.5 1
# 2. PRIMING EFFECT DIRECTION
# For those who noticed, did it help (1) or backfire (-1)?
priming_effect <- clean_data %>%
  select(starts_with("S"), ends_with("_Priming_Effect")) %>%
  pivot_longer(cols = ends_with("_Effect"), names_to = "Scenario", values_to = "Effect") %>%
  mutate(Scenario = str_extract(Scenario, "^S\\d+")) %>%
  filter(!is.na(Effect)) %>% # Only look at people who answered this
  count(Scenario, Effect) %>%
  mutate(
    Effect_Type = case_when(
      Effect == 1 ~ "Worked (Chose Option)",
      Effect == -1 ~ "Backfired (Suspicious)",
      Effect == 0 ~ "No Effect",
      TRUE ~ "Other"
    )
  ) %>%
  pivot_wider(names_from = Effect_Type, values_from = n, values_fill = 0)

print("--- PRIMING BEHAVIORAL EFFECT ---")
## [1] "--- PRIMING BEHAVIORAL EFFECT ---"
kable(priming_effect, caption = "How did the wording affect choices?") %>% kable_styling()
How did the wording affect choices?
Scenario Effect No Effect Backfired (Suspicious) Other
S3 0 1 0 0
S6 -1 0 1 0
S6 0 1 0 0
S7 99 0 0 1
S8 -1 0 1 0
# 3. PRIMING SUCCESS METRICS
# This shows whether your positively-worded incorrect option actually influenced people

priming_success <- clean_data %>%
  select(starts_with("S"), ends_with("_Priming_Detected"), ends_with("_Priming_Effect")) %>%
  pivot_longer(
    cols = ends_with("_Detected"), 
    names_to = "Scenario_Detected", 
    values_to = "Detected"
  ) %>%
  pivot_longer(
    cols = ends_with("_Effect"), 
    names_to = "Scenario_Effect", 
    values_to = "Effect"
  ) %>%
  # Match scenario names
  mutate(
    Scenario_Det = str_extract(Scenario_Detected, "^S\\d+"),
    Scenario_Eff = str_extract(Scenario_Effect, "^S\\d+")
  ) %>%
  filter(Scenario_Det == Scenario_Eff) %>%  # Keep matching rows
  rename(Scenario = Scenario_Det) %>%
  select(Scenario, Detected, Effect) %>%
  
  # Calculate success metrics
  group_by(Scenario) %>%
  summarise(
    # How many people saw the scenario
    Total_Responses = n(),
    
    # How many detected the priming
    Detected_Count = sum(Detected == 1, na.rm = TRUE),
    Detection_Rate = mean(Detected == 1, na.rm = TRUE),
    
    # Of those who detected, how many were influenced? (Effect = 1)
    Influenced_Count = sum(Detected == 1 & Effect == 1, na.rm = TRUE),
    
    # Of those who detected, how many became suspicious? (Effect = -1)
    Backfire_Count = sum(Detected == 1 & Effect == -1, na.rm = TRUE),
    
    # Of those who detected, how many weren't affected? (Effect = 0)
    No_Effect_Count = sum(Detected == 1 & Effect == 0, na.rm = TRUE),
    
    # THE KEY METRIC: Success rate = influenced / detected
    Success_Rate = ifelse(Detected_Count > 0, 
                          Influenced_Count / Detected_Count, 
                          NA),
    
    # Backfire rate = suspicious / detected  
    Backfire_Rate = ifelse(Detected_Count > 0, 
                           Backfire_Count / Detected_Count, 
                           NA)
  )

# Display the success metrics
print("--- PRIMING SUCCESS METRICS ---")
## [1] "--- PRIMING SUCCESS METRICS ---"
print("Of those who noticed the wording, did it work as intended?")
## [1] "Of those who noticed the wording, did it work as intended?"
priming_success %>%
  select(Scenario, Detection_Rate, Detected_Count, 
         Influenced_Count, Success_Rate, Backfire_Rate) %>%
  kable(
    digits = 3, 
    caption = "Priming Effectiveness: Did the positive wording influence choices?",
    col.names = c("Scenario", "Detection Rate", "N Detected", 
                  "N Influenced", "Success Rate", "Backfire Rate")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover")) %>%
  # Highlight high success rates (> 50%) in green
  column_spec(5, bold = TRUE, 
              color = ifelse(is.na(priming_success$Success_Rate), "gray",
                            ifelse(priming_success$Success_Rate > 0.5, "darkgreen", "black")))
Priming Effectiveness: Did the positive wording influence choices?
Scenario Detection Rate N Detected N Influenced Success Rate Backfire Rate
S1 0.0 0 0 NA NA
S3 0.5 1 0 0 0.0
S4 0.0 0 0 NA NA
S5 0.0 0 0 NA NA
S6 1.0 2 0 0 0.5
S7 0.5 1 0 0 0.0
S8 0.5 1 0 0 1.0
# 3. VISUALIZATION: EXPERT VS NOVICE ACCURACY
# Uses 'final_decision_table' created in Chunk 4
ggplot(final_decision_table, aes(x = Scenario)) +
  # Add bars for Novices
  geom_bar(aes(y = Novice_Acc, fill = "Novice"), stat = "identity", position = position_nudge(x = -0.2), width = 0.4) +
  # Add bars for Experts
  geom_bar(aes(y = Expert_Acc, fill = "Expert"), stat = "identity", position = position_nudge(x = 0.2), width = 0.4) +
  # Add dashed line for 85% ceiling
  geom_hline(yintercept = 0.85, linetype = "dashed", color = "red", alpha = 0.5) +
  annotate("text", x = 1, y = 0.87, label = "Ceiling (.85)", color = "red", size = 3) +
  # Formatting
  scale_y_continuous(labels = scales::percent, limits = c(0, 1)) +
  scale_fill_manual(values = c("Novice" = "#A8D0E6", "Expert" = "#2C3E50")) +
  labs(
    title = "Expert vs. Novice Accuracy by Scenario",
    subtitle = "Comparing performance gaps to identify valid scenarios",
    y = "Accuracy (%)",
    x = NULL,
    fill = "Group"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")