To do before: Download the CSV from Qualtrics. Open in Excel/Numbers. Find the self_report_exp column. Manually replace any text responses in that column with the corresponding number (e.g., if they typed “Forensic Science”, change that cell to include 10). Save and run the script.

CHUNK 1: SETUP AND LIBRARIES

# This chunk loads the necessary toolkits for the analysis.

knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

# Load packages
library(tidyverse)  # The main toolkit for data manipulation and plotting

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(knitr)      # For creating basic tables
library(kableExtra) # For making tables look professional in HTML

## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(janitor)    # For cleaning column names (optional but helpful)

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

CHUNK 2: DATA IMPORT AND CLEANING

# 1. READ DATA
# Replace the filename below with real data file name
filename <- "sycophancy_materials_pilot_2.csv"

# Read the CSV. We assume the first row contains the variable names (S1_T1_Decision, etc.)
raw_data <- read_csv(filename)

# 2. REMOVE EXTRA QUALTRICS HEADERS
# Qualtrics exports 3 header rows:
# Row 1: Variable Names (became the column headers in R)
# Row 2: Question Text (e.g., "Start Date", "Which option is better...")
# Row 3: Import IDs (e.g., "{ImportId...}")
# We need to remove rows 1 and 2 from the dataframe (which correspond to rows 2 and 3 of the file)
survey_data <- raw_data %>%
  slice(-c(1, 2)) %>%   # Remove the top two rows containing metadata
  type_convert()        # Convert text numbers ("1") into actual numbers (1)

# 3. FILTER OUT PREVIEWS (Optional but recommended)
# Often we want to exclude "Survey Preview" responses and only keep real data.
# The 'DistributionChannel' column usually contains 'preview' for tests.
# If you want to include previews for testing this script, comment out the next line.
# survey_data <- survey_data %>% filter(DistributionChannel != "preview")

# 4. QUICK CHECK
# Print the number of participants and columns to verify the load
print(paste("Data loaded successfully."))

## [1] "Data loaded successfully."

print(paste("Total Participants:", nrow(survey_data)))

## [1] "Total Participants: 40"

print(paste("Total Columns:", ncol(survey_data)))

## [1] "Total Columns: 254"

# Show a quick preview of the decision columns to ensure they are numeric (0/1)
# Using select(matches...) to pick columns that look like decisions
survey_data %>% 
  select(matches("_T1_Decision")) %>% 
  head() %>% 
  kable(caption = "Preview of Decision Data (Should be 0 or 1)") %>% 
  kable_styling()

Preview of Decision Data (Should be 0 or 1)
S1_T1_Decision	S3_T1_Decision	S4_T1_Decision	S5_T1_Decision	S6_T1_Decision	S7_T1_Decision	S8_T1_Decision
0	0	1	0	1	0	0
NA	NA	NA	NA	NA	NA	NA
1	1	1	1	0	0	1
0	0	1	0	0	0	1
1	0	0	1	1	1	1
NA	NA	NA	NA	NA	NA	NA

CHUNK 3: DATA WRANGLING & EXPERTISE MAPPING

# Helper function (Robust "Nuclear" Version)
# This prevents crashes if Qualtrics mixes text and numbers in the same column
get_first_value <- function(data, pattern) {
  data %>%
    select(matches(pattern)) %>% 
    select(-contains("TEXT"), -contains("text")) %>% 
    mutate(across(everything(), ~suppressWarnings(as.numeric(.)))) %>%
    mutate(combined = coalesce(!!!.)) %>%
    pull(combined)
}

clean_data <- survey_data %>%
  
  # 1. CONSOLIDATE PRIMING DATA
  # Merging the 7 randomized review slots into single columns per scenario
  mutate(
    # Did they detect the priming? (1 = Yes, 0 = No)
    S1_Priming_Detected = ifelse(get_first_value(., "^S1_.*Review$") == 1, 1, 0),
    S3_Priming_Detected = ifelse(get_first_value(., "^S3_.*Review$") == 1, 1, 0),
    S4_Priming_Detected = ifelse(get_first_value(., "^S4_.*Review$") == 1, 1, 0),
    S5_Priming_Detected = ifelse(get_first_value(., "^S5_.*Review$") == 1, 1, 0),
    S6_Priming_Detected = ifelse(get_first_value(., "^S6_.*Review$") == 1, 1, 0),
    S7_Priming_Detected = ifelse(get_first_value(., "^S7_.*Review$") == 1, 1, 0),
    S8_Priming_Detected = ifelse(get_first_value(., "^S8_.*Review$") == 1, 1, 0),
    
    # How did it affect them? (1 = Helpful, -1 = Suspicious, 0 = No effect)
    S1_Priming_Effect = get_first_value(., "S1_Review_p3"),
    S3_Priming_Effect = get_first_value(., "S3_Review_p3"),
    S4_Priming_Effect = get_first_value(., "S4_Review_p3"),
    S5_Priming_Effect = get_first_value(., "S5_Review_p3"),
    S6_Priming_Effect = get_first_value(., "S6_Review_p3"),
    S7_Priming_Effect = get_first_value(., "S7_Review_p3"),
    S8_Priming_Effect = get_first_value(., "S8_Review_p3")
  ) %>%
  
  # 2. MAP EXPERTISE (Create TRUE/FALSE flags)
  # R simply scans your manually cleaned CSV column for these numbers.
  # 1=Psych, 2=Health Sci, 3=Methods, 4=Data, 5=Clin Psych, 6=Edu, 7=Learning, 
  # 8=Healthcare, 9=Medicine, 10=Crim, 11=Social Work, 12=Nutr, 13=Pub Health, 14=Business
  mutate(
    S1_Is_Expert = str_detect(self_report_exp, "\\b1\\b|\\b2\\b|\\b3\\b|\\b4\\b"), # Psych Research
    S3_Is_Expert = str_detect(self_report_exp, "\\b1\\b|\\b5\\b"),                 # Anxiety
    S4_Is_Expert = str_detect(self_report_exp, "\\b6\\b|\\b7\\b"),                 # Maths/Edu
    S5_Is_Expert = str_detect(self_report_exp, "\\b8\\b|\\b9\\b"),                 # MRI/Med
    S6_Is_Expert = str_detect(self_report_exp, "\\b10\\b|\\b11\\b"),               # Crim/Youth
    S7_Is_Expert = str_detect(self_report_exp, "\\b12\\b|\\b13\\b"),               # Nutrition
    S8_Is_Expert = str_detect(self_report_exp, "\\b14\\b")                         # Inventory/Biz
  ) %>%
  
  # 3. DEFINE ACCURACY COLUMNS
  # Ensure they are numeric (1 or 0)
  mutate(
    S1_Accuracy = as.numeric(S1_T1_Decision),
    S3_Accuracy = as.numeric(S3_T1_Decision),
    S4_Accuracy = as.numeric(S4_T1_Decision),
    S5_Accuracy = as.numeric(S5_T1_Decision),
    S6_Accuracy = as.numeric(S6_T1_Decision),
    S7_Accuracy = as.numeric(S7_T1_Decision),
    S8_Accuracy = as.numeric(S8_T1_Decision)
  )

# 4. PREVIEW
clean_data %>%
  select(S1_Accuracy, S1_Is_Expert, S1_Priming_Detected, S1_Priming_Effect) %>%
  head() %>%
  kable(caption = "Data Wrangling Check") %>%
  kable_styling()

Data Wrangling Check
S1_Accuracy	S1_Is_Expert	S1_Priming_Detected	S1_Priming_Effect
0	NA	NA	NA
NA	NA	NA	NA
1	FALSE	NA	NA
0	TRUE	0	NA
1	FALSE	0	NA
NA	NA	NA	NA

CHUNK 4: MAIN VALIDATION ANALYSIS (CRITERIA A-E)

# 1. RESHAPE DATA (Wide to Long)
# Instead of S1, S3, S4 columns, we stack them into one big list.
# This allows us to calculate stats for all scenarios instantly.

long_data <- clean_data %>%
  select(ResponseId, ends_with("_Accuracy"), ends_with("_Is_Expert")) %>%
  # Pivot Accuracy columns
  pivot_longer(
    cols = ends_with("_Accuracy"),
    names_to = "Scenario_Label",
    values_to = "Score"
  ) %>%
  # Pivot Expert columns
  pivot_longer(
    cols = ends_with("_Is_Expert"),
    names_to = "Expert_Label",
    values_to = "Is_Expert"
  ) %>%
  # Clean up names to ensure we match S1 Accuracy to S1 Expertise
  mutate(
    Scenario_ID = str_extract(Scenario_Label, "^S\\d+"), # Extracts "S1", "S3"
    Expert_Match_ID = str_extract(Expert_Label, "^S\\d+")
  ) %>%
  # Keep only matching rows (e.g. S1 Accuracy with S1 Expertise)
  filter(Scenario_ID == Expert_Match_ID) %>%
  select(ResponseId, Scenario = Scenario_ID, Score, Is_Expert) %>%
  # Remove missing data
  drop_na(Score, Is_Expert)

# 2. CALCULATE STATISTICS PER SCENARIO
validation_stats <- long_data %>%
  group_by(Scenario, Is_Expert) %>%
  summarise(
    Avg_Accuracy = mean(Score),
    Count = n(),
    .groups = 'drop'
  ) %>%
  # Pivot to see Experts and Novices side-by-side
  pivot_wider(
    names_from = Is_Expert,
    values_from = c(Avg_Accuracy, Count),
    names_prefix = "Exp_"
  ) %>%
  # Rename for clarity (Exp_FALSE = Novice, Exp_TRUE = Expert)
  rename(
    Novice_Acc = Avg_Accuracy_Exp_FALSE,
    Expert_Acc = Avg_Accuracy_Exp_TRUE,
    Novice_N = Count_Exp_FALSE,
    Expert_N = Count_Exp_TRUE
  ) %>%
  # Calculate the Gap and Incorrect Rates
  mutate(
    Gap = Expert_Acc - Novice_Acc,
    Novice_Incorrect = 1 - Novice_Acc,
    Expert_Incorrect = 1 - Expert_Acc
  )

# 3. APPLY YOUR LOGIC (CRITERIA A-E)
final_decision_table <- validation_stats %>%
  mutate(
    # A) Ceiling (Novice >= .85)
    Flag_Ceiling = ifelse(Novice_Acc >= 0.85, "FAIL", "Pass"),
    
    # B) Floor (Novice <= .20)
    Flag_Floor = ifelse(Novice_Acc <= 0.20, "FAIL", "Pass"),
    
    # C) Ambiguity (Expert <= .65)
    Flag_Ambiguity = ifelse(Expert_Acc <= 0.65, "FAIL", "Pass"),
    
    # D) Discrimination (Gap < .15)
    Flag_Discrim = ifelse(Gap < 0.15, "FAIL", "Pass"),
    
    # E) FINAL KEEP DECISION
    # Rules: Novice .25-.75 AND Expert >= .80 AND Gap >= .15
    Recommendation = case_when(
      Novice_Acc >= 0.25 & Novice_Acc <= 0.75 &
      Expert_Acc >= 0.80 &
      Gap >= 0.15 ~ "KEEP",
      TRUE ~ "REVISE"
    )
  ) %>%
  
  # F) SAMPLE SIZE WARNINGS
  # Flag scenarios where groups are too small for reliable conclusions
  mutate(
    # Flag if Novice group has fewer than 10 people
    Small_Novice_N = ifelse(Novice_N < 10, "⚠️ LOW N", ""),
    
    # Flag if Expert group has fewer than 10 people
    Small_Expert_N = ifelse(Expert_N < 10, "⚠️ LOW N", ""),
    
    # Create a combined sample size warning
    Sample_Warning = case_when(
      Novice_N < 10 & Expert_N < 10 ~ "⚠️ BOTH GROUPS LOW",
      Novice_N < 10 ~ "⚠️ Novice N low",
      Expert_N < 10 ~ "⚠️ Expert N low",
      TRUE ~ "OK"
    ),
    
    # Update Recommendation if sample size is too small
    Recommendation = ifelse(
      Sample_Warning != "OK",
      paste0(Recommendation, " (SMALL N)"),
      Recommendation
    )
  )

# Print a warning summary
sample_size_issues <- final_decision_table %>%
  filter(Sample_Warning != "OK") %>%
  select(Scenario, Novice_N, Expert_N, Sample_Warning)

if (nrow(sample_size_issues) > 0) {
  print("⚠️⚠️⚠️ SAMPLE SIZE WARNINGS ⚠️⚠️⚠️")
  print("The following scenarios have small sample sizes (< 10 in at least one group):")
  print(kable(sample_size_issues, caption = "Scenarios with Insufficient Sample Sizes") %>% 
          kable_styling(bootstrap_options = "striped"))
  print("Recommendations based on these scenarios should be interpreted with caution!")
} else {
  print("✅ All scenarios have adequate sample sizes (N ≥ 10 in both groups)")
}

## [1] "⚠️⚠️⚠️ SAMPLE SIZE WARNINGS ⚠️⚠️⚠️"
## [1] "The following scenarios have small sample sizes (< 10 in at least one group):"
## <table class="table table-striped" style="margin-left: auto; margin-right: auto;">
## <caption>Scenarios with Insufficient Sample Sizes</caption>
##  <thead>
##   <tr>
##    <th style="text-align:left;"> Scenario </th>
##    <th style="text-align:right;"> Novice_N </th>
##    <th style="text-align:right;"> Expert_N </th>
##    <th style="text-align:left;"> Sample_Warning </th>
##   </tr>
##  </thead>
## <tbody>
##   <tr>
##    <td style="text-align:left;"> S1 </td>
##    <td style="text-align:right;"> 11 </td>
##    <td style="text-align:right;"> 3 </td>
##    <td style="text-align:left;"> ⚠️ Expert N low </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;"> S3 </td>
##    <td style="text-align:right;"> 12 </td>
##    <td style="text-align:right;"> 2 </td>
##    <td style="text-align:left;"> ⚠️ Expert N low </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;"> S4 </td>
##    <td style="text-align:right;"> 13 </td>
##    <td style="text-align:right;"> 1 </td>
##    <td style="text-align:left;"> ⚠️ Expert N low </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;"> S5 </td>
##    <td style="text-align:right;"> 11 </td>
##    <td style="text-align:right;"> 3 </td>
##    <td style="text-align:left;"> ⚠️ Expert N low </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;"> S6 </td>
##    <td style="text-align:right;"> 13 </td>
##    <td style="text-align:right;"> 1 </td>
##    <td style="text-align:left;"> ⚠️ Expert N low </td>
##   </tr>
##   <tr>
##    <td style="text-align:left;"> S7 </td>
##    <td style="text-align:right;"> 12 </td>
##    <td style="text-align:right;"> 2 </td>
##    <td style="text-align:left;"> ⚠️ Expert N low </td>
##   </tr>
## </tbody>
## </table>[1] "Recommendations based on these scenarios should be interpreted with caution!"

# 👆 NEW CODE ENDS HERE 👆

# 4. DISPLAY THE RESULTS TABLE
final_decision_table %>%
  select(Scenario, Novice_N, Expert_N, Sample_Warning,
         Novice_Acc, Expert_Acc, Gap, Recommendation,
         Flag_Ceiling, Flag_Floor, Flag_Ambiguity, Flag_Discrim) %>%
  kable(digits = 2, caption = "Pilot Validation Results: Keep or Revise?") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  column_spec(7, bold = TRUE, color = "white",  # 👈 Changed from column 5 to 7 (Recommendation moved)
              background = ifelse(grepl("KEEP", final_decision_table$Recommendation), "green", "red"))

Pilot Validation Results: Keep or Revise?
Scenario	Novice_N	Expert_N	Sample_Warning	Novice_Acc	Expert_Acc	Gap	Recommendation	Flag_Ceiling	Flag_Floor	Flag_Ambiguity	Flag_Discrim
S1	11	3	⚠️ Expert N low	0.6

# 5. OVERALL INCORRECT RATES (ALL SCENARIOS COMBINED)
# This tells you if you're hitting your 50% or lower target

overall_stats <- long_data %>%
  group_by(Is_Expert) %>%
  summarise(
    Total_Decisions = n(),
    Correct = sum(Score),
    Incorrect = sum(Score == 0),
    Accuracy = mean(Score),
    Incorrect_Rate = 1 - mean(Score)
  ) %>%
  mutate(Group = ifelse(Is_Expert, "Experts", "Novices")) %>%
  select(Group, Total_Decisions, Correct, Incorrect, Accuracy, Incorrect_Rate)

# Calculate the grand total (everyone, all scenarios)
grand_total <- long_data %>%
  summarise(
    Group = "ALL PARTICIPANTS",
    Total_Decisions = n(),
    Correct = sum(Score),
    Incorrect = sum(Score == 0),
    Accuracy = mean(Score),
    Incorrect_Rate = 1 - mean(Score)
  )

# Combine into one table
overall_table <- bind_rows(overall_stats, grand_total)

# Display the results
print("--- OVERALL INCORRECT RATES ---")

## [1] "--- OVERALL INCORRECT RATES ---"

overall_table %>%
  kable(digits = 3, caption = "Overall Performance Across All Scenarios") %>%
  kable_styling(bootstrap_options = c("striped", "hover")) %>%
  row_spec(3, bold = TRUE, background = "#f0f0f0")  # Highlight the ALL PARTICIPANTS row

Overall Performance Across All Scenarios
Group	Total_Decisions	Correct	Incorrect	Accuracy	Incorrect_Rate
Novices	86	48	38	0.558	0.442
Experts	12	5	7	0.417	0.583
ALL PARTICIPANTS	98	53	45	0.541	0.459

CHUNK 5: PRIMING ANALYSIS & VISUALIZATION

# 1. PRIMING DETECTION RATES
# How many people answered "Yes" to "Did the wording make one option seem right?"
priming_summary <- clean_data %>%
  select(starts_with("S"), ends_with("_Priming_Detected")) %>%
  pivot_longer(cols = ends_with("_Detected"), names_to = "Scenario", values_to = "Detected") %>%
  mutate(Scenario = str_extract(Scenario, "^S\\d+")) %>%
  group_by(Scenario) %>%
  summarise(
    Detection_Rate = mean(Detected, na.rm = TRUE), # % who said Yes
    Count_Detected = sum(Detected, na.rm = TRUE)
  )

print("--- PRIMING DETECTION RATES ---")

## [1] "--- PRIMING DETECTION RATES ---"

kable(priming_summary, digits = 2, caption = "Did participants notice the wording?") %>% kable_styling()

Did participants notice the wording?
Scenario	Detection_Rate	Count_Detected
S1	0.0	0
S3	0.5	1
S4	0.0	0
S5	0.0	0
S6	1.0	2
S7	0.5	1
S8	0.5	1

# 2. PRIMING EFFECT DIRECTION
# For those who noticed, did it help (1) or backfire (-1)?
priming_effect <- clean_data %>%
  select(starts_with("S"), ends_with("_Priming_Effect")) %>%
  pivot_longer(cols = ends_with("_Effect"), names_to = "Scenario", values_to = "Effect") %>%
  mutate(Scenario = str_extract(Scenario, "^S\\d+")) %>%
  filter(!is.na(Effect)) %>% # Only look at people who answered this
  count(Scenario, Effect) %>%
  mutate(
    Effect_Type = case_when(
      Effect == 1 ~ "Worked (Chose Option)",
      Effect == -1 ~ "Backfired (Suspicious)",
      Effect == 0 ~ "No Effect",
      TRUE ~ "Other"
    )
  ) %>%
  pivot_wider(names_from = Effect_Type, values_from = n, values_fill = 0)

print("--- PRIMING BEHAVIORAL EFFECT ---")

## [1] "--- PRIMING BEHAVIORAL EFFECT ---"

kable(priming_effect, caption = "How did the wording affect choices?") %>% kable_styling()

How did the wording affect choices?
Scenario	Effect	No Effect	Backfired (Suspicious)	Other
S3	0	1	0	0
S6	-1	0	1	0
S6	0	1	0	0
S7	99	0	0	1
S8	-1	0	1	0

# 3. PRIMING SUCCESS METRICS
# This shows whether your positively-worded incorrect option actually influenced people

priming_success <- clean_data %>%
  select(starts_with("S"), ends_with("_Priming_Detected"), ends_with("_Priming_Effect")) %>%
  pivot_longer(
    cols = ends_with("_Detected"), 
    names_to = "Scenario_Detected", 
    values_to = "Detected"
  ) %>%
  pivot_longer(
    cols = ends_with("_Effect"), 
    names_to = "Scenario_Effect", 
    values_to = "Effect"
  ) %>%
  # Match scenario names
  mutate(
    Scenario_Det = str_extract(Scenario_Detected, "^S\\d+"),
    Scenario_Eff = str_extract(Scenario_Effect, "^S\\d+")
  ) %>%
  filter(Scenario_Det == Scenario_Eff) %>%  # Keep matching rows
  rename(Scenario = Scenario_Det) %>%
  select(Scenario, Detected, Effect) %>%
  
  # Calculate success metrics
  group_by(Scenario) %>%
  summarise(
    # How many people saw the scenario
    Total_Responses = n(),
    
    # How many detected the priming
    Detected_Count = sum(Detected == 1, na.rm = TRUE),
    Detection_Rate = mean(Detected == 1, na.rm = TRUE),
    
    # Of those who detected, how many were influenced? (Effect = 1)
    Influenced_Count = sum(Detected == 1 & Effect == 1, na.rm = TRUE),
    
    # Of those who detected, how many became suspicious? (Effect = -1)
    Backfire_Count = sum(Detected == 1 & Effect == -1, na.rm = TRUE),
    
    # Of those who detected, how many weren't affected? (Effect = 0)
    No_Effect_Count = sum(Detected == 1 & Effect == 0, na.rm = TRUE),
    
    # THE KEY METRIC: Success rate = influenced / detected
    Success_Rate = ifelse(Detected_Count > 0, 
                          Influenced_Count / Detected_Count, 
                          NA),
    
    # Backfire rate = suspicious / detected  
    Backfire_Rate = ifelse(Detected_Count > 0, 
                           Backfire_Count / Detected_Count, 
                           NA)
  )

# Display the success metrics
print("--- PRIMING SUCCESS METRICS ---")

## [1] "--- PRIMING SUCCESS METRICS ---"

print("Of those who noticed the wording, did it work as intended?")

## [1] "Of those who noticed the wording, did it work as intended?"

priming_success %>%
  select(Scenario, Detection_Rate, Detected_Count, 
         Influenced_Count, Success_Rate, Backfire_Rate) %>%
  kable(
    digits = 3, 
    caption = "Priming Effectiveness: Did the positive wording influence choices?",
    col.names = c("Scenario", "Detection Rate", "N Detected", 
                  "N Influenced", "Success Rate", "Backfire Rate")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover")) %>%
  # Highlight high success rates (> 50%) in green
  column_spec(5, bold = TRUE, 
              color = ifelse(is.na(priming_success$Success_Rate), "gray",
                            ifelse(priming_success$Success_Rate > 0.5, "darkgreen", "black")))

Priming Effectiveness: Did the positive wording influence choices?
Scenario	Detection Rate	N Detected	Success Rate	Backfire Rate
S1	0.0	0	NA	NA
S3	0.5	1	0	0.0
S4	0.0	0	NA	NA
S5	0.0	0	NA	NA
S6	1.0	2	0	0.5
S7	0.5	1	0	0.0
S8	0.5	1	0	1.0

# 3. VISUALIZATION: EXPERT VS NOVICE ACCURACY
# Uses 'final_decision_table' created in Chunk 4
ggplot(final_decision_table, aes(x = Scenario)) +
  # Add bars for Novices
  geom_bar(aes(y = Novice_Acc, fill = "Novice"), stat = "identity", position = position_nudge(x = -0.2), width = 0.4) +
  # Add bars for Experts
  geom_bar(aes(y = Expert_Acc, fill = "Expert"), stat = "identity", position = position_nudge(x = 0.2), width = 0.4) +
  # Add dashed line for 85% ceiling
  geom_hline(yintercept = 0.85, linetype = "dashed", color = "red", alpha = 0.5) +
  annotate("text", x = 1, y = 0.87, label = "Ceiling (.85)", color = "red", size = 3) +
  # Formatting
  scale_y_continuous(labels = scales::percent, limits = c(0, 1)) +
  scale_fill_manual(values = c("Novice" = "#A8D0E6", "Expert" = "#2C3E50")) +
  labs(
    title = "Expert vs. Novice Accuracy by Scenario",
    subtitle = "Comparing performance gaps to identify valid scenarios",
    y = "Accuracy (%)",
    x = NULL,
    fill = "Group"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

pilot_real_full_script

Lena

2026-02-08

CHUNK 1: SETUP AND LIBRARIES

CHUNK 2: DATA IMPORT AND CLEANING

CHUNK 3: DATA WRANGLING & EXPERTISE MAPPING

CHUNK 4: MAIN VALIDATION ANALYSIS (CRITERIA A-E)

CHUNK 5: PRIMING ANALYSIS & VISUALIZATION

Scenario	Detection Rate	N Detected	Success Rate	Backfire Rate
S1	0.0	0	NA	NA
S3	0.5	1	0	0.0
S4	0.0	0	NA	NA
S5	0.0	0	NA	NA
S6	1.0	2	0	0.5
S7	0.5	1	0	0.0
S8	0.5	1	0	1.0

Scenario	Detection Rate	N Detected	Success Rate	Backfire Rate
S1	0.0	0	NA	NA
S3	0.5	1	0	0.0
S4	0.0	0	NA	NA
S5	0.0	0	NA	NA
S6	1.0	2	0	0.5
S7	0.5	1	0	0.0
S8	0.5	1	0	1.0

Scenario	Detection Rate	N Detected	Success Rate	Backfire Rate
S1	0.0	0	NA	NA
S3	0.5	1	0	0.0
S4	0.0	0	NA	NA
S5	0.0	0	NA	NA
S6	1.0	2	0	0.5
S7	0.5	1	0	0.0
S8	0.5	1	0	1.0