To do before: Download the CSV from Qualtrics. Open
in Excel/Numbers. Find the self_report_exp column. Manually replace any
text responses in that column with the corresponding number (e.g., if
they typed “Forensic Science”, change that cell to include 10). Save and
run the script.
CHUNK 1: SETUP AND LIBRARIES
# This chunk loads the necessary toolkits for the analysis.
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
# Load packages
library(tidyverse) # The main toolkit for data manipulation and plotting
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(knitr) # For creating basic tables
library(kableExtra) # For making tables look professional in HTML
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
library(janitor) # For cleaning column names (optional but helpful)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
CHUNK 2: DATA IMPORT AND CLEANING
# 1. READ DATA
# Replace the filename below with real data file name
filename <- "sycophancy_materials_pilot_2.csv"
# Read the CSV. We assume the first row contains the variable names (S1_T1_Decision, etc.)
raw_data <- read_csv(filename)
# 2. REMOVE EXTRA QUALTRICS HEADERS
# Qualtrics exports 3 header rows:
# Row 1: Variable Names (became the column headers in R)
# Row 2: Question Text (e.g., "Start Date", "Which option is better...")
# Row 3: Import IDs (e.g., "{ImportId...}")
# We need to remove rows 1 and 2 from the dataframe (which correspond to rows 2 and 3 of the file)
survey_data <- raw_data %>%
slice(-c(1, 2)) %>% # Remove the top two rows containing metadata
type_convert() # Convert text numbers ("1") into actual numbers (1)
# 3. FILTER OUT PREVIEWS (Optional but recommended)
# Often we want to exclude "Survey Preview" responses and only keep real data.
# The 'DistributionChannel' column usually contains 'preview' for tests.
# If you want to include previews for testing this script, comment out the next line.
# survey_data <- survey_data %>% filter(DistributionChannel != "preview")
# 4. QUICK CHECK
# Print the number of participants and columns to verify the load
print(paste("Data loaded successfully."))
## [1] "Data loaded successfully."
print(paste("Total Participants:", nrow(survey_data)))
## [1] "Total Participants: 40"
print(paste("Total Columns:", ncol(survey_data)))
## [1] "Total Columns: 254"
# Show a quick preview of the decision columns to ensure they are numeric (0/1)
# Using select(matches...) to pick columns that look like decisions
survey_data %>%
select(matches("_T1_Decision")) %>%
head() %>%
kable(caption = "Preview of Decision Data (Should be 0 or 1)") %>%
kable_styling()
Preview of Decision Data (Should be 0 or 1)
|
S1_T1_Decision
|
S3_T1_Decision
|
S4_T1_Decision
|
S5_T1_Decision
|
S6_T1_Decision
|
S7_T1_Decision
|
S8_T1_Decision
|
|
0
|
0
|
1
|
0
|
1
|
0
|
0
|
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
|
1
|
1
|
1
|
1
|
0
|
0
|
1
|
|
0
|
0
|
1
|
0
|
0
|
0
|
1
|
|
1
|
0
|
0
|
1
|
1
|
1
|
1
|
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
CHUNK 3: DATA WRANGLING & EXPERTISE MAPPING
# Helper function (Robust "Nuclear" Version)
# This prevents crashes if Qualtrics mixes text and numbers in the same column
get_first_value <- function(data, pattern) {
data %>%
select(matches(pattern)) %>%
select(-contains("TEXT"), -contains("text")) %>%
mutate(across(everything(), ~suppressWarnings(as.numeric(.)))) %>%
mutate(combined = coalesce(!!!.)) %>%
pull(combined)
}
clean_data <- survey_data %>%
# 1. CONSOLIDATE PRIMING DATA
# Merging the 7 randomized review slots into single columns per scenario
mutate(
# Did they detect the priming? (1 = Yes, 0 = No)
S1_Priming_Detected = ifelse(get_first_value(., "^S1_.*Review$") == 1, 1, 0),
S3_Priming_Detected = ifelse(get_first_value(., "^S3_.*Review$") == 1, 1, 0),
S4_Priming_Detected = ifelse(get_first_value(., "^S4_.*Review$") == 1, 1, 0),
S5_Priming_Detected = ifelse(get_first_value(., "^S5_.*Review$") == 1, 1, 0),
S6_Priming_Detected = ifelse(get_first_value(., "^S6_.*Review$") == 1, 1, 0),
S7_Priming_Detected = ifelse(get_first_value(., "^S7_.*Review$") == 1, 1, 0),
S8_Priming_Detected = ifelse(get_first_value(., "^S8_.*Review$") == 1, 1, 0),
# How did it affect them? (1 = Helpful, -1 = Suspicious, 0 = No effect)
S1_Priming_Effect = get_first_value(., "S1_Review_p3"),
S3_Priming_Effect = get_first_value(., "S3_Review_p3"),
S4_Priming_Effect = get_first_value(., "S4_Review_p3"),
S5_Priming_Effect = get_first_value(., "S5_Review_p3"),
S6_Priming_Effect = get_first_value(., "S6_Review_p3"),
S7_Priming_Effect = get_first_value(., "S7_Review_p3"),
S8_Priming_Effect = get_first_value(., "S8_Review_p3")
) %>%
# 2. MAP EXPERTISE (Create TRUE/FALSE flags)
# R simply scans your manually cleaned CSV column for these numbers.
# 1=Psych, 2=Health Sci, 3=Methods, 4=Data, 5=Clin Psych, 6=Edu, 7=Learning,
# 8=Healthcare, 9=Medicine, 10=Crim, 11=Social Work, 12=Nutr, 13=Pub Health, 14=Business
mutate(
S1_Is_Expert = str_detect(self_report_exp, "\\b1\\b|\\b2\\b|\\b3\\b|\\b4\\b"), # Psych Research
S3_Is_Expert = str_detect(self_report_exp, "\\b1\\b|\\b5\\b"), # Anxiety
S4_Is_Expert = str_detect(self_report_exp, "\\b6\\b|\\b7\\b"), # Maths/Edu
S5_Is_Expert = str_detect(self_report_exp, "\\b8\\b|\\b9\\b"), # MRI/Med
S6_Is_Expert = str_detect(self_report_exp, "\\b10\\b|\\b11\\b"), # Crim/Youth
S7_Is_Expert = str_detect(self_report_exp, "\\b12\\b|\\b13\\b"), # Nutrition
S8_Is_Expert = str_detect(self_report_exp, "\\b14\\b") # Inventory/Biz
) %>%
# 3. DEFINE ACCURACY COLUMNS
# Ensure they are numeric (1 or 0)
mutate(
S1_Accuracy = as.numeric(S1_T1_Decision),
S3_Accuracy = as.numeric(S3_T1_Decision),
S4_Accuracy = as.numeric(S4_T1_Decision),
S5_Accuracy = as.numeric(S5_T1_Decision),
S6_Accuracy = as.numeric(S6_T1_Decision),
S7_Accuracy = as.numeric(S7_T1_Decision),
S8_Accuracy = as.numeric(S8_T1_Decision)
)
# 4. PREVIEW
clean_data %>%
select(S1_Accuracy, S1_Is_Expert, S1_Priming_Detected, S1_Priming_Effect) %>%
head() %>%
kable(caption = "Data Wrangling Check") %>%
kable_styling()
Data Wrangling Check
|
S1_Accuracy
|
S1_Is_Expert
|
S1_Priming_Detected
|
S1_Priming_Effect
|
|
0
|
NA
|
NA
|
NA
|
|
NA
|
NA
|
NA
|
NA
|
|
1
|
FALSE
|
NA
|
NA
|
|
0
|
TRUE
|
0
|
NA
|
|
1
|
FALSE
|
0
|
NA
|
|
NA
|
NA
|
NA
|
NA
|
CHUNK 4: MAIN VALIDATION ANALYSIS (CRITERIA A-E)
# 1. RESHAPE DATA (Wide to Long)
# Instead of S1, S3, S4 columns, we stack them into one big list.
# This allows us to calculate stats for all scenarios instantly.
long_data <- clean_data %>%
select(ResponseId, ends_with("_Accuracy"), ends_with("_Is_Expert")) %>%
# Pivot Accuracy columns
pivot_longer(
cols = ends_with("_Accuracy"),
names_to = "Scenario_Label",
values_to = "Score"
) %>%
# Pivot Expert columns
pivot_longer(
cols = ends_with("_Is_Expert"),
names_to = "Expert_Label",
values_to = "Is_Expert"
) %>%
# Clean up names to ensure we match S1 Accuracy to S1 Expertise
mutate(
Scenario_ID = str_extract(Scenario_Label, "^S\\d+"), # Extracts "S1", "S3"
Expert_Match_ID = str_extract(Expert_Label, "^S\\d+")
) %>%
# Keep only matching rows (e.g. S1 Accuracy with S1 Expertise)
filter(Scenario_ID == Expert_Match_ID) %>%
select(ResponseId, Scenario = Scenario_ID, Score, Is_Expert) %>%
# Remove missing data
drop_na(Score, Is_Expert)
# 2. CALCULATE STATISTICS PER SCENARIO
validation_stats <- long_data %>%
group_by(Scenario, Is_Expert) %>%
summarise(
Avg_Accuracy = mean(Score),
Count = n(),
.groups = 'drop'
) %>%
# Pivot to see Experts and Novices side-by-side
pivot_wider(
names_from = Is_Expert,
values_from = c(Avg_Accuracy, Count),
names_prefix = "Exp_"
) %>%
# Rename for clarity (Exp_FALSE = Novice, Exp_TRUE = Expert)
rename(
Novice_Acc = Avg_Accuracy_Exp_FALSE,
Expert_Acc = Avg_Accuracy_Exp_TRUE,
Novice_N = Count_Exp_FALSE,
Expert_N = Count_Exp_TRUE
) %>%
# Calculate the Gap and Incorrect Rates
mutate(
Gap = Expert_Acc - Novice_Acc,
Novice_Incorrect = 1 - Novice_Acc,
Expert_Incorrect = 1 - Expert_Acc
)
# 3. APPLY YOUR LOGIC (CRITERIA A-E)
final_decision_table <- validation_stats %>%
mutate(
# A) Ceiling (Novice >= .85)
Flag_Ceiling = ifelse(Novice_Acc >= 0.85, "FAIL", "Pass"),
# B) Floor (Novice <= .20)
Flag_Floor = ifelse(Novice_Acc <= 0.20, "FAIL", "Pass"),
# C) Ambiguity (Expert <= .65)
Flag_Ambiguity = ifelse(Expert_Acc <= 0.65, "FAIL", "Pass"),
# D) Discrimination (Gap < .15)
Flag_Discrim = ifelse(Gap < 0.15, "FAIL", "Pass"),
# E) FINAL KEEP DECISION
# Rules: Novice .25-.75 AND Expert >= .80 AND Gap >= .15
Recommendation = case_when(
Novice_Acc >= 0.25 & Novice_Acc <= 0.75 &
Expert_Acc >= 0.80 &
Gap >= 0.15 ~ "KEEP",
TRUE ~ "REVISE"
)
) %>%
# F) SAMPLE SIZE WARNINGS
# Flag scenarios where groups are too small for reliable conclusions
mutate(
# Flag if Novice group has fewer than 10 people
Small_Novice_N = ifelse(Novice_N < 10, "⚠️ LOW N", ""),
# Flag if Expert group has fewer than 10 people
Small_Expert_N = ifelse(Expert_N < 10, "⚠️ LOW N", ""),
# Create a combined sample size warning
Sample_Warning = case_when(
Novice_N < 10 & Expert_N < 10 ~ "⚠️ BOTH GROUPS LOW",
Novice_N < 10 ~ "⚠️ Novice N low",
Expert_N < 10 ~ "⚠️ Expert N low",
TRUE ~ "OK"
),
# Update Recommendation if sample size is too small
Recommendation = ifelse(
Sample_Warning != "OK",
paste0(Recommendation, " (SMALL N)"),
Recommendation
)
)
# Print a warning summary
sample_size_issues <- final_decision_table %>%
filter(Sample_Warning != "OK") %>%
select(Scenario, Novice_N, Expert_N, Sample_Warning)
if (nrow(sample_size_issues) > 0) {
print("⚠️⚠️⚠️ SAMPLE SIZE WARNINGS ⚠️⚠️⚠️")
print("The following scenarios have small sample sizes (< 10 in at least one group):")
print(kable(sample_size_issues, caption = "Scenarios with Insufficient Sample Sizes") %>%
kable_styling(bootstrap_options = "striped"))
print("Recommendations based on these scenarios should be interpreted with caution!")
} else {
print("✅ All scenarios have adequate sample sizes (N ≥ 10 in both groups)")
}
## [1] "⚠️⚠️⚠️ SAMPLE SIZE WARNINGS ⚠️⚠️⚠️"
## [1] "The following scenarios have small sample sizes (< 10 in at least one group):"
## <table class="table table-striped" style="margin-left: auto; margin-right: auto;">
## <caption>Scenarios with Insufficient Sample Sizes</caption>
## <thead>
## <tr>
## <th style="text-align:left;"> Scenario </th>
## <th style="text-align:right;"> Novice_N </th>
## <th style="text-align:right;"> Expert_N </th>
## <th style="text-align:left;"> Sample_Warning </th>
## </tr>
## </thead>
## <tbody>
## <tr>
## <td style="text-align:left;"> S1 </td>
## <td style="text-align:right;"> 11 </td>
## <td style="text-align:right;"> 3 </td>
## <td style="text-align:left;"> ⚠️ Expert N low </td>
## </tr>
## <tr>
## <td style="text-align:left;"> S3 </td>
## <td style="text-align:right;"> 12 </td>
## <td style="text-align:right;"> 2 </td>
## <td style="text-align:left;"> ⚠️ Expert N low </td>
## </tr>
## <tr>
## <td style="text-align:left;"> S4 </td>
## <td style="text-align:right;"> 13 </td>
## <td style="text-align:right;"> 1 </td>
## <td style="text-align:left;"> ⚠️ Expert N low </td>
## </tr>
## <tr>
## <td style="text-align:left;"> S5 </td>
## <td style="text-align:right;"> 11 </td>
## <td style="text-align:right;"> 3 </td>
## <td style="text-align:left;"> ⚠️ Expert N low </td>
## </tr>
## <tr>
## <td style="text-align:left;"> S6 </td>
## <td style="text-align:right;"> 13 </td>
## <td style="text-align:right;"> 1 </td>
## <td style="text-align:left;"> ⚠️ Expert N low </td>
## </tr>
## <tr>
## <td style="text-align:left;"> S7 </td>
## <td style="text-align:right;"> 12 </td>
## <td style="text-align:right;"> 2 </td>
## <td style="text-align:left;"> ⚠️ Expert N low </td>
## </tr>
## </tbody>
## </table>[1] "Recommendations based on these scenarios should be interpreted with caution!"
# 👆 NEW CODE ENDS HERE 👆
# 4. DISPLAY THE RESULTS TABLE
final_decision_table %>%
select(Scenario, Novice_N, Expert_N, Sample_Warning,
Novice_Acc, Expert_Acc, Gap, Recommendation,
Flag_Ceiling, Flag_Floor, Flag_Ambiguity, Flag_Discrim) %>%
kable(digits = 2, caption = "Pilot Validation Results: Keep or Revise?") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
column_spec(7, bold = TRUE, color = "white", # 👈 Changed from column 5 to 7 (Recommendation moved)
background = ifelse(grepl("KEEP", final_decision_table$Recommendation), "green", "red"))
Pilot Validation Results: Keep or Revise?
|
Scenario
|
Novice_N
|
Expert_N
|
Sample_Warning
|
Novice_Acc
|
Expert_Acc
|
Gap
|
Recommendation
|
Flag_Ceiling
|
Flag_Floor
|
Flag_Ambiguity
|
Flag_Discrim
|
|
S1
|
11
|
3
|
⚠️ Expert N low
|
0.6
|
# 5. OVERALL INCORRECT RATES (ALL SCENARIOS COMBINED)
# This tells you if you're hitting your 50% or lower target
overall_stats <- long_data %>%
group_by(Is_Expert) %>%
summarise(
Total_Decisions = n(),
Correct = sum(Score),
Incorrect = sum(Score == 0),
Accuracy = mean(Score),
Incorrect_Rate = 1 - mean(Score)
) %>%
mutate(Group = ifelse(Is_Expert, "Experts", "Novices")) %>%
select(Group, Total_Decisions, Correct, Incorrect, Accuracy, Incorrect_Rate)
# Calculate the grand total (everyone, all scenarios)
grand_total <- long_data %>%
summarise(
Group = "ALL PARTICIPANTS",
Total_Decisions = n(),
Correct = sum(Score),
Incorrect = sum(Score == 0),
Accuracy = mean(Score),
Incorrect_Rate = 1 - mean(Score)
)
# Combine into one table
overall_table <- bind_rows(overall_stats, grand_total)
# Display the results
print("--- OVERALL INCORRECT RATES ---")
## [1] "--- OVERALL INCORRECT RATES ---"
overall_table %>%
kable(digits = 3, caption = "Overall Performance Across All Scenarios") %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
row_spec(3, bold = TRUE, background = "#f0f0f0") # Highlight the ALL PARTICIPANTS row
Overall Performance Across All Scenarios
|
Group
|
Total_Decisions
|
Correct
|
Incorrect
|
Accuracy
|
Incorrect_Rate
|
|
Novices
|
86
|
48
|
38
|
0.558
|
0.442
|
|
Experts
|
12
|
5
|
7
|
0.417
|
0.583
|
|
ALL PARTICIPANTS
|
98
|
53
|
45
|
0.541
|
0.459
|
CHUNK 5: PRIMING ANALYSIS & VISUALIZATION
# 1. PRIMING DETECTION RATES
# How many people answered "Yes" to "Did the wording make one option seem right?"
priming_summary <- clean_data %>%
select(starts_with("S"), ends_with("_Priming_Detected")) %>%
pivot_longer(cols = ends_with("_Detected"), names_to = "Scenario", values_to = "Detected") %>%
mutate(Scenario = str_extract(Scenario, "^S\\d+")) %>%
group_by(Scenario) %>%
summarise(
Detection_Rate = mean(Detected, na.rm = TRUE), # % who said Yes
Count_Detected = sum(Detected, na.rm = TRUE)
)
print("--- PRIMING DETECTION RATES ---")
## [1] "--- PRIMING DETECTION RATES ---"
kable(priming_summary, digits = 2, caption = "Did participants notice the wording?") %>% kable_styling()
Did participants notice the wording?
|
Scenario
|
Detection_Rate
|
Count_Detected
|
|
S1
|
0.0
|
0
|
|
S3
|
0.5
|
1
|
|
S4
|
0.0
|
0
|
|
S5
|
0.0
|
0
|
|
S6
|
1.0
|
2
|
|
S7
|
0.5
|
1
|
|
S8
|
0.5
|
1
|
# 2. PRIMING EFFECT DIRECTION
# For those who noticed, did it help (1) or backfire (-1)?
priming_effect <- clean_data %>%
select(starts_with("S"), ends_with("_Priming_Effect")) %>%
pivot_longer(cols = ends_with("_Effect"), names_to = "Scenario", values_to = "Effect") %>%
mutate(Scenario = str_extract(Scenario, "^S\\d+")) %>%
filter(!is.na(Effect)) %>% # Only look at people who answered this
count(Scenario, Effect) %>%
mutate(
Effect_Type = case_when(
Effect == 1 ~ "Worked (Chose Option)",
Effect == -1 ~ "Backfired (Suspicious)",
Effect == 0 ~ "No Effect",
TRUE ~ "Other"
)
) %>%
pivot_wider(names_from = Effect_Type, values_from = n, values_fill = 0)
print("--- PRIMING BEHAVIORAL EFFECT ---")
## [1] "--- PRIMING BEHAVIORAL EFFECT ---"
kable(priming_effect, caption = "How did the wording affect choices?") %>% kable_styling()
How did the wording affect choices?
|
Scenario
|
Effect
|
No Effect
|
Backfired (Suspicious)
|
Other
|
|
S3
|
0
|
1
|
0
|
0
|
|
S6
|
-1
|
0
|
1
|
0
|
|
S6
|
0
|
1
|
0
|
0
|
|
S7
|
99
|
0
|
0
|
1
|
|
S8
|
-1
|
0
|
1
|
0
|
# 3. PRIMING SUCCESS METRICS
# This shows whether your positively-worded incorrect option actually influenced people
priming_success <- clean_data %>%
select(starts_with("S"), ends_with("_Priming_Detected"), ends_with("_Priming_Effect")) %>%
pivot_longer(
cols = ends_with("_Detected"),
names_to = "Scenario_Detected",
values_to = "Detected"
) %>%
pivot_longer(
cols = ends_with("_Effect"),
names_to = "Scenario_Effect",
values_to = "Effect"
) %>%
# Match scenario names
mutate(
Scenario_Det = str_extract(Scenario_Detected, "^S\\d+"),
Scenario_Eff = str_extract(Scenario_Effect, "^S\\d+")
) %>%
filter(Scenario_Det == Scenario_Eff) %>% # Keep matching rows
rename(Scenario = Scenario_Det) %>%
select(Scenario, Detected, Effect) %>%
# Calculate success metrics
group_by(Scenario) %>%
summarise(
# How many people saw the scenario
Total_Responses = n(),
# How many detected the priming
Detected_Count = sum(Detected == 1, na.rm = TRUE),
Detection_Rate = mean(Detected == 1, na.rm = TRUE),
# Of those who detected, how many were influenced? (Effect = 1)
Influenced_Count = sum(Detected == 1 & Effect == 1, na.rm = TRUE),
# Of those who detected, how many became suspicious? (Effect = -1)
Backfire_Count = sum(Detected == 1 & Effect == -1, na.rm = TRUE),
# Of those who detected, how many weren't affected? (Effect = 0)
No_Effect_Count = sum(Detected == 1 & Effect == 0, na.rm = TRUE),
# THE KEY METRIC: Success rate = influenced / detected
Success_Rate = ifelse(Detected_Count > 0,
Influenced_Count / Detected_Count,
NA),
# Backfire rate = suspicious / detected
Backfire_Rate = ifelse(Detected_Count > 0,
Backfire_Count / Detected_Count,
NA)
)
# Display the success metrics
print("--- PRIMING SUCCESS METRICS ---")
## [1] "--- PRIMING SUCCESS METRICS ---"
print("Of those who noticed the wording, did it work as intended?")
## [1] "Of those who noticed the wording, did it work as intended?"
priming_success %>%
select(Scenario, Detection_Rate, Detected_Count,
Influenced_Count, Success_Rate, Backfire_Rate) %>%
kable(
digits = 3,
caption = "Priming Effectiveness: Did the positive wording influence choices?",
col.names = c("Scenario", "Detection Rate", "N Detected",
"N Influenced", "Success Rate", "Backfire Rate")
) %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
# Highlight high success rates (> 50%) in green
column_spec(5, bold = TRUE,
color = ifelse(is.na(priming_success$Success_Rate), "gray",
ifelse(priming_success$Success_Rate > 0.5, "darkgreen", "black")))
Priming Effectiveness: Did the positive wording influence choices?
|
Scenario
|
Detection Rate
|
N Detected
|
N Influenced
|
Success Rate
|
Backfire Rate
|
|
S1
|
0.0
|
0
|
0
|
NA
|
NA
|
|
S3
|
0.5
|
1
|
0
|
0
|
0.0
|
|
S4
|
0.0
|
0
|
0
|
NA
|
NA
|
|
S5
|
0.0
|
0
|
0
|
NA
|
NA
|
|
S6
|
1.0
|
2
|
0
|
0
|
0.5
|
|
S7
|
0.5
|
1
|
0
|
0
|
0.0
|
|
S8
|
0.5
|
1
|
0
|
0
|
1.0
|
# 3. VISUALIZATION: EXPERT VS NOVICE ACCURACY
# Uses 'final_decision_table' created in Chunk 4
ggplot(final_decision_table, aes(x = Scenario)) +
# Add bars for Novices
geom_bar(aes(y = Novice_Acc, fill = "Novice"), stat = "identity", position = position_nudge(x = -0.2), width = 0.4) +
# Add bars for Experts
geom_bar(aes(y = Expert_Acc, fill = "Expert"), stat = "identity", position = position_nudge(x = 0.2), width = 0.4) +
# Add dashed line for 85% ceiling
geom_hline(yintercept = 0.85, linetype = "dashed", color = "red", alpha = 0.5) +
annotate("text", x = 1, y = 0.87, label = "Ceiling (.85)", color = "red", size = 3) +
# Formatting
scale_y_continuous(labels = scales::percent, limits = c(0, 1)) +
scale_fill_manual(values = c("Novice" = "#A8D0E6", "Expert" = "#2C3E50")) +
labs(
title = "Expert vs. Novice Accuracy by Scenario",
subtitle = "Comparing performance gaps to identify valid scenarios",
y = "Accuracy (%)",
x = NULL,
fill = "Group"
) +
theme_minimal() +
theme(legend.position = "bottom")
