Item-level analysis of the Confidence Calibration Test from TakeTest.xyz. The test presents 120 true/false statements spanning geography, history, science, culture, and more. For each statement, respondents indicate whether they believe it is true or false and rate their confidence from 50% (pure guess) to 100% (certain).
Data from N=520 unique first-time completions.
library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1.9000 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
##
##
## Attaching package: 'magrittr'
##
##
## The following object is masked from 'package:purrr':
##
## set_names
##
##
## The following object is masked from 'package:tidyr':
##
## extract
##
##
## Loading required package: weights
##
## Loading required package: assertthat
##
##
## Attaching package: 'assertthat'
##
##
## The following object is masked from 'package:tibble':
##
## has_name
##
##
## Loading required package: psych
##
##
## Attaching package: 'psych'
##
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
##
##
## Loading required package: robustbase
##
##
## Attaching package: 'kirkegaard'
##
##
## The following object is masked from 'package:psych':
##
## rescale
##
##
## The following object is masked from 'package:assertthat':
##
## are_equal
##
##
## The following object is masked from 'package:purrr':
##
## is_logical
##
##
## The following object is masked from 'package:base':
##
## +
load_packages(scales, mirt)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:kirkegaard':
##
## rescale
##
## The following objects are masked from 'package:psych':
##
## alpha, rescale
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
##
## Loading required package: stats4
## Loading required package: lattice
theme_set(theme_bw())
answers <- read.csv("../data/calibration_answers.csv")
questions <- read.csv("../data/calibration_questions.csv")
# Merge
d <- answers %>%
left_join(questions, by = "question_index") %>%
mutate(
correct_answer = correct == "True",
is_correct = (answered_true == 1) == correct_answer,
conf_decimal = confidence / 100,
# p(true) as stated by respondent
p_true = ifelse(answered_true == 1, conf_decimal, 1 - conf_decimal),
outcome = as.numeric(correct_answer),
# confidence in chosen answer
conf_in_answer = conf_decimal
)
sprintf("N respondents: %d, N items: %d, N answers: %d",
length(unique(d$visitor_id)), length(unique(d$question_index)), nrow(d))
## [1] "N respondents: 520, N items: 120, N answers: 62390"
cal_summary <- d %>%
group_by(confidence) %>%
summarize(
n = n(),
n_correct = sum(is_correct),
accuracy = mean(is_correct),
.groups = "drop"
)
cal_summary
ggplot(cal_summary, aes(x = confidence, y = accuracy * 100)) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray50") +
geom_point(aes(size = n), color = "#b8860b") +
geom_line(color = "#b8860b") +
geom_text(aes(label = sprintf("n=%s", comma(n))), vjust = -1.2, size = 3) +
scale_x_continuous(breaks = seq(50, 100, 10), limits = c(45, 105)) +
scale_y_continuous(breaks = seq(50, 100, 10), limits = c(45, 105)) +
labs(
x = "Stated confidence (%)",
y = "Actual accuracy (%)",
title = "Overall calibration curve (N=520)",
subtitle = "Dashed line = perfect calibration"
) +
guides(size = guide_legend(title = "Responses")) +
coord_equal()
Which statements do people get right most and least often?
item_stats <- d %>%
group_by(question_index, text, correct_answer) %>%
summarize(
n = n(),
pct_correct = mean(is_correct) * 100,
mean_confidence = mean(confidence),
mean_conf_correct = mean(confidence[is_correct]),
mean_conf_incorrect = mean(confidence[!is_correct]),
pct_true = mean(answered_true == 1) * 100,
.groups = "drop"
) %>%
arrange(pct_correct)
item_stats
item_stats %>%
head(20) %>%
mutate(text = str_trunc(text, 60)) %>%
select(text, correct_answer, pct_correct, mean_confidence)
item_stats %>%
head(20) %>%
mutate(text = str_trunc(text, 55) %>% fct_reorder(pct_correct)) %>%
ggplot(aes(x = pct_correct, y = text)) +
geom_col(fill = "#b8860b", alpha = 0.7) +
geom_vline(xintercept = 50, linetype = "dashed", color = "gray50") +
labs(x = "% correct", y = NULL, title = "20 hardest items") +
scale_x_continuous(limits = c(0, 100))
item_stats %>%
tail(20) %>%
arrange(desc(pct_correct)) %>%
mutate(text = str_trunc(text, 60)) %>%
select(text, correct_answer, pct_correct, mean_confidence)
Where are people most and least calibrated per item? We compute the gap between mean confidence and actual accuracy.
item_stats <- item_stats %>%
mutate(
overconfidence = mean_confidence - pct_correct,
abs_overconfidence = abs(overconfidence)
)
item_stats %>%
arrange(desc(overconfidence)) %>%
head(20) %>%
mutate(text = str_trunc(text, 60)) %>%
select(text, correct_answer, pct_correct, mean_confidence, overconfidence)
item_stats %>%
arrange(desc(overconfidence)) %>%
head(20) %>%
mutate(text = str_trunc(text, 55) %>% fct_reorder(overconfidence)) %>%
ggplot(aes(x = overconfidence, y = text)) +
geom_col(fill = "#c0392b", alpha = 0.7) +
labs(x = "Overconfidence (mean confidence - % correct)", y = NULL,
title = "20 most overconfident items",
subtitle = "Items where people are most wrong about being right") +
geom_vline(xintercept = 0, linetype = "dashed")
item_stats %>%
arrange(overconfidence) %>%
head(20) %>%
mutate(text = str_trunc(text, 60)) %>%
select(text, correct_answer, pct_correct, mean_confidence, overconfidence)
d %>%
count(confidence) %>%
ggplot(aes(x = confidence, y = n)) +
geom_col(fill = "#b8860b", alpha = 0.7) +
labs(x = "Confidence level (%)", y = "Count",
title = "Distribution of confidence ratings across all responses")
person_stats <- d %>%
group_by(visitor_id) %>%
summarize(
n_answered = n(),
pct_correct = mean(is_correct) * 100,
mean_confidence = mean(confidence),
overconfidence = mean_confidence - pct_correct,
brier = mean((p_true - outcome)^2),
mae = mean(abs(p_true - outcome)),
.groups = "drop"
)
ggplot(person_stats, aes(x = pct_correct)) +
geom_histogram(binwidth = 2, fill = "#b8860b", alpha = 0.7, color = "gray30") +
labs(x = "% correct", y = "Count", title = "Distribution of knowledge scores")
ggplot(person_stats, aes(x = overconfidence)) +
geom_histogram(binwidth = 2, fill = "#c0392b", alpha = 0.7, color = "gray30") +
geom_vline(xintercept = 0, linetype = "dashed") +
labs(x = "Overconfidence (mean confidence - % correct)", y = "Count",
title = "Distribution of overconfidence",
subtitle = "Positive = overconfident, negative = underconfident")
GG_scatter(person_stats, "mean_confidence", "pct_correct") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray50")
## `geom_smooth()` using formula = 'y ~ x'
GG_scatter(person_stats, "pct_correct", "overconfidence") +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray50")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(person_stats, aes(x = brier)) +
geom_histogram(binwidth = 0.01, fill = "#b8860b", alpha = 0.7, color = "gray30") +
labs(x = "Brier score", y = "Count",
title = "Distribution of Brier scores",
subtitle = "Lower = better calibrated; 0.25 = uninformed baseline")
For the most-missed items, what do people actually answer?
hardest <- item_stats %>% head(10)
d %>%
filter(question_index %in% hardest$question_index) %>%
group_by(question_index, text, correct_answer) %>%
summarize(
pct_said_true = mean(answered_true == 1) * 100,
pct_correct = mean(is_correct) * 100,
conf_when_wrong = mean(confidence[!is_correct]),
conf_when_right = mean(confidence[is_correct]),
.groups = "drop"
) %>%
mutate(text = str_trunc(text, 55)) %>%
arrange(pct_correct)
Fit a 2-parameter logistic IRT model to the binary (correct/incorrect) responses. The discrimination parameter (a) tells us how well each item separates high- from low-ability respondents. Negative or very low a values indicate bad items.
# Create item response matrix (visitors x items)
item_matrix <- d %>%
select(visitor_id, question_index, is_correct) %>%
mutate(is_correct = as.integer(is_correct)) %>%
pivot_wider(
names_from = question_index,
values_from = is_correct,
names_prefix = "q"
) %>%
select(-visitor_id)
sprintf("Item matrix: %d respondents x %d items", nrow(item_matrix), ncol(item_matrix))
## [1] "Item matrix: 520 respondents x 120 items"
mod_2pl <- mirt(item_matrix, 1, itemtype = "2PL", verbose = FALSE)
irt_params <- coef(mod_2pl, IRTpars = TRUE, simplify = TRUE)$items %>%
as.data.frame() %>%
rownames_to_column("item") %>%
as_tibble() %>%
mutate(question_index = as.integer(str_extract(item, "\\d+")))
# Get factor loadings directly from mirt summary
loadings_df <- summary(mod_2pl, verbose = FALSE)$rotF %>%
as.data.frame() %>%
rownames_to_column("item") %>%
rename(loading = F1)
irt_params <- irt_params %>%
left_join(loadings_df, by = "item") %>%
left_join(questions, by = "question_index")
sprintf("Negative discrimination (a < 0): %d items", sum(irt_params$a < 0))
## [1] "Negative discrimination (a < 0): 9 items"
sprintf("Low discrimination (a < 0.2): %d items", sum(irt_params$a < 0.2))
## [1] "Low discrimination (a < 0.2): 24 items"
sprintf("Good discrimination (a > 0.5): %d items", sum(irt_params$a > 0.5))
## [1] "Good discrimination (a > 0.5): 54 items"
sprintf("Excellent discrimination (a > 1.0): %d items", sum(irt_params$a > 1.0))
## [1] "Excellent discrimination (a > 1.0): 18 items"
irt_params %>%
arrange(a) %>%
head(15) %>%
mutate(text = str_trunc(text, 55)) %>%
select(item, a, loading, b, text, correct)
irt_params %>%
arrange(desc(a)) %>%
head(15) %>%
mutate(text = str_trunc(text, 55)) %>%
select(item, a, loading, b, text, correct)
ggplot(irt_params, aes(x = a)) +
geom_histogram(binwidth = 0.1, fill = "#b8860b", alpha = 0.7, color = "gray30") +
geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
labs(x = "Discrimination (a)", y = "Count",
title = "Distribution of IRT discrimination parameters",
subtitle = "Items left of dashed line have negative discrimination")
For each item, compute the average confidence that respondents place in the correct answer. For those who chose correctly, this is their stated confidence. For those who chose incorrectly, it is 100 minus their stated confidence (since confidence in the wrong answer implies 100-conf in the right one).
item_stats <- item_stats %>%
left_join(
d %>%
mutate(
# confidence in the correct answer
conf_in_correct = ifelse(is_correct, confidence, 100 - confidence)
) %>%
group_by(question_index) %>%
summarize(mean_conf_in_correct = mean(conf_in_correct), .groups = "drop"),
by = "question_index"
)
cor.test(item_stats$pct_correct, item_stats$mean_conf_in_correct)
##
## Pearson's product-moment correlation
##
## data: item_stats$pct_correct and item_stats$mean_conf_in_correct
## t = 27.903, df = 118, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9035535 0.9520878
## sample estimates:
## cor
## 0.931874
GG_scatter(item_stats, "pct_correct", "mean_conf_in_correct") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray50")
## `geom_smooth()` using formula = 'y ~ x'
tibble(
metric = c("N respondents", "N items", "Mean % correct", "SD % correct",
"Mean overconfidence", "Mean Brier", "Mean MAE",
"% overconfident (>0)", "% underconfident (<0)"),
value = c(
nrow(person_stats),
length(unique(d$question_index)),
sprintf("%.1f%%", mean(person_stats$pct_correct)),
sprintf("%.1f%%", sd(person_stats$pct_correct)),
sprintf("%.1f", mean(person_stats$overconfidence)),
sprintf("%.3f", mean(person_stats$brier)),
sprintf("%.3f", mean(person_stats$mae)),
sprintf("%.0f%%", mean(person_stats$overconfidence > 0) * 100),
sprintf("%.0f%%", mean(person_stats$overconfidence < 0) * 100)
)
)