About

Item-level analysis of the Confidence Calibration Test from TakeTest.xyz. The test presents 120 true/false statements spanning geography, history, science, culture, and more. For each statement, respondents indicate whether they believe it is true or false and rate their confidence from 50% (pure guess) to 100% (certain).

Data from N=520 unique first-time completions.

Init

library(kirkegaard)

## Loading required package: tidyverse

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4          ✔ readr     2.1.5     
## ✔ forcats   1.0.1          ✔ stringr   1.6.0     
## ✔ ggplot2   4.0.1.9000     ✔ tibble    3.3.0     
## ✔ lubridate 1.9.4          ✔ tidyr     1.3.1     
## ✔ purrr     1.2.0          
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
## 
## 
## Attaching package: 'magrittr'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## 
## Loading required package: weights
## 
## Loading required package: assertthat
## 
## 
## Attaching package: 'assertthat'
## 
## 
## The following object is masked from 'package:tibble':
## 
##     has_name
## 
## 
## Loading required package: psych
## 
## 
## Attaching package: 'psych'
## 
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## 
## Loading required package: robustbase
## 
## 
## Attaching package: 'kirkegaard'
## 
## 
## The following object is masked from 'package:psych':
## 
##     rescale
## 
## 
## The following object is masked from 'package:assertthat':
## 
##     are_equal
## 
## 
## The following object is masked from 'package:purrr':
## 
##     is_logical
## 
## 
## The following object is masked from 'package:base':
## 
##     +

load_packages(scales, mirt)

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:kirkegaard':
## 
##     rescale
## 
## The following objects are masked from 'package:psych':
## 
##     alpha, rescale
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
## 
## Loading required package: stats4
## Loading required package: lattice

theme_set(theme_bw())

answers <- read.csv("../data/calibration_answers.csv")
questions <- read.csv("../data/calibration_questions.csv")

# Merge
d <- answers %>%
  left_join(questions, by = "question_index") %>%
  mutate(
    correct_answer = correct == "True",
    is_correct = (answered_true == 1) == correct_answer,
    conf_decimal = confidence / 100,
    # p(true) as stated by respondent
    p_true = ifelse(answered_true == 1, conf_decimal, 1 - conf_decimal),
    outcome = as.numeric(correct_answer),
    # confidence in chosen answer
    conf_in_answer = conf_decimal
  )

sprintf("N respondents: %d, N items: %d, N answers: %d",
        length(unique(d$visitor_id)), length(unique(d$question_index)), nrow(d))

## [1] "N respondents: 520, N items: 120, N answers: 62390"

Overall calibration

cal_summary <- d %>%
  group_by(confidence) %>%
  summarize(
    n = n(),
    n_correct = sum(is_correct),
    accuracy = mean(is_correct),
    .groups = "drop"
  )

cal_summary

ggplot(cal_summary, aes(x = confidence, y = accuracy * 100)) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray50") +
  geom_point(aes(size = n), color = "#b8860b") +
  geom_line(color = "#b8860b") +
  geom_text(aes(label = sprintf("n=%s", comma(n))), vjust = -1.2, size = 3) +
  scale_x_continuous(breaks = seq(50, 100, 10), limits = c(45, 105)) +
  scale_y_continuous(breaks = seq(50, 100, 10), limits = c(45, 105)) +
  labs(
    x = "Stated confidence (%)",
    y = "Actual accuracy (%)",
    title = "Overall calibration curve (N=520)",
    subtitle = "Dashed line = perfect calibration"
  ) +
  guides(size = guide_legend(title = "Responses")) +
  coord_equal()

Item difficulty

Which statements do people get right most and least often?

item_stats <- d %>%
  group_by(question_index, text, correct_answer) %>%
  summarize(
    n = n(),
    pct_correct = mean(is_correct) * 100,
    mean_confidence = mean(confidence),
    mean_conf_correct = mean(confidence[is_correct]),
    mean_conf_incorrect = mean(confidence[!is_correct]),
    pct_true = mean(answered_true == 1) * 100,
    .groups = "drop"
  ) %>%
  arrange(pct_correct)

item_stats

Hardest items (most people get wrong)

item_stats %>%
  head(20) %>%
  mutate(text = str_trunc(text, 60)) %>%
  select(text, correct_answer, pct_correct, mean_confidence)

item_stats %>%
  head(20) %>%
  mutate(text = str_trunc(text, 55) %>% fct_reorder(pct_correct)) %>%
  ggplot(aes(x = pct_correct, y = text)) +
  geom_col(fill = "#b8860b", alpha = 0.7) +
  geom_vline(xintercept = 50, linetype = "dashed", color = "gray50") +
  labs(x = "% correct", y = NULL, title = "20 hardest items") +
  scale_x_continuous(limits = c(0, 100))

Easiest items

item_stats %>%
  tail(20) %>%
  arrange(desc(pct_correct)) %>%
  mutate(text = str_trunc(text, 60)) %>%
  select(text, correct_answer, pct_correct, mean_confidence)

Overconfidence by item

Where are people most and least calibrated per item? We compute the gap between mean confidence and actual accuracy.

item_stats <- item_stats %>%
  mutate(
    overconfidence = mean_confidence - pct_correct,
    abs_overconfidence = abs(overconfidence)
  )

Most overconfident items (people think they know but don’t)

item_stats %>%
  arrange(desc(overconfidence)) %>%
  head(20) %>%
  mutate(text = str_trunc(text, 60)) %>%
  select(text, correct_answer, pct_correct, mean_confidence, overconfidence)

item_stats %>%
  arrange(desc(overconfidence)) %>%
  head(20) %>%
  mutate(text = str_trunc(text, 55) %>% fct_reorder(overconfidence)) %>%
  ggplot(aes(x = overconfidence, y = text)) +
  geom_col(fill = "#c0392b", alpha = 0.7) +
  labs(x = "Overconfidence (mean confidence - % correct)", y = NULL,
       title = "20 most overconfident items",
       subtitle = "Items where people are most wrong about being right") +
  geom_vline(xintercept = 0, linetype = "dashed")

Most underconfident items (people know more than they think)

item_stats %>%
  arrange(overconfidence) %>%
  head(20) %>%
  mutate(text = str_trunc(text, 60)) %>%
  select(text, correct_answer, pct_correct, mean_confidence, overconfidence)

Confidence distribution

d %>%
  count(confidence) %>%
  ggplot(aes(x = confidence, y = n)) +
  geom_col(fill = "#b8860b", alpha = 0.7) +
  labs(x = "Confidence level (%)", y = "Count",
       title = "Distribution of confidence ratings across all responses")

Per-person summary scores

person_stats <- d %>%
  group_by(visitor_id) %>%
  summarize(
    n_answered = n(),
    pct_correct = mean(is_correct) * 100,
    mean_confidence = mean(confidence),
    overconfidence = mean_confidence - pct_correct,
    brier = mean((p_true - outcome)^2),
    mae = mean(abs(p_true - outcome)),
    .groups = "drop"
  )

ggplot(person_stats, aes(x = pct_correct)) +
  geom_histogram(binwidth = 2, fill = "#b8860b", alpha = 0.7, color = "gray30") +
  labs(x = "% correct", y = "Count", title = "Distribution of knowledge scores")

ggplot(person_stats, aes(x = overconfidence)) +
  geom_histogram(binwidth = 2, fill = "#c0392b", alpha = 0.7, color = "gray30") +
  geom_vline(xintercept = 0, linetype = "dashed") +
  labs(x = "Overconfidence (mean confidence - % correct)", y = "Count",
       title = "Distribution of overconfidence",
       subtitle = "Positive = overconfident, negative = underconfident")

GG_scatter(person_stats, "mean_confidence", "pct_correct") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray50")

## `geom_smooth()` using formula = 'y ~ x'

Knowledge vs. overconfidence (Dunning-Kruger)

GG_scatter(person_stats, "pct_correct", "overconfidence") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray50")

## `geom_smooth()` using formula = 'y ~ x'

ggplot(person_stats, aes(x = brier)) +
  geom_histogram(binwidth = 0.01, fill = "#b8860b", alpha = 0.7, color = "gray30") +
  labs(x = "Brier score", y = "Count",
       title = "Distribution of Brier scores",
       subtitle = "Lower = better calibrated; 0.25 = uninformed baseline")

Item-level: answer patterns for hardest items

For the most-missed items, what do people actually answer?

hardest <- item_stats %>% head(10)

d %>%
  filter(question_index %in% hardest$question_index) %>%
  group_by(question_index, text, correct_answer) %>%
  summarize(
    pct_said_true = mean(answered_true == 1) * 100,
    pct_correct = mean(is_correct) * 100,
    conf_when_wrong = mean(confidence[!is_correct]),
    conf_when_right = mean(confidence[is_correct]),
    .groups = "drop"
  ) %>%
  mutate(text = str_trunc(text, 55)) %>%
  arrange(pct_correct)

2PL IRT model

Fit a 2-parameter logistic IRT model to the binary (correct/incorrect) responses. The discrimination parameter (a) tells us how well each item separates high- from low-ability respondents. Negative or very low a values indicate bad items.

# Create item response matrix (visitors x items)
item_matrix <- d %>%
  select(visitor_id, question_index, is_correct) %>%
  mutate(is_correct = as.integer(is_correct)) %>%
  pivot_wider(
    names_from = question_index,
    values_from = is_correct,
    names_prefix = "q"
  ) %>%
  select(-visitor_id)

sprintf("Item matrix: %d respondents x %d items", nrow(item_matrix), ncol(item_matrix))

## [1] "Item matrix: 520 respondents x 120 items"

mod_2pl <- mirt(item_matrix, 1, itemtype = "2PL", verbose = FALSE)

irt_params <- coef(mod_2pl, IRTpars = TRUE, simplify = TRUE)$items %>%
  as.data.frame() %>%
  rownames_to_column("item") %>%
  as_tibble() %>%
  mutate(question_index = as.integer(str_extract(item, "\\d+")))

# Get factor loadings directly from mirt summary
loadings_df <- summary(mod_2pl, verbose = FALSE)$rotF %>%
  as.data.frame() %>%
  rownames_to_column("item") %>%
  rename(loading = F1)

irt_params <- irt_params %>%
  left_join(loadings_df, by = "item") %>%
  left_join(questions, by = "question_index")

sprintf("Negative discrimination (a < 0): %d items", sum(irt_params$a < 0))

## [1] "Negative discrimination (a < 0): 9 items"

sprintf("Low discrimination (a < 0.2): %d items", sum(irt_params$a < 0.2))

## [1] "Low discrimination (a < 0.2): 24 items"

sprintf("Good discrimination (a > 0.5): %d items", sum(irt_params$a > 0.5))

## [1] "Good discrimination (a > 0.5): 54 items"

sprintf("Excellent discrimination (a > 1.0): %d items", sum(irt_params$a > 1.0))

## [1] "Excellent discrimination (a > 1.0): 18 items"

Worst discriminating items

irt_params %>%
  arrange(a) %>%
  head(15) %>%
  mutate(text = str_trunc(text, 55)) %>%
  select(item, a, loading, b, text, correct)

Best discriminating items

irt_params %>%
  arrange(desc(a)) %>%
  head(15) %>%
  mutate(text = str_trunc(text, 55)) %>%
  select(item, a, loading, b, text, correct)

ggplot(irt_params, aes(x = a)) +
  geom_histogram(binwidth = 0.1, fill = "#b8860b", alpha = 0.7, color = "gray30") +
  geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
  labs(x = "Discrimination (a)", y = "Count",
       title = "Distribution of IRT discrimination parameters",
       subtitle = "Items left of dashed line have negative discrimination")

Pass rate vs. mean confidence in correct answer

For each item, compute the average confidence that respondents place in the correct answer. For those who chose correctly, this is their stated confidence. For those who chose incorrectly, it is 100 minus their stated confidence (since confidence in the wrong answer implies 100-conf in the right one).

item_stats <- item_stats %>%
  left_join(
    d %>%
      mutate(
        # confidence in the correct answer
        conf_in_correct = ifelse(is_correct, confidence, 100 - confidence)
      ) %>%
      group_by(question_index) %>%
      summarize(mean_conf_in_correct = mean(conf_in_correct), .groups = "drop"),
    by = "question_index"
  )

cor.test(item_stats$pct_correct, item_stats$mean_conf_in_correct)

## 
##  Pearson's product-moment correlation
## 
## data:  item_stats$pct_correct and item_stats$mean_conf_in_correct
## t = 27.903, df = 118, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9035535 0.9520878
## sample estimates:
##      cor 
## 0.931874

GG_scatter(item_stats, "pct_correct", "mean_conf_in_correct") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray50")

## `geom_smooth()` using formula = 'y ~ x'

Summary statistics

tibble(
  metric = c("N respondents", "N items", "Mean % correct", "SD % correct",
             "Mean overconfidence", "Mean Brier", "Mean MAE",
             "% overconfident (>0)", "% underconfident (<0)"),
  value = c(
    nrow(person_stats),
    length(unique(d$question_index)),
    sprintf("%.1f%%", mean(person_stats$pct_correct)),
    sprintf("%.1f%%", sd(person_stats$pct_correct)),
    sprintf("%.1f", mean(person_stats$overconfidence)),
    sprintf("%.3f", mean(person_stats$brier)),
    sprintf("%.3f", mean(person_stats$mae)),
    sprintf("%.0f%%", mean(person_stats$overconfidence > 0) * 100),
    sprintf("%.0f%%", mean(person_stats$overconfidence < 0) * 100)
  )
)

Confidence Calibration Test: Item-level analysis