Init

library(pacman)
p_load(stringr, forcats, weights, psych, tidyverse, googlesheets, kirkegaard)

Data

googlesheets::gs_auth()

google_sheet = googlesheets::gs_url("https://docs.google.com/spreadsheets/d/1s2BYTK3EO9U9bmGPMYIFWjTXXVpI069dU0i8K6wn0Zo/edit#gid=576355961")
## Sheet-identifying info appears to be a browser URL.
## googlesheets will attempt to extract sheet key from the URL.
## Putative key: 1s2BYTK3EO9U9bmGPMYIFWjTXXVpI069dU0i8K6wn0Zo
## Sheet successfully identified: "Science knowledge questions dataset"
d_test_scored = googlesheets::gs_read(google_sheet, ws = "scored_answers")
## Accessing worksheet titled 'scored_answers'.
## Parsed with column specification:
## cols(
##   Nima = col_integer(),
##   Jens = col_integer(),
##   Oliver = col_integer(),
##   Jonatan = col_integer(),
##   Noah = col_integer()
## )
d_test_meta = googlesheets::gs_read(google_sheet, ws = "proposed_items")
## Accessing worksheet titled 'proposed_items'.
## Parsed with column specification:
## cols(
##   Fields = col_character(),
##   `Estimated pass rate` = col_character(),
##   Text = col_character(),
##   Other = col_character(),
##   Option_1 = col_character(),
##   Option_2 = col_character(),
##   Option_3 = col_character(),
##   Option_4 = col_character(),
##   Option_5 = col_character(),
##   Option_6 = col_character(),
##   Option_7 = col_character(),
##   Option_8 = col_character(),
##   Option_9 = col_character(),
##   Option_10 = col_character(),
##   `Correct_option (WHITE TEXT)` = col_character(),
##   Notes = col_character(),
##   `Task type` = col_character()
## )

Analyses

#number of items
nrow(d_test_meta)
## [1] 212
# Item count by category
d_test_meta$Fields %>% str_to_lower() %>% table2() %>% print(n=Inf)
## # A tibble: 17 x 3
##    Group            Count Percent
##    <chr>            <dbl>   <dbl>
##  1 biology            33.   15.6 
##  2 geography          27.   12.7 
##  3 history            23.   10.8 
##  4 physics            19.    8.96
##  5 medicine           16.    7.55
##  6 chemistry          13.    6.13
##  7 psychology         13.    6.13
##  8 astronomy          12.    5.66
##  9 math               12.    5.66
## 10 economics           9.    4.25
## 11 statistics          8.    3.77
## 12 computer science    6.    2.83
## 13 geology             6.    2.83
## 14 linguistics         5.    2.36
## 15 nutrition           5.    2.36
## 16 paleontology        5.    2.36
## 17 <NA>                0.    0.
# basic item level stats --------------------------------------------------
d_item_stats = d_test_scored %>% t %>% psych::describe()


# analyses ----------------------------------------------------------------
#correlate est. pass rate with observed
wtd.cors(d_item_stats$mean, d_test_meta$`Estimated pass rate` %>% str_replace(pattern = "%", replacement = ""))
##           [,1]
## [1,] 0.4047746
# item difficulty by topic ------------------------------------------------
#d_field_means = cbind(d_item_stats, field = d_test_meta$Fields) %>% group_by(field) %>% summarise(count = n(), mean = mean(mean))

#plot
# d_field_means %>% ggplot() +
#   geom_point(aes(mean, fct_reorder(field, mean), size = count)) +
#   xlab("Mean pass rate") + ylab("Field") +
#   scale_size_continuous(name = "Number of items") +
#   scale_x_continuous(breaks = seq(0, 1, .1))
# ggsave("figures/field_pass_rates.png")

# hardest and easiest items -----------------------------------------------

# cbind(pass_rate = d_item_stats$mean,
#       quest = d_test_meta$Text,
#       field = d_test_meta$Fields) %>% 
#   as_data_frame() %>% 
#   arrange(pass_rate) %>% 
#   View()