Init
library(pacman)
p_load(stringr, forcats, weights, psych, tidyverse, googlesheets, kirkegaard)
Data
googlesheets::gs_auth()
google_sheet = googlesheets::gs_url("https://docs.google.com/spreadsheets/d/1s2BYTK3EO9U9bmGPMYIFWjTXXVpI069dU0i8K6wn0Zo/edit#gid=576355961")
## Sheet-identifying info appears to be a browser URL.
## googlesheets will attempt to extract sheet key from the URL.
## Putative key: 1s2BYTK3EO9U9bmGPMYIFWjTXXVpI069dU0i8K6wn0Zo
## Sheet successfully identified: "Science knowledge questions dataset"
d_test_scored = googlesheets::gs_read(google_sheet, ws = "scored_answers")
## Accessing worksheet titled 'scored_answers'.
## Parsed with column specification:
## cols(
## Nima = col_integer(),
## Jens = col_integer(),
## Oliver = col_integer(),
## Jonatan = col_integer(),
## Noah = col_integer()
## )
d_test_meta = googlesheets::gs_read(google_sheet, ws = "proposed_items")
## Accessing worksheet titled 'proposed_items'.
## Parsed with column specification:
## cols(
## Fields = col_character(),
## `Estimated pass rate` = col_character(),
## Text = col_character(),
## Other = col_character(),
## Option_1 = col_character(),
## Option_2 = col_character(),
## Option_3 = col_character(),
## Option_4 = col_character(),
## Option_5 = col_character(),
## Option_6 = col_character(),
## Option_7 = col_character(),
## Option_8 = col_character(),
## Option_9 = col_character(),
## Option_10 = col_character(),
## `Correct_option (WHITE TEXT)` = col_character(),
## Notes = col_character(),
## `Task type` = col_character()
## )
Analyses
#number of items
nrow(d_test_meta)
## [1] 212
# Item count by category
d_test_meta$Fields %>% str_to_lower() %>% table2() %>% print(n=Inf)
## # A tibble: 17 x 3
## Group Count Percent
## <chr> <dbl> <dbl>
## 1 biology 33. 15.6
## 2 geography 27. 12.7
## 3 history 23. 10.8
## 4 physics 19. 8.96
## 5 medicine 16. 7.55
## 6 chemistry 13. 6.13
## 7 psychology 13. 6.13
## 8 astronomy 12. 5.66
## 9 math 12. 5.66
## 10 economics 9. 4.25
## 11 statistics 8. 3.77
## 12 computer science 6. 2.83
## 13 geology 6. 2.83
## 14 linguistics 5. 2.36
## 15 nutrition 5. 2.36
## 16 paleontology 5. 2.36
## 17 <NA> 0. 0.
# basic item level stats --------------------------------------------------
d_item_stats = d_test_scored %>% t %>% psych::describe()
# analyses ----------------------------------------------------------------
#correlate est. pass rate with observed
wtd.cors(d_item_stats$mean, d_test_meta$`Estimated pass rate` %>% str_replace(pattern = "%", replacement = ""))
## [,1]
## [1,] 0.4047746
# item difficulty by topic ------------------------------------------------
#d_field_means = cbind(d_item_stats, field = d_test_meta$Fields) %>% group_by(field) %>% summarise(count = n(), mean = mean(mean))
#plot
# d_field_means %>% ggplot() +
# geom_point(aes(mean, fct_reorder(field, mean), size = count)) +
# xlab("Mean pass rate") + ylab("Field") +
# scale_size_continuous(name = "Number of items") +
# scale_x_continuous(breaks = seq(0, 1, .1))
# ggsave("figures/field_pass_rates.png")
# hardest and easiest items -----------------------------------------------
# cbind(pass_rate = d_item_stats$mean,
# quest = d_test_meta$Text,
# field = d_test_meta$Fields) %>%
# as_data_frame() %>%
# arrange(pass_rate) %>%
# View()