Init

options(
  digits = 3
        )

library(pacman)
p_load(kirkegaard, 
       rms, 
       lubridate
       )

theme_set(theme_bw())

select = dplyr::select
describe = function(...) psych::describe(...) %>% as.matrix()

Data

#quiz data
quiz25 = read_csv("data/20200914074101-SurveyExport.csv") %>%
  df_legalize_names()

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   `Response ID` = col_double(),
##   `Time Started` = col_datetime(format = ""),
##   `Date Submitted` = col_datetime(format = ""),
##   `Contact ID` = col_logical(),
##   `Legacy Comments` = col_logical(),
##   Comments = col_logical(),
##   Tags = col_logical(),
##   Longitude = col_double(),
##   Latitude = col_double(),
##   `What is your age?` = col_double(),
##   `What is your google scholar h-index? The h-index of someone who has never published a paper is 0.` = col_double(),
##   `Which number does the binary number 1010 correspond to in the decimal system?` = col_double(),
##   `With regards to your knowledge of science, what percentile of the general population do you think you are in?The 30th centile is lower than 30% of popuation, and lower than 70% of population, while the 80th centile is higher than 80% of the population and lower than 20% of the population.` = col_double(),
##   `The previous page featured 25 questions testing your knowledge. How many correct answers do you think you gave?  ` = col_double(),
##   `We have calculated your score.` = col_double()
## )

## See spec(...) for full column specifications.

#recode
quiz25$score = quiz25$We_have_calculated_your_score %>%
  as.character() %>%
  as.numeric()

quiz25$score_guess =
  quiz25$The_previous_page_featured_25_questions_testing_your_knowledge_How_many_correct_answers_do_you_think_you_gave %>%
  as.character() %>%
  as.numeric()

quiz25$centile_guess =
  quiz25$With_regards_to_your_knowledge_of_science_what_percentile_of_the_general_population_do_you_think_you_are_in_The_30th_centile_is_lower_than_30pct_of_popuation_and_lower_than_70pct_of_population_while_the_80th_centile_is_higher_than_80pct_of_the_population_and_lower_than_20pct_of_the_population %>%
  as.character() %>%
  as.numeric()

quiz25$education = 
  quiz25$What_is_the_highest_level_of_education_you_have_completed %>% 
  ordered(levels = c("None", "High school", "Other", "Bachelor", "Master", "Doctorate or higher"))

quiz25$age = 
  quiz25$What_is_your_age %>% 
  as.character() %>% 
  as.numeric()

#parsing is bad bc we changed the format halfway, so fuck the remaining data
quiz25$time_started = quiz25$Time_Started %>% unlist() %>% as.numeric() %>% as_datetime()
quiz25$time_ended = quiz25$Date_Submitted %>% unlist() %>% as.numeric() %>% as_datetime()

quiz25$time_taken = (quiz25$time_ended - quiz25$time_started) %>% 
  as.numeric()
quiz25$time_taken %>% describe()

##    vars    n mean    sd median trimmed mad min    max  range skew kurtosis  se
## X1    1 2413 1211 10179    531     567 227  22 337171 337149 27.4      821 207

#knowledge item names
test_item_names = quiz25 %>% 
  select(Almost_all_plants_are_of_the_following_type:In_which_ancient_Greek_city_state_did_Socrates_live) %>% 
  names()

Analysis

#score key finding algo
#it looks at each response options relation to the total score
#only using this because too lazy to copy in the correct answers

# score_key = map_chr(test_item_names, function(v) {
#   # browser()
#   #score by key
#   desc = describeBy(quiz25$score, quiz25[[v]], mat = T)
#   
#   #highest is the right answer
#   desc %>% 
#     filter(n > 10) %>% 
#     arrange(-mean) %>% 
#     .$group1 %>% 
#     .[1] %>% 
#     as.character()
# })
# score_key[10] = "Vowels and consonants." #overwrite because algo got it wrong!

score_key = c("Multicellular eukaryotes", 
              "Create a second group of participants with ear infections who do not use any ear drops", 
              "Inserting a gene into plants that makes them resistant to insects.", 
              "The tilt of the Earth’s axis in relation to the sun", 
              "Marc", 
              "The dodo hypothesis.",
              "The Stroop effect.",
              "The Gaussian distribution.",
              "About 50%.",
              "Vowels and consonants.",
              "Typology.",
              "Water boils at a lower temperature in Denver than Los Angeles",
              "Amplitude or height", 
              "Jonas Salk", 
              "Astrology", 
              "Aspirin", 
              "Brazil", 
              "10", 
              "Nitrogen", 
              "Herbert Spencer", 
              "Iridium.", 
              "Paul Erdős.", 
              "A standardized measure of the linear relationship between two variables.", 
              "66 million years", 
              "Athens")

#score items
scored_items = quiz25[test_item_names] %>% 
  score_items(key = score_key) %>% 
  set_names("Q" + 1:25)

#IRT
IRT_fit = irt.fa(scored_items)

IRT_fit$fa

## Factor Analysis using method =  minres
## Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, 
##     fm = fm)
## Standardized loadings (pattern matrix) based upon correlation matrix
##      MR1      h2   u2 com
## Q1  0.38 1.4e-01 0.86   1
## Q2  0.56 3.1e-01 0.69   1
## Q3  0.51 2.6e-01 0.74   1
## Q4  0.50 2.5e-01 0.75   1
## Q5  0.38 1.4e-01 0.86   1
## Q6  0.17 2.7e-02 0.97   1
## Q7  0.22 4.7e-02 0.95   1
## Q8  0.56 3.1e-01 0.69   1
## Q9  0.54 2.9e-01 0.71   1
## Q10 0.01 8.8e-05 1.00   1
## Q11 0.05 2.8e-03 1.00   1
## Q12 0.52 2.7e-01 0.73   1
## Q13 0.59 3.5e-01 0.65   1
## Q14 0.56 3.1e-01 0.69   1
## Q15 0.40 1.6e-01 0.84   1
## Q16 0.27 7.1e-02 0.93   1
## Q17 0.18 3.3e-02 0.97   1
## Q18 0.49 2.4e-01 0.76   1
## Q19 0.62 3.8e-01 0.62   1
## Q20 0.45 2.0e-01 0.80   1
## Q21 0.34 1.1e-01 0.89   1
## Q22 0.67 4.5e-01 0.55   1
## Q23 0.36 1.3e-01 0.87   1
## Q24 0.38 1.4e-01 0.86   1
## Q25 0.37 1.4e-01 0.86   1
## 
##                 MR1
## SS loadings    4.79
## Proportion Var 0.19
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 factor is sufficient.
## 
## The degrees of freedom for the null model are  300  and the objective function was  5.77 with Chi Square of  13868
## The degrees of freedom for the model are 275  and the objective function was  2.19 
## 
## The root mean square of the residuals (RMSR) is  0.07 
## The df corrected root mean square of the residuals is  0.08 
## 
## The harmonic number of observations is  2413 with the empirical chi square  7866  with prob <  0 
## The total number of observations was  2413  with Likelihood Chi Square =  5269  with prob <  0 
## 
## Tucker Lewis Index of factoring reliability =  0.598
## RMSEA index =  0.087  and the 90 % confidence intervals are  0.085 0.089
## BIC =  3127
## Fit based upon off diagonal values = 0.87
## Measures of factor score adequacy             
##                                                    MR1
## Correlation of (regression) scores with factors   0.93
## Multiple R square of scores with factors          0.87
## Minimum correlation of possible factor scores     0.74

#item data
item_data = tibble(
  item = names(scored_items),
  question = test_item_names %>% str_clean(),
  correct = score_key,
  pass_rate = scored_items %>% colMeans(na.rm = T),
  g_loading = IRT_fit$fa$loadings[, 1]
)

#print as data table, round digits to 2
item_data %>% df_round() %>% DT::datatable()

#item stats
GG_scatter(item_data, "pass_rate", "g_loading", case_names = "item")

## `geom_smooth()` using formula 'y ~ x'

#summary
describe(quiz25$score)

##    vars    n mean   sd median trimmed  mad min max range   skew kurtosis     se
## X1    1 2413 14.7 3.33     15    14.8 2.97   0  25    25 -0.129    0.178 0.0678

ggplot(quiz25, aes(score)) +
  geom_bar(stat = "count") +
  scale_x_continuous(breaks = 0:25, limits = c(0, 25))

## Warning: Removed 2 rows containing missing values (geom_bar).

#quantiles
#estimate the empirical cumulative distribution function
empirical_cdf = ecdf(quiz25$score)

#get centiles for scores 0-25
empirical_cdf(0:25) %>%
  set_names(0:25)

##        0        1        2        3        4        5        6        7 
## 0.000414 0.000414 0.001243 0.001658 0.002072 0.004144 0.008288 0.017406 
##        8        9       10       11       12       13       14       15 
## 0.032739 0.054704 0.101533 0.162453 0.246167 0.347286 0.462909 0.587650 
##       16       17       18       19       20       21       22       23 
## 0.701616 0.800663 0.873601 0.928305 0.956900 0.980937 0.992126 0.997513 
##       24       25 
## 0.999586 1.000000

#correlations among scores and guesses at own ability
quiz25 %>%
  select(score, score_guess, centile_guess) %>%
  wtd.cors()

##               score score_guess centile_guess
## score         1.000       0.499         0.436
## score_guess   0.499       1.000         0.610
## centile_guess 0.436       0.610         1.000

#regressions
#model
ols(score ~ score_guess, data = quiz25)

## Frequencies of Missing Values Due to Each Variable
##       score score_guess 
##           0           5 
## 
## Linear Regression Model
##  
##  ols(formula = score ~ score_guess, data = quiz25)
##  
##  
##                  Model Likelihood    Discrimination    
##                        Ratio Test           Indexes    
##  Obs    2408    LR chi2    688.06    R2       0.249    
##  sigma2.8747    d.f.            1    R2 adj   0.248    
##  d.f.   2406    Pr(> chi2) 0.0000    g        1.870    
##  
##  Residuals
##  
##        Min        1Q    Median        3Q       Max 
##  -16.95587  -1.94424  -0.02832   1.89923   9.75434 
##  
##  
##              Coef   S.E.   t     Pr(>|t|)
##  Intercept   9.3181 0.2008 46.40 <0.0001 
##  score_guess 0.3855 0.0137 28.21 <0.0001 
##

#plot
GG_scatter(quiz25, "score_guess", "score") +
  geom_count() +
  geom_smooth() +
  ggtitle("Score on 25 item science knowledge test and own estimate of this score after taking test",
          str_glue("Data is from {nrow(quiz25)} people from Twitter"))

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

GG_save("trial/score_guess.png")

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#model
ols(score ~ centile_guess, data = quiz25)

## Frequencies of Missing Values Due to Each Variable
##         score centile_guess 
##             0             7 
## 
## Linear Regression Model
##  
##  ols(formula = score ~ centile_guess, data = quiz25)
##  
##  
##                  Model Likelihood    Discrimination    
##                        Ratio Test           Indexes    
##  Obs    2406    LR chi2    508.30    R2       0.190    
##  sigma2.9845    d.f.            1    R2 adj   0.190    
##  d.f.   2404    Pr(> chi2) 0.0000    g        1.607    
##  
##  Residuals
##  
##        Min        1Q    Median        3Q       Max 
##  -15.01005  -1.93334  -0.07784   1.85437  10.35819 
##  
##  
##                Coef   S.E.   t     Pr(>|t|)
##  Intercept     9.8490 0.2144 45.94 <0.0001 
##  centile_guess 0.0716 0.0030 23.78 <0.0001 
##

#plot
GG_scatter(quiz25, "centile_guess", "score") +
  geom_count() +
  geom_smooth() +
  ggtitle("Score on 25 item science knowledge test and own centile estimate of this score after taking test",
          str_glue("Data is from {nrow(quiz25)} people from Twitter"))

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

GG_save("trial/score_guess2.png")

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ols(score ~ score_guess + centile_guess, data = quiz25)

## Frequencies of Missing Values Due to Each Variable
##         score   score_guess centile_guess 
##             0             5             7 
## 
## Linear Regression Model
##  
##  ols(formula = score ~ score_guess + centile_guess, data = quiz25)
##  
##  
##                  Model Likelihood    Discrimination    
##                        Ratio Test           Indexes    
##  Obs    2406    LR chi2    778.29    R2       0.276    
##  sigma2.8222    d.f.            2    R2 adj   0.276    
##  d.f.   2403    Pr(> chi2) 0.0000    g        1.974    
##  
##  Residuals
##  
##        Min        1Q    Median        3Q       Max 
##  -16.96538  -1.80380  -0.08731   1.84164   8.98498 
##  
##  
##                Coef   S.E.   t     Pr(>|t|)
##  Intercept     8.3566 0.2211 37.79 <0.0001 
##  score_guess   0.2860 0.0169 16.89 <0.0001 
##  centile_guess 0.0346 0.0036  9.62 <0.0001 
##

#standardized data
ols(score ~ score_guess + centile_guess, data = quiz25 %>% select(score, score_guess, centile_guess) %>% df_standardize())

## Frequencies of Missing Values Due to Each Variable
##         score   score_guess centile_guess 
##             0             5             7 
## 
## Linear Regression Model
##  
##  ols(formula = score ~ score_guess + centile_guess, data = quiz25 %>% 
##      select(score, score_guess, centile_guess) %>% df_standardize())
##  
##  
##                  Model Likelihood    Discrimination    
##                        Ratio Test           Indexes    
##  Obs    2406    LR chi2    778.29    R2       0.276    
##  sigma0.8474    d.f.            2    R2 adj   0.276    
##  d.f.   2403    Pr(> chi2) 0.0000    g        0.593    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -5.09428 -0.54164 -0.02622  0.55300  2.69796 
##  
##  
##                Coef    S.E.   t     Pr(>|t|)
##  Intercept     -0.0001 0.0173  0.00 0.9966  
##  score_guess    0.3683 0.0218 16.89 <0.0001 
##  centile_guess  0.2098 0.0218  9.62 <0.0001 
##

#education as a data test
GG_group_means(quiz25, "score", "education")

## Missing values were removed.

#Cohen d
SMD_matrix(quiz25$score*-1, quiz25$education)

##                        None High school    Other Bachelor Master
## None                     NA      0.0894  0.00489   0.2709 0.3470
## High school         0.08936          NA -0.08447   0.1816 0.2577
## Other               0.00489     -0.0845       NA   0.2660 0.3422
## Bachelor            0.27091      0.1816  0.26602       NA 0.0761
## Master              0.34705      0.2577  0.34216   0.0761     NA
## Doctorate or higher 0.63493      0.5456  0.63005   0.3640 0.2879
##                     Doctorate or higher
## None                              0.635
## High school                       0.546
## Other                             0.630
## Bachelor                          0.364
## Master                            0.288
## Doctorate or higher                  NA

#age
GG_scatter(quiz25, "age", "score") +
  geom_smooth()

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#time taken
GG_scatter(quiz25, "time_taken", "score") +
  geom_smooth()

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#random guess distribution
quiz25[test_item_names] %>% 
  map_dbl(~table(.) %>% length()) %>% 
  map_df(function(x) {
    sample(c(T, F), size = nrow(quiz25), replace = T, prob = c(1/x, 1-(1/x)))
  }) %>% 
  rowSums() %>% 
  {
    tibble(score = .)
  } %>% 
  ggplot(aes(score)) +
  geom_bar(stat = "count", fill = "red", alpha = .5) +
  scale_x_continuous(breaks = 0:25, limits = c(0, 25)) +
  geom_bar(data = quiz25, stat = "count", fill = "black", alpha = .5) +
  ggtitle("Comparison of real scores vs. simulated random guesses",
          "Red = simulated, black = real")

## Warning: Removed 2 rows containing missing values (geom_bar).

Science quiz 25-item test results

Init

Data

Analysis

Meta