Data
#quiz data
quiz25 = read_csv("data/20200914074101-SurveyExport.csv") %>%
df_legalize_names()
## Parsed with column specification:
## cols(
## .default = col_character(),
## `Response ID` = col_double(),
## `Time Started` = col_datetime(format = ""),
## `Date Submitted` = col_datetime(format = ""),
## `Contact ID` = col_logical(),
## `Legacy Comments` = col_logical(),
## Comments = col_logical(),
## Tags = col_logical(),
## Longitude = col_double(),
## Latitude = col_double(),
## `What is your age?` = col_double(),
## `What is your google scholar h-index? The h-index of someone who has never published a paper is 0.` = col_double(),
## `Which number does the binary number 1010 correspond to in the decimal system?` = col_double(),
## `With regards to your knowledge of science, what percentile of the general population do you think you are in?The 30th centile is lower than 30% of popuation, and lower than 70% of population, while the 80th centile is higher than 80% of the population and lower than 20% of the population.` = col_double(),
## `The previous page featured 25 questions testing your knowledge. How many correct answers do you think you gave? Â ` = col_double(),
## `We have calculated your score.` = col_double()
## )
## See spec(...) for full column specifications.
#recode
quiz25$score = quiz25$We_have_calculated_your_score %>%
as.character() %>%
as.numeric()
quiz25$score_guess =
quiz25$The_previous_page_featured_25_questions_testing_your_knowledge_How_many_correct_answers_do_you_think_you_gave %>%
as.character() %>%
as.numeric()
quiz25$centile_guess =
quiz25$With_regards_to_your_knowledge_of_science_what_percentile_of_the_general_population_do_you_think_you_are_in_The_30th_centile_is_lower_than_30pct_of_popuation_and_lower_than_70pct_of_population_while_the_80th_centile_is_higher_than_80pct_of_the_population_and_lower_than_20pct_of_the_population %>%
as.character() %>%
as.numeric()
quiz25$education =
quiz25$What_is_the_highest_level_of_education_you_have_completed %>%
ordered(levels = c("None", "High school", "Other", "Bachelor", "Master", "Doctorate or higher"))
quiz25$age =
quiz25$What_is_your_age %>%
as.character() %>%
as.numeric()
#parsing is bad bc we changed the format halfway, so fuck the remaining data
quiz25$time_started = quiz25$Time_Started %>% unlist() %>% as.numeric() %>% as_datetime()
quiz25$time_ended = quiz25$Date_Submitted %>% unlist() %>% as.numeric() %>% as_datetime()
quiz25$time_taken = (quiz25$time_ended - quiz25$time_started) %>%
as.numeric()
quiz25$time_taken %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 2413 1211 10179 531 567 227 22 337171 337149 27.4 821 207
#knowledge item names
test_item_names = quiz25 %>%
select(Almost_all_plants_are_of_the_following_type:In_which_ancient_Greek_city_state_did_Socrates_live) %>%
names()
Analysis
#score key finding algo
#it looks at each response options relation to the total score
#only using this because too lazy to copy in the correct answers
# score_key = map_chr(test_item_names, function(v) {
# # browser()
# #score by key
# desc = describeBy(quiz25$score, quiz25[[v]], mat = T)
#
# #highest is the right answer
# desc %>%
# filter(n > 10) %>%
# arrange(-mean) %>%
# .$group1 %>%
# .[1] %>%
# as.character()
# })
# score_key[10] = "Vowels and consonants." #overwrite because algo got it wrong!
score_key = c("Multicellular eukaryotes",
"Create a second group of participants with ear infections who do not use any ear drops",
"Inserting a gene into plants that makes them resistant to insects.",
"The tilt of the Earth’s axis in relation to the sun",
"Marc",
"The dodo hypothesis.",
"The Stroop effect.",
"The Gaussian distribution.",
"About 50%.",
"Vowels and consonants.",
"Typology.",
"Water boils at a lower temperature in Denver than Los Angeles",
"Amplitude or height",
"Jonas Salk",
"Astrology",
"Aspirin",
"Brazil",
"10",
"Nitrogen",
"Herbert Spencer",
"Iridium.",
"Paul Erdős.",
"A standardized measure of the linear relationship between two variables.",
"66 million years",
"Athens")
#score items
scored_items = quiz25[test_item_names] %>%
score_items(key = score_key) %>%
set_names("Q" + 1:25)
#IRT
IRT_fit = irt.fa(scored_items)

IRT_fit$fa
## Factor Analysis using method = minres
## Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate,
## fm = fm)
## Standardized loadings (pattern matrix) based upon correlation matrix
## MR1 h2 u2 com
## Q1 0.38 1.4e-01 0.86 1
## Q2 0.56 3.1e-01 0.69 1
## Q3 0.51 2.6e-01 0.74 1
## Q4 0.50 2.5e-01 0.75 1
## Q5 0.38 1.4e-01 0.86 1
## Q6 0.17 2.7e-02 0.97 1
## Q7 0.22 4.7e-02 0.95 1
## Q8 0.56 3.1e-01 0.69 1
## Q9 0.54 2.9e-01 0.71 1
## Q10 0.01 8.8e-05 1.00 1
## Q11 0.05 2.8e-03 1.00 1
## Q12 0.52 2.7e-01 0.73 1
## Q13 0.59 3.5e-01 0.65 1
## Q14 0.56 3.1e-01 0.69 1
## Q15 0.40 1.6e-01 0.84 1
## Q16 0.27 7.1e-02 0.93 1
## Q17 0.18 3.3e-02 0.97 1
## Q18 0.49 2.4e-01 0.76 1
## Q19 0.62 3.8e-01 0.62 1
## Q20 0.45 2.0e-01 0.80 1
## Q21 0.34 1.1e-01 0.89 1
## Q22 0.67 4.5e-01 0.55 1
## Q23 0.36 1.3e-01 0.87 1
## Q24 0.38 1.4e-01 0.86 1
## Q25 0.37 1.4e-01 0.86 1
##
## MR1
## SS loadings 4.79
## Proportion Var 0.19
##
## Mean item complexity = 1
## Test of the hypothesis that 1 factor is sufficient.
##
## The degrees of freedom for the null model are 300 and the objective function was 5.77 with Chi Square of 13868
## The degrees of freedom for the model are 275 and the objective function was 2.19
##
## The root mean square of the residuals (RMSR) is 0.07
## The df corrected root mean square of the residuals is 0.08
##
## The harmonic number of observations is 2413 with the empirical chi square 7866 with prob < 0
## The total number of observations was 2413 with Likelihood Chi Square = 5269 with prob < 0
##
## Tucker Lewis Index of factoring reliability = 0.598
## RMSEA index = 0.087 and the 90 % confidence intervals are 0.085 0.089
## BIC = 3127
## Fit based upon off diagonal values = 0.87
## Measures of factor score adequacy
## MR1
## Correlation of (regression) scores with factors 0.93
## Multiple R square of scores with factors 0.87
## Minimum correlation of possible factor scores 0.74
#item data
item_data = tibble(
item = names(scored_items),
question = test_item_names %>% str_clean(),
correct = score_key,
pass_rate = scored_items %>% colMeans(na.rm = T),
g_loading = IRT_fit$fa$loadings[, 1]
)
#print as data table, round digits to 2
item_data %>% df_round() %>% DT::datatable()
#item stats
GG_scatter(item_data, "pass_rate", "g_loading", case_names = "item")
## `geom_smooth()` using formula 'y ~ x'

#summary
describe(quiz25$score)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 2413 14.7 3.33 15 14.8 2.97 0 25 25 -0.129 0.178 0.0678
ggplot(quiz25, aes(score)) +
geom_bar(stat = "count") +
scale_x_continuous(breaks = 0:25, limits = c(0, 25))
## Warning: Removed 2 rows containing missing values (geom_bar).

#quantiles
#estimate the empirical cumulative distribution function
empirical_cdf = ecdf(quiz25$score)
#get centiles for scores 0-25
empirical_cdf(0:25) %>%
set_names(0:25)
## 0 1 2 3 4 5 6 7
## 0.000414 0.000414 0.001243 0.001658 0.002072 0.004144 0.008288 0.017406
## 8 9 10 11 12 13 14 15
## 0.032739 0.054704 0.101533 0.162453 0.246167 0.347286 0.462909 0.587650
## 16 17 18 19 20 21 22 23
## 0.701616 0.800663 0.873601 0.928305 0.956900 0.980937 0.992126 0.997513
## 24 25
## 0.999586 1.000000
#correlations among scores and guesses at own ability
quiz25 %>%
select(score, score_guess, centile_guess) %>%
wtd.cors()
## score score_guess centile_guess
## score 1.000 0.499 0.436
## score_guess 0.499 1.000 0.610
## centile_guess 0.436 0.610 1.000
#regressions
#model
ols(score ~ score_guess, data = quiz25)
## Frequencies of Missing Values Due to Each Variable
## score score_guess
## 0 5
##
## Linear Regression Model
##
## ols(formula = score ~ score_guess, data = quiz25)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 2408 LR chi2 688.06 R2 0.249
## sigma2.8747 d.f. 1 R2 adj 0.248
## d.f. 2406 Pr(> chi2) 0.0000 g 1.870
##
## Residuals
##
## Min 1Q Median 3Q Max
## -16.95587 -1.94424 -0.02832 1.89923 9.75434
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 9.3181 0.2008 46.40 <0.0001
## score_guess 0.3855 0.0137 28.21 <0.0001
##
#plot
GG_scatter(quiz25, "score_guess", "score") +
geom_count() +
geom_smooth() +
ggtitle("Score on 25 item science knowledge test and own estimate of this score after taking test",
str_glue("Data is from {nrow(quiz25)} people from Twitter"))
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

GG_save("trial/score_guess.png")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#model
ols(score ~ centile_guess, data = quiz25)
## Frequencies of Missing Values Due to Each Variable
## score centile_guess
## 0 7
##
## Linear Regression Model
##
## ols(formula = score ~ centile_guess, data = quiz25)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 2406 LR chi2 508.30 R2 0.190
## sigma2.9845 d.f. 1 R2 adj 0.190
## d.f. 2404 Pr(> chi2) 0.0000 g 1.607
##
## Residuals
##
## Min 1Q Median 3Q Max
## -15.01005 -1.93334 -0.07784 1.85437 10.35819
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 9.8490 0.2144 45.94 <0.0001
## centile_guess 0.0716 0.0030 23.78 <0.0001
##
#plot
GG_scatter(quiz25, "centile_guess", "score") +
geom_count() +
geom_smooth() +
ggtitle("Score on 25 item science knowledge test and own centile estimate of this score after taking test",
str_glue("Data is from {nrow(quiz25)} people from Twitter"))
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

GG_save("trial/score_guess2.png")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ols(score ~ score_guess + centile_guess, data = quiz25)
## Frequencies of Missing Values Due to Each Variable
## score score_guess centile_guess
## 0 5 7
##
## Linear Regression Model
##
## ols(formula = score ~ score_guess + centile_guess, data = quiz25)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 2406 LR chi2 778.29 R2 0.276
## sigma2.8222 d.f. 2 R2 adj 0.276
## d.f. 2403 Pr(> chi2) 0.0000 g 1.974
##
## Residuals
##
## Min 1Q Median 3Q Max
## -16.96538 -1.80380 -0.08731 1.84164 8.98498
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 8.3566 0.2211 37.79 <0.0001
## score_guess 0.2860 0.0169 16.89 <0.0001
## centile_guess 0.0346 0.0036 9.62 <0.0001
##
#standardized data
ols(score ~ score_guess + centile_guess, data = quiz25 %>% select(score, score_guess, centile_guess) %>% df_standardize())
## Frequencies of Missing Values Due to Each Variable
## score score_guess centile_guess
## 0 5 7
##
## Linear Regression Model
##
## ols(formula = score ~ score_guess + centile_guess, data = quiz25 %>%
## select(score, score_guess, centile_guess) %>% df_standardize())
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 2406 LR chi2 778.29 R2 0.276
## sigma0.8474 d.f. 2 R2 adj 0.276
## d.f. 2403 Pr(> chi2) 0.0000 g 0.593
##
## Residuals
##
## Min 1Q Median 3Q Max
## -5.09428 -0.54164 -0.02622 0.55300 2.69796
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -0.0001 0.0173 0.00 0.9966
## score_guess 0.3683 0.0218 16.89 <0.0001
## centile_guess 0.2098 0.0218 9.62 <0.0001
##
#education as a data test
GG_group_means(quiz25, "score", "education")
## Missing values were removed.

#Cohen d
SMD_matrix(quiz25$score*-1, quiz25$education)
## None High school Other Bachelor Master
## None NA 0.0894 0.00489 0.2709 0.3470
## High school 0.08936 NA -0.08447 0.1816 0.2577
## Other 0.00489 -0.0845 NA 0.2660 0.3422
## Bachelor 0.27091 0.1816 0.26602 NA 0.0761
## Master 0.34705 0.2577 0.34216 0.0761 NA
## Doctorate or higher 0.63493 0.5456 0.63005 0.3640 0.2879
## Doctorate or higher
## None 0.635
## High school 0.546
## Other 0.630
## Bachelor 0.364
## Master 0.288
## Doctorate or higher NA
#age
GG_scatter(quiz25, "age", "score") +
geom_smooth()
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#time taken
GG_scatter(quiz25, "time_taken", "score") +
geom_smooth()
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#random guess distribution
quiz25[test_item_names] %>%
map_dbl(~table(.) %>% length()) %>%
map_df(function(x) {
sample(c(T, F), size = nrow(quiz25), replace = T, prob = c(1/x, 1-(1/x)))
}) %>%
rowSums() %>%
{
tibble(score = .)
} %>%
ggplot(aes(score)) +
geom_bar(stat = "count", fill = "red", alpha = .5) +
scale_x_continuous(breaks = 0:25, limits = c(0, 25)) +
geom_bar(data = quiz25, stat = "count", fill = "black", alpha = .5) +
ggtitle("Comparison of real scores vs. simulated random guesses",
"Red = simulated, black = real")
## Warning: Removed 2 rows containing missing values (geom_bar).
