Init

#R options
options(
  digits = 3
  )

#packages
library(pacman)
p_load(
  kirkegaard, 
  rms, 
  lubridate,
  mirt,
  patchwork
  )

#theme
theme_set(theme_bw())

Functions

#overwrite psych function to prevent bug
describe = function(...) psych::describe(...) %>% as.matrix()

#loop convenience function
add_quantile_smooths = function(x, y, data, quantiles, method = "qgam", k = 30) {
  #loop x vars and quantiles to add
  todo = expand_grid(
    x = x,
    quantile = quantiles
  )
  
  # browser()
  #loop rows and add
  for (i in seq_along_rows(todo)) {
    i_var = todo$x[i]
    i_quantile = todo$quantile[i]
    
    #make var
    data[[str_glue("{y}_q{i_quantile*100}")]] = NA_real_
    
    #get values
    i_vals = quantile_smooth(y = data[[y]], 
                             x = data[[i_var]], 
                             quantile = i_quantile, 
                             method = method, 
                             k = k)
    
    #beware, functions differ in dealing with NA!
    if (method == "qgam") {
      data[[str_glue("{y}_q{i_quantile*100}")]][as.numeric(names(i_vals))] = i_vals
    } else {
      data[[str_glue("{y}_q{i_quantile*100}")]] = i_vals
    }
    
  }
  
  data
}

Data

#quiz data
quiz25 = read_csv("data/20200914074101-SurveyExport.csv") %>%
  df_legalize_names()
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_character(),
##   `Response ID` = col_double(),
##   `Time Started` = col_datetime(format = ""),
##   `Date Submitted` = col_datetime(format = ""),
##   `Contact ID` = col_logical(),
##   `Legacy Comments` = col_logical(),
##   Comments = col_logical(),
##   Tags = col_logical(),
##   Longitude = col_double(),
##   Latitude = col_double(),
##   `What is your age?` = col_double(),
##   `What is your google scholar h-index? The h-index of someone who has never published a paper is 0.` = col_double(),
##   `Which number does the binary number 1010 correspond to in the decimal system?` = col_double(),
##   `With regards to your knowledge of science, what percentile of the general population do you think you are in?The 30th centile is lower than 30% of popuation, and lower than 70% of population, while the 80th centile is higher than 80% of the population and lower than 20% of the population.` = col_double(),
##   `The previous page featured 25 questions testing your knowledge. How many correct answers do you think you gave?  ` = col_double(),
##   `We have calculated your score.` = col_double()
## )
## ℹ Use `spec()` for the full column specifications.
#recode
quiz25$score_survey = quiz25$We_have_calculated_your_score %>%
  as.character() %>%
  as.numeric()

quiz25$score_guess =
  quiz25$The_previous_page_featured_25_questions_testing_your_knowledge_How_many_correct_answers_do_you_think_you_gave %>%
  as.character() %>%
  as.numeric()

quiz25$centile_guess =
  quiz25$With_regards_to_your_knowledge_of_science_what_percentile_of_the_general_population_do_you_think_you_are_in_The_30th_centile_is_lower_than_30pct_of_popuation_and_lower_than_70pct_of_population_while_the_80th_centile_is_higher_than_80pct_of_the_population_and_lower_than_20pct_of_the_population %>%
  as.character() %>%
  as.numeric()

quiz25$education = 
  quiz25$What_is_the_highest_level_of_education_you_have_completed %>% 
  ordered(levels = c("None", "High school", "Other", "Bachelor", "Master", "Doctorate or higher"))

quiz25$age = 
  quiz25$What_is_your_age %>% 
  as.character() %>% 
  as.numeric()

#parsing is bad bc we changed the format halfway, so fuck the remaining data
quiz25$time_started = quiz25$Time_Started %>% unlist() %>% as.numeric() %>% as_datetime()
quiz25$time_ended = quiz25$Date_Submitted %>% unlist() %>% as.numeric() %>% as_datetime()

quiz25$time_taken = (quiz25$time_ended - quiz25$time_started) %>% 
  as.numeric()
quiz25$time_taken %>% describe()
##    vars    n mean    sd median trimmed mad min    max  range skew kurtosis  se
## X1    1 2413 1211 10179    531     567 227  22 337171 337149 27.4      821 207
#knowledge item names
test_item_names = quiz25 %>% 
  select(Almost_all_plants_are_of_the_following_type:In_which_ancient_Greek_city_state_did_Socrates_live) %>% 
  names()

Analyses

Score science test

#scoring key
score_key = c("Multicellular eukaryotes", 
              "Create a second group of participants with ear infections who do not use any ear drops", 
              "Inserting a gene into plants that makes them resistant to insects.", 
              "The tilt of the Earth’s axis in relation to the sun", 
              "Marc", 
              "The dodo hypothesis.",
              "The Stroop effect.",
              "The Gaussian distribution.",
              "About 50%.",
              "Vowels and consonants.",
              "Typology.",
              "Water boils at a lower temperature in Denver than Los Angeles",
              "Amplitude or height", 
              "Jonas Salk", 
              "Astrology", 
              "Aspirin", 
              "Brazil", 
              "10", 
              "Nitrogen", 
              "Herbert Spencer", 
              "Iridium.", 
              "Paul Erdős.", 
              "A standardized measure of the linear relationship between two variables.", 
              "66 million years", 
              "Athens")

#score items
scored_items = quiz25[test_item_names] %>% 
  score_items(key = score_key) %>% 
  set_names("Q" + 1:25)

#compute score sum, maybe the survey item got it wrong
quiz25$score = scored_items %>% rowSums(na.rm = T)

#IRT
IRT_fit = mirt(scored_items, model = 1, technical = list(removeEmptyRows=T))
## 
Iteration: 1, Log-Lik: -29859.155, Max-Change: 0.55443
Iteration: 2, Log-Lik: -29470.260, Max-Change: 0.34782
Iteration: 3, Log-Lik: -29417.784, Max-Change: 0.12366
Iteration: 4, Log-Lik: -29406.222, Max-Change: 0.07332
Iteration: 5, Log-Lik: -29403.783, Max-Change: 0.02881
Iteration: 6, Log-Lik: -29403.072, Max-Change: 0.01829
Iteration: 7, Log-Lik: -29402.917, Max-Change: 0.01148
Iteration: 8, Log-Lik: -29402.838, Max-Change: 0.00454
Iteration: 9, Log-Lik: -29402.820, Max-Change: 0.00269
Iteration: 10, Log-Lik: -29402.812, Max-Change: 0.00167
Iteration: 11, Log-Lik: -29402.811, Max-Change: 0.00063
Iteration: 12, Log-Lik: -29402.810, Max-Change: 0.00034
Iteration: 13, Log-Lik: -29402.810, Max-Change: 0.00026
Iteration: 14, Log-Lik: -29402.810, Max-Change: 0.00010
IRT_fit %>% summary()
##         F1      h2
## Q1  0.3639 0.13245
## Q2  0.5938 0.35256
## Q3  0.5994 0.35924
## Q4  0.5399 0.29153
## Q5  0.3899 0.15201
## Q6  0.2455 0.06027
## Q7  0.2373 0.05630
## Q8  0.6050 0.36604
## Q9  0.5760 0.33179
## Q10 0.0405 0.00164
## Q11 0.0628 0.00394
## Q12 0.5339 0.28506
## Q13 0.6438 0.41453
## Q14 0.6085 0.37032
## Q15 0.4508 0.20319
## Q16 0.2582 0.06664
## Q17 0.1598 0.02553
## Q18 0.5240 0.27452
## Q19 0.6436 0.41423
## Q20 0.4696 0.22050
## Q21 0.3093 0.09568
## Q22 0.6928 0.48002
## Q23 0.3646 0.13296
## Q24 0.3597 0.12938
## Q25 0.3736 0.13959
## 
## SS loadings:  5.36 
## Proportion Var:  0.214 
## 
## Factor correlations: 
## 
##    F1
## F1  1
IRT_fit %>% plot(type = "trace")

#scores
scored_items_counts = miss_by_case(scored_items, reverse = T)
IRT_fit_scores = fscores(IRT_fit, full.scores = T, full.scores.SE = T)
quiz25$g = NA
quiz25$g[scored_items_counts > 0] = IRT_fit_scores[, 1] %>% standardize()

#reliability
empirical_rxx(IRT_fit_scores)
##    F1 
## 0.736
marginal_rxx(IRT_fit)
## [1] 0.734
alpha(scored_items)
## 
## Reliability analysis   
## Call: alpha(x = scored_items)
## 
##   raw_alpha std.alpha G6(smc) average_r S/N    ase mean   sd median_r
##       0.67      0.68    0.69     0.079 2.1 0.0094 0.61 0.14    0.071
## 
##  lower alpha upper     95% confidence boundaries
## 0.66 0.67 0.69 
## 
##  Reliability if an item is dropped:
##     raw_alpha std.alpha G6(smc) average_r S/N alpha se  var.r med.r
## Q1       0.66      0.67    0.68     0.078 2.0   0.0098 0.0042 0.070
## Q2       0.66      0.67    0.68     0.078 2.0   0.0097 0.0041 0.070
## Q3       0.67      0.68    0.69     0.080 2.1   0.0095 0.0041 0.072
## Q4       0.67      0.67    0.68     0.079 2.0   0.0097 0.0041 0.070
## Q5       0.66      0.67    0.68     0.079 2.0   0.0097 0.0042 0.070
## Q6       0.67      0.68    0.69     0.082 2.2   0.0095 0.0040 0.074
## Q7       0.67      0.68    0.69     0.081 2.1   0.0095 0.0041 0.075
## Q8       0.66      0.67    0.67     0.077 2.0   0.0099 0.0040 0.070
## Q9       0.65      0.66    0.67     0.076 2.0   0.0101 0.0036 0.070
## Q10      0.68      0.69    0.70     0.085 2.2   0.0091 0.0037 0.078
## Q11      0.68      0.69    0.69     0.084 2.2   0.0092 0.0039 0.078
## Q12      0.66      0.66    0.67     0.076 2.0   0.0100 0.0038 0.070
## Q13      0.66      0.66    0.67     0.076 2.0   0.0100 0.0038 0.069
## Q14      0.67      0.67    0.68     0.078 2.0   0.0097 0.0041 0.069
## Q15      0.67      0.68    0.69     0.081 2.1   0.0095 0.0042 0.072
## Q16      0.67      0.68    0.69     0.082 2.1   0.0094 0.0040 0.072
## Q17      0.68      0.68    0.69     0.082 2.1   0.0093 0.0041 0.077
## Q18      0.66      0.67    0.67     0.077 2.0   0.0100 0.0037 0.070
## Q19      0.65      0.66    0.67     0.075 1.9   0.0101 0.0036 0.068
## Q20      0.66      0.67    0.68     0.078 2.0   0.0098 0.0041 0.071
## Q21      0.67      0.68    0.68     0.080 2.1   0.0096 0.0041 0.071
## Q22      0.64      0.66    0.66     0.074 1.9   0.0103 0.0035 0.068
## Q23      0.66      0.67    0.68     0.078 2.0   0.0098 0.0041 0.070
## Q24      0.66      0.67    0.68     0.078 2.0   0.0098 0.0042 0.070
## Q25      0.67      0.67    0.68     0.079 2.1   0.0096 0.0042 0.070
## 
##  Item statistics 
##        n raw.r std.r r.cor r.drop mean   sd
## Q1  2412  0.38  0.36 0.304  0.252 0.66 0.47
## Q2  2412  0.32  0.38 0.326  0.248 0.92 0.27
## Q3  2412  0.21  0.29 0.221  0.166 0.97 0.18
## Q4  2412  0.31  0.35 0.299  0.233 0.91 0.29
## Q5  2412  0.36  0.35 0.293  0.247 0.24 0.42
## Q6  2412  0.22  0.23 0.144  0.130 0.12 0.33
## Q7  2412  0.30  0.27 0.199  0.166 0.38 0.49
## Q8  2412  0.40  0.41 0.374  0.308 0.84 0.36
## Q9  2412  0.46  0.43 0.406  0.339 0.42 0.49
## Q10 2412  0.17  0.14 0.028  0.030 0.35 0.48
## Q11 2412  0.21  0.18 0.073  0.064 0.41 0.49
## Q12 2412  0.44  0.43 0.394  0.320 0.69 0.46
## Q13 2412  0.43  0.44 0.413  0.337 0.83 0.38
## Q14 2412  0.31  0.36 0.306  0.238 0.92 0.26
## Q15 2412  0.22  0.29 0.212  0.157 0.95 0.23
## Q16 2412  0.21  0.24 0.154  0.109 0.86 0.35
## Q17 2412  0.26  0.24 0.145  0.115 0.47 0.50
## Q18 2412  0.44  0.41 0.376  0.310 0.57 0.49
## Q19 2412  0.48  0.48 0.460  0.376 0.73 0.44
## Q20 2412  0.37  0.37 0.323  0.272 0.17 0.38
## Q21 2408  0.31  0.31 0.237  0.183 0.74 0.44
## Q22 2408  0.54  0.51 0.505  0.423 0.37 0.48
## Q23 2408  0.39  0.36 0.302  0.256 0.39 0.49
## Q24 2408  0.39  0.37 0.312  0.262 0.53 0.50
## Q25 2408  0.31  0.32 0.257  0.206 0.84 0.36
## 
## Non missing response frequency for each item
##        0    1 miss
## Q1  0.34 0.66    0
## Q2  0.08 0.92    0
## Q3  0.03 0.97    0
## Q4  0.09 0.91    0
## Q5  0.76 0.24    0
## Q6  0.88 0.12    0
## Q7  0.62 0.38    0
## Q8  0.16 0.84    0
## Q9  0.58 0.42    0
## Q10 0.65 0.35    0
## Q11 0.59 0.41    0
## Q12 0.31 0.69    0
## Q13 0.17 0.83    0
## Q14 0.08 0.92    0
## Q15 0.05 0.95    0
## Q16 0.14 0.86    0
## Q17 0.53 0.47    0
## Q18 0.43 0.57    0
## Q19 0.27 0.73    0
## Q20 0.83 0.17    0
## Q21 0.26 0.74    0
## Q22 0.63 0.37    0
## Q23 0.61 0.39    0
## Q24 0.47 0.53    0
## Q25 0.16 0.84    0
#item data
item_data = tibble(
  item = names(scored_items),
  question = test_item_names %>% str_clean(),
  correct = score_key,
  pass_rate = scored_items %>% colMeans(na.rm = T),
  loading = IRT_fit %>% summary(verbose = F) %>% .$rotF %>% as.numeric()
)

#print as data table, round digits to 2
item_data %>% df_round() %>% DT::datatable()
#item stats
GG_scatter(item_data, "pass_rate", "loading", case_names = "item")
## `geom_smooth()` using formula 'y ~ x'

#summary
quiz25 %>% select(score, g) %>% describe()
##       vars    n      mean   sd  median  trimmed  mad   min   max range    skew
## score    1 2413  1.53e+01 3.48 15.0000 1.53e+01 2.97  0.00 25.00  25.0 -0.1549
## g        2 2412 -2.43e-17 1.00  0.0021 2.95e-04 1.04 -3.84  2.66   6.5 -0.0309
##       kurtosis     se
## score   0.0275 0.0708
## g      -0.2282 0.0204
#distributions
ggplot(quiz25, aes(score)) +
  geom_bar(stat = "count") +
  scale_x_continuous(breaks = 0:25, limits = c(0, 25)) +
  GG_denhist(quiz25, "g", vline = F)
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing non-finite values (stat_density).
## Warning: Removed 2 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing non-finite values (stat_density).

GG_save("figs/score_dists.png")
## Warning: Removed 2 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).

## Warning: Removed 1 rows containing non-finite values (stat_density).
#quantiles
#estimate the empirical cumulative distribution function
empirical_cdf = ecdf(quiz25$score)

#get centiles for scores 0-25
empirical_cdf(0:25) %>%
  set_names(0:25)
##        0        1        2        3        4        5        6        7 
## 0.000414 0.000414 0.000829 0.001658 0.001658 0.003315 0.006631 0.012433 
##        8        9       10       11       12       13       14       15 
## 0.025280 0.053046 0.090344 0.143804 0.211355 0.297555 0.398259 0.512225 
##       16       17       18       19       20       21       22       23 
## 0.632408 0.732283 0.824285 0.891007 0.940323 0.964360 0.984252 0.994198 
##       24       25 
## 0.998757 1.000000

Main results

#correlations among scores and guesses at own ability
quiz25 %>%
  select(score, g, score_guess, centile_guess) %>%
  cor_matrix(p_val = T)
##               score     g         score_guess centile_guess
## score         "1"       "0.95***" "0.52***"   "0.46***"    
## g             "0.95***" "1"       "0.53***"   "0.48***"    
## score_guess   "0.52***" "0.53***" "1"         "0.61***"    
## centile_guess "0.46***" "0.48***" "0.61***"   "1"
#plot
GG_scatter(quiz25, "score_guess", "centile_guess")
## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/scatter_score_guess_centile.png")
## `geom_smooth()` using formula 'y ~ x'
#main nonlinear tests
ggplot(quiz25, aes(score, score_guess)) +
  geom_count() +
  geom_smooth() +
  geom_smooth(method = "lm", alpha = .5) +
  geom_smooth(method = "lm", color = "orange", alpha = .5) +
  scale_x_continuous(breaks = seq(0, 25, 5), limits = c(0, 25)) +
  ylab("Self-estimated score") +
  ggplot(quiz25, aes(g, centile_guess)) +
  geom_point() +
  geom_smooth(color = "blue", alpha = .5) +
  geom_smooth(method = "lm", color = "orange", alpha = .5) +
  ylab("Self-estimated ability centile")
## Warning: Removed 5 rows containing non-finite values (stat_sum).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 7 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 7 rows containing non-finite values (stat_smooth).
## Warning: Removed 7 rows containing missing values (geom_point).

GG_save("figs/nonlinear_fits.png")
## Warning: Removed 5 rows containing non-finite values (stat_sum).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 7 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 7 rows containing non-finite values (stat_smooth).

## Warning: Removed 7 rows containing missing values (geom_point).
#proportion with < -2z
mean(quiz25$g < -2, na.rm=T)
## [1] 0.0207
#regressions

#formal test nonlinear
lrtest(
  ols(score ~ score_guess, data = quiz25),
  ols(score ~ rcs(score_guess), data = quiz25)
)
## 
## Model 1: score ~ score_guess
## Model 2: score ~ rcs(score_guess)
## 
## L.R. Chisq       d.f.          P 
##      4.046      3.000      0.256
lrtest(
  ols(g ~ centile_guess, data = quiz25),
  ols(g ~ rcs(centile_guess), data = quiz25)
)
## 
## Model 1: g ~ centile_guess
## Model 2: g ~ rcs(centile_guess)
## 
## L.R. Chisq       d.f.          P 
##   4.69e+01   3.00e+00   3.63e-10
#model fits to get R2
list(
  raw_linear = ols(score ~ score_guess, data = quiz25),
  raw_spline = ols(score ~ rcs(score_guess), data = quiz25),
  g_linear = ols(g ~ centile_guess, data = quiz25),
  g_spline = ols(g ~ rcs(centile_guess), data = quiz25)
) %>% summarize_models()
#incremental validity?
#compare R2s
ols(g ~ score_guess + centile_guess, data = quiz25)
## Frequencies of Missing Values Due to Each Variable
##             g   score_guess centile_guess 
##             1             5             7 
## 
## Linear Regression Model
##  
##  ols(formula = g ~ score_guess + centile_guess, data = quiz25)
##  
##  
##                  Model Likelihood    Discrimination    
##                        Ratio Test           Indexes    
##  Obs    2406    LR chi2    943.38    R2       0.324    
##  sigma0.8199    d.f.            2    R2 adj   0.324    
##  d.f.   2403    Pr(> chi2) 0.0000    g        0.642    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -5.19255 -0.53968 -0.01491  0.55539  2.62168 
##  
##  
##                Coef    S.E.   t      Pr(>|t|)
##  Intercept     -2.0936 0.0642 -32.59 <0.0001 
##  score_guess    0.0878 0.0049  17.85 <0.0001 
##  centile_guess  0.0125 0.0010  12.01 <0.0001 
## 
ols(score ~ score_guess + centile_guess, data = quiz25)
## Frequencies of Missing Values Due to Each Variable
##         score   score_guess centile_guess 
##             0             5             7 
## 
## Linear Regression Model
##  
##  ols(formula = score ~ score_guess + centile_guess, data = quiz25)
##  
##  
##                  Model Likelihood    Discrimination    
##                        Ratio Test           Indexes    
##  Obs    2406    LR chi2    862.90    R2       0.301    
##  sigma2.8976    d.f.            2    R2 adj   0.301    
##  d.f.   2403    Pr(> chi2) 0.0000    g        2.153    
##  
##  Residuals
##  
##        Min        1Q    Median        3Q       Max 
##  -17.87365  -1.90617   0.04497   1.89687   9.60008 
##  
##  
##                Coef   S.E.   t     Pr(>|t|)
##  Intercept     8.3028 0.2270 36.57 <0.0001 
##  score_guess   0.3066 0.0174 17.63 <0.0001 
##  centile_guess 0.0391 0.0037 10.59 <0.0001 
## 
#lr tests
lrtest(
  ols(g ~ centile_guess, data = quiz25),
  ols(g ~ centile_guess + score_guess, data = quiz25)
)
## 
## Model 1: g ~ centile_guess
## Model 2: g ~ centile_guess + score_guess
## 
## L.R. Chisq       d.f.          P 
##        300          1          0
lrtest(
  ols(score ~ score_guess, data = quiz25),
  ols(score ~ score_guess + centile_guess, data = quiz25)
)
## 
## Model 1: score ~ score_guess
## Model 2: score ~ score_guess + centile_guess
## 
## L.R. Chisq       d.f.          P 
##        109          1          0
#model models, save and test residuals
(raw_score_model = ols(score_guess ~ score, data = quiz25))
## Frequencies of Missing Values Due to Each Variable
## score_guess       score 
##           5           0 
## 
## Linear Regression Model
##  
##  ols(formula = score_guess ~ score, data = quiz25)
##  
##  
##                  Model Likelihood    Discrimination    
##                        Ratio Test           Indexes    
##  Obs    2408    LR chi2    754.02    R2       0.269    
##  sigma3.6670    d.f.            1    R2 adj   0.269    
##  d.f.   2406    Pr(> chi2) 0.0000    g        2.504    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -14.7274  -2.4440   0.1229   2.4812  19.4653 
##  
##  
##            Coef   S.E.   t     Pr(>|t|)
##  Intercept 4.2513 0.3380 12.58 <0.0001 
##  score     0.6417 0.0216 29.74 <0.0001 
## 
(g_score_model = ols(centile_guess ~ g, data = quiz25))
## Frequencies of Missing Values Due to Each Variable
## centile_guess             g 
##             7             1 
## 
## Linear Regression Model
##  
##  ols(formula = centile_guess ~ g, data = quiz25)
##  
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs     2406    LR chi2    643.84    R2       0.235    
##  sigma17.6824    d.f.            1    R2 adj   0.234    
##  d.f.    2404    Pr(> chi2) 0.0000    g       11.116    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -77.324  -9.005   2.390  11.929  69.392 
##  
##  
##            Coef    S.E.   t      Pr(>|t|)
##  Intercept 68.2999 0.3605 189.46 <0.0001 
##  g          9.8219 0.3616  27.16 <0.0001 
## 
#test residuals for HS
test_HS(resid = resid(raw_score_model), x = quiz25$score)
test_HS(resid = resid(g_score_model), x = quiz25$g)
#add 10th and 90th quantiles
quiz25 = add_quantile_smooths(data = quiz25, x = "score", y = "score_guess", quantiles = c(.1, .9), k = 5)
## Estimating learning rate. Each dot corresponds to a loss evaluation. 
## qu = 0.1..............done 
## Estimating learning rate. Each dot corresponds to a loss evaluation. 
## qu = 0.9.............done
quiz25 = add_quantile_smooths(data = quiz25, x = "g", y = "centile_guess", quantiles = c(.1, .9), k = 5)
## Estimating learning rate. Each dot corresponds to a loss evaluation. 
## qu = 0.1.................done 
## Estimating learning rate. Each dot corresponds to a loss evaluation. 
## qu = 0.9...........done
#plots
quiz25 %>% 
  filter(
    !is.na(score), !is.na(score_guess)
  ) %>% 
  ggplot(aes(score, score_guess)) +
  geom_point(alpha = .2) +
  geom_smooth(method = lm, se = F) +
  geom_ribbon(mapping = aes(
    ymin = score_guess_q10,
    ymax = score_guess_q90
  ), alpha = .4) +
  quiz25 %>%
  filter(
    !is.na(g), !is.na(score_guess)
  ) %>% 
  ggplot(aes(g, centile_guess)) +
  geom_point(alpha = .2) +
  geom_smooth(method = lm, se = F) +
  geom_ribbon(mapping = aes(
    ymin = centile_guess_q10,
    ymax = centile_guess_q90
  ), alpha = .4)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).

GG_save("figs/HS_plots.png")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 2 rows containing missing values (geom_point).

Without outlier

#remove outlier
quiz25_noOutlier = quiz25 %>% filter(g > -2.5, g < 2.5)
nrow(quiz25_noOutlier) - nrow(quiz25)
## [1] -21
#cors
quiz25_noOutlier %>%
  select(score, g, score_guess, centile_guess) %>%
  cor_matrix(p_val = T)
##               score     g         score_guess centile_guess
## score         "1"       "0.95***" "0.53***"   "0.47***"    
## g             "0.95***" "1"       "0.54***"   "0.50***"    
## score_guess   "0.53***" "0.54***" "1"         "0.61***"    
## centile_guess "0.47***" "0.50***" "0.61***"   "1"
#combined
#cors
combine_upperlower(
  quiz25_noOutlier %>%
  select(score, g, score_guess, centile_guess) %>%
  wtd.cors(),
  quiz25 %>%
  select(score, g, score_guess, centile_guess) %>%
  wtd.cors(), .diag = 1
)
##               score     g score_guess centile_guess
## score         1.000 0.951       0.528         0.470
## g             0.954 1.000       0.543         0.496
## score_guess   0.519 0.533       1.000         0.613
## centile_guess 0.459 0.485       0.610         1.000
#formal test nonlinear
lrtest(
  ols(score ~ score_guess, data = quiz25_noOutlier),
  ols(score ~ rcs(score_guess), data = quiz25_noOutlier)
)
## 
## Model 1: score ~ score_guess
## Model 2: score ~ rcs(score_guess)
## 
## L.R. Chisq       d.f.          P 
##   13.09073    3.00000    0.00444
lrtest(
  ols(g ~ centile_guess, data = quiz25_noOutlier),
  ols(g ~ rcs(centile_guess), data = quiz25_noOutlier)
)
## 
## Model 1: g ~ centile_guess
## Model 2: g ~ rcs(centile_guess)
## 
## L.R. Chisq       d.f.          P 
##   5.04e+01   3.00e+00   6.53e-11
#model fits to get R2
list(
  raw_linear = ols(score ~ score_guess, data = quiz25_noOutlier),
  raw_spline = ols(score ~ rcs(score_guess), data = quiz25_noOutlier),
  g_linear = ols(g ~ centile_guess, data = quiz25_noOutlier),
  g_spline = ols(g ~ rcs(centile_guess), data = quiz25_noOutlier)
) %>% summarize_models()
#nonlinear fits
ggplot(quiz25_noOutlier, aes(score, score_guess)) +
  geom_count() +
  geom_smooth() +
  geom_smooth(method = "lm", alpha = .5) +
  geom_smooth(method = "lm", color = "orange", alpha = .5) +
  scale_x_continuous(breaks = seq(0, 25, 5), limits = c(0, 25)) +
  ylab("Self-estimated score") +
  ggplot(quiz25_noOutlier, aes(g, centile_guess)) +
  geom_point() +
  geom_smooth(color = "blue", alpha = .5) +
  geom_smooth(method = "lm", color = "orange", alpha = .5) +
  ylab("Self-estimated ability centile")
## Warning: Removed 4 rows containing non-finite values (stat_sum).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 6 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 6 rows containing non-finite values (stat_smooth).
## Warning: Removed 6 rows containing missing values (geom_point).

GG_save("figs/nonlinear_fits_noOutlier.png")
## Warning: Removed 4 rows containing non-finite values (stat_sum).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 6 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 6 rows containing non-finite values (stat_smooth).

## Warning: Removed 6 rows containing missing values (geom_point).
#model models, save and test residuals
(raw_score_model = ols(score_guess ~ score, data = quiz25_noOutlier))
## Frequencies of Missing Values Due to Each Variable
## score_guess       score 
##           4           0 
## 
## Linear Regression Model
##  
##  ols(formula = score_guess ~ score, data = quiz25_noOutlier)
##  
##  
##                  Model Likelihood    Discrimination    
##                        Ratio Test           Indexes    
##  Obs    2388    LR chi2    781.82    R2       0.279    
##  sigma3.6091    d.f.            1    R2 adj   0.279    
##  d.f.   2386    Pr(> chi2) 0.0000    g        2.542    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -14.8498  -2.5017   0.1623   2.4903  14.4983 
##  
##  
##            Coef   S.E.   t     Pr(>|t|)
##  Intercept 3.8215 0.3445 11.09 <0.0001 
##  score     0.6680 0.0220 30.40 <0.0001 
## 
(g_score_model = ols(centile_guess ~ g, data = quiz25_noOutlier))
## Frequencies of Missing Values Due to Each Variable
## centile_guess             g 
##             6             0 
## 
## Linear Regression Model
##  
##  ols(formula = centile_guess ~ g, data = quiz25_noOutlier)
##  
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs     2386    LR chi2    675.05    R2       0.246    
##  sigma17.5382    d.f.            1    R2 adj   0.246    
##  d.f.    2384    Pr(> chi2) 0.0000    g       11.442    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -77.763  -8.856   2.499  11.887  47.726 
##  
##  
##            Coef    S.E.   t      Pr(>|t|)
##  Intercept 68.1823 0.3591 189.89 <0.0001 
##  g         10.3666 0.3713  27.92 <0.0001 
## 
#test residuals for HS
test_HS(resid = resid(raw_score_model), x = quiz25_noOutlier$score)
test_HS(resid = resid(g_score_model), x = quiz25_noOutlier$g)
#add 10th and 90th quantiles
quiz25_noOutlier = add_quantile_smooths(data = quiz25_noOutlier, x = "score", y = "score_guess", quantiles = c(.1, .9), k = 5)
## Estimating learning rate. Each dot corresponds to a loss evaluation. 
## qu = 0.1..............done 
## Estimating learning rate. Each dot corresponds to a loss evaluation. 
## qu = 0.9.........done
quiz25_noOutlier = add_quantile_smooths(data = quiz25_noOutlier, x = "g", y = "centile_guess", quantiles = c(.1, .9), k = 5)
## Estimating learning rate. Each dot corresponds to a loss evaluation. 
## qu = 0.1................done 
## Estimating learning rate. Each dot corresponds to a loss evaluation. 
## qu = 0.9.........done
#plots
quiz25_noOutlier %>% 
  filter(
    !is.na(score), !is.na(score_guess)
  ) %>% 
  ggplot(aes(score, score_guess)) +
  geom_point(alpha = .2) +
  geom_smooth(method = lm, se = F) +
  geom_ribbon(mapping = aes(
    ymin = score_guess_q10,
    ymax = score_guess_q90
  ), alpha = .4) +
  quiz25_noOutlier %>%
  filter(
    !is.na(g), !is.na(score_guess)
  ) %>% 
  ggplot(aes(g, centile_guess)) +
  geom_point(alpha = .2) +
  geom_smooth(method = lm, se = F) +
  geom_ribbon(mapping = aes(
    ymin = centile_guess_q10,
    ymax = centile_guess_q90
  ), alpha = .4)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).

GG_save("figs/HS_plots_noOutlier.png")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 2 rows containing missing values (geom_point).
#incremental validity in R2
quiz25_noOutlier_z = quiz25_noOutlier %>% df_standardize(messages = F)
list(
  raw1 = ols(score ~ score_guess, data = quiz25_noOutlier_z),
  raw2 = ols(score ~ centile_guess, data = quiz25_noOutlier_z),
  raw_combined = ols(score ~ score_guess + centile_guess, data = quiz25_noOutlier_z),
  g_1 = ols(g ~ centile_guess, data = quiz25_noOutlier_z),
  g_2 = ols(g ~ score_guess, data = quiz25_noOutlier_z),
  g_combined = ols(g ~ centile_guess + score_guess, data = quiz25_noOutlier_z)
) %>% summarize_models()

Meta

write_sessioninfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 19.3
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices datasets  utils     methods  
## [8] base     
## 
## other attached packages:
##  [1] patchwork_1.1.1       mirt_1.33.2           lubridate_1.7.9.2    
##  [4] rms_6.1-0             SparseM_1.78          kirkegaard_2021-02-08
##  [7] metafor_2.4-0         Matrix_1.3-2          psych_2.0.12         
## [10] magrittr_2.0.1        assertthat_0.2.1      weights_1.0.1        
## [13] mice_3.13.0           gdata_2.18.0          Hmisc_4.4-2          
## [16] Formula_1.2-4         survival_3.2-7        lattice_0.20-41      
## [19] forcats_0.5.1         stringr_1.4.0         dplyr_1.0.4          
## [22] purrr_0.3.4           readr_1.4.0           tidyr_1.1.2          
## [25] tibble_3.0.6          ggplot2_3.3.3         tidyverse_1.3.0      
## [28] pacman_0.5.1         
## 
## loaded via a namespace (and not attached):
##   [1] TH.data_1.0-10        colorspace_2.0-0      ellipsis_0.3.1       
##   [4] htmlTable_2.1.0       base64enc_0.1-3       fs_1.5.0             
##   [7] rstudioapi_0.13       farver_2.0.3          Deriv_4.1.2          
##  [10] MatrixModels_0.4-1    DT_0.17               mvtnorm_1.1-1        
##  [13] xml2_1.3.2            codetools_0.2-18      splines_4.0.3        
##  [16] doParallel_1.0.16     mnormt_2.0.2          knitr_1.31           
##  [19] jsonlite_1.7.2        broom_0.7.4           cluster_2.1.0        
##  [22] dbplyr_2.0.0          png_0.1-7             shiny_1.5.0          
##  [25] compiler_4.0.3        httr_1.4.2            backports_1.2.1      
##  [28] fastmap_1.0.1         cli_2.3.0             later_1.1.0.1        
##  [31] htmltools_0.5.1.1     quantreg_5.82         tools_4.0.3          
##  [34] gtable_0.3.0          glue_1.4.2            Rcpp_1.0.6           
##  [37] cellranger_1.1.0      vctrs_0.3.6           nlme_3.1-151         
##  [40] conquer_1.0.2         iterators_1.0.13      crosstalk_1.1.1      
##  [43] multilevel_2.6        xfun_0.20             rvest_0.3.6          
##  [46] mime_0.9              lifecycle_0.2.0       renv_0.12.5          
##  [49] dcurver_0.9.2         gtools_3.8.2          polspline_1.1.19     
##  [52] MASS_7.3-53           zoo_1.8-8             scales_1.1.1         
##  [55] promises_1.1.1        hms_1.0.0             parallel_4.0.3       
##  [58] sandwich_3.0-0        RColorBrewer_1.1-2    psychometric_2.2     
##  [61] yaml_2.2.1            gridExtra_2.3         rpart_4.1-15         
##  [64] latticeExtra_0.6-29   stringi_1.5.3         highr_0.8            
##  [67] foreach_1.5.1         checkmate_2.0.0       permute_0.9-5        
##  [70] qgam_1.3.2            rlang_0.4.10          pkgconfig_2.0.3      
##  [73] matrixStats_0.57.0    evaluate_0.14         labeling_0.4.2       
##  [76] htmlwidgets_1.5.3     tidyselect_1.1.0      plyr_1.8.6           
##  [79] R6_2.5.0              generics_0.1.0        multcomp_1.4-15      
##  [82] DBI_1.1.1             pillar_1.4.7          haven_2.3.1          
##  [85] foreign_0.8-81        withr_2.4.1           mgcv_1.8-33          
##  [88] nnet_7.3-14           modelr_0.1.8          crayon_1.4.0         
##  [91] tmvnsim_1.0-2         rmarkdown_2.6         jpeg_0.1-8.1         
##  [94] grid_4.0.3            readxl_1.3.1          data.table_1.13.6    
##  [97] vegan_2.5-7           reprex_1.0.0.9000     digest_0.6.27        
## [100] xtable_1.8-4          httpuv_1.5.5          GPArotation_2014.11-1
## [103] munsell_0.5.0

Upload

if (F) {
  renv::init()
  
  library(osfr)
  osf_auth(read_lines("~/.config/osf_token"))
  osf_proj = osf_retrieve_node("https://osf.io/fhqap/")
  osf_upload(osf_proj, conflicts = "overwrite", path = 
               c(
                 "data", "figs", "renv.lock",
                 "notebook.html", "notebook.Rmd",
                 "Heteroscedasticity.html", "Heteroscedasticity.Rmd",
                 "science_quiz_25.html", "science_quiz_25.Rmd",
                 "sessions_info.txt"
               ))
}