Score science test
#scoring key
score_key = c("Multicellular eukaryotes",
"Create a second group of participants with ear infections who do not use any ear drops",
"Inserting a gene into plants that makes them resistant to insects.",
"The tilt of the Earth’s axis in relation to the sun",
"Marc",
"The dodo hypothesis.",
"The Stroop effect.",
"The Gaussian distribution.",
"About 50%.",
"Vowels and consonants.",
"Typology.",
"Water boils at a lower temperature in Denver than Los Angeles",
"Amplitude or height",
"Jonas Salk",
"Astrology",
"Aspirin",
"Brazil",
"10",
"Nitrogen",
"Herbert Spencer",
"Iridium.",
"Paul Erdős.",
"A standardized measure of the linear relationship between two variables.",
"66 million years",
"Athens")
#score items
scored_items = quiz25[test_item_names] %>%
score_items(key = score_key) %>%
set_names("Q" + 1:25)
#compute score sum, maybe the survey item got it wrong
quiz25$score = scored_items %>% rowSums(na.rm = T)
#IRT
IRT_fit = mirt(scored_items, model = 1, technical = list(removeEmptyRows=T))
##
Iteration: 1, Log-Lik: -29859.155, Max-Change: 0.55443
Iteration: 2, Log-Lik: -29470.260, Max-Change: 0.34782
Iteration: 3, Log-Lik: -29417.784, Max-Change: 0.12366
Iteration: 4, Log-Lik: -29406.222, Max-Change: 0.07332
Iteration: 5, Log-Lik: -29403.783, Max-Change: 0.02881
Iteration: 6, Log-Lik: -29403.072, Max-Change: 0.01829
Iteration: 7, Log-Lik: -29402.917, Max-Change: 0.01148
Iteration: 8, Log-Lik: -29402.838, Max-Change: 0.00454
Iteration: 9, Log-Lik: -29402.820, Max-Change: 0.00269
Iteration: 10, Log-Lik: -29402.812, Max-Change: 0.00167
Iteration: 11, Log-Lik: -29402.811, Max-Change: 0.00063
Iteration: 12, Log-Lik: -29402.810, Max-Change: 0.00034
Iteration: 13, Log-Lik: -29402.810, Max-Change: 0.00026
Iteration: 14, Log-Lik: -29402.810, Max-Change: 0.00010
IRT_fit %>% summary()
## F1 h2
## Q1 0.3639 0.13245
## Q2 0.5938 0.35256
## Q3 0.5994 0.35924
## Q4 0.5399 0.29153
## Q5 0.3899 0.15201
## Q6 0.2455 0.06027
## Q7 0.2373 0.05630
## Q8 0.6050 0.36604
## Q9 0.5760 0.33179
## Q10 0.0405 0.00164
## Q11 0.0628 0.00394
## Q12 0.5339 0.28506
## Q13 0.6438 0.41453
## Q14 0.6085 0.37032
## Q15 0.4508 0.20319
## Q16 0.2582 0.06664
## Q17 0.1598 0.02553
## Q18 0.5240 0.27452
## Q19 0.6436 0.41423
## Q20 0.4696 0.22050
## Q21 0.3093 0.09568
## Q22 0.6928 0.48002
## Q23 0.3646 0.13296
## Q24 0.3597 0.12938
## Q25 0.3736 0.13959
##
## SS loadings: 5.36
## Proportion Var: 0.214
##
## Factor correlations:
##
## F1
## F1 1
IRT_fit %>% plot(type = "trace")

#scores
scored_items_counts = miss_by_case(scored_items, reverse = T)
IRT_fit_scores = fscores(IRT_fit, full.scores = T, full.scores.SE = T)
quiz25$g = NA
quiz25$g[scored_items_counts > 0] = IRT_fit_scores[, 1] %>% standardize()
#reliability
empirical_rxx(IRT_fit_scores)
## F1
## 0.736
marginal_rxx(IRT_fit)
## [1] 0.734
alpha(scored_items)
##
## Reliability analysis
## Call: alpha(x = scored_items)
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.67 0.68 0.69 0.079 2.1 0.0094 0.61 0.14 0.071
##
## lower alpha upper 95% confidence boundaries
## 0.66 0.67 0.69
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## Q1 0.66 0.67 0.68 0.078 2.0 0.0098 0.0042 0.070
## Q2 0.66 0.67 0.68 0.078 2.0 0.0097 0.0041 0.070
## Q3 0.67 0.68 0.69 0.080 2.1 0.0095 0.0041 0.072
## Q4 0.67 0.67 0.68 0.079 2.0 0.0097 0.0041 0.070
## Q5 0.66 0.67 0.68 0.079 2.0 0.0097 0.0042 0.070
## Q6 0.67 0.68 0.69 0.082 2.2 0.0095 0.0040 0.074
## Q7 0.67 0.68 0.69 0.081 2.1 0.0095 0.0041 0.075
## Q8 0.66 0.67 0.67 0.077 2.0 0.0099 0.0040 0.070
## Q9 0.65 0.66 0.67 0.076 2.0 0.0101 0.0036 0.070
## Q10 0.68 0.69 0.70 0.085 2.2 0.0091 0.0037 0.078
## Q11 0.68 0.69 0.69 0.084 2.2 0.0092 0.0039 0.078
## Q12 0.66 0.66 0.67 0.076 2.0 0.0100 0.0038 0.070
## Q13 0.66 0.66 0.67 0.076 2.0 0.0100 0.0038 0.069
## Q14 0.67 0.67 0.68 0.078 2.0 0.0097 0.0041 0.069
## Q15 0.67 0.68 0.69 0.081 2.1 0.0095 0.0042 0.072
## Q16 0.67 0.68 0.69 0.082 2.1 0.0094 0.0040 0.072
## Q17 0.68 0.68 0.69 0.082 2.1 0.0093 0.0041 0.077
## Q18 0.66 0.67 0.67 0.077 2.0 0.0100 0.0037 0.070
## Q19 0.65 0.66 0.67 0.075 1.9 0.0101 0.0036 0.068
## Q20 0.66 0.67 0.68 0.078 2.0 0.0098 0.0041 0.071
## Q21 0.67 0.68 0.68 0.080 2.1 0.0096 0.0041 0.071
## Q22 0.64 0.66 0.66 0.074 1.9 0.0103 0.0035 0.068
## Q23 0.66 0.67 0.68 0.078 2.0 0.0098 0.0041 0.070
## Q24 0.66 0.67 0.68 0.078 2.0 0.0098 0.0042 0.070
## Q25 0.67 0.67 0.68 0.079 2.1 0.0096 0.0042 0.070
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## Q1 2412 0.38 0.36 0.304 0.252 0.66 0.47
## Q2 2412 0.32 0.38 0.326 0.248 0.92 0.27
## Q3 2412 0.21 0.29 0.221 0.166 0.97 0.18
## Q4 2412 0.31 0.35 0.299 0.233 0.91 0.29
## Q5 2412 0.36 0.35 0.293 0.247 0.24 0.42
## Q6 2412 0.22 0.23 0.144 0.130 0.12 0.33
## Q7 2412 0.30 0.27 0.199 0.166 0.38 0.49
## Q8 2412 0.40 0.41 0.374 0.308 0.84 0.36
## Q9 2412 0.46 0.43 0.406 0.339 0.42 0.49
## Q10 2412 0.17 0.14 0.028 0.030 0.35 0.48
## Q11 2412 0.21 0.18 0.073 0.064 0.41 0.49
## Q12 2412 0.44 0.43 0.394 0.320 0.69 0.46
## Q13 2412 0.43 0.44 0.413 0.337 0.83 0.38
## Q14 2412 0.31 0.36 0.306 0.238 0.92 0.26
## Q15 2412 0.22 0.29 0.212 0.157 0.95 0.23
## Q16 2412 0.21 0.24 0.154 0.109 0.86 0.35
## Q17 2412 0.26 0.24 0.145 0.115 0.47 0.50
## Q18 2412 0.44 0.41 0.376 0.310 0.57 0.49
## Q19 2412 0.48 0.48 0.460 0.376 0.73 0.44
## Q20 2412 0.37 0.37 0.323 0.272 0.17 0.38
## Q21 2408 0.31 0.31 0.237 0.183 0.74 0.44
## Q22 2408 0.54 0.51 0.505 0.423 0.37 0.48
## Q23 2408 0.39 0.36 0.302 0.256 0.39 0.49
## Q24 2408 0.39 0.37 0.312 0.262 0.53 0.50
## Q25 2408 0.31 0.32 0.257 0.206 0.84 0.36
##
## Non missing response frequency for each item
## 0 1 miss
## Q1 0.34 0.66 0
## Q2 0.08 0.92 0
## Q3 0.03 0.97 0
## Q4 0.09 0.91 0
## Q5 0.76 0.24 0
## Q6 0.88 0.12 0
## Q7 0.62 0.38 0
## Q8 0.16 0.84 0
## Q9 0.58 0.42 0
## Q10 0.65 0.35 0
## Q11 0.59 0.41 0
## Q12 0.31 0.69 0
## Q13 0.17 0.83 0
## Q14 0.08 0.92 0
## Q15 0.05 0.95 0
## Q16 0.14 0.86 0
## Q17 0.53 0.47 0
## Q18 0.43 0.57 0
## Q19 0.27 0.73 0
## Q20 0.83 0.17 0
## Q21 0.26 0.74 0
## Q22 0.63 0.37 0
## Q23 0.61 0.39 0
## Q24 0.47 0.53 0
## Q25 0.16 0.84 0
#item data
item_data = tibble(
item = names(scored_items),
question = test_item_names %>% str_clean(),
correct = score_key,
pass_rate = scored_items %>% colMeans(na.rm = T),
loading = IRT_fit %>% summary(verbose = F) %>% .$rotF %>% as.numeric()
)
#print as data table, round digits to 2
item_data %>% df_round() %>% DT::datatable()
#item stats
GG_scatter(item_data, "pass_rate", "loading", case_names = "item")
## `geom_smooth()` using formula 'y ~ x'

#summary
quiz25 %>% select(score, g) %>% describe()
## vars n mean sd median trimmed mad min max range skew
## score 1 2413 1.53e+01 3.48 15.0000 1.53e+01 2.97 0.00 25.00 25.0 -0.1549
## g 2 2412 -2.43e-17 1.00 0.0021 2.95e-04 1.04 -3.84 2.66 6.5 -0.0309
## kurtosis se
## score 0.0275 0.0708
## g -0.2282 0.0204
#distributions
ggplot(quiz25, aes(score)) +
geom_bar(stat = "count") +
scale_x_continuous(breaks = 0:25, limits = c(0, 25)) +
GG_denhist(quiz25, "g", vline = F)
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing non-finite values (stat_density).
## Warning: Removed 2 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing non-finite values (stat_density).

GG_save("figs/score_dists.png")
## Warning: Removed 2 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing non-finite values (stat_density).
#quantiles
#estimate the empirical cumulative distribution function
empirical_cdf = ecdf(quiz25$score)
#get centiles for scores 0-25
empirical_cdf(0:25) %>%
set_names(0:25)
## 0 1 2 3 4 5 6 7
## 0.000414 0.000414 0.000829 0.001658 0.001658 0.003315 0.006631 0.012433
## 8 9 10 11 12 13 14 15
## 0.025280 0.053046 0.090344 0.143804 0.211355 0.297555 0.398259 0.512225
## 16 17 18 19 20 21 22 23
## 0.632408 0.732283 0.824285 0.891007 0.940323 0.964360 0.984252 0.994198
## 24 25
## 0.998757 1.000000
Main results
#correlations among scores and guesses at own ability
quiz25 %>%
select(score, g, score_guess, centile_guess) %>%
cor_matrix(p_val = T)
## score g score_guess centile_guess
## score "1" "0.95***" "0.52***" "0.46***"
## g "0.95***" "1" "0.53***" "0.48***"
## score_guess "0.52***" "0.53***" "1" "0.61***"
## centile_guess "0.46***" "0.48***" "0.61***" "1"
#plot
GG_scatter(quiz25, "score_guess", "centile_guess")
## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/scatter_score_guess_centile.png")
## `geom_smooth()` using formula 'y ~ x'
#main nonlinear tests
ggplot(quiz25, aes(score, score_guess)) +
geom_count() +
geom_smooth() +
geom_smooth(method = "lm", alpha = .5) +
geom_smooth(method = "lm", color = "orange", alpha = .5) +
scale_x_continuous(breaks = seq(0, 25, 5), limits = c(0, 25)) +
ylab("Self-estimated score") +
ggplot(quiz25, aes(g, centile_guess)) +
geom_point() +
geom_smooth(color = "blue", alpha = .5) +
geom_smooth(method = "lm", color = "orange", alpha = .5) +
ylab("Self-estimated ability centile")
## Warning: Removed 5 rows containing non-finite values (stat_sum).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 7 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 7 rows containing non-finite values (stat_smooth).
## Warning: Removed 7 rows containing missing values (geom_point).

GG_save("figs/nonlinear_fits.png")
## Warning: Removed 5 rows containing non-finite values (stat_sum).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 7 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 7 rows containing non-finite values (stat_smooth).
## Warning: Removed 7 rows containing missing values (geom_point).
#proportion with < -2z
mean(quiz25$g < -2, na.rm=T)
## [1] 0.0207
#regressions
#formal test nonlinear
lrtest(
ols(score ~ score_guess, data = quiz25),
ols(score ~ rcs(score_guess), data = quiz25)
)
##
## Model 1: score ~ score_guess
## Model 2: score ~ rcs(score_guess)
##
## L.R. Chisq d.f. P
## 4.046 3.000 0.256
lrtest(
ols(g ~ centile_guess, data = quiz25),
ols(g ~ rcs(centile_guess), data = quiz25)
)
##
## Model 1: g ~ centile_guess
## Model 2: g ~ rcs(centile_guess)
##
## L.R. Chisq d.f. P
## 4.69e+01 3.00e+00 3.63e-10
#model fits to get R2
list(
raw_linear = ols(score ~ score_guess, data = quiz25),
raw_spline = ols(score ~ rcs(score_guess), data = quiz25),
g_linear = ols(g ~ centile_guess, data = quiz25),
g_spline = ols(g ~ rcs(centile_guess), data = quiz25)
) %>% summarize_models()
#incremental validity?
#compare R2s
ols(g ~ score_guess + centile_guess, data = quiz25)
## Frequencies of Missing Values Due to Each Variable
## g score_guess centile_guess
## 1 5 7
##
## Linear Regression Model
##
## ols(formula = g ~ score_guess + centile_guess, data = quiz25)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 2406 LR chi2 943.38 R2 0.324
## sigma0.8199 d.f. 2 R2 adj 0.324
## d.f. 2403 Pr(> chi2) 0.0000 g 0.642
##
## Residuals
##
## Min 1Q Median 3Q Max
## -5.19255 -0.53968 -0.01491 0.55539 2.62168
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -2.0936 0.0642 -32.59 <0.0001
## score_guess 0.0878 0.0049 17.85 <0.0001
## centile_guess 0.0125 0.0010 12.01 <0.0001
##
ols(score ~ score_guess + centile_guess, data = quiz25)
## Frequencies of Missing Values Due to Each Variable
## score score_guess centile_guess
## 0 5 7
##
## Linear Regression Model
##
## ols(formula = score ~ score_guess + centile_guess, data = quiz25)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 2406 LR chi2 862.90 R2 0.301
## sigma2.8976 d.f. 2 R2 adj 0.301
## d.f. 2403 Pr(> chi2) 0.0000 g 2.153
##
## Residuals
##
## Min 1Q Median 3Q Max
## -17.87365 -1.90617 0.04497 1.89687 9.60008
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 8.3028 0.2270 36.57 <0.0001
## score_guess 0.3066 0.0174 17.63 <0.0001
## centile_guess 0.0391 0.0037 10.59 <0.0001
##
#lr tests
lrtest(
ols(g ~ centile_guess, data = quiz25),
ols(g ~ centile_guess + score_guess, data = quiz25)
)
##
## Model 1: g ~ centile_guess
## Model 2: g ~ centile_guess + score_guess
##
## L.R. Chisq d.f. P
## 300 1 0
lrtest(
ols(score ~ score_guess, data = quiz25),
ols(score ~ score_guess + centile_guess, data = quiz25)
)
##
## Model 1: score ~ score_guess
## Model 2: score ~ score_guess + centile_guess
##
## L.R. Chisq d.f. P
## 109 1 0
#model models, save and test residuals
(raw_score_model = ols(score_guess ~ score, data = quiz25))
## Frequencies of Missing Values Due to Each Variable
## score_guess score
## 5 0
##
## Linear Regression Model
##
## ols(formula = score_guess ~ score, data = quiz25)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 2408 LR chi2 754.02 R2 0.269
## sigma3.6670 d.f. 1 R2 adj 0.269
## d.f. 2406 Pr(> chi2) 0.0000 g 2.504
##
## Residuals
##
## Min 1Q Median 3Q Max
## -14.7274 -2.4440 0.1229 2.4812 19.4653
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 4.2513 0.3380 12.58 <0.0001
## score 0.6417 0.0216 29.74 <0.0001
##
(g_score_model = ols(centile_guess ~ g, data = quiz25))
## Frequencies of Missing Values Due to Each Variable
## centile_guess g
## 7 1
##
## Linear Regression Model
##
## ols(formula = centile_guess ~ g, data = quiz25)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 2406 LR chi2 643.84 R2 0.235
## sigma17.6824 d.f. 1 R2 adj 0.234
## d.f. 2404 Pr(> chi2) 0.0000 g 11.116
##
## Residuals
##
## Min 1Q Median 3Q Max
## -77.324 -9.005 2.390 11.929 69.392
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 68.2999 0.3605 189.46 <0.0001
## g 9.8219 0.3616 27.16 <0.0001
##
#test residuals for HS
test_HS(resid = resid(raw_score_model), x = quiz25$score)
test_HS(resid = resid(g_score_model), x = quiz25$g)
#add 10th and 90th quantiles
quiz25 = add_quantile_smooths(data = quiz25, x = "score", y = "score_guess", quantiles = c(.1, .9), k = 5)
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.1..............done
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.9.............done
quiz25 = add_quantile_smooths(data = quiz25, x = "g", y = "centile_guess", quantiles = c(.1, .9), k = 5)
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.1.................done
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.9...........done
#plots
quiz25 %>%
filter(
!is.na(score), !is.na(score_guess)
) %>%
ggplot(aes(score, score_guess)) +
geom_point(alpha = .2) +
geom_smooth(method = lm, se = F) +
geom_ribbon(mapping = aes(
ymin = score_guess_q10,
ymax = score_guess_q90
), alpha = .4) +
quiz25 %>%
filter(
!is.na(g), !is.na(score_guess)
) %>%
ggplot(aes(g, centile_guess)) +
geom_point(alpha = .2) +
geom_smooth(method = lm, se = F) +
geom_ribbon(mapping = aes(
ymin = centile_guess_q10,
ymax = centile_guess_q90
), alpha = .4)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).

GG_save("figs/HS_plots.png")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
Without outlier
#remove outlier
quiz25_noOutlier = quiz25 %>% filter(g > -2.5, g < 2.5)
nrow(quiz25_noOutlier) - nrow(quiz25)
## [1] -21
#cors
quiz25_noOutlier %>%
select(score, g, score_guess, centile_guess) %>%
cor_matrix(p_val = T)
## score g score_guess centile_guess
## score "1" "0.95***" "0.53***" "0.47***"
## g "0.95***" "1" "0.54***" "0.50***"
## score_guess "0.53***" "0.54***" "1" "0.61***"
## centile_guess "0.47***" "0.50***" "0.61***" "1"
#combined
#cors
combine_upperlower(
quiz25_noOutlier %>%
select(score, g, score_guess, centile_guess) %>%
wtd.cors(),
quiz25 %>%
select(score, g, score_guess, centile_guess) %>%
wtd.cors(), .diag = 1
)
## score g score_guess centile_guess
## score 1.000 0.951 0.528 0.470
## g 0.954 1.000 0.543 0.496
## score_guess 0.519 0.533 1.000 0.613
## centile_guess 0.459 0.485 0.610 1.000
#formal test nonlinear
lrtest(
ols(score ~ score_guess, data = quiz25_noOutlier),
ols(score ~ rcs(score_guess), data = quiz25_noOutlier)
)
##
## Model 1: score ~ score_guess
## Model 2: score ~ rcs(score_guess)
##
## L.R. Chisq d.f. P
## 13.09073 3.00000 0.00444
lrtest(
ols(g ~ centile_guess, data = quiz25_noOutlier),
ols(g ~ rcs(centile_guess), data = quiz25_noOutlier)
)
##
## Model 1: g ~ centile_guess
## Model 2: g ~ rcs(centile_guess)
##
## L.R. Chisq d.f. P
## 5.04e+01 3.00e+00 6.53e-11
#model fits to get R2
list(
raw_linear = ols(score ~ score_guess, data = quiz25_noOutlier),
raw_spline = ols(score ~ rcs(score_guess), data = quiz25_noOutlier),
g_linear = ols(g ~ centile_guess, data = quiz25_noOutlier),
g_spline = ols(g ~ rcs(centile_guess), data = quiz25_noOutlier)
) %>% summarize_models()
#nonlinear fits
ggplot(quiz25_noOutlier, aes(score, score_guess)) +
geom_count() +
geom_smooth() +
geom_smooth(method = "lm", alpha = .5) +
geom_smooth(method = "lm", color = "orange", alpha = .5) +
scale_x_continuous(breaks = seq(0, 25, 5), limits = c(0, 25)) +
ylab("Self-estimated score") +
ggplot(quiz25_noOutlier, aes(g, centile_guess)) +
geom_point() +
geom_smooth(color = "blue", alpha = .5) +
geom_smooth(method = "lm", color = "orange", alpha = .5) +
ylab("Self-estimated ability centile")
## Warning: Removed 4 rows containing non-finite values (stat_sum).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 6 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 6 rows containing non-finite values (stat_smooth).
## Warning: Removed 6 rows containing missing values (geom_point).

GG_save("figs/nonlinear_fits_noOutlier.png")
## Warning: Removed 4 rows containing non-finite values (stat_sum).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 6 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 6 rows containing non-finite values (stat_smooth).
## Warning: Removed 6 rows containing missing values (geom_point).
#model models, save and test residuals
(raw_score_model = ols(score_guess ~ score, data = quiz25_noOutlier))
## Frequencies of Missing Values Due to Each Variable
## score_guess score
## 4 0
##
## Linear Regression Model
##
## ols(formula = score_guess ~ score, data = quiz25_noOutlier)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 2388 LR chi2 781.82 R2 0.279
## sigma3.6091 d.f. 1 R2 adj 0.279
## d.f. 2386 Pr(> chi2) 0.0000 g 2.542
##
## Residuals
##
## Min 1Q Median 3Q Max
## -14.8498 -2.5017 0.1623 2.4903 14.4983
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 3.8215 0.3445 11.09 <0.0001
## score 0.6680 0.0220 30.40 <0.0001
##
(g_score_model = ols(centile_guess ~ g, data = quiz25_noOutlier))
## Frequencies of Missing Values Due to Each Variable
## centile_guess g
## 6 0
##
## Linear Regression Model
##
## ols(formula = centile_guess ~ g, data = quiz25_noOutlier)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 2386 LR chi2 675.05 R2 0.246
## sigma17.5382 d.f. 1 R2 adj 0.246
## d.f. 2384 Pr(> chi2) 0.0000 g 11.442
##
## Residuals
##
## Min 1Q Median 3Q Max
## -77.763 -8.856 2.499 11.887 47.726
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 68.1823 0.3591 189.89 <0.0001
## g 10.3666 0.3713 27.92 <0.0001
##
#test residuals for HS
test_HS(resid = resid(raw_score_model), x = quiz25_noOutlier$score)
test_HS(resid = resid(g_score_model), x = quiz25_noOutlier$g)
#add 10th and 90th quantiles
quiz25_noOutlier = add_quantile_smooths(data = quiz25_noOutlier, x = "score", y = "score_guess", quantiles = c(.1, .9), k = 5)
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.1..............done
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.9.........done
quiz25_noOutlier = add_quantile_smooths(data = quiz25_noOutlier, x = "g", y = "centile_guess", quantiles = c(.1, .9), k = 5)
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.1................done
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.9.........done
#plots
quiz25_noOutlier %>%
filter(
!is.na(score), !is.na(score_guess)
) %>%
ggplot(aes(score, score_guess)) +
geom_point(alpha = .2) +
geom_smooth(method = lm, se = F) +
geom_ribbon(mapping = aes(
ymin = score_guess_q10,
ymax = score_guess_q90
), alpha = .4) +
quiz25_noOutlier %>%
filter(
!is.na(g), !is.na(score_guess)
) %>%
ggplot(aes(g, centile_guess)) +
geom_point(alpha = .2) +
geom_smooth(method = lm, se = F) +
geom_ribbon(mapping = aes(
ymin = centile_guess_q10,
ymax = centile_guess_q90
), alpha = .4)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).

GG_save("figs/HS_plots_noOutlier.png")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
#incremental validity in R2
quiz25_noOutlier_z = quiz25_noOutlier %>% df_standardize(messages = F)
list(
raw1 = ols(score ~ score_guess, data = quiz25_noOutlier_z),
raw2 = ols(score ~ centile_guess, data = quiz25_noOutlier_z),
raw_combined = ols(score ~ score_guess + centile_guess, data = quiz25_noOutlier_z),
g_1 = ols(g ~ centile_guess, data = quiz25_noOutlier_z),
g_2 = ols(g ~ score_guess, data = quiz25_noOutlier_z),
g_combined = ols(g ~ centile_guess + score_guess, data = quiz25_noOutlier_z)
) %>% summarize_models()