A reviewer asked me to look at possible heteroscedasticity (HS) in some models I ran. It took me on a detour that ended with making some new convenience functions for resting for HS, whether linear or nonlinear. HS is when the variance of the residual depends on predictor values; i.e., the amount of error is a function of the predictors. Errors are supposed to be totally random normal values. HS biases the standard errors of the model, though does not normally result in bias in the effect sizes/slopes (I think).
options(
digits = 2
)
library(pacman)
p_load(kirkegaard,
readxl,
rms,
osfr,
olsrr,
haven
)
theme_set(theme_bw())
#get model R2 adj
get_r2adj = function(x) {
x %>% summary.lm() %>% .$adj.r.squared
}
#boomer code
#https://rdrr.io/cran/rms/src/R/ols.s#sym-print.ols
get_p = function(x) {
1 - pchisq(x$stats['Model L.R.'], x$stats['d.f.'])
}
#make function to get smoothed quantiles
quantile_smooth = function(x, y, quantile, method = c("qgam", "Rq", "running"), k = 50) {
#method
if (method == "qgam") {
#same as in example here
#https://rdrr.io/cran/qgam/man/qgam.html
fit = qgam::qgam(list(y~s(x, k = k, bs = "cr"), ~ s(x, k = k, bs = "cr")),
data = data.frame(x = x, y = y), qu = quantile)
return(predict(fit))
} else if (method == "Rq") {
#fit with spline
#https://rdrr.io/cran/rms/man/Rq.html
fit = rms::Rq(y ~ rcs(x), data = data.frame(x = x, y = y), tau = quantile)
return(fitted(fit) %>% as.vector())
} else if (method == "running") {
#beware, need to sort the y by x first
#and then return it to original state
x_order = order(x)
return(caTools::runquantile(y[x_order], k = k, probs = quantile)[x_order])
} else {
stop(str_glue("method not recognized: {method}"), call. = F)
}
}
Simulate data for 3 functions: 1 no HS, 1 linear HS, 1 nonlinear HS. Then plot them to see if it looks like it should.
#settings
HS_strength = 5
HS_n = 1000
set.seed(2)
#simulate
tibble(
x = seq(0, 10, length.out = HS_n),
y_noHS = x * 1 + rnorm(HS_n),
y_linHS = x * 1 + rnorm(HS_n)*seq(1, HS_strength, length.out = HS_n),
y_nonlinHS = x * 1 + rnorm(HS_n)*c(seq(HS_strength, 1, length.out = HS_n/2),
seq(1, HS_strength, length.out = HS_n/2))
) %>% pivot_longer(cols = -x) %>%
mutate(
name = factor(name %>% mapvalues(c("y_noHS", "y_linHS", "y_nonlinHS"), c("no HS", "linear HS", "nonlinear HS")), levels = c("no HS", "linear HS", "nonlinear HS"))
) %>%
#smoothers for ribbons
plyr::ddply("name", function(x) {
x %>%
mutate(
# value_90_rq = caTools::runquantile(value, k = 50, probs = .10),
# value_10_rq = caTools::runquantile(value, k = 50, probs = .90),
value_90_running = quantile_smooth(x, value, k = 50, quantile = .10, method = "running"),
value_10_running = quantile_smooth(x, value, k = 50, quantile = .90, method = "running"),
value_90_quantreg = quantile_smooth(x, value, quantile = .10, method = "Rq"),
value_10_quantreg = quantile_smooth(x, value, quantile = .90, method = "Rq"),
value_90_qgam = quantile_smooth(x, value, quantile = .10, method = "qgam"),
value_10_qgam = quantile_smooth(x, value, quantile = .90, method = "qgam")
)
}) ->
HS_sim_data
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.1.............done
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.9..............done
## Warning: Problem with `mutate()` input `value_10_quantreg`.
## ℹ 29 non-positive fis
## ℹ Input `value_10_quantreg` is `quantile_smooth(x, value, quantile = 0.9, method = "Rq")`.
## Warning in quantreg::summary.rq(fit, covariance = TRUE, se = se, hs = hs): 29
## non-positive fis
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.1...........done
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.9..............done
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.1............done
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.9........
## qu = 0.9, log(sigma) = 0.067717 : outer Newton did not converge fully.
## ...done
#this is nice https://stackoverflow.com/questions/37326686/ggplot2-geom-ribbon-with-alpha-dependent-on-data-density-along-y-axis-for-each
#running quantiles
HS_sim_data %>%
ggplot(aes(x, value)) +
geom_point(alpha = .2) +
geom_smooth(method = lm, se = F) +
geom_ribbon(mapping = aes(
ymin = value_90_running,
ymax = value_10_running
), alpha = .4) +
facet_wrap("name", labeller = )
## `geom_smooth()` using formula 'y ~ x'
GG_save("figs/HS_examples_running.png")
## `geom_smooth()` using formula 'y ~ x'
#quantile reg with spline
HS_sim_data %>%
ggplot(aes(x, value)) +
geom_point(alpha = .2) +
geom_smooth(method = lm, se = F) +
geom_ribbon(mapping = aes(
ymin = value_90_quantreg,
ymax = value_10_quantreg
), alpha = .4) +
facet_wrap("name")
## `geom_smooth()` using formula 'y ~ x'
GG_save("figs/HS_examples_quantreg.png")
## `geom_smooth()` using formula 'y ~ x'
#quantile gam
HS_sim_data %>%
ggplot(aes(x, value)) +
geom_point(alpha = .2) +
geom_smooth(method = lm, se = F) +
geom_ribbon(mapping = aes(
ymin = value_90_qgam,
ymax = value_10_qgam
), alpha = .4) +
facet_wrap("name")
## `geom_smooth()` using formula 'y ~ x'
GG_save("figs/HS_examples_qgam.png")
## `geom_smooth()` using formula 'y ~ x'
#visualize the residuals
fit_lin_mod = ols(value ~ x, data = HS_sim_data)
HS_sim_data$resid = resid(fit_lin_mod)
#add 90th centiles by group
HS_sim_data %<>%
plyr::ddply("name", function(dd) {
dd$resid_10 = quantile_smooth(dd$x, dd$resid, quantile = .10, method = "qgam")
dd$resid_90 = quantile_smooth(dd$x, dd$resid, quantile = .90, method = "qgam")
dd
})
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.1.........done
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.9................done
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.1...........done
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.9.................done
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.1...........
## qu = 0.1, log(sigma) = -0.023419 : outer Newton did not converge fully.
## done
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.9...........done
#plot
HS_sim_data %>%
ggplot(aes(x, resid)) +
geom_point() +
geom_ribbon(mapping = aes(
ymin = resid_10,
ymax = resid_90
), alpha = .3, color = "blue", fill = "blue") +
facet_wrap("name")
#absolute residuals
HS_sim_data %>%
ggplot(aes(x, abs(resid))) +
geom_point() +
geom_smooth() +
geom_smooth(method = "lm", color = "orange") +
facet_wrap("name")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula 'y ~ x'
#squared residuals
HS_sim_data %>%
ggplot(aes(x, resid^2)) +
geom_point() +
geom_smooth() +
geom_smooth(method = "lm", color = "orange") +
facet_wrap("name")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula 'y ~ x'
Some standard methods are implemented in olsrr: https://cran.r-project.org/web/packages/olsrr/vignettes/heteroskedasticity.html
#try them on our data
plyr::ddply(HS_sim_data, "name", function(dd) {
# browser()
#fit model
fit = lm(value ~ x, data = dd)
#standard methods
tibble(
bartlett = ols_test_bartlett(dd, 'value', 'x')$pval,
#breusch_pagan = ols_test_breusch_pagan(fit, rhs = T)$pval, #produced error??
test_score = ols_test_score(fit, rhs = T)$p,
test_f = ols_test_f(fit)$p
)
})
So:
Despite that, we make our own try because these standard methods are susceptible to outliers. One can adjust for that using ranked data, but we do it from buttom up just because. The function below takes the model residuals and the predictor of interest, then:
#function
test_HS = function(resid, x) {
#data
d = tibble(
resid = standardize(abs(resid)),
x = standardize(x),
resid_rank = rank(resid),
x_rank = rank(x)
)
#fits
mods = list(
fit_linear = ols(resid ~ x, data = d),
fit_spline = ols(resid ~ rcs(x), data = d),
fit_linear_rank = ols(resid_rank ~ x_rank, data = d),
fit_spline_rank = ols(resid_rank ~ rcs(x_rank), data = d)
)
#summarize
y = tibble(
test = c("linear raw", "spline raw", "linear rank", "spline rank"),
r2adj = map_dbl(mods, get_r2adj),
p = map_dbl(mods, get_p),
fit = mods
)
#for the rcs, we have to compute the model improvement vs. linear
#https://stackoverflow.com/questions/47937065/how-can-i-interpret-the-p-value-of-nonlinear-term-under-rms-anova
#https://stats.stackexchange.com/questions/405961/is-there-formal-test-of-non-linearity-in-linear-regression
y$p[2] = lrtest(mods$fit_linear, mods$fit_spline)$stats["P"]
y$p[4] = lrtest(mods$fit_linear_rank, mods$fit_spline_rank)$stats["P"]
y %>% mutate(log10_p = -log10(p))
}
#test each for HS
HS_sim_data %>%
plyr::ddply("name", function(x) {
fit = ols(value ~ x, data = x)
test_HS(resid = fit$residuals, x = x$x)
}) %>% as_tibble()
#function
run_sim = function(i, n, HS_strength = 5) {
#simulate data
HS_n = n
#make data
tibble(
x = seq(0, 10, length.out = HS_n),
y_noHS = x * 1 + rnorm(HS_n),
y_linHS = x * 1 + rnorm(HS_n)*seq(1, HS_strength, length.out = HS_n),
y_nonlinHS = x * 1 + rnorm(HS_n)*c(seq(HS_strength, 1, length.out = HS_n/2),
seq(1, HS_strength, length.out = HS_n/2))
) %>%
pivot_longer(cols = -x) %>%
mutate(
name = factor(name %>% mapvalues(c("y_noHS", "y_linHS", "y_nonlinHS"), c("no HS", "linear HS", "nonlinear HS")), levels = c("no HS", "linear HS", "nonlinear HS"))
) ->
HS_sim_data
#do the tests
HS_sim_data %>%
plyr::ddply("name", function(x) {
fit = ols(value ~ x, data = x)
test_HS(resid = fit$residuals, x = x$x)
}) %>% as_tibble() %>%
#we dont want to save model fits
dplyr::select(-fit) %>%
mutate(
i = i
)
}
#run
set.seed(1)
HS_sim_tests = map_df(1:10000, run_sim, n = 200, HS_strength = 5)
#do p values behave as expected?
HS_sim_tests %>%
ggplot(aes(p)) +
geom_histogram(boundary = 0) +
facet_wrap(c("name", "test"), scales = c("free_y"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
GG_save("figs/p_values_tests.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#below .05 proportions
HS_sim_tests %>% plyr::ddply(c("name", "test"), function(dd) {
tibble(
p_mean = mean(dd$p),
p_05 = mean(dd$p < .05),
p_01 = mean(dd$p < .01)
)
})
So:
We ran 1000 simulations at n=200, with 3 conditions, and quite strong HS
We test for heteroscedasticity using the squared residuals method, linear and nonlinear, as well as ranked and normal data.
p values should follow specific distributions:
First line, we want uniform dists, since there isn’t any HS.
Second line, we want close to 0 for col 1-2, and uniform for col 3-4.
Third line, we want close to 0 for col 3-4, and uniform for col 1-2.
in fact the p values are somewhat pushed towards 0 in rows 2-3 in the off-codition, inflated p values (false positives).
it means the test is getting the kind of HS wrong, but it finds it alright, i.e., detecting linear HS when it is not so, or detecting nonlinear when it is linear.
As above, but use n=1000 to see if the identification problem goes away.
#run
set.seed(1)
HS_sim_tests2 = map_df(1:100, run_sim, n = 1000, HS_strength = 5)
#do p values behave as expected?
HS_sim_tests2 %>%
ggplot(aes(p)) +
geom_histogram(boundary = 0) +
facet_wrap(c("name", "test"), scales = c("free_y"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#below .05 proportions
HS_sim_tests2 %>% plyr::ddply(c("name", "test"), function(dd) {
tibble(
p_mean = mean(dd$p),
p_05 = mean(dd$p < .05),
p_01 = mean(dd$p < .01)
)
})
https://www.sciencedirect.com/science/article/abs/pii/S0160289620300271?via%3Dihub#f0005 https://osf.io/dg547/
#read their data
dk = read_spss("Dunning_Kruger_INTELL_UPLOAD.sav")
#prep data
dk %<>% mutate(
SAI_z = SAI_winsorised %>% standardize(),
raven_z = Raven_Total %>% standardize(),
raven_90_qgam = quantile_smooth(SAI_z, raven_z, quantile = .10, method = "qgam", k = 5),
raven_10_qgam = quantile_smooth(SAI_z, raven_z, quantile = .90, method = "qgam", k = 5)
)
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.1.........done
## Estimating learning rate. Each dot corresponds to a loss evaluation.
## qu = 0.9...........done
#plot
dk %>%
ggplot(aes(SAI_z, raven_z)) +
geom_point(alpha = .2) +
geom_smooth(method = lm, se = F) +
geom_ribbon(mapping = aes(
ymin = raven_90_qgam,
ymax = raven_10_qgam
), alpha = .4) +
scale_y_continuous("Raven test standard score (z)") +
scale_x_continuous("Self-estimated intelligence standard score (z)")
## `geom_smooth()` using formula 'y ~ x'
GG_save("figs/DK_HS.png")
## `geom_smooth()` using formula 'y ~ x'
#test results
DK_fit = lm(raven_z ~ SAI_z, data = dk)
summary(DK_fit)
##
## Call:
## lm(formula = raven_z ~ SAI_z, data = dk)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.568 -0.505 0.145 0.607 2.516
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.14e-15 3.15e-02 0.00 1
## SAI_z 2.82e-01 3.15e-02 8.97 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.96 on 927 degrees of freedom
## Multiple R-squared: 0.0798, Adjusted R-squared: 0.0788
## F-statistic: 80.4 on 1 and 927 DF, p-value: <2e-16
test_HS(resid = resid(DK_fit), x = dk$SAI_z)
#standard tests
ols_test_bartlett(dk, 'raven_z', 'SAI_z')
##
## Bartlett's Test of Homogenity of Variances
## ------------------------------------------------
## Ho: Variances are equal across groups
## Ha: Variances are unequal for atleast two groups
##
## Data
## ------------------------
## Variables: raven_z SAI_z
##
## Test Summary
## ------------------------
## DF = 1
## Chi2 = 0.000
## Prob > Chi2 = 1.000
# ols_test_breusch_pagan(DK_fit, rhs = T)
ols_test_score(DK_fit, rhs = T)
##
## Score Test for Heteroskedasticity
## ---------------------------------
## Ho: Variance is homogenous
## Ha: Variance is not homogenous
##
## Variables: m2
##
## Test Summary
## -------------------------
## DF = 1
## Chi2 = 7.371
## Prob > Chi2 = 0.0066
ols_test_f(DK_fit)
##
## F Test for Heteroskedasticity
## -----------------------------
## Ho: Variance is homogenous
## Ha: Variance is not homogenous
##
## Variables: fitted values of raven_z
##
## Test Summary
## ----------------------
## Num DF = 1
## Den DF = 927
## F = 7.414
## Prob > F = 0.0066