About

A reviewer asked me to look at possible heteroscedasticity (HS) in some models I ran. It took me on a detour that ended with making some new convenience functions for resting for HS, whether linear or nonlinear. HS is when the variance of the residual depends on predictor values; i.e., the amount of error is a function of the predictors. Errors are supposed to be totally random normal values. HS biases the standard errors of the model, though does not normally result in bias in the effect sizes/slopes (I think).

Init

options(
  digits = 2
)
library(pacman)
p_load(kirkegaard, 
       readxl, 
       rms, 
       osfr, 
       olsrr,
       haven
       )
theme_set(theme_bw())

Functions

#get model R2 adj
get_r2adj = function(x) {
  x %>% summary.lm() %>% .$adj.r.squared
}

#boomer code
#https://rdrr.io/cran/rms/src/R/ols.s#sym-print.ols
get_p = function(x) {
  1 - pchisq(x$stats['Model L.R.'], x$stats['d.f.'])
}

#make function to get smoothed quantiles
quantile_smooth = function(x, y, quantile, method = c("qgam", "Rq", "running"), k = 50) {
  
  #method
  if (method == "qgam") {
    #same as in example here
    #https://rdrr.io/cran/qgam/man/qgam.html
    fit = qgam::qgam(list(y~s(x, k = k, bs = "cr"), ~ s(x, k = k, bs = "cr")), 
               data = data.frame(x = x, y = y), qu = quantile)
    return(predict(fit))
    
  } else if (method == "Rq") {
    #fit with spline
    #https://rdrr.io/cran/rms/man/Rq.html
    fit = rms::Rq(y ~ rcs(x), data = data.frame(x = x, y = y), tau = quantile)
    return(fitted(fit) %>% as.vector())
    
  } else if (method == "running") {
    #beware, need to sort the y by x first
    #and then return it to original state
    x_order = order(x)
    return(caTools::runquantile(y[x_order], k = k, probs = quantile)[x_order])
    
  } else {
    stop(str_glue("method not recognized: {method}"), call. = F)
  }
  
}

Simulating some data

Simulate data for 3 functions: 1 no HS, 1 linear HS, 1 nonlinear HS. Then plot them to see if it looks like it should.

#settings
HS_strength = 5
HS_n = 1000
set.seed(2)

#simulate
tibble(
  x = seq(0, 10, length.out = HS_n),
  y_noHS = x * 1 + rnorm(HS_n),
  y_linHS = x * 1 + rnorm(HS_n)*seq(1, HS_strength, length.out = HS_n),
  y_nonlinHS = x * 1 + rnorm(HS_n)*c(seq(HS_strength, 1, length.out = HS_n/2),
                                 seq(1, HS_strength, length.out = HS_n/2))
) %>% pivot_longer(cols = -x) %>% 
  mutate(
    name = factor(name %>% mapvalues(c("y_noHS", "y_linHS", "y_nonlinHS"), c("no HS", "linear HS", "nonlinear HS")), levels = c("no HS", "linear HS", "nonlinear HS"))
  )  %>% 
  #smoothers for ribbons
  plyr::ddply("name", function(x) {
    x %>%
      mutate(
        # value_90_rq = caTools::runquantile(value, k = 50, probs = .10),
        # value_10_rq = caTools::runquantile(value, k = 50, probs = .90),
        value_90_running = quantile_smooth(x, value, k = 50, quantile = .10, method = "running"),
        value_10_running = quantile_smooth(x, value, k = 50, quantile = .90, method = "running"),
        value_90_quantreg = quantile_smooth(x, value, quantile = .10, method = "Rq"),
        value_10_quantreg = quantile_smooth(x, value, quantile = .90, method = "Rq"),
        value_90_qgam = quantile_smooth(x, value, quantile = .10, method = "qgam"),
        value_10_qgam = quantile_smooth(x, value, quantile = .90, method = "qgam")
    )
  }) -> 
  HS_sim_data

## Estimating learning rate. Each dot corresponds to a loss evaluation. 
## qu = 0.1.............done 
## Estimating learning rate. Each dot corresponds to a loss evaluation. 
## qu = 0.9..............done

## Warning: Problem with `mutate()` input `value_10_quantreg`.
## ℹ 29 non-positive fis
## ℹ Input `value_10_quantreg` is `quantile_smooth(x, value, quantile = 0.9, method = "Rq")`.

## Warning in quantreg::summary.rq(fit, covariance = TRUE, se = se, hs = hs): 29
## non-positive fis

## Estimating learning rate. Each dot corresponds to a loss evaluation. 
## qu = 0.1...........done 
## Estimating learning rate. Each dot corresponds to a loss evaluation. 
## qu = 0.9..............done 
## Estimating learning rate. Each dot corresponds to a loss evaluation. 
## qu = 0.1............done 
## Estimating learning rate. Each dot corresponds to a loss evaluation. 
## qu = 0.9........

## qu = 0.9, log(sigma) = 0.067717 : outer Newton did not converge fully.

## ...done

#this is nice https://stackoverflow.com/questions/37326686/ggplot2-geom-ribbon-with-alpha-dependent-on-data-density-along-y-axis-for-each
#running quantiles
HS_sim_data %>% 
  ggplot(aes(x, value)) +
  geom_point(alpha = .2) +
  geom_smooth(method = lm, se = F) +
  geom_ribbon(mapping = aes(
    ymin = value_90_running,
    ymax = value_10_running
  ), alpha = .4) +
  facet_wrap("name", labeller = )

## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/HS_examples_running.png")

## `geom_smooth()` using formula 'y ~ x'

#quantile reg with spline
HS_sim_data %>% 
  ggplot(aes(x, value)) +
  geom_point(alpha = .2) +
  geom_smooth(method = lm, se = F) +
  geom_ribbon(mapping = aes(
    ymin = value_90_quantreg,
    ymax = value_10_quantreg
  ), alpha = .4) +
  facet_wrap("name")

## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/HS_examples_quantreg.png")

## `geom_smooth()` using formula 'y ~ x'

#quantile gam
HS_sim_data %>% 
  ggplot(aes(x, value)) +
  geom_point(alpha = .2) +
  geom_smooth(method = lm, se = F) +
  geom_ribbon(mapping = aes(
    ymin = value_90_qgam,
    ymax = value_10_qgam
  ), alpha = .4) +
  facet_wrap("name")

## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/HS_examples_qgam.png")

## `geom_smooth()` using formula 'y ~ x'

Model residuals

#visualize the residuals
fit_lin_mod = ols(value ~ x, data = HS_sim_data)
HS_sim_data$resid = resid(fit_lin_mod)

#add 90th centiles by group
HS_sim_data %<>% 
  plyr::ddply("name", function(dd) {
    dd$resid_10 = quantile_smooth(dd$x, dd$resid, quantile = .10, method = "qgam")
    dd$resid_90 = quantile_smooth(dd$x, dd$resid, quantile = .90, method = "qgam")
    dd
  })

## Estimating learning rate. Each dot corresponds to a loss evaluation. 
## qu = 0.1.........done 
## Estimating learning rate. Each dot corresponds to a loss evaluation. 
## qu = 0.9................done 
## Estimating learning rate. Each dot corresponds to a loss evaluation. 
## qu = 0.1...........done 
## Estimating learning rate. Each dot corresponds to a loss evaluation. 
## qu = 0.9.................done 
## Estimating learning rate. Each dot corresponds to a loss evaluation. 
## qu = 0.1...........

## qu = 0.1, log(sigma) = -0.023419 : outer Newton did not converge fully.

## done 
## Estimating learning rate. Each dot corresponds to a loss evaluation. 
## qu = 0.9...........done

#plot
HS_sim_data %>% 
  ggplot(aes(x, resid)) +
  geom_point() +
  geom_ribbon(mapping = aes(
    ymin = resid_10,
    ymax = resid_90
  ), alpha = .3, color = "blue", fill = "blue") +
  facet_wrap("name")

#absolute residuals
HS_sim_data %>% 
  ggplot(aes(x, abs(resid))) +
  geom_point() +
  geom_smooth() +
  geom_smooth(method = "lm", color = "orange") +
  facet_wrap("name")

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using formula 'y ~ x'

#squared residuals
HS_sim_data %>% 
  ggplot(aes(x, resid^2)) +
  geom_point() +
  geom_smooth() +
  geom_smooth(method = "lm", color = "orange") +
  facet_wrap("name")

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula 'y ~ x'

Detecting heteroscedasticity

Standard functions

Some standard methods are implemented in olsrr: https://cran.r-project.org/web/packages/olsrr/vignettes/heteroskedasticity.html

#try them on our data
plyr::ddply(HS_sim_data, "name", function(dd) {
  # browser()
  #fit model
  fit = lm(value ~ x, data = dd)
  
  #standard methods
  tibble(
    bartlett = ols_test_bartlett(dd, 'value', 'x')$pval,
    #breusch_pagan = ols_test_breusch_pagan(fit, rhs = T)$pval, #produced error??
    test_score = ols_test_score(fit, rhs = T)$p,
    test_f = ols_test_f(fit)$p
  )
})

So:

No method finds HS when it isn’t there
All methods find linear HS when linear HS is there
Only Bartlett’s method finds the nonlinear when it’s there

Despite that, we make our own try because these standard methods are susceptible to outliers. One can adjust for that using ranked data, but we do it from buttom up just because. The function below takes the model residuals and the predictor of interest, then:

Takes Absolute values of the residuals
Standardizes both variables
Creates rank transforms
Fits linear and spline models to predict the transformed residuals from the predictor
Derives some p values for these comparisons, as well as the effect sizes in adj. R2.

#function
test_HS = function(resid, x) {
  #data
  d = tibble(
    resid = standardize(abs(resid)),
    x = standardize(x),
    resid_rank = rank(resid),
    x_rank = rank(x)
  )
  
  #fits
  mods = list(
    fit_linear = ols(resid ~ x, data = d),
    fit_spline = ols(resid ~ rcs(x), data = d),
    fit_linear_rank = ols(resid_rank ~ x_rank, data = d),
    fit_spline_rank = ols(resid_rank ~ rcs(x_rank), data = d)
  )
  
  #summarize
  y = tibble(
    test = c("linear raw", "spline raw", "linear rank", "spline rank"),
    r2adj = map_dbl(mods, get_r2adj),
    p = map_dbl(mods, get_p),
    fit = mods
  )
  
  #for the rcs, we have to compute the model improvement vs. linear
  #https://stackoverflow.com/questions/47937065/how-can-i-interpret-the-p-value-of-nonlinear-term-under-rms-anova
  #https://stats.stackexchange.com/questions/405961/is-there-formal-test-of-non-linearity-in-linear-regression
  y$p[2] = lrtest(mods$fit_linear, mods$fit_spline)$stats["P"]
  y$p[4] = lrtest(mods$fit_linear_rank, mods$fit_spline_rank)$stats["P"]

  y %>% mutate(log10_p = -log10(p))
}

#test each for HS
HS_sim_data %>% 
  plyr::ddply("name", function(x) {
    fit = ols(value ~ x, data = x)
    test_HS(resid = fit$residuals, x = x$x)
  }) %>% as_tibble()

Do the p values work as expected?

#function
run_sim = function(i, n, HS_strength = 5) {
  #simulate data
  HS_n = n
  
  #make data
  tibble(
    x = seq(0, 10, length.out = HS_n),
    y_noHS = x * 1 + rnorm(HS_n),
    y_linHS = x * 1 + rnorm(HS_n)*seq(1, HS_strength, length.out = HS_n),
    y_nonlinHS = x * 1 + rnorm(HS_n)*c(seq(HS_strength, 1, length.out = HS_n/2),
                                   seq(1, HS_strength, length.out = HS_n/2))
  ) %>% 
    pivot_longer(cols = -x) %>% 
  mutate(
    name = factor(name %>% mapvalues(c("y_noHS", "y_linHS", "y_nonlinHS"), c("no HS", "linear HS", "nonlinear HS")), levels = c("no HS", "linear HS", "nonlinear HS"))
  ) -> 
    HS_sim_data
  
  
  #do the tests
  HS_sim_data %>% 
    plyr::ddply("name", function(x) {
      fit = ols(value ~ x, data = x)
      test_HS(resid = fit$residuals, x = x$x)
    }) %>% as_tibble() %>% 
    #we dont want to save model fits
    dplyr::select(-fit) %>% 
    mutate(
      i = i
    )
}

#run
set.seed(1)
HS_sim_tests = map_df(1:10000, run_sim, n = 200, HS_strength = 5)

#do p values behave as expected?
HS_sim_tests %>% 
  ggplot(aes(p)) +
  geom_histogram(boundary = 0) +
  facet_wrap(c("name", "test"), scales = c("free_y"))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figs/p_values_tests.png")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#below .05 proportions
HS_sim_tests %>% plyr::ddply(c("name", "test"), function(dd) {
  tibble(
    p_mean = mean(dd$p),
    p_05 = mean(dd$p < .05),
    p_01 = mean(dd$p < .01)
  )
})

So:

We ran 1000 simulations at n=200, with 3 conditions, and quite strong HS
We test for heteroscedasticity using the squared residuals method, linear and nonlinear, as well as ranked and normal data.
p values should follow specific distributions:
- First line, we want uniform dists, since there isn’t any HS.
- Second line, we want close to 0 for col 1-2, and uniform for col 3-4.
- Third line, we want close to 0 for col 3-4, and uniform for col 1-2.
in fact the p values are somewhat pushed towards 0 in rows 2-3 in the off-codition, inflated p values (false positives).
it means the test is getting the kind of HS wrong, but it finds it alright, i.e., detecting linear HS when it is not so, or detecting nonlinear when it is linear.

With more power

As above, but use n=1000 to see if the identification problem goes away.

#run
set.seed(1)
HS_sim_tests2 = map_df(1:100, run_sim, n = 1000, HS_strength = 5)

#do p values behave as expected?
HS_sim_tests2 %>% 
  ggplot(aes(p)) +
  geom_histogram(boundary = 0) +
  facet_wrap(c("name", "test"), scales = c("free_y"))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#below .05 proportions
HS_sim_tests2 %>% plyr::ddply(c("name", "test"), function(dd) {
  tibble(
    p_mean = mean(dd$p),
    p_05 = mean(dd$p < .05),
    p_01 = mean(dd$p < .01)
  )
})

Dunning Kruger: Gignac & Zajenkowski 2020

https://www.sciencedirect.com/science/article/abs/pii/S0160289620300271?via%3Dihub#f0005 https://osf.io/dg547/

#read their data
dk = read_spss("Dunning_Kruger_INTELL_UPLOAD.sav")

#prep data
dk %<>% mutate(
  SAI_z = SAI_winsorised %>% standardize(),
  raven_z = Raven_Total %>% standardize(),
  raven_90_qgam = quantile_smooth(SAI_z, raven_z, quantile = .10, method = "qgam", k = 5),
  raven_10_qgam = quantile_smooth(SAI_z, raven_z, quantile = .90, method = "qgam", k = 5)
  )

## Estimating learning rate. Each dot corresponds to a loss evaluation. 
## qu = 0.1.........done 
## Estimating learning rate. Each dot corresponds to a loss evaluation. 
## qu = 0.9...........done

#plot
dk %>% 
  ggplot(aes(SAI_z, raven_z)) +
  geom_point(alpha = .2) +
  geom_smooth(method = lm, se = F) +
  geom_ribbon(mapping = aes(
    ymin = raven_90_qgam,
    ymax = raven_10_qgam
  ), alpha = .4) +
  scale_y_continuous("Raven test standard score (z)") +
  scale_x_continuous("Self-estimated intelligence standard score (z)")

## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/DK_HS.png")

## `geom_smooth()` using formula 'y ~ x'

#test results
DK_fit = lm(raven_z ~ SAI_z, data = dk)
summary(DK_fit)

## 
## Call:
## lm(formula = raven_z ~ SAI_z, data = dk)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.568 -0.505  0.145  0.607  2.516 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.14e-15   3.15e-02    0.00        1    
## SAI_z       2.82e-01   3.15e-02    8.97   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.96 on 927 degrees of freedom
## Multiple R-squared:  0.0798, Adjusted R-squared:  0.0788 
## F-statistic: 80.4 on 1 and 927 DF,  p-value: <2e-16

test_HS(resid = resid(DK_fit), x = dk$SAI_z)

#standard tests
ols_test_bartlett(dk, 'raven_z', 'SAI_z')

## 
##     Bartlett's Test of Homogenity of Variances    
## ------------------------------------------------
## Ho: Variances are equal across groups
## Ha: Variances are unequal for atleast two groups
## 
##           Data           
##  ------------------------
##  Variables: raven_z SAI_z 
## 
##       Test Summary       
##  ------------------------
##  DF            =    1 
##  Chi2          =    0.000 
##  Prob > Chi2   =    1.000

# ols_test_breusch_pagan(DK_fit, rhs = T)
ols_test_score(DK_fit, rhs = T)

## 
##  Score Test for Heteroskedasticity
##  ---------------------------------
##  Ho: Variance is homogenous
##  Ha: Variance is not homogenous
## 
##  Variables: m2 
## 
##       Test Summary        
##  -------------------------
##  DF            =    1 
##  Chi2          =    7.371 
##  Prob > Chi2   =    0.0066

ols_test_f(DK_fit)

## 
##  F Test for Heteroskedasticity
##  -----------------------------
##  Ho: Variance is homogenous
##  Ha: Variance is not homogenous
## 
##  Variables: fitted values of raven_z 
## 
##      Test Summary      
##  ----------------------
##  Num DF     =    1 
##  Den DF     =    927 
##  F          =    7.414 
##  Prob > F   =    0.0066

Understanding and testing for heteroscedasticity