About

Understanding the relationship between group differences and inferences about individuals.

Init

options(
  digits = 2
)

library(pacman)
p_load(
  kirkegaard
)

theme_set(theme_bw())

#overwrite describe
describe = function(...) {
 y = psych::describe(...)
 class(y) = "data.frame"
 y
}

Simulations

Simulation 1: large gap

n = 10000
set.seed(1)
s1 = tibble(
  group = rep(c("blue", "green"), each = n),
  value = c(rnorm(n, mean = 50, sd = 5), rnorm(n, mean = 55, sd = 5))
)

#describe the data
describe(s1$value)
describeBy(s1$value, s1$group)
## 
##  Descriptive statistics by group 
## group: blue
##    vars     n mean  sd median trimmed mad min max range  skew kurtosis   se
## X1    1 10000   50 5.1     50      50   5  32  69    37 -0.02    -0.04 0.05
## ------------------------------------------------------------ 
## group: green
##    vars     n mean sd median trimmed mad min max range skew kurtosis   se
## X1    1 10000   55  5     55      55   5  33  74    40 0.06     0.03 0.05
#histogram
s1 %>% 
  ggplot(aes(value, fill = group)) +
  geom_histogram(position = "dodge") +
  scale_fill_manual(values = c("blue", "green")) +
  scale_x_continuous(breaks = seq(0, 100, by = 5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#violin
s1 %>% 
  ggplot(aes(group, value, fill = group)) +
  geom_violin(position = "dodge") +
  scale_fill_manual(values = c("blue", "green")) +
  scale_y_continuous(breaks = seq(0, 100, by = 5))

#fit models
model_with_group = lm(value ~ group, data = s1)
model_with_group %>% summary()
## 
## Call:
## lm(formula = value ~ group, data = s1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -21.49  -3.37  -0.06   3.36  19.08 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  49.9673     0.0501   997.7   <2e-16 ***
## groupgreen    5.0117     0.0708    70.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5 on 19998 degrees of freedom
## Multiple R-squared:   0.2,   Adjusted R-squared:   0.2 
## F-statistic: 5.01e+03 on 1 and 19998 DF,  p-value: <2e-16
#fit model without
model_without_group = lm(value ~ 1, data = s1)
model_without_group %>% summary()
## 
## Call:
## lm(formula = value ~ 1, data = s1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.830  -3.768  -0.014   3.829  21.167 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  52.4732     0.0396    1325   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.6 on 19999 degrees of freedom
#predict some new cases, with uncertainties (prediction interval)
s1_newdata = tibble(
  group = c("green", "blue")
)

#add and make long format (difficult because this is column grouped format)
s1_newdata = bind_rows(
  predict(model_with_group, s1_newdata, interval = "prediction") %>% data.frame(),
  predict(model_without_group, s1_newdata, interval = "prediction") %>% data.frame()
) %>% mutate(
  group = rep(c("green", "blue"), 2),
  model = rep(c("with groups", "without groups"), each = 2),
  interval_width = upr - lwr
)

#plot results
s1_newdata %>% 
  ggplot(aes(group, fit, color = group)) +
  geom_point(size = 5) +
  geom_errorbar(aes(ymin = lwr, ymax = upr)) +
  scale_color_manual(values = c("blue", "green")) +
  facet_wrap("model")

#with ggeffects
#does not work for intercept only models
# ggpredict(model_with_group, terms = "group", interval = "prediction") %>% plot()
# ggpredict(model_without_group, terms = c(), interval = "prediction") %>% plot()

Simulation 2: larger gap

n = 10000
set.seed(1)
s2 = tibble(
  group = rep(c("blue", "green"), each = n),
  value = c(rnorm(n, mean = 50, sd = 5), rnorm(n, mean = 60, sd = 5))
)

#describe the data
describe(s2$value)
describeBy(s2$value, s2$group)
## 
##  Descriptive statistics by group 
## group: blue
##    vars     n mean  sd median trimmed mad min max range  skew kurtosis   se
## X1    1 10000   50 5.1     50      50   5  32  69    37 -0.02    -0.04 0.05
## ------------------------------------------------------------ 
## group: green
##    vars     n mean sd median trimmed mad min max range skew kurtosis   se
## X1    1 10000   60  5     60      60   5  38  79    40 0.06     0.03 0.05
#histogram
s2 %>% 
  ggplot(aes(value, fill = group)) +
  geom_histogram(position = "dodge") +
  scale_fill_manual(values = c("blue", "green")) +
  scale_x_continuous(breaks = seq(0, 100, by = 5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#violin
s2 %>% 
  ggplot(aes(group, value, fill = group)) +
  geom_violin(position = "dodge") +
  scale_fill_manual(values = c("blue", "green")) +
  scale_y_continuous(breaks = seq(0, 100, by = 5))

#fit models
model_with_group = lm(value ~ group, data = s2)
model_with_group %>% summary()
## 
## Call:
## lm(formula = value ~ group, data = s2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -21.49  -3.37  -0.06   3.36  19.08 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  49.9673     0.0501     998   <2e-16 ***
## groupgreen   10.0117     0.0708     141   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5 on 19998 degrees of freedom
## Multiple R-squared:   0.5,   Adjusted R-squared:   0.5 
## F-statistic: 2e+04 on 1 and 19998 DF,  p-value: <2e-16
#fit model without
model_without_group = lm(value ~ 1, data = s2)
model_without_group %>% summary()
## 
## Call:
## lm(formula = value ~ 1, data = s2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.330  -5.237   0.099   5.228  23.667 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  54.9732     0.0501    1098   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.1 on 19999 degrees of freedom
#predict some new cases, with uncertainties (prediction interval)
s2_newdata = tibble(
  group = c("green", "blue")
)

#add and make long format (difficult because this is column grouped format)
s2_newdata = bind_rows(
  predict(model_with_group, s2_newdata, interval = "prediction") %>% data.frame(),
  predict(model_without_group, s2_newdata, interval = "prediction") %>% data.frame()
) %>% mutate(
  group = rep(c("green", "blue"), 2),
  model = rep(c("with groups", "without groups"), each = 2),
  interval_width = upr - lwr
)

#plot results
s2_newdata %>% 
  ggplot(aes(group, fit, color = group)) +
  geom_point(size = 5) +
  geom_errorbar(aes(ymin = lwr, ymax = upr)) +
  scale_color_manual(values = c("blue", "green")) +
  facet_wrap("model")

Simulation 3: more groups

n = 10000
set.seed(1)
s3 = tibble(
  group = rep(c("blue", "green"), each = n) %>% sample(),
  age_group = rep(c("young", "middle", "old"), length.out = n*2) %>% sample() %>% ordered(levels = c("young", "middle", "old"))
) %>% mutate(
  value = rnorm(n()) + 
    case_when(
      group == "blue" ~ rnorm(n(), mean = 10, sd = 5),
      group == "green" ~ rnorm(n(), mean = 0, sd = 5)
      ) + 
    case_when(
      age_group == "young" ~ rnorm(n(), mean = -5, sd = 5),
      age_group == "middle" ~ rnorm(n(), mean = 0, sd = 5),
      age_group == "old" ~ rnorm(n(), mean = 5, sd = 5)
      ) + 50
)

#describe the data
describe(s3$value)
describeBy(s3$value, s3$group)
## 
##  Descriptive statistics by group 
## group: blue
##    vars     n mean  sd median trimmed mad min max range  skew kurtosis   se
## X1    1 10000   60 8.2     60      60 8.4  28  93    66 -0.03    -0.11 0.08
## ------------------------------------------------------------ 
## group: green
##    vars     n mean  sd median trimmed mad min max range  skew kurtosis   se
## X1    1 10000   50 8.2     50      50 8.4  22  80    58 -0.01     -0.1 0.08
describeBy(s3$value, s3$age_group)
## 
##  Descriptive statistics by group 
## group: young
##    vars    n mean  sd median trimmed mad min max range skew kurtosis   se
## X1    1 6667   50 8.7     50      50   9  22  79    58    0    -0.25 0.11
## ------------------------------------------------------------ 
## group: middle
##    vars    n mean  sd median trimmed mad min max range  skew kurtosis   se
## X1    1 6667   55 8.8     55      55   9  23  85    62 -0.03    -0.18 0.11
## ------------------------------------------------------------ 
## group: old
##    vars    n mean  sd median trimmed mad min max range  skew kurtosis   se
## X1    1 6666   60 8.7     60      60 8.9  32  93    61 -0.02    -0.21 0.11
#histogram
s3 %>% 
  ggplot(aes(value, fill = group)) +
  geom_histogram(position = "dodge") +
  scale_fill_manual(values = c("blue", "green")) +
  scale_x_continuous(breaks = seq(0, 100, by = 10)) +
  facet_wrap("age_group")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#violin
s3 %>% 
  ggplot(aes(group, value, fill = group)) +
  geom_violin(position = "dodge") +
  scale_fill_manual(values = c("blue", "green")) +
  scale_y_continuous(breaks = seq(0, 100, by = 10)) +
  facet_wrap("age_group")

#fit models
model_with_group = lm(value ~ group + age_group, data = s3)
model_with_group %>% summary()
## 
## Call:
## lm(formula = value ~ group + age_group, data = s3)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31.64  -4.77   0.04   4.85  29.13 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  59.9183     0.0715  837.77   <2e-16 ***
## groupgreen   -9.8693     0.1011  -97.57   <2e-16 ***
## age_group.L   7.1157     0.0876   81.23   <2e-16 ***
## age_group.Q   0.0305     0.0876    0.35     0.73    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.2 on 19996 degrees of freedom
## Multiple R-squared:  0.446,  Adjusted R-squared:  0.446 
## F-statistic: 5.36e+03 on 3 and 19996 DF,  p-value: <2e-16
#fit model without
model_without_group = lm(value ~ 1, data = s3)
model_without_group %>% summary()
## 
## Call:
## lm(formula = value ~ 1, data = s3)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33.47  -6.70   0.08   6.68  38.14 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  54.9834     0.0679     809   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.6 on 19999 degrees of freedom
#predict some new cases, with uncertainties (prediction interval)
s3_newdata = expand_grid(
  group = c("green", "blue"),
  age_group = levels(s3$age_group)
)

#add and make long format (difficult because this is column grouped format)
s3_newdata = bind_rows(
  predict(model_with_group, s3_newdata, interval = "prediction") %>% data.frame(),
  predict(model_without_group, s3_newdata, interval = "prediction") %>% data.frame()
) %>% mutate(
  group = rep(s3_newdata$group, length.out = n()),
  age_group = rep(s3_newdata$age_group, length.out = n()) %>% ordered(levels = levels(s3$age_group)),
  model = rep(c("with groups", "without groups"), each = nrow(s3_newdata)),
  interval_width = upr - lwr
)

#plot results
s3_newdata %>% 
  ggplot(aes(group, fit, color = group, linetype = age_group)) +
  geom_point(size = 5, position = position_dodge(width = 1)) +
  geom_errorbar(aes(ymin = lwr, ymax = upr), position = position_dodge(width = 1)) +
  scale_color_manual(values = c("blue", "green")) +
  facet_wrap(c("model"))

Information collection

#simple visualization
info_gather = tibble(
  information = seq(0, 1, by = .01)
) %>% mutate(
  precision_of_estimate = information^(1/5)
)

#plot
info_gather %>% 
  ggplot(aes(information, precision_of_estimate)) +
  geom_line() +
  geom_vline(xintercept = c(.20, .50, .90), linetype = "dotted")

GG_save("figs/information_precision.png")