Understanding the relationship between group differences and inferences about individuals.
options(
digits = 2
)
library(pacman)
p_load(
kirkegaard
)
theme_set(theme_bw())
#overwrite describe
describe = function(...) {
y = psych::describe(...)
class(y) = "data.frame"
y
}
n = 10000
set.seed(1)
s1 = tibble(
group = rep(c("blue", "green"), each = n),
value = c(rnorm(n, mean = 50, sd = 5), rnorm(n, mean = 55, sd = 5))
)
#describe the data
describe(s1$value)
describeBy(s1$value, s1$group)
##
## Descriptive statistics by group
## group: blue
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 10000 50 5.1 50 50 5 32 69 37 -0.02 -0.04 0.05
## ------------------------------------------------------------
## group: green
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 10000 55 5 55 55 5 33 74 40 0.06 0.03 0.05
#histogram
s1 %>%
ggplot(aes(value, fill = group)) +
geom_histogram(position = "dodge") +
scale_fill_manual(values = c("blue", "green")) +
scale_x_continuous(breaks = seq(0, 100, by = 5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#violin
s1 %>%
ggplot(aes(group, value, fill = group)) +
geom_violin(position = "dodge") +
scale_fill_manual(values = c("blue", "green")) +
scale_y_continuous(breaks = seq(0, 100, by = 5))
#fit models
model_with_group = lm(value ~ group, data = s1)
model_with_group %>% summary()
##
## Call:
## lm(formula = value ~ group, data = s1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.49 -3.37 -0.06 3.36 19.08
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.9673 0.0501 997.7 <2e-16 ***
## groupgreen 5.0117 0.0708 70.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5 on 19998 degrees of freedom
## Multiple R-squared: 0.2, Adjusted R-squared: 0.2
## F-statistic: 5.01e+03 on 1 and 19998 DF, p-value: <2e-16
#fit model without
model_without_group = lm(value ~ 1, data = s1)
model_without_group %>% summary()
##
## Call:
## lm(formula = value ~ 1, data = s1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.830 -3.768 -0.014 3.829 21.167
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52.4732 0.0396 1325 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.6 on 19999 degrees of freedom
#predict some new cases, with uncertainties (prediction interval)
s1_newdata = tibble(
group = c("green", "blue")
)
#add and make long format (difficult because this is column grouped format)
s1_newdata = bind_rows(
predict(model_with_group, s1_newdata, interval = "prediction") %>% data.frame(),
predict(model_without_group, s1_newdata, interval = "prediction") %>% data.frame()
) %>% mutate(
group = rep(c("green", "blue"), 2),
model = rep(c("with groups", "without groups"), each = 2),
interval_width = upr - lwr
)
#plot results
s1_newdata %>%
ggplot(aes(group, fit, color = group)) +
geom_point(size = 5) +
geom_errorbar(aes(ymin = lwr, ymax = upr)) +
scale_color_manual(values = c("blue", "green")) +
facet_wrap("model")
#with ggeffects
#does not work for intercept only models
# ggpredict(model_with_group, terms = "group", interval = "prediction") %>% plot()
# ggpredict(model_without_group, terms = c(), interval = "prediction") %>% plot()
n = 10000
set.seed(1)
s2 = tibble(
group = rep(c("blue", "green"), each = n),
value = c(rnorm(n, mean = 50, sd = 5), rnorm(n, mean = 60, sd = 5))
)
#describe the data
describe(s2$value)
describeBy(s2$value, s2$group)
##
## Descriptive statistics by group
## group: blue
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 10000 50 5.1 50 50 5 32 69 37 -0.02 -0.04 0.05
## ------------------------------------------------------------
## group: green
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 10000 60 5 60 60 5 38 79 40 0.06 0.03 0.05
#histogram
s2 %>%
ggplot(aes(value, fill = group)) +
geom_histogram(position = "dodge") +
scale_fill_manual(values = c("blue", "green")) +
scale_x_continuous(breaks = seq(0, 100, by = 5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#violin
s2 %>%
ggplot(aes(group, value, fill = group)) +
geom_violin(position = "dodge") +
scale_fill_manual(values = c("blue", "green")) +
scale_y_continuous(breaks = seq(0, 100, by = 5))
#fit models
model_with_group = lm(value ~ group, data = s2)
model_with_group %>% summary()
##
## Call:
## lm(formula = value ~ group, data = s2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.49 -3.37 -0.06 3.36 19.08
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.9673 0.0501 998 <2e-16 ***
## groupgreen 10.0117 0.0708 141 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5 on 19998 degrees of freedom
## Multiple R-squared: 0.5, Adjusted R-squared: 0.5
## F-statistic: 2e+04 on 1 and 19998 DF, p-value: <2e-16
#fit model without
model_without_group = lm(value ~ 1, data = s2)
model_without_group %>% summary()
##
## Call:
## lm(formula = value ~ 1, data = s2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.330 -5.237 0.099 5.228 23.667
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 54.9732 0.0501 1098 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.1 on 19999 degrees of freedom
#predict some new cases, with uncertainties (prediction interval)
s2_newdata = tibble(
group = c("green", "blue")
)
#add and make long format (difficult because this is column grouped format)
s2_newdata = bind_rows(
predict(model_with_group, s2_newdata, interval = "prediction") %>% data.frame(),
predict(model_without_group, s2_newdata, interval = "prediction") %>% data.frame()
) %>% mutate(
group = rep(c("green", "blue"), 2),
model = rep(c("with groups", "without groups"), each = 2),
interval_width = upr - lwr
)
#plot results
s2_newdata %>%
ggplot(aes(group, fit, color = group)) +
geom_point(size = 5) +
geom_errorbar(aes(ymin = lwr, ymax = upr)) +
scale_color_manual(values = c("blue", "green")) +
facet_wrap("model")
n = 10000
set.seed(1)
s3 = tibble(
group = rep(c("blue", "green"), each = n) %>% sample(),
age_group = rep(c("young", "middle", "old"), length.out = n*2) %>% sample() %>% ordered(levels = c("young", "middle", "old"))
) %>% mutate(
value = rnorm(n()) +
case_when(
group == "blue" ~ rnorm(n(), mean = 10, sd = 5),
group == "green" ~ rnorm(n(), mean = 0, sd = 5)
) +
case_when(
age_group == "young" ~ rnorm(n(), mean = -5, sd = 5),
age_group == "middle" ~ rnorm(n(), mean = 0, sd = 5),
age_group == "old" ~ rnorm(n(), mean = 5, sd = 5)
) + 50
)
#describe the data
describe(s3$value)
describeBy(s3$value, s3$group)
##
## Descriptive statistics by group
## group: blue
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 10000 60 8.2 60 60 8.4 28 93 66 -0.03 -0.11 0.08
## ------------------------------------------------------------
## group: green
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 10000 50 8.2 50 50 8.4 22 80 58 -0.01 -0.1 0.08
describeBy(s3$value, s3$age_group)
##
## Descriptive statistics by group
## group: young
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 6667 50 8.7 50 50 9 22 79 58 0 -0.25 0.11
## ------------------------------------------------------------
## group: middle
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 6667 55 8.8 55 55 9 23 85 62 -0.03 -0.18 0.11
## ------------------------------------------------------------
## group: old
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 6666 60 8.7 60 60 8.9 32 93 61 -0.02 -0.21 0.11
#histogram
s3 %>%
ggplot(aes(value, fill = group)) +
geom_histogram(position = "dodge") +
scale_fill_manual(values = c("blue", "green")) +
scale_x_continuous(breaks = seq(0, 100, by = 10)) +
facet_wrap("age_group")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#violin
s3 %>%
ggplot(aes(group, value, fill = group)) +
geom_violin(position = "dodge") +
scale_fill_manual(values = c("blue", "green")) +
scale_y_continuous(breaks = seq(0, 100, by = 10)) +
facet_wrap("age_group")
#fit models
model_with_group = lm(value ~ group + age_group, data = s3)
model_with_group %>% summary()
##
## Call:
## lm(formula = value ~ group + age_group, data = s3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.64 -4.77 0.04 4.85 29.13
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 59.9183 0.0715 837.77 <2e-16 ***
## groupgreen -9.8693 0.1011 -97.57 <2e-16 ***
## age_group.L 7.1157 0.0876 81.23 <2e-16 ***
## age_group.Q 0.0305 0.0876 0.35 0.73
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.2 on 19996 degrees of freedom
## Multiple R-squared: 0.446, Adjusted R-squared: 0.446
## F-statistic: 5.36e+03 on 3 and 19996 DF, p-value: <2e-16
#fit model without
model_without_group = lm(value ~ 1, data = s3)
model_without_group %>% summary()
##
## Call:
## lm(formula = value ~ 1, data = s3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33.47 -6.70 0.08 6.68 38.14
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 54.9834 0.0679 809 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.6 on 19999 degrees of freedom
#predict some new cases, with uncertainties (prediction interval)
s3_newdata = expand_grid(
group = c("green", "blue"),
age_group = levels(s3$age_group)
)
#add and make long format (difficult because this is column grouped format)
s3_newdata = bind_rows(
predict(model_with_group, s3_newdata, interval = "prediction") %>% data.frame(),
predict(model_without_group, s3_newdata, interval = "prediction") %>% data.frame()
) %>% mutate(
group = rep(s3_newdata$group, length.out = n()),
age_group = rep(s3_newdata$age_group, length.out = n()) %>% ordered(levels = levels(s3$age_group)),
model = rep(c("with groups", "without groups"), each = nrow(s3_newdata)),
interval_width = upr - lwr
)
#plot results
s3_newdata %>%
ggplot(aes(group, fit, color = group, linetype = age_group)) +
geom_point(size = 5, position = position_dodge(width = 1)) +
geom_errorbar(aes(ymin = lwr, ymax = upr), position = position_dodge(width = 1)) +
scale_color_manual(values = c("blue", "green")) +
facet_wrap(c("model"))
#simple visualization
info_gather = tibble(
information = seq(0, 1, by = .01)
) %>% mutate(
precision_of_estimate = information^(1/5)
)
#plot
info_gather %>%
ggplot(aes(information, precision_of_estimate)) +
geom_line() +
geom_vline(xintercept = c(.20, .50, .90), linetype = "dotted")
GG_save("figs/information_precision.png")