Init

library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
## 
## 
## Attaching package: 'magrittr'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## 
## Loading required package: weights
## 
## Loading required package: Hmisc
## 
## 
## Attaching package: 'Hmisc'
## 
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## 
## 
## Loading required package: assertthat
## 
## 
## Attaching package: 'assertthat'
## 
## 
## The following object is masked from 'package:tibble':
## 
##     has_name
## 
## 
## Loading required package: psych
## 
## 
## Attaching package: 'psych'
## 
## 
## The following object is masked from 'package:Hmisc':
## 
##     describe
## 
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## 
## 
## Attaching package: 'kirkegaard'
## 
## 
## The following object is masked from 'package:psych':
## 
##     rescale
## 
## 
## The following object is masked from 'package:assertthat':
## 
##     are_equal
## 
## 
## The following object is masked from 'package:purrr':
## 
##     is_logical
## 
## 
## The following object is masked from 'package:base':
## 
##     +
load_packages(
  haven,
  mgcv,
  ggeffects
)
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## 
## The following object is masked from 'package:dplyr':
## 
##     collapse
## 
## This is mgcv 1.8-39. For overview type 'help("mgcv-package")'.
#theme
theme_set(theme_bw())

Data

nhanes2018_files = dir("data/2018", full.names = T)
nhanes2018 = map(nhanes2018_files, read_xpt)

#which is the id variable?
nhanes2018[[1]]$SEQN %>% head()
## [1] 93703 93704 93705 93706 93707 93708
#join each dataset by id
d = reduce(nhanes2018, .f = full_join, by = "SEQN")

#variable list
d_vars = df_var_table(d)

2000

nhanes2000_files = dir("data/2000", full.names = T)
nhanes2000 = map(nhanes2000_files, read_xpt)

#which is the id variable?
nhanes2000[[1]]$SEQN %>% head()
## [1] 1 2 3 4 5 6
#join each dataset by id
d2000 = reduce(nhanes2000, .f = full_join, by = "SEQN")

#variable list
d2000_vars = df_var_table(d2000)

#BMI
d2000$BMI = d2000$BMXBMI %>% as.numeric()

#age
d2000$age = d2000$RIDAGEYR %>% as.numeric()

#subsets
d2000_adult = d2000 %>% filter(age >= 20)

Recode

#body fat
d$body_fat = (d$DXDTOFAT/1000 / d$BMXWT) %>% as.numeric()

#BMI
d$BMI = d$BMXBMI %>% as.numeric()

#weight
d$weight = d$BMXWT %>% as.numeric()

#height
d$height = d$BMXHT %>% as.numeric()

#alternative BMI's
BMI_scaling_factors = seq(1, 3, by = .1)
for (x in BMI_scaling_factors) {
  d[[str_glue("BMI_" + x) %>% str_legalize()]] = d$weight / (d$height/100)^x
}

#waist
d$waist = d$BMXWAIST %>% as.numeric()

#height to waist ratio
d$whr = d$waist / d$height

#sex
d$sex = d$RIAGENDR %>% mapvalues(1:2, c("male", "female")) %>% factor()

#age
d$age = d$RIDAGEYR %>% as.numeric()

#race
d$race = d$RIDRETH3 %>% case_match(
  1 ~ "Mexican",
  2 ~ "Other Hispanic",
  3 ~ "White",
  4 ~ "Black",
  6 ~ "Asian",
  7 ~ "Other_multiple"
)

#combined sex and race
d$sex_race = d$sex + " " + d$race

#adults only
d_adults = d %>% filter(age >= 20)

#white adults only
d_white_adults = d %>% filter(age >= 20, race == "White")

Analysis

BMI and weight

#weight
d_adults %>% 
  GG_denhist("weight", "sex") +
  scale_x_continuous("Weight (kg)") +
    labs(
      title = "Weight by sex",
      subtitle = "NHANES 2018, US adults"
    )
## Warning in GG_denhist(., "weight", "sex"): There were groups without any data.
## These were removed
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figs/weight by sex.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#height
d_adults %>% 
  GG_denhist("height", "sex") +
  scale_x_continuous("Height (cm)") +
    labs(
      title = "Height by sex",
      subtitle = "NHANES 2018, US adults"
    )
## Warning in GG_denhist(., "height", "sex"): There were groups without any data.
## These were removed
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figs/height by sex.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#numbers
describe2(
  d_adults$height,
  d_adults$sex
)
## New names:
## • `` -> `...1`
describe2(
  d_adults$height,
  d_adults$sex_race
)
## New names:
## • `` -> `...1`
SMD_matrix(d_adults$height, d_adults$sex_race)
##                       female Asian female Black female Mexican
## female Asian                    NA -0.860565854    -0.06929056
## female Black           -0.86056585           NA     0.79127530
## female Mexican         -0.06929056  0.791275296             NA
## female Other Hispanic  -0.18064197  0.679923882    -0.11135141
## female Other_multiple  -0.85804019  0.002525667    -0.78874963
## female White           -0.73902899  0.121536863    -0.66973843
## male Asian             -1.99783459 -1.137268735    -1.92854403
## male Black             -2.83701959 -1.976453734    -2.76772903
## male Mexican           -2.01398084 -1.153414985    -1.94469028
## male Other Hispanic    -1.91009684 -1.049530987    -1.84080628
## male Other_multiple    -2.90301670 -2.042450850    -2.83372615
## male White             -2.74905492 -1.888489064    -2.67976436
##                       female Other Hispanic female Other_multiple female White
## female Asian                     -0.1806420          -0.858040188   -0.7390290
## female Black                      0.6799239           0.002525667    0.1215369
## female Mexican                   -0.1113514          -0.788749629   -0.6697384
## female Other Hispanic                    NA          -0.677398216   -0.5583870
## female Other_multiple            -0.6773982                    NA    0.1190112
## female White                     -0.5583870           0.119011196           NA
## male Asian                       -1.8171926          -1.139794401   -1.2588056
## male Black                       -2.6563776          -1.978979400   -2.0979906
## male Mexican                     -1.8333389          -1.155940651   -1.2749518
## male Other Hispanic              -1.7294549          -1.052056654   -1.1710678
## male Other_multiple              -2.7223747          -2.044976517   -2.1639877
## male White                       -2.5684129          -1.891014730   -2.0100259
##                        male Asian  male Black male Mexican male Other Hispanic
## female Asian          -1.99783459 -2.83701959  -2.01398084         -1.91009684
## female Black          -1.13726873 -1.97645373  -1.15341498         -1.04953099
## female Mexican        -1.92854403 -2.76772903  -1.94469028         -1.84080628
## female Other Hispanic -1.81719262 -2.65637762  -1.83333887         -1.72945487
## female Other_multiple -1.13979440 -1.97897940  -1.15594065         -1.05205665
## female White          -1.25880560 -2.09799060  -1.27495185         -1.17106785
## male Asian                     NA -0.83918500  -0.01614625          0.08773775
## male Black            -0.83918500          NA   0.82303875          0.92692275
## male Mexican          -0.01614625  0.82303875           NA          0.10388400
## male Other Hispanic    0.08773775  0.92692275   0.10388400                  NA
## male Other_multiple   -0.90518212 -0.06599712  -0.88903587         -0.99291986
## male White            -0.75122033  0.08796467  -0.73507408         -0.83895808
##                       male Other_multiple  male White
## female Asian                  -2.90301670 -2.74905492
## female Black                  -2.04245085 -1.88848906
## female Mexican                -2.83372615 -2.67976436
## female Other Hispanic         -2.72237473 -2.56841295
## female Other_multiple         -2.04497652 -1.89101473
## female White                  -2.16398771 -2.01002593
## male Asian                    -0.90518212 -0.75122033
## male Black                    -0.06599712  0.08796467
## male Mexican                  -0.88903587 -0.73507408
## male Other Hispanic           -0.99291986 -0.83895808
## male Other_multiple                    NA  0.15396179
## male White                     0.15396179          NA
#BMI
d_adults %>% 
  GG_denhist("BMI", "sex") +
  scale_x_continuous("BMI (weight / height²)") +
    labs(
      title = "BMI by sex",
      subtitle = "NHANES 2018, US adults"
    )
## Warning in GG_denhist(., "BMI", "sex"): There were groups without any data.
## These were removed
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figs/BMI by sex.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
describe2(
  d_adults$BMI,
  d_adults$sex_race
)
## New names:
## • `` -> `...1`

Spline residualization

#can we outperform the BMI with splines?
weight_fit = gam(weight ~ sex + s(height, by = sex), data = d_adults, na.action = "na.omit")
weight_fit %>% summary()
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## weight ~ sex + s(height, by = sex)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  83.9639     0.7936 105.796  < 2e-16 ***
## sexmale      -4.3422     0.9698  -4.478 7.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                      edf Ref.df      F p-value    
## s(height):sexfemale 3.58  4.464  80.23  <2e-16 ***
## s(height):sexmale   1.00  1.000 579.03  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.204   Deviance explained = 20.5%
## GCV = 419.02  Scale est. = 418.49    n = 5175
d_adults$weight_resid = NA
d_adults$weight_resid[-weight_fit$na.action] = resid(weight_fit)

plot(ggeffects::ggpredict(weight_fit, terms = c("height", "sex"))) +
  labs(
    title = "Predict body fat % from sex, height using splines",
    subtitle = "NHANES 2018, US adults",
    y = "Body fat % (x ray)"
  ) +
  scale_y_continuous(labels = scales::percent) +
  theme_bw()
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.

#predict BF% from residuals
d_adults %>% 
  ggplot(aes(weight_resid, body_fat, color = sex)) +
  geom_point(alpha = .3) +
  geom_smooth() +
  scale_x_continuous(limits = c(NA, 80)) +
  scale_y_continuous(labels = scales::percent) +
  labs(
    title = "Spline residualization of weight from height, by sex",
    subtitle = "NHANES 2018, US adults",
    x = "Weight residual (kg)",
    y = "Body fat % (x ray)"
  )
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 3331 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 3331 rows containing missing values (`geom_point()`).

GG_save("figs/spline resid weight BF.png")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 3331 rows containing non-finite values (`stat_smooth()`).
## Removed 3331 rows containing missing values (`geom_point()`).
#correlations
d_adults %>% 
  group_by(sex) %>% 
  summarise(r = cor(weight_resid, body_fat, use = "pair"))

Alternative BMIs

#compute correlations for each alternative BMI
BMI_alts = tibble(
  var = d %>% select(BMI_1:BMI_3) %>% names(),
  scaling_factor = BMI_scaling_factors
) %>% mutate(
  cor_BF_male = d_adults %>% filter(sex == "male") %>% select(body_fat, BMI_1:BMI_3) %>% cor(use = "pair") %>% extract(1, -1) %>% as.numeric(),
  cor_BF_female = d_adults %>% filter(sex == "female") %>% select(body_fat, BMI_1:BMI_3) %>% cor(use = "pair") %>% extract(1, -1) %>% as.numeric()
) %>% 
  pivot_longer(cols = cor_BF_male:cor_BF_female) %>% 
  mutate(
    sex = str_remove(name, "cor_BF_")
  ) 

BMI_alts %>% 
  ggplot(aes(scaling_factor, value, color = sex)) +
  geom_line() +
  labs(
    title = "Alternative BMI scaling factors",
    subtitle = "NHANES 2018, US adults",
    x = "Scaling factor",
    y = "Correlation with body fat %",
    color = "Sex"
  )

GG_save("figs/alt BMIs.png")

BMI_alts %>% 
  group_by(sex) %>% 
  slice_max(value, n = 1)

Body fat

#by by sex
GG_denhist(d_adults, "body_fat", "sex", auto_fraction_bounary = F) +
  scale_x_continuous("Body fat % (x-ray)", labels = scales::percent, limits = c(.1, NA)) +
  labs(
    title = "Body fat % by sex",
    subtitle = "NHANES 2018, US adults"
  )
## Warning in GG_denhist(d_adults, "body_fat", "sex", auto_fraction_bounary = F):
## There were groups without any data. These were removed
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (`geom_bar()`).

GG_save("figs/body fat sex.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (`geom_bar()`).
#relationship to BMI by sex
d_adults %>% 
  ggplot(aes(BMI, body_fat, color = sex)) +
  geom_point(alpha = .1) +
  geom_smooth() +
  geom_smooth(method = lm, linetype = "dotted") +
  scale_y_continuous("Body fat % (x-ray)", labels = scales::percent) +
  scale_x_continuous(limits = c(NA, 60)) +
  labs(
    title = "Body fat % as predicted by BMI",
    subtitle = "NHANES 2018, US adults"
  )
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 3331 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 3331 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 3331 rows containing missing values (`geom_point()`).

GG_save("figs/body fat from BMI sex.png")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 3331 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 3331 rows containing non-finite values (`stat_smooth()`).
## Removed 3331 rows containing missing values (`geom_point()`).
#correlations
d_adults %>% 
  group_by(sex) %>% 
  summarise(r = cor(BMI, body_fat, use = "pair"))
#ols regression
lm(body_fat ~ BMI * sex + age, data = d_adults) %>% summary()
## 
## Call:
## lm(formula = body_fat ~ BMI * sex + age, data = d_adults)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.180717 -0.026588  0.001356  0.027483  0.212488 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.694e-01  5.614e-03  30.175  < 2e-16 ***
## BMI          6.609e-03  1.616e-04  40.888  < 2e-16 ***
## sexmale     -1.387e-01  7.923e-03 -17.501  < 2e-16 ***
## age          7.082e-04  7.535e-05   9.399  < 2e-16 ***
## BMI:sexmale  9.446e-04  2.666e-04   3.543 0.000403 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04122 on 2235 degrees of freedom
##   (3329 observations deleted due to missingness)
## Multiple R-squared:  0.7674, Adjusted R-squared:  0.767 
## F-statistic:  1844 on 4 and 2235 DF,  p-value: < 2.2e-16
#gam
gam_fit = gam(body_fat ~ sex + s(BMI, by = sex) + s(age, by = sex), data = d_adults)
gam_fit %>% summary()
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## body_fat ~ sex + s(BMI, by = sex) + s(age, by = sex)
## 
## Parametric coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.391985   0.001145  342.43   <2e-16 ***
## sexmale     -0.114842   0.001666  -68.94   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                    edf Ref.df      F  p-value    
## s(BMI):sexfemale 4.994  6.108 347.05  < 2e-16 ***
## s(BMI):sexmale   3.391  4.269 340.50  < 2e-16 ***
## s(age):sexfemale 2.079  2.594  14.89 1.14e-07 ***
## s(age):sexmale   1.000  1.000  30.50  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.793   Deviance explained = 79.4%
## GCV = 0.0015167  Scale est. = 0.0015076  n = 2240
plot(ggeffects::ggpredict(gam_fit, terms = c("BMI", "sex"))) +
  labs(
    title = "Predict body fat % from age, sex, BMI using splines",
    subtitle = "NHANES 2018, US adults",
    y = "Body fat % (x ray)"
  ) +
  scale_y_continuous(labels = scales::percent) +
  theme_bw()
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.

GG_save("figs/BF from BMI by sex gam.png")

#whr
d_adults %>% 
  ggplot(aes(whr, body_fat, color = sex)) +
  geom_point(alpha = .1) +
  geom_smooth() +
  geom_smooth(method = lm, linetype = "dotted") +
  scale_y_continuous("Body fat % (x-ray)", labels = scales::percent) +
  # scale_x_continuous() +
  labs(
    title = "Body fat % as predicted by WHR",
    subtitle = "NHANES 2018, US adults",
    x = "Waist-height ratio"
  )
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 3344 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 3344 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 3344 rows containing missing values (`geom_point()`).

GG_save("figs/body fat from WHR sex.png")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 3344 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 3344 rows containing non-finite values (`stat_smooth()`).
## Removed 3344 rows containing missing values (`geom_point()`).
#correlations
d_adults %>% 
  group_by(sex) %>% 
  summarise(r = cor(whr, body_fat, use = "pair"))
#ols regression
lm(body_fat ~ whr * sex + age, data = d_adults) %>% summary()
## 
## Call:
## lm(formula = body_fat ~ whr * sex + age, data = d_adults)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.155130 -0.025113  0.000391  0.024315  0.187796 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.003e-01  6.662e-03  15.062  < 2e-16 ***
## whr          4.832e-01  1.057e-02  45.714  < 2e-16 ***
## sexmale     -1.585e-01  9.883e-03 -16.040  < 2e-16 ***
## age          7.838e-05  7.051e-05   1.112    0.266    
## whr:sexmale  9.655e-02  1.678e-02   5.753 9.95e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03767 on 2220 degrees of freedom
##   (3344 observations deleted due to missingness)
## Multiple R-squared:  0.8054, Adjusted R-squared:  0.805 
## F-statistic:  2296 on 4 and 2220 DF,  p-value: < 2.2e-16
#gam
gam_fit = gam(body_fat ~ sex + s(whr, by = sex) + s(age, by = sex), data = d_adults)
gam_fit %>% summary()
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## body_fat ~ sex + s(whr, by = sex) + s(age, by = sex)
## 
## Parametric coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.386650   0.001040  371.85   <2e-16 ***
## sexmale     -0.104128   0.001519  -68.57   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                    edf Ref.df       F p-value    
## s(whr):sexfemale 4.819  5.981 436.013 < 2e-16 ***
## s(whr):sexmale   5.320  6.389 347.318 < 2e-16 ***
## s(age):sexfemale 1.463  1.790   0.576 0.43163    
## s(age):sexmale   2.105  2.627   4.315 0.00893 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.831   Deviance explained = 83.2%
## GCV = 0.0012393  Scale est. = 0.0012305  n = 2225
plot(ggeffects::ggpredict(gam_fit, terms = c("whr", "sex"))) +
  labs(
    title = "Predict body fat % from age, sex, WHR using splines",
    subtitle = "NHANES 2018, US adults",
    y = "Body fat % (x ray)",
    x = "Waist-height ratio"
  ) +
  scale_y_continuous(labels = scales::percent) +
  theme_bw()
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.

GG_save("figs/BF from WHR by sex gam.png")

Whites only

#whites only
d_white_adults %>% 
  ggplot(aes(BMI, body_fat, color = sex)) +
  geom_point(alpha = .1) +
  geom_smooth() +
  geom_smooth(method = lm, linetype = "dotted") +
  scale_y_continuous("Body fat % (x-ray)", labels = scales::percent) +
  scale_x_continuous(limits = c(NA, 60)) +
  labs(
    title = "Body fat % as predicted by BMI",
    subtitle = "NHANES 2018, US adults"
  )
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 1255 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1255 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1255 rows containing missing values (`geom_point()`).

#correlations
d_white_adults %>% 
  group_by(sex) %>% 
  summarise(r = cor(BMI, body_fat, use = "pair"))
#ols regression
lm(body_fat ~ BMI * sex + age, data = d_white_adults) %>% summary()
## 
## Call:
## lm(formula = body_fat ~ BMI * sex + age, data = d_white_adults)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.177487 -0.026754  0.001141  0.027726  0.147798 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.1604735  0.0096218  16.678  < 2e-16 ***
## BMI          0.0067636  0.0002709  24.965  < 2e-16 ***
## sexmale     -0.1389525  0.0138791 -10.012  < 2e-16 ***
## age          0.0007441  0.0001379   5.395 9.46e-08 ***
## BMI:sexmale  0.0011454  0.0004644   2.466   0.0139 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04186 on 677 degrees of freedom
##   (1253 observations deleted due to missingness)
## Multiple R-squared:  0.7645, Adjusted R-squared:  0.7631 
## F-statistic: 549.4 on 4 and 677 DF,  p-value: < 2.2e-16
#gam
gam_fit = gam(body_fat ~ sex + s(BMI, k = 5, by = sex) + s(age, by = sex), data = d_white_adults)
gam_fit %>% summary()
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## body_fat ~ sex + s(BMI, k = 5, by = sex) + s(age, by = sex)
## 
## Parametric coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.389096   0.002018  192.85   <2e-16 ***
## sexmale     -0.108768   0.003035  -35.83   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                    edf Ref.df       F  p-value    
## s(BMI):sexfemale 3.540  3.885 208.241  < 2e-16 ***
## s(BMI):sexmale   2.103  2.567 200.156  < 2e-16 ***
## s(age):sexfemale 1.309  1.555   9.877 0.000261 ***
## s(age):sexmale   1.635  2.028   2.439 0.083678 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.795   Deviance explained = 79.8%
## GCV = 0.0015369  Scale est. = 0.001513  n = 682
plot(ggeffects::ggpredict(gam_fit, terms = c("BMI", "sex"))) +
  labs(
    title = "Predict body fat % from age, sex, BMI",
    subtitle = "NHANES 2018, US adults",
    y = "Body fat % (x ray)"
  ) +
  scale_y_continuous(labels = scales::percent)
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.

Normal distribution origins

#simulate data with increasing number of independent causes
set.seed(1)
d_norm = map_dfr(c(1:11, 100), \(x) {
  tibble(
    rolls = x,
    mean = rbinom(1000, size = x, prob = 1/2) / x
  )
})

#plot hist with facets
d_norm %>% 
  ggplot(aes(mean)) + 
  geom_histogram(bins = 21) +
  facet_wrap("rolls", scales = c("free_y")) +
  labs(
    title = "From binomial distribution to normal distribution"
  )

GG_save("figs/binom to norm.png")

Moving BMI distribution and obesity rate

bmi_stats = describe2(
  d_adults$BMI,
  d_adults$sex
)
## New names:
## • `` -> `...1`
bmi_stats
#simulate norm data with moving mean
set.seed(1)
d_bmi = map_dfr(seq(22.5, 50, length.out = 9) %>% round(1), \(x) {
  tibble(
    mean_BMI = x %>% as.factor(),
    BMI = rnorm(1000, mean = x, sd = bmi_stats$sd[2]),
    obese = (BMI >= 30) %>% as.factor()
  )
})

#manually
d_bmi_dens <- d_bmi %>% 
  group_split(mean_BMI) %>% 
  map_dfr(\(dd) {
    dens = density(dd$BMI)
    
    density_df <- tibble(
      BMI = dens$x,
      density = dens$y
    ) %>% 
      mutate(
        obese = BMI >= 30,
        mean_BMI = dd$mean_BMI[1]
        )
  })

#facet plot
d_bmi_dens %>% 
  ggplot(aes(BMI, y = density, fill = obese)) +
  geom_area(aes(fill = obese), alpha = 0.7) +
  geom_vline(xintercept = 30, linetype = "dotted") +
  facet_wrap("mean_BMI") +
  labs(
    title = "Obesity rate as function of mean BMI changes"
  )

GG_save("figs/BMI means moving obesity rate.png")

#many distributions, but only summary stats
set.seed(1)
d_bmi2 = map_dfr(seq(18, 50, length.out = 100), \(x) {
  tibble(
    mean_BMI = x,
    obesity_rate = rnorm(1e5, mean = x, sd = bmi_stats$sd[2]) %>% is_weakly_greater_than(30) %>% mean()
  )
})

d_bmi2 %>% 
  ggplot(aes(mean_BMI, obesity_rate)) +
  geom_line() +
  scale_y_continuous(labels = scales::percent) +
  labs(
    title = "Obesity rate as function of changes in mean BMI",
    x = str_glue("Mean BMI (SD = {round(bmi_stats$sd[2], 1)})"),
    y = "Obesity rate"
  )

GG_save("figs/BMI means moving obesity rate 2.png")

BMI dist 2000 vs. 2018

#compare 2000 vs 2018
d_combo = bind_rows(
  d_adults %>% select(BMI, age) %>% mutate(survey_year = 2018),
  d2000_adult %>% select(BMI, age) %>% mutate(survey_year = 2000)
) %>% 
  filter(age <= 60) %>% 
  mutate(
    obese = BMI >= 30
  )

GG_denhist(d_combo, "BMI", "survey_year", vline = NULL) +
  geom_vline(xintercept = 30, linetype = "dotted") +
  scale_x_continuous(breaks = seq(0, 100, by = 5)) +
  labs(
    title = "BMI distribution in 2000 and 2018",
    subtitle = "NHANES, American adults, aged 20-60",
    fill = "Survey year"
  )
## Warning in GG_denhist(d_combo, "BMI", "survey_year", vline = NULL): There were
## groups without any data. These were removed
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figs/BMI dist 2000 vs 2018.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#numbers
describe2(
  d_combo %>% select(-survey_year),
  d_combo$survey_year
)

Meta

write_sessioninfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 21.2
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggeffects_2.0.0       mgcv_1.8-39           nlme_3.1-155         
##  [4] haven_2.5.3           kirkegaard_2023-08-04 psych_2.3.9          
##  [7] assertthat_0.2.1      weights_1.0.4         Hmisc_5.1-1          
## [10] magrittr_2.0.3        lubridate_1.9.3       forcats_1.0.0        
## [13] stringr_1.5.0         dplyr_1.1.3           purrr_1.0.2          
## [16] readr_2.1.4           tidyr_1.3.0           tibble_3.2.1         
## [19] ggplot2_3.4.3         tidyverse_2.0.0      
## 
## loaded via a namespace (and not attached):
##  [1] insight_1.0.0     tools_4.1.2       backports_1.4.1   bslib_0.5.1      
##  [5] utf8_1.2.3        R6_2.5.1          rpart_4.1.16      colorspace_2.1-0 
##  [9] jomo_2.7-6        nnet_7.3-17       withr_2.5.1       tidyselect_1.2.0 
## [13] gridExtra_2.3     mnormt_2.1.1      compiler_4.1.2    textshaping_0.3.7
## [17] glmnet_4.1-8      cli_3.6.1         htmlTable_2.4.1   mice_3.16.0      
## [21] labeling_0.4.3    sass_0.4.7        scales_1.2.1      checkmate_2.2.0  
## [25] systemfonts_1.0.5 digest_0.6.33     foreign_0.8-82    minqa_1.2.6      
## [29] rmarkdown_2.25    base64enc_0.1-3   pkgconfig_2.0.3   htmltools_0.5.6.1
## [33] lme4_1.1-34       fastmap_1.1.1     htmlwidgets_1.6.2 rlang_1.1.1      
## [37] rstudioapi_0.15.0 farver_2.1.1      shape_1.4.6       jquerylib_0.1.4  
## [41] generics_0.1.3    jsonlite_1.8.7    gtools_3.9.4      Formula_1.2-5    
## [45] Matrix_1.4-0      Rcpp_1.0.11       munsell_0.5.0     fansi_1.0.5      
## [49] lifecycle_1.0.3   stringi_1.7.12    yaml_2.3.7        MASS_7.3-55      
## [53] plyr_1.8.9        grid_4.1.2        parallel_4.1.2    gdata_2.19.0     
## [57] mitml_0.4-5       lattice_0.20-45   splines_4.1.2     hms_1.1.3        
## [61] knitr_1.44        pillar_1.9.0      boot_1.3-28       codetools_0.2-18 
## [65] pan_1.9           glue_1.6.2        evaluate_0.22     data.table_1.14.8
## [69] vctrs_0.6.3       nloptr_2.0.3      tzdb_0.4.0        foreach_1.5.2    
## [73] gtable_0.3.4      datawizard_0.13.0 cachem_1.0.8      xfun_0.40        
## [77] broom_1.0.5       ragg_1.2.6        survival_3.2-13   iterators_1.0.14 
## [81] cluster_2.1.2     timechange_0.2.0