Init

library(kirkegaard)

## Loading required package: tidyverse

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
## 
## 
## Attaching package: 'magrittr'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## 
## Loading required package: weights
## 
## Loading required package: Hmisc
## 
## 
## Attaching package: 'Hmisc'
## 
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## 
## 
## Loading required package: assertthat
## 
## 
## Attaching package: 'assertthat'
## 
## 
## The following object is masked from 'package:tibble':
## 
##     has_name
## 
## 
## Loading required package: psych
## 
## 
## Attaching package: 'psych'
## 
## 
## The following object is masked from 'package:Hmisc':
## 
##     describe
## 
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## 
## 
## Attaching package: 'kirkegaard'
## 
## 
## The following object is masked from 'package:psych':
## 
##     rescale
## 
## 
## The following object is masked from 'package:assertthat':
## 
##     are_equal
## 
## 
## The following object is masked from 'package:purrr':
## 
##     is_logical
## 
## 
## The following object is masked from 'package:base':
## 
##     +

load_packages(
  haven,
  mgcv,
  ggeffects
)

## Loading required package: nlme
## 
## Attaching package: 'nlme'
## 
## The following object is masked from 'package:dplyr':
## 
##     collapse
## 
## This is mgcv 1.8-39. For overview type 'help("mgcv-package")'.

#theme
theme_set(theme_bw())

Data

nhanes2018_files = dir("data/2018", full.names = T)
nhanes2018 = map(nhanes2018_files, read_xpt)

#which is the id variable?
nhanes2018[[1]]$SEQN %>% head()

## [1] 93703 93704 93705 93706 93707 93708

#join each dataset by id
d = reduce(nhanes2018, .f = full_join, by = "SEQN")

#variable list
d_vars = df_var_table(d)

2000

nhanes2000_files = dir("data/2000", full.names = T)
nhanes2000 = map(nhanes2000_files, read_xpt)

#which is the id variable?
nhanes2000[[1]]$SEQN %>% head()

## [1] 1 2 3 4 5 6

#join each dataset by id
d2000 = reduce(nhanes2000, .f = full_join, by = "SEQN")

#variable list
d2000_vars = df_var_table(d2000)

#BMI
d2000$BMI = d2000$BMXBMI %>% as.numeric()

#age
d2000$age = d2000$RIDAGEYR %>% as.numeric()

#subsets
d2000_adult = d2000 %>% filter(age >= 20)

Recode

#body fat
d$body_fat = (d$DXDTOFAT/1000 / d$BMXWT) %>% as.numeric()

#BMI
d$BMI = d$BMXBMI %>% as.numeric()

#weight
d$weight = d$BMXWT %>% as.numeric()

#height
d$height = d$BMXHT %>% as.numeric()

#alternative BMI's
BMI_scaling_factors = seq(1, 3, by = .1)
for (x in BMI_scaling_factors) {
  d[[str_glue("BMI_" + x) %>% str_legalize()]] = d$weight / (d$height/100)^x
}

#waist
d$waist = d$BMXWAIST %>% as.numeric()

#height to waist ratio
d$whr = d$waist / d$height

#sex
d$sex = d$RIAGENDR %>% mapvalues(1:2, c("male", "female")) %>% factor()

#age
d$age = d$RIDAGEYR %>% as.numeric()

#race
d$race = d$RIDRETH3 %>% case_match(
  1 ~ "Mexican",
  2 ~ "Other Hispanic",
  3 ~ "White",
  4 ~ "Black",
  6 ~ "Asian",
  7 ~ "Other_multiple"
)

#combined sex and race
d$sex_race = d$sex + " " + d$race

#adults only
d_adults = d %>% filter(age >= 20)

#white adults only
d_white_adults = d %>% filter(age >= 20, race == "White")

Analysis

BMI and weight

#weight
d_adults %>% 
  GG_denhist("weight", "sex") +
  scale_x_continuous("Weight (kg)") +
    labs(
      title = "Weight by sex",
      subtitle = "NHANES 2018, US adults"
    )

## Warning in GG_denhist(., "weight", "sex"): There were groups without any data.
## These were removed

## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figs/weight by sex.png")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#height
d_adults %>% 
  GG_denhist("height", "sex") +
  scale_x_continuous("Height (cm)") +
    labs(
      title = "Height by sex",
      subtitle = "NHANES 2018, US adults"
    )

## Warning in GG_denhist(., "height", "sex"): There were groups without any data.
## These were removed

## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figs/height by sex.png")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#numbers
describe2(
  d_adults$height,
  d_adults$sex
)

## New names:
## • `` -> `...1`

describe2(
  d_adults$height,
  d_adults$sex_race
)

## New names:
## • `` -> `...1`

SMD_matrix(d_adults$height, d_adults$sex_race)

##                       female Asian female Black female Mexican
## female Asian                    NA -0.860565854    -0.06929056
## female Black           -0.86056585           NA     0.79127530
## female Mexican         -0.06929056  0.791275296             NA
## female Other Hispanic  -0.18064197  0.679923882    -0.11135141
## female Other_multiple  -0.85804019  0.002525667    -0.78874963
## female White           -0.73902899  0.121536863    -0.66973843
## male Asian             -1.99783459 -1.137268735    -1.92854403
## male Black             -2.83701959 -1.976453734    -2.76772903
## male Mexican           -2.01398084 -1.153414985    -1.94469028
## male Other Hispanic    -1.91009684 -1.049530987    -1.84080628
## male Other_multiple    -2.90301670 -2.042450850    -2.83372615
## male White             -2.74905492 -1.888489064    -2.67976436
##                       female Other Hispanic female Other_multiple female White
## female Asian                     -0.1806420          -0.858040188   -0.7390290
## female Black                      0.6799239           0.002525667    0.1215369
## female Mexican                   -0.1113514          -0.788749629   -0.6697384
## female Other Hispanic                    NA          -0.677398216   -0.5583870
## female Other_multiple            -0.6773982                    NA    0.1190112
## female White                     -0.5583870           0.119011196           NA
## male Asian                       -1.8171926          -1.139794401   -1.2588056
## male Black                       -2.6563776          -1.978979400   -2.0979906
## male Mexican                     -1.8333389          -1.155940651   -1.2749518
## male Other Hispanic              -1.7294549          -1.052056654   -1.1710678
## male Other_multiple              -2.7223747          -2.044976517   -2.1639877
## male White                       -2.5684129          -1.891014730   -2.0100259
##                        male Asian  male Black male Mexican male Other Hispanic
## female Asian          -1.99783459 -2.83701959  -2.01398084         -1.91009684
## female Black          -1.13726873 -1.97645373  -1.15341498         -1.04953099
## female Mexican        -1.92854403 -2.76772903  -1.94469028         -1.84080628
## female Other Hispanic -1.81719262 -2.65637762  -1.83333887         -1.72945487
## female Other_multiple -1.13979440 -1.97897940  -1.15594065         -1.05205665
## female White          -1.25880560 -2.09799060  -1.27495185         -1.17106785
## male Asian                     NA -0.83918500  -0.01614625          0.08773775
## male Black            -0.83918500          NA   0.82303875          0.92692275
## male Mexican          -0.01614625  0.82303875           NA          0.10388400
## male Other Hispanic    0.08773775  0.92692275   0.10388400                  NA
## male Other_multiple   -0.90518212 -0.06599712  -0.88903587         -0.99291986
## male White            -0.75122033  0.08796467  -0.73507408         -0.83895808
##                       male Other_multiple  male White
## female Asian                  -2.90301670 -2.74905492
## female Black                  -2.04245085 -1.88848906
## female Mexican                -2.83372615 -2.67976436
## female Other Hispanic         -2.72237473 -2.56841295
## female Other_multiple         -2.04497652 -1.89101473
## female White                  -2.16398771 -2.01002593
## male Asian                    -0.90518212 -0.75122033
## male Black                    -0.06599712  0.08796467
## male Mexican                  -0.88903587 -0.73507408
## male Other Hispanic           -0.99291986 -0.83895808
## male Other_multiple                    NA  0.15396179
## male White                     0.15396179          NA

#BMI
d_adults %>% 
  GG_denhist("BMI", "sex") +
  scale_x_continuous("BMI (weight / height²)") +
    labs(
      title = "BMI by sex",
      subtitle = "NHANES 2018, US adults"
    )

## Warning in GG_denhist(., "BMI", "sex"): There were groups without any data.
## These were removed

## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figs/BMI by sex.png")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

describe2(
  d_adults$BMI,
  d_adults$sex_race
)

## New names:
## • `` -> `...1`

Spline residualization

#can we outperform the BMI with splines?
weight_fit = gam(weight ~ sex + s(height, by = sex), data = d_adults, na.action = "na.omit")
weight_fit %>% summary()

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## weight ~ sex + s(height, by = sex)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  83.9639     0.7936 105.796  < 2e-16 ***
## sexmale      -4.3422     0.9698  -4.478 7.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                      edf Ref.df      F p-value    
## s(height):sexfemale 3.58  4.464  80.23  <2e-16 ***
## s(height):sexmale   1.00  1.000 579.03  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.204   Deviance explained = 20.5%
## GCV = 419.02  Scale est. = 418.49    n = 5175

d_adults$weight_resid = NA
d_adults$weight_resid[-weight_fit$na.action] = resid(weight_fit)

plot(ggeffects::ggpredict(weight_fit, terms = c("height", "sex"))) +
  labs(
    title = "Predict body fat % from sex, height using splines",
    subtitle = "NHANES 2018, US adults",
    y = "Body fat % (x ray)"
  ) +
  scale_y_continuous(labels = scales::percent) +
  theme_bw()

## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.

#predict BF% from residuals
d_adults %>% 
  ggplot(aes(weight_resid, body_fat, color = sex)) +
  geom_point(alpha = .3) +
  geom_smooth() +
  scale_x_continuous(limits = c(NA, 80)) +
  scale_y_continuous(labels = scales::percent) +
  labs(
    title = "Spline residualization of weight from height, by sex",
    subtitle = "NHANES 2018, US adults",
    x = "Weight residual (kg)",
    y = "Body fat % (x ray)"
  )

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## Warning: Removed 3331 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 3331 rows containing missing values (`geom_point()`).

GG_save("figs/spline resid weight BF.png")

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## Warning: Removed 3331 rows containing non-finite values (`stat_smooth()`).
## Removed 3331 rows containing missing values (`geom_point()`).

#correlations
d_adults %>% 
  group_by(sex) %>% 
  summarise(r = cor(weight_resid, body_fat, use = "pair"))

Alternative BMIs

#compute correlations for each alternative BMI
BMI_alts = tibble(
  var = d %>% select(BMI_1:BMI_3) %>% names(),
  scaling_factor = BMI_scaling_factors
) %>% mutate(
  cor_BF_male = d_adults %>% filter(sex == "male") %>% select(body_fat, BMI_1:BMI_3) %>% cor(use = "pair") %>% extract(1, -1) %>% as.numeric(),
  cor_BF_female = d_adults %>% filter(sex == "female") %>% select(body_fat, BMI_1:BMI_3) %>% cor(use = "pair") %>% extract(1, -1) %>% as.numeric()
) %>% 
  pivot_longer(cols = cor_BF_male:cor_BF_female) %>% 
  mutate(
    sex = str_remove(name, "cor_BF_")
  ) 

BMI_alts %>% 
  ggplot(aes(scaling_factor, value, color = sex)) +
  geom_line() +
  labs(
    title = "Alternative BMI scaling factors",
    subtitle = "NHANES 2018, US adults",
    x = "Scaling factor",
    y = "Correlation with body fat %",
    color = "Sex"
  )

GG_save("figs/alt BMIs.png")

BMI_alts %>% 
  group_by(sex) %>% 
  slice_max(value, n = 1)

Body fat

#by by sex
GG_denhist(d_adults, "body_fat", "sex", auto_fraction_bounary = F) +
  scale_x_continuous("Body fat % (x-ray)", labels = scales::percent, limits = c(.1, NA)) +
  labs(
    title = "Body fat % by sex",
    subtitle = "NHANES 2018, US adults"
  )

## Warning in GG_denhist(d_adults, "body_fat", "sex", auto_fraction_bounary = F):
## There were groups without any data. These were removed

## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 2 rows containing missing values (`geom_bar()`).

GG_save("figs/body fat sex.png")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 2 rows containing missing values (`geom_bar()`).

#relationship to BMI by sex
d_adults %>% 
  ggplot(aes(BMI, body_fat, color = sex)) +
  geom_point(alpha = .1) +
  geom_smooth() +
  geom_smooth(method = lm, linetype = "dotted") +
  scale_y_continuous("Body fat % (x-ray)", labels = scales::percent) +
  scale_x_continuous(limits = c(NA, 60)) +
  labs(
    title = "Body fat % as predicted by BMI",
    subtitle = "NHANES 2018, US adults"
  )

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## Warning: Removed 3331 rows containing non-finite values (`stat_smooth()`).

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 3331 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 3331 rows containing missing values (`geom_point()`).

GG_save("figs/body fat from BMI sex.png")

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## Warning: Removed 3331 rows containing non-finite values (`stat_smooth()`).

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 3331 rows containing non-finite values (`stat_smooth()`).
## Removed 3331 rows containing missing values (`geom_point()`).

#correlations
d_adults %>% 
  group_by(sex) %>% 
  summarise(r = cor(BMI, body_fat, use = "pair"))

#ols regression
lm(body_fat ~ BMI * sex + age, data = d_adults) %>% summary()

## 
## Call:
## lm(formula = body_fat ~ BMI * sex + age, data = d_adults)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.180717 -0.026588  0.001356  0.027483  0.212488 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.694e-01  5.614e-03  30.175  < 2e-16 ***
## BMI          6.609e-03  1.616e-04  40.888  < 2e-16 ***
## sexmale     -1.387e-01  7.923e-03 -17.501  < 2e-16 ***
## age          7.082e-04  7.535e-05   9.399  < 2e-16 ***
## BMI:sexmale  9.446e-04  2.666e-04   3.543 0.000403 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04122 on 2235 degrees of freedom
##   (3329 observations deleted due to missingness)
## Multiple R-squared:  0.7674, Adjusted R-squared:  0.767 
## F-statistic:  1844 on 4 and 2235 DF,  p-value: < 2.2e-16

#gam
gam_fit = gam(body_fat ~ sex + s(BMI, by = sex) + s(age, by = sex), data = d_adults)
gam_fit %>% summary()

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## body_fat ~ sex + s(BMI, by = sex) + s(age, by = sex)
## 
## Parametric coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.391985   0.001145  342.43   <2e-16 ***
## sexmale     -0.114842   0.001666  -68.94   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                    edf Ref.df      F  p-value    
## s(BMI):sexfemale 4.994  6.108 347.05  < 2e-16 ***
## s(BMI):sexmale   3.391  4.269 340.50  < 2e-16 ***
## s(age):sexfemale 2.079  2.594  14.89 1.14e-07 ***
## s(age):sexmale   1.000  1.000  30.50  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.793   Deviance explained = 79.4%
## GCV = 0.0015167  Scale est. = 0.0015076  n = 2240

plot(ggeffects::ggpredict(gam_fit, terms = c("BMI", "sex"))) +
  labs(
    title = "Predict body fat % from age, sex, BMI using splines",
    subtitle = "NHANES 2018, US adults",
    y = "Body fat % (x ray)"
  ) +
  scale_y_continuous(labels = scales::percent) +
  theme_bw()

## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.

GG_save("figs/BF from BMI by sex gam.png")

#whr
d_adults %>% 
  ggplot(aes(whr, body_fat, color = sex)) +
  geom_point(alpha = .1) +
  geom_smooth() +
  geom_smooth(method = lm, linetype = "dotted") +
  scale_y_continuous("Body fat % (x-ray)", labels = scales::percent) +
  # scale_x_continuous() +
  labs(
    title = "Body fat % as predicted by WHR",
    subtitle = "NHANES 2018, US adults",
    x = "Waist-height ratio"
  )

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## Warning: Removed 3344 rows containing non-finite values (`stat_smooth()`).

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 3344 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 3344 rows containing missing values (`geom_point()`).

GG_save("figs/body fat from WHR sex.png")

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## Warning: Removed 3344 rows containing non-finite values (`stat_smooth()`).

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 3344 rows containing non-finite values (`stat_smooth()`).
## Removed 3344 rows containing missing values (`geom_point()`).

#correlations
d_adults %>% 
  group_by(sex) %>% 
  summarise(r = cor(whr, body_fat, use = "pair"))

#ols regression
lm(body_fat ~ whr * sex + age, data = d_adults) %>% summary()

## 
## Call:
## lm(formula = body_fat ~ whr * sex + age, data = d_adults)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.155130 -0.025113  0.000391  0.024315  0.187796 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.003e-01  6.662e-03  15.062  < 2e-16 ***
## whr          4.832e-01  1.057e-02  45.714  < 2e-16 ***
## sexmale     -1.585e-01  9.883e-03 -16.040  < 2e-16 ***
## age          7.838e-05  7.051e-05   1.112    0.266    
## whr:sexmale  9.655e-02  1.678e-02   5.753 9.95e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03767 on 2220 degrees of freedom
##   (3344 observations deleted due to missingness)
## Multiple R-squared:  0.8054, Adjusted R-squared:  0.805 
## F-statistic:  2296 on 4 and 2220 DF,  p-value: < 2.2e-16

#gam
gam_fit = gam(body_fat ~ sex + s(whr, by = sex) + s(age, by = sex), data = d_adults)
gam_fit %>% summary()

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## body_fat ~ sex + s(whr, by = sex) + s(age, by = sex)
## 
## Parametric coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.386650   0.001040  371.85   <2e-16 ***
## sexmale     -0.104128   0.001519  -68.57   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                    edf Ref.df       F p-value    
## s(whr):sexfemale 4.819  5.981 436.013 < 2e-16 ***
## s(whr):sexmale   5.320  6.389 347.318 < 2e-16 ***
## s(age):sexfemale 1.463  1.790   0.576 0.43163    
## s(age):sexmale   2.105  2.627   4.315 0.00893 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.831   Deviance explained = 83.2%
## GCV = 0.0012393  Scale est. = 0.0012305  n = 2225

plot(ggeffects::ggpredict(gam_fit, terms = c("whr", "sex"))) +
  labs(
    title = "Predict body fat % from age, sex, WHR using splines",
    subtitle = "NHANES 2018, US adults",
    y = "Body fat % (x ray)",
    x = "Waist-height ratio"
  ) +
  scale_y_continuous(labels = scales::percent) +
  theme_bw()

## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.

GG_save("figs/BF from WHR by sex gam.png")

Whites only

#whites only
d_white_adults %>% 
  ggplot(aes(BMI, body_fat, color = sex)) +
  geom_point(alpha = .1) +
  geom_smooth() +
  geom_smooth(method = lm, linetype = "dotted") +
  scale_y_continuous("Body fat % (x-ray)", labels = scales::percent) +
  scale_x_continuous(limits = c(NA, 60)) +
  labs(
    title = "Body fat % as predicted by BMI",
    subtitle = "NHANES 2018, US adults"
  )

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

## Warning: Removed 1255 rows containing non-finite values (`stat_smooth()`).

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 1255 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 1255 rows containing missing values (`geom_point()`).

#correlations
d_white_adults %>% 
  group_by(sex) %>% 
  summarise(r = cor(BMI, body_fat, use = "pair"))

#ols regression
lm(body_fat ~ BMI * sex + age, data = d_white_adults) %>% summary()

## 
## Call:
## lm(formula = body_fat ~ BMI * sex + age, data = d_white_adults)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.177487 -0.026754  0.001141  0.027726  0.147798 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.1604735  0.0096218  16.678  < 2e-16 ***
## BMI          0.0067636  0.0002709  24.965  < 2e-16 ***
## sexmale     -0.1389525  0.0138791 -10.012  < 2e-16 ***
## age          0.0007441  0.0001379   5.395 9.46e-08 ***
## BMI:sexmale  0.0011454  0.0004644   2.466   0.0139 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04186 on 677 degrees of freedom
##   (1253 observations deleted due to missingness)
## Multiple R-squared:  0.7645, Adjusted R-squared:  0.7631 
## F-statistic: 549.4 on 4 and 677 DF,  p-value: < 2.2e-16

#gam
gam_fit = gam(body_fat ~ sex + s(BMI, k = 5, by = sex) + s(age, by = sex), data = d_white_adults)
gam_fit %>% summary()

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## body_fat ~ sex + s(BMI, k = 5, by = sex) + s(age, by = sex)
## 
## Parametric coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.389096   0.002018  192.85   <2e-16 ***
## sexmale     -0.108768   0.003035  -35.83   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                    edf Ref.df       F  p-value    
## s(BMI):sexfemale 3.540  3.885 208.241  < 2e-16 ***
## s(BMI):sexmale   2.103  2.567 200.156  < 2e-16 ***
## s(age):sexfemale 1.309  1.555   9.877 0.000261 ***
## s(age):sexmale   1.635  2.028   2.439 0.083678 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.795   Deviance explained = 79.8%
## GCV = 0.0015369  Scale est. = 0.001513  n = 682

plot(ggeffects::ggpredict(gam_fit, terms = c("BMI", "sex"))) +
  labs(
    title = "Predict body fat % from age, sex, BMI",
    subtitle = "NHANES 2018, US adults",
    y = "Body fat % (x ray)"
  ) +
  scale_y_continuous(labels = scales::percent)

## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.

Normal distribution origins

#simulate data with increasing number of independent causes
set.seed(1)
d_norm = map_dfr(c(1:11, 100), \(x) {
  tibble(
    rolls = x,
    mean = rbinom(1000, size = x, prob = 1/2) / x
  )
})

#plot hist with facets
d_norm %>% 
  ggplot(aes(mean)) + 
  geom_histogram(bins = 21) +
  facet_wrap("rolls", scales = c("free_y")) +
  labs(
    title = "From binomial distribution to normal distribution"
  )

GG_save("figs/binom to norm.png")

Moving BMI distribution and obesity rate

bmi_stats = describe2(
  d_adults$BMI,
  d_adults$sex
)

## New names:
## • `` -> `...1`

bmi_stats

#simulate norm data with moving mean
set.seed(1)
d_bmi = map_dfr(seq(22.5, 50, length.out = 9) %>% round(1), \(x) {
  tibble(
    mean_BMI = x %>% as.factor(),
    BMI = rnorm(1000, mean = x, sd = bmi_stats$sd[2]),
    obese = (BMI >= 30) %>% as.factor()
  )
})

#manually
d_bmi_dens <- d_bmi %>% 
  group_split(mean_BMI) %>% 
  map_dfr(\(dd) {
    dens = density(dd$BMI)
    
    density_df <- tibble(
      BMI = dens$x,
      density = dens$y
    ) %>% 
      mutate(
        obese = BMI >= 30,
        mean_BMI = dd$mean_BMI[1]
        )
  })

#facet plot
d_bmi_dens %>% 
  ggplot(aes(BMI, y = density, fill = obese)) +
  geom_area(aes(fill = obese), alpha = 0.7) +
  geom_vline(xintercept = 30, linetype = "dotted") +
  facet_wrap("mean_BMI") +
  labs(
    title = "Obesity rate as function of mean BMI changes"
  )

GG_save("figs/BMI means moving obesity rate.png")

#many distributions, but only summary stats
set.seed(1)
d_bmi2 = map_dfr(seq(18, 50, length.out = 100), \(x) {
  tibble(
    mean_BMI = x,
    obesity_rate = rnorm(1e5, mean = x, sd = bmi_stats$sd[2]) %>% is_weakly_greater_than(30) %>% mean()
  )
})

d_bmi2 %>% 
  ggplot(aes(mean_BMI, obesity_rate)) +
  geom_line() +
  scale_y_continuous(labels = scales::percent) +
  labs(
    title = "Obesity rate as function of changes in mean BMI",
    x = str_glue("Mean BMI (SD = {round(bmi_stats$sd[2], 1)})"),
    y = "Obesity rate"
  )

GG_save("figs/BMI means moving obesity rate 2.png")

BMI dist 2000 vs. 2018

#compare 2000 vs 2018
d_combo = bind_rows(
  d_adults %>% select(BMI, age) %>% mutate(survey_year = 2018),
  d2000_adult %>% select(BMI, age) %>% mutate(survey_year = 2000)
) %>% 
  filter(age <= 60) %>% 
  mutate(
    obese = BMI >= 30
  )

GG_denhist(d_combo, "BMI", "survey_year", vline = NULL) +
  geom_vline(xintercept = 30, linetype = "dotted") +
  scale_x_continuous(breaks = seq(0, 100, by = 5)) +
  labs(
    title = "BMI distribution in 2000 and 2018",
    subtitle = "NHANES, American adults, aged 20-60",
    fill = "Survey year"
  )

## Warning in GG_denhist(d_combo, "BMI", "survey_year", vline = NULL): There were
## groups without any data. These were removed

## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figs/BMI dist 2000 vs 2018.png")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#numbers
describe2(
  d_combo %>% select(-survey_year),
  d_combo$survey_year
)

BMI, body weight, body fat, secular changes, etc.

Init

Data

2000

Recode

Analysis

BMI and weight

Spline residualization

Alternative BMIs

Body fat

Whites only

Normal distribution origins

Moving BMI distribution and obesity rate

BMI dist 2000 vs. 2018

Meta