Init
options(
digits = 3
)
library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.3 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## Loading required package: weights
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
## Loading required package: assertthat
##
## Attaching package: 'assertthat'
## The following object is masked from 'package:tibble':
##
## has_name
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
## Loading required package: psych
##
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## Loading required package: metafor
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loading the 'metafor' package (version 3.0-2). For an
## introduction to the package please type: help(metafor)
## Loading required package: rlang
##
## Attaching package: 'rlang'
## The following object is masked from 'package:magrittr':
##
## set_names
## The following object is masked from 'package:assertthat':
##
## has_name
## The following objects are masked from 'package:purrr':
##
## %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
## flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
## splice
##
## Attaching package: 'kirkegaard'
## The following object is masked from 'package:rlang':
##
## is_logical
## The following object is masked from 'package:psych':
##
## rescale
## The following object is masked from 'package:assertthat':
##
## are_equal
## The following objects are masked from 'package:purrr':
##
## is_logical, is_numeric
## The following object is masked from 'package:base':
##
## +
load_packages(
rms,
readxl,
haven
)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
##
## Attaching package: 'rms'
## The following object is masked from 'package:metafor':
##
## vif
theme_set(theme_bw())
Recoding
d %<>% mutate(
pubic_hair_normal = (GP01N01 == 1),
penis_abnormal = (GP01N02 == 2),
prostate_enlarged = (GP01N11 == 2),
testis_right = GP01N05,
testis_left = GP01N06,
testis_avg = testis_right + testis_left,
testis_avg_z = (standardize(testis_right) + standardize(testis_left)) %>% standardize(focal_group = race == "White"),
#day of examination
GP_date_day = GP01002B,
GP_date_month = GP01002A,
GP_date_year = GP01002C %>% mapvalues(from = 5, to = 85),
GP_date_year = 1900 + GP_date_year,
GP_date = str_glue("{GP_date_year}-{GP_date_month}-{GP_date_day}") %>% as.Date(),
#recode cm into cm³
testis_right_orig = GP01N05,
testis_left_orig = GP01N06,
testosterone = LD6861,
testosterone_z = testosterone %>% standardize(focal_group = race == "White"),
homosexual = (MM010069 == 1),
nonheterosexual = (MM010430 == 0),
height_z = height %>% standardize(focal_group = race == "White"),
weight_z = weight %>% standardize(focal_group = race == "White"),
BMI_z = BMI %>% standardize(focal_group = race == "White"),
#semen variables
pct_motile = semen$v__MOTILITY,
pct_progressive = semen$v__PROGRESSIVE,
pct_small = semen$MS02006,
pct_normal = semen$MS01006,
pct_large = semen$MS03006,
pct_tapered = semen$MS04006,
pct_amorph = semen$MS05006,
semen_concentration = semen$CONCENTRATION0,
semen_volume = semen$VOL,
semen_count = semen$COUNT,
)
#fix wrong dates
d$GP_date %<>% mapvalues(c("1985-01-22", "1985-02-19", "1985-04-19") %>% as.Date(),
c("1986-01-22", "1986-02-19", "1986-04-19") %>% as.Date())
#look at dates
ggplot(d, aes(GP_date)) +
geom_histogram() +
scale_x_date(date_breaks = "1 month", guide = guide_axis(n.dodge = 2), labels = {function(x) {str_replace(as.character(x), "\\-01$", "")}})
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Testis conversion
#before 1st Jan 1986
d$testis_in_cm = d$GP_date < as.Date("1986-01-01")
d$testis_in_cm %>% table2()
#browse distribution
d %>% group_by(testis_in_cm) %>% do(table2(.$testis_left_orig)) %>% print(n=999)
## # A tibble: 30 × 4
## # Groups: testis_in_cm [2]
## testis_in_cm Group Count Percent
## <lgl> <chr> <dbl> <dbl>
## 1 FALSE 15 587 25.6
## 2 FALSE 20 513 22.4
## 3 FALSE 12 414 18.1
## 4 FALSE 25 382 16.7
## 5 FALSE 10 202 8.82
## 6 FALSE 8 86 3.76
## 7 FALSE 6 42 1.83
## 8 FALSE 4 15 0.655
## 9 FALSE 5 10 0.437
## 10 FALSE 2 9 0.393
## 11 FALSE 3 9 0.393
## 12 FALSE <NA> 8 0.349
## 13 FALSE 1 5 0.218
## 14 FALSE 7 2 0.0874
## 15 FALSE 18 2 0.0874
## 16 FALSE 9 1 0.0437
## 17 FALSE 11 1 0.0437
## 18 FALSE 23 1 0.0437
## 19 TRUE 4 1234 56.8
## 20 TRUE 5 538 24.8
## 21 TRUE 3 222 10.2
## 22 TRUE 6 108 4.97
## 23 TRUE 7 24 1.10
## 24 TRUE 2 20 0.920
## 25 TRUE <NA> 14 0.644
## 26 TRUE 8 6 0.276
## 27 TRUE 1 4 0.184
## 28 TRUE 9 1 0.0460
## 29 TRUE 10 1 0.0460
## 30 TRUE 20 1 0.0460
#plot
ggplot(d, aes(testis_left_orig)) +
geom_bar() +
facet_wrap(facets = "testis_in_cm", scales = c("free_x"), nrow = 2) +
scale_x_continuous(breaks = seq(1, 25))
## Warning: Removed 22 rows containing non-finite values (stat_count).

#test date of men with left testis == 20
d %>% filter(testis_left_orig == 20) %>% pull(GP_date) %>% table2(sort_descending = NULL)
d %>% filter(testis_left_orig == 15) %>% pull(GP_date) %>% table2(sort_descending = NULL)
d %>% filter(testis_left_orig == 10) %>% pull(GP_date) %>% table2(sort_descending = NULL)
#function to fix errors
fix_testis = function(value, testis_in_cm) {
#impossible values
#recode to nearest possible value
#or just discard
value[value == 23] = NA
value[value == 11] = NA
value[value == 18] = NA
value[value == 17] = NA
#these are unclear
value[value == 9] = NA
#wrongly coded values
#these are volumes
testis_in_cm[value == 25] = F
testis_in_cm[value == 20] = F
testis_in_cm[value == 15] = F
testis_in_cm[value == 12] = F
testis_in_cm[value == 10] = F
#these are sizs
testis_in_cm[value == 1] = T
testis_in_cm[value == 7] = T
tibble(
value,
testis_in_cm
)
}
#apply
d[c("testis_left", "testis_in_cm_left")] = fix_testis(d$testis_left_orig, d$testis_in_cm)
d[c("testis_right", "testis_in_cm_right")] = fix_testis(d$testis_right_orig, d$testis_in_cm)
#browse distribution again
d %>% group_by(testis_in_cm_left) %>% do(table2(.$testis_left)) %>% print(n=999)
## # A tibble: 21 × 4
## # Groups: testis_in_cm_left [2]
## testis_in_cm_left Group Count Percent
## <lgl> <chr> <dbl> <dbl>
## 1 FALSE 15 587 25.7
## 2 FALSE 20 514 22.5
## 3 FALSE 12 414 18.1
## 4 FALSE 25 382 16.7
## 5 FALSE 10 203 8.89
## 6 FALSE 8 86 3.77
## 7 FALSE 6 42 1.84
## 8 FALSE 4 15 0.657
## 9 FALSE <NA> 13 0.569
## 10 FALSE 5 10 0.438
## 11 FALSE 2 9 0.394
## 12 FALSE 3 9 0.394
## 13 TRUE 4 1234 56.7
## 14 TRUE 5 538 24.7
## 15 TRUE 3 222 10.2
## 16 TRUE 6 108 4.96
## 17 TRUE 7 26 1.19
## 18 TRUE 2 20 0.918
## 19 TRUE <NA> 15 0.689
## 20 TRUE 1 9 0.413
## 21 TRUE 8 6 0.275
#plot
ggplot(d, aes(testis_left)) +
geom_bar() +
facet_wrap(facets = "testis_in_cm_left", scales = c("free_x"), nrow = 2) +
scale_x_continuous(breaks = seq(1, 25))
## Warning: Removed 28 rows containing non-finite values (stat_count).

#right testicle
d %>% group_by(testis_in_cm_right) %>% do(table2(.$testis_right)) %>% print(n=999)
## # A tibble: 21 × 4
## # Groups: testis_in_cm_right [2]
## testis_in_cm_right Group Count Percent
## <lgl> <chr> <dbl> <dbl>
## 1 FALSE 20 639 27.9
## 2 FALSE 25 585 25.6
## 3 FALSE 15 539 23.5
## 4 FALSE 12 261 11.4
## 5 FALSE 10 128 5.59
## 6 FALSE 8 60 2.62
## 7 FALSE 6 24 1.05
## 8 FALSE <NA> 16 0.699
## 9 FALSE 3 12 0.524
## 10 FALSE 4 12 0.524
## 11 FALSE 2 10 0.437
## 12 FALSE 5 3 0.131
## 13 TRUE 4 1142 52.6
## 14 TRUE 5 625 28.8
## 15 TRUE 3 216 9.94
## 16 TRUE 6 118 5.43
## 17 TRUE 7 29 1.33
## 18 TRUE 2 21 0.966
## 19 TRUE <NA> 11 0.506
## 20 TRUE 8 6 0.276
## 21 TRUE 1 5 0.230
ggplot(d, aes(testis_right)) +
geom_bar() +
facet_wrap(facets = "testis_in_cm_right", scales = c("free_x"), nrow = 2) +
scale_x_continuous(breaks = seq(1, 25))
## Warning: Removed 27 rows containing non-finite values (stat_count).

#recode both to z scores within method and combine
d$testis_left[d$testis_in_cm_left] = d$testis_left[d$testis_in_cm_left] %>% standardize()
d$testis_left[!d$testis_in_cm_left] = d$testis_left[!d$testis_in_cm_left] %>% standardize()
d$testis_right[d$testis_in_cm_right] = d$testis_right[d$testis_in_cm_right] %>% standardize()
d$testis_right[!d$testis_in_cm_right] = d$testis_right[!d$testis_in_cm_right] %>% standardize()
#did everything work?!
ggplot(d, aes(testis_right, fill = testis_in_cm_right)) +
geom_histogram(position = "dodge")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 27 rows containing non-finite values (stat_bin).

#compare ball sizes across sides before and after conversions
d %>% filter(testis_in_cm) %>% GG_scatter("testis_left_orig", "testis_right_orig")
## `geom_smooth()` using formula 'y ~ x'

d %>% filter(!testis_in_cm) %>% GG_scatter("testis_left_orig", "testis_right_orig")
## `geom_smooth()` using formula 'y ~ x'

#after joining all data to same scale
GG_scatter(d, "testis_left", "testis_right")
## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/testis_left_right.png")
## `geom_smooth()` using formula 'y ~ x'
GG_scatter(d, "testis_left", "testis_right", color = "testis_in_cm")
## `geom_smooth()` using formula 'y ~ x'

#average them
d$testis_avg_z = rowMeans(d %>% select(testis_right, testis_left)) %>% standardize()
Results
Simple
#race counts
d$race %>% table2()
#descriptives
d %>%
select(
#not so interesting because these are Z scores
testis_right, testis_left, testis_avg_z,
testosterone,
# pubic_hair_normal, penis_abnormal,
age,
HEIGHTCM, WEIGHTKG, BMI
) %>%
describe2()
#by race
d %>%
select(
testis_avg_z,
testosterone
) %>%
describeBy(group = d$race, mat = T) %>%
select(-item, -vars, -range, -skew, -kurtosis, -trimmed)
#correlations
d %>%
select(
testis_right, testis_left, testis_avg_z,
testosterone,
age,
HEIGHTCM, WEIGHTKG, BMI
) %>%
cor_matrix(p_val = T)
## testis_right testis_left testis_avg_z testosterone age
## testis_right "1" "0.80***" "0.95***" "0.02" "0.00"
## testis_left "0.80***" "1" "0.95***" "0.03" "0.00"
## testis_avg_z "0.95***" "0.95***" "1" "0.03" "0.00"
## testosterone "0.02" "0.03" "0.03" "1" "-0.19***"
## age "0.00" "0.00" "0.00" "-0.19***" "1"
## HEIGHTCM "0.10***" "0.09***" "0.10***" "-0.02" "0.01"
## WEIGHTKG "0.09***" "0.09***" "0.09***" "-0.32***" "0.06***"
## BMI "0.05***" "0.05***" "0.06***" "-0.34***" "0.06***"
## HEIGHTCM WEIGHTKG BMI
## testis_right "0.10***" "0.09***" "0.05***"
## testis_left "0.09***" "0.09***" "0.05***"
## testis_avg_z "0.10***" "0.09***" "0.06***"
## testosterone "-0.02" "-0.32***" "-0.34***"
## age "0.01" "0.06***" "0.06***"
## HEIGHTCM "1" "0.43***" "0.04*"
## WEIGHTKG "0.43***" "1" "0.81***"
## BMI "0.04*" "0.81***" "1"
Testis size
GG_group_means(d, "testis_avg_z", "race", type = "boxplot")
## Missing values were removed.

SMD_matrix(d$testis_avg_z, group = d$race, extended_output = T)
## $d
## White Black Hispanic Asian Native
## White NA 0.247 0.0682 0.788 -0.0749
## Black 0.2474 NA -0.1792 0.540 -0.3223
## Hispanic 0.0682 -0.179 NA 0.720 -0.1431
## Asian 0.7879 0.540 0.7197 NA -0.8628
## Native -0.0749 -0.322 -0.1431 -0.863 NA
##
## $d_string
## White Black Hispanic
## White NA "0.25 [0.16 0.34]" "0.068 [-0.075 0.21]"
## Black "0.25 [0.16 0.34]" NA "-0.18 [-0.34 -0.015]"
## Hispanic "0.068 [-0.075 0.21]" "-0.18 [-0.34 -0.015]" NA
## Asian "0.79 [0.45 1.13]" "0.54 [0.19 0.89]" "0.72 [0.35 1.09]"
## Native "-0.075 [-0.36 0.21]" "-0.32 [-0.62 -0.026]" "-0.14 [-0.46 0.17]"
## Asian Native
## White "0.79 [0.45 1.13]" "-0.075 [-0.36 0.21]"
## Black "0.54 [0.19 0.89]" "-0.32 [-0.62 -0.026]"
## Hispanic "0.72 [0.35 1.09]" "-0.14 [-0.46 0.17]"
## Asian NA "-0.86 [-1.32 -0.40]"
## Native "-0.86 [-1.32 -0.40]" NA
##
## $CI_lower
## White Black Hispanic Asian Native
## White NA 0.155 -0.0752 0.450 -0.360
## Black 0.1552 NA -0.3435 0.192 -0.619
## Hispanic -0.0752 -0.344 NA 0.350 -0.459
## Asian 0.4497 0.192 0.3498 NA -1.322
## Native -0.3596 -0.619 -0.4588 -1.322 NA
##
## $CI_upper
## White Black Hispanic Asian Native
## White NA 0.3397 0.2116 1.126 0.210
## Black 0.340 NA -0.0149 0.889 -0.026
## Hispanic 0.212 -0.0149 NA 1.090 0.173
## Asian 1.126 0.8889 1.0896 NA -0.404
## Native 0.210 -0.0260 0.1727 -0.404 NA
##
## $se_z
## [1] 1.96
##
## $se
## White Black Hispanic Asian Native
## White NA 0.0471 0.0732 0.173 0.145
## Black 0.0471 NA 0.0838 0.178 0.151
## Hispanic 0.0732 0.0838 NA 0.189 0.161
## Asian 0.1725 0.1778 0.1887 NA 0.234
## Native 0.1453 0.1512 0.1611 0.234 NA
##
## $p
## White Black Hispanic Asian Native
## White NA 1.45e-07 0.351118 4.96e-06 0.606430
## Black 1.45e-07 NA 0.032546 2.37e-03 0.033020
## Hispanic 3.51e-01 3.25e-02 NA 1.37e-04 0.374467
## Asian 4.96e-06 2.37e-03 0.000137 NA 0.000231
## Native 6.06e-01 3.30e-02 0.374467 2.31e-04 NA
##
## $pairwise_n
## White Black Hispanic Asian Native
## White NA 4134 3813 3650 3664
## Black 4134 NA 715 552 566
## Hispanic 3813 715 NA 231 245
## Asian 3650 552 231 NA 82
## Native 3664 566 245 82 NA
#model
list(
ols(testis_avg_z ~ race, data = d),
ols(testis_avg_z ~ race + age + height_z + BMI_z, data = d)
) %>% summarize_models(asterisks_only = F)
Testosterone
#raw units
describeBy(d$testosterone, d$race, mat = T)
GG_group_means(d, "testosterone_z", "race", type = "boxplot")

SMD_matrix(d$testosterone_z, group = d$race, extended_output = T)
## $d
## White Black Hispanic Asian Native
## White NA -0.1086 -0.0203 -0.221 -0.00896
## Black -0.10860 NA 0.0883 -0.112 0.09965
## Hispanic -0.02027 0.0883 NA -0.200 0.01132
## Asian -0.22064 -0.1120 -0.2004 NA 0.21168
## Native -0.00896 0.0996 0.0113 0.212 NA
##
## $d_string
## White Black Hispanic
## White NA "-0.11 [-0.20 -0.017]" "-0.02 [-0.16 0.12]"
## Black "-0.11 [-0.20 -0.017]" NA "0.088 [-0.075 0.25]"
## Hispanic "-0.02 [-0.16 0.12]" "0.088 [-0.075 0.25]" NA
## Asian "-0.22 [-0.56 0.12]" "-0.11 [-0.46 0.23]" "-0.20 [-0.56 0.16]"
## Native "-0.009 [-0.29 0.27]" "0.10 [-0.19 0.39]" "0.011 [-0.30 0.32]"
## Asian Native
## White "-0.22 [-0.56 0.12]" "-0.009 [-0.29 0.27]"
## Black "-0.11 [-0.46 0.23]" "0.10 [-0.19 0.39]"
## Hispanic "-0.20 [-0.56 0.16]" "0.011 [-0.30 0.32]"
## Asian NA "0.21 [-0.23 0.65]"
## Native "0.21 [-0.23 0.65]" NA
##
## $CI_lower
## White Black Hispanic Asian Native
## White NA -0.2001 -0.1626 -0.558 -0.291
## Black -0.200 NA -0.0746 -0.459 -0.193
## Hispanic -0.163 -0.0746 NA -0.564 -0.301
## Asian -0.558 -0.4589 -0.5644 NA -0.227
## Native -0.291 -0.1932 -0.3011 -0.227 NA
##
## $CI_upper
## White Black Hispanic Asian Native
## White NA -0.0171 0.122 0.117 0.273
## Black -0.0171 NA 0.251 0.235 0.392
## Hispanic 0.1221 0.2513 NA 0.164 0.324
## Asian 0.1171 0.2349 0.164 NA 0.650
## Native 0.2729 0.3925 0.324 0.650 NA
##
## $se_z
## [1] 1.96
##
## $se
## White Black Hispanic Asian Native
## White NA 0.0467 0.0726 0.172 0.144
## Black 0.0467 NA 0.0831 0.177 0.149
## Hispanic 0.0726 0.0831 NA 0.186 0.159
## Asian 0.1723 0.1770 0.1857 NA 0.224
## Native 0.1438 0.1494 0.1594 0.224 NA
##
## $p
## White Black Hispanic Asian Native
## White NA 0.020 0.780 0.200 0.950
## Black 0.02 NA 0.288 0.527 0.505
## Hispanic 0.78 0.288 NA 0.281 0.943
## Asian 0.20 0.527 0.281 NA 0.344
## Native 0.95 0.505 0.943 0.344 NA
##
## $pairwise_n
## White Black Hispanic Asian Native
## White NA 4179 3854 3688 3703
## Black 4179 NA 725 559 574
## Hispanic 3854 725 NA 234 249
## Asian 3688 559 234 NA 83
## Native 3703 574 249 83 NA
#model T
list(
ols(testosterone_z ~ race, data = d),
ols(testosterone_z ~ race + age + height_z + BMI_z, data = d)
) %>% summarize_models(asterisks_only = F)
#raw units
list(
ols(testosterone ~ race, data = d),
ols(testosterone ~ race + age + height_z + BMI_z, data = d)
) %>% summarize_models(asterisks_only = F)
#in %
26.38 / 675.9603
## [1] 0.039
19.82 / 675.9603
## [1] 0.0293