Init

options(
  digits = 3
)

library(kirkegaard)

## Loading required package: tidyverse

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.3     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

## Loading required package: weights

## Loading required package: Hmisc

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

## Loading required package: assertthat

## 
## Attaching package: 'assertthat'

## The following object is masked from 'package:tibble':
## 
##     has_name

## Loading required package: magrittr

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

## Loading required package: psych

## 
## Attaching package: 'psych'

## The following object is masked from 'package:Hmisc':
## 
##     describe

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

## Loading required package: metafor

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## 
## Loading the 'metafor' package (version 3.0-2). For an
## introduction to the package please type: help(metafor)

## Loading required package: rlang

## 
## Attaching package: 'rlang'

## The following object is masked from 'package:magrittr':
## 
##     set_names

## The following object is masked from 'package:assertthat':
## 
##     has_name

## The following objects are masked from 'package:purrr':
## 
##     %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
##     flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
##     splice

## 
## Attaching package: 'kirkegaard'

## The following object is masked from 'package:rlang':
## 
##     is_logical

## The following object is masked from 'package:psych':
## 
##     rescale

## The following object is masked from 'package:assertthat':
## 
##     are_equal

## The following objects are masked from 'package:purrr':
## 
##     is_logical, is_numeric

## The following object is masked from 'package:base':
## 
##     +

load_packages(
  rms,
  readxl,
  haven
)

## Loading required package: SparseM

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

## 
## Attaching package: 'rms'

## The following object is masked from 'package:metafor':
## 
##     vif

theme_set(theme_bw())

Data

d = read_rds("data/VES merged.rds")
variables = read_csv("data/VES_dataset_variables.csv")

## Rows: 3341 Columns: 3

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): code, name
## dbl (1): number

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#semen data
semen = read_sas("data/VEJOBIQ6_MMPI_Hormon.sas7bdat")

#same order
assert_that(all(semen$AOP_ID == d$AOP_ID))

## [1] TRUE

Recoding

d %<>% mutate(
  pubic_hair_normal = (GP01N01 == 1),
  penis_abnormal = (GP01N02 == 2),
  prostate_enlarged = (GP01N11 == 2),
  
  testis_right = GP01N05,
  testis_left = GP01N06,
  testis_avg = testis_right + testis_left,
  testis_avg_z = (standardize(testis_right) + standardize(testis_left)) %>% standardize(focal_group = race == "White"),
  
  #day of examination
  GP_date_day = GP01002B,
  GP_date_month = GP01002A,
  GP_date_year = GP01002C %>% mapvalues(from = 5, to = 85),
  GP_date_year = 1900 + GP_date_year,
  GP_date = str_glue("{GP_date_year}-{GP_date_month}-{GP_date_day}") %>% as.Date(),
  
  #recode cm into cm³
  testis_right_orig = GP01N05,
  testis_left_orig = GP01N06,
  
  testosterone = LD6861,
  testosterone_z = testosterone %>% standardize(focal_group = race == "White"),
  
  homosexual = (MM010069 == 1),
  nonheterosexual = (MM010430 == 0),
  
  height_z = height %>% standardize(focal_group = race == "White"),
  weight_z = weight %>% standardize(focal_group = race == "White"),
  BMI_z = BMI %>% standardize(focal_group = race == "White"),
  
  #semen variables
  pct_motile = semen$v__MOTILITY,
  pct_progressive = semen$v__PROGRESSIVE,
  pct_small = semen$MS02006,
  pct_normal = semen$MS01006,
  pct_large = semen$MS03006,
  pct_tapered = semen$MS04006,
  pct_amorph = semen$MS05006,
  semen_concentration = semen$CONCENTRATION0,
  semen_volume = semen$VOL,
  semen_count = semen$COUNT,
)

#fix wrong dates
d$GP_date %<>% mapvalues(c("1985-01-22", "1985-02-19", "1985-04-19") %>% as.Date(),
                         c("1986-01-22", "1986-02-19", "1986-04-19") %>% as.Date())

#look at dates
ggplot(d, aes(GP_date)) +
  geom_histogram() +
  scale_x_date(date_breaks = "1 month", guide = guide_axis(n.dodge = 2), labels = {function(x) {str_replace(as.character(x), "\\-01$", "")}})

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Testis conversion

#before 1st Jan 1986
d$testis_in_cm = d$GP_date < as.Date("1986-01-01")
d$testis_in_cm %>% table2()

#browse distribution
d %>% group_by(testis_in_cm) %>% do(table2(.$testis_left_orig)) %>% print(n=999)

## # A tibble: 30 × 4
## # Groups:   testis_in_cm [2]
##    testis_in_cm Group Count Percent
##    <lgl>        <chr> <dbl>   <dbl>
##  1 FALSE        15      587 25.6   
##  2 FALSE        20      513 22.4   
##  3 FALSE        12      414 18.1   
##  4 FALSE        25      382 16.7   
##  5 FALSE        10      202  8.82  
##  6 FALSE        8        86  3.76  
##  7 FALSE        6        42  1.83  
##  8 FALSE        4        15  0.655 
##  9 FALSE        5        10  0.437 
## 10 FALSE        2         9  0.393 
## 11 FALSE        3         9  0.393 
## 12 FALSE        <NA>      8  0.349 
## 13 FALSE        1         5  0.218 
## 14 FALSE        7         2  0.0874
## 15 FALSE        18        2  0.0874
## 16 FALSE        9         1  0.0437
## 17 FALSE        11        1  0.0437
## 18 FALSE        23        1  0.0437
## 19 TRUE         4      1234 56.8   
## 20 TRUE         5       538 24.8   
## 21 TRUE         3       222 10.2   
## 22 TRUE         6       108  4.97  
## 23 TRUE         7        24  1.10  
## 24 TRUE         2        20  0.920 
## 25 TRUE         <NA>     14  0.644 
## 26 TRUE         8         6  0.276 
## 27 TRUE         1         4  0.184 
## 28 TRUE         9         1  0.0460
## 29 TRUE         10        1  0.0460
## 30 TRUE         20        1  0.0460

#plot
ggplot(d, aes(testis_left_orig)) +
  geom_bar() +
  facet_wrap(facets = "testis_in_cm", scales = c("free_x"), nrow = 2) +
  scale_x_continuous(breaks = seq(1, 25))

## Warning: Removed 22 rows containing non-finite values (stat_count).

#test date of men with left testis == 20
d %>% filter(testis_left_orig == 20) %>% pull(GP_date) %>% table2(sort_descending = NULL)

d %>% filter(testis_left_orig == 15) %>% pull(GP_date) %>% table2(sort_descending = NULL)

d %>% filter(testis_left_orig == 10) %>% pull(GP_date) %>% table2(sort_descending = NULL)

#function to fix errors
fix_testis = function(value, testis_in_cm) {
  #impossible values
  #recode to nearest possible value
  #or just discard
  value[value == 23] = NA
  value[value == 11] = NA
  value[value == 18] = NA
  value[value == 17] = NA
  
  #these are unclear
  value[value == 9] = NA
  
  #wrongly coded values
  #these are volumes
  testis_in_cm[value == 25] = F
  testis_in_cm[value == 20] = F
  testis_in_cm[value == 15] = F
  testis_in_cm[value == 12] = F
  testis_in_cm[value == 10] = F
  
  #these are sizs
  testis_in_cm[value == 1] = T
  testis_in_cm[value == 7] = T
  
  tibble(
    value,
    testis_in_cm
  )
}

#apply
d[c("testis_left", "testis_in_cm_left")] = fix_testis(d$testis_left_orig, d$testis_in_cm)

d[c("testis_right", "testis_in_cm_right")] = fix_testis(d$testis_right_orig, d$testis_in_cm)

#browse distribution again
d %>% group_by(testis_in_cm_left) %>% do(table2(.$testis_left)) %>% print(n=999)

## # A tibble: 21 × 4
## # Groups:   testis_in_cm_left [2]
##    testis_in_cm_left Group Count Percent
##    <lgl>             <chr> <dbl>   <dbl>
##  1 FALSE             15      587  25.7  
##  2 FALSE             20      514  22.5  
##  3 FALSE             12      414  18.1  
##  4 FALSE             25      382  16.7  
##  5 FALSE             10      203   8.89 
##  6 FALSE             8        86   3.77 
##  7 FALSE             6        42   1.84 
##  8 FALSE             4        15   0.657
##  9 FALSE             <NA>     13   0.569
## 10 FALSE             5        10   0.438
## 11 FALSE             2         9   0.394
## 12 FALSE             3         9   0.394
## 13 TRUE              4      1234  56.7  
## 14 TRUE              5       538  24.7  
## 15 TRUE              3       222  10.2  
## 16 TRUE              6       108   4.96 
## 17 TRUE              7        26   1.19 
## 18 TRUE              2        20   0.918
## 19 TRUE              <NA>     15   0.689
## 20 TRUE              1         9   0.413
## 21 TRUE              8         6   0.275

#plot
ggplot(d, aes(testis_left)) +
  geom_bar() +
  facet_wrap(facets = "testis_in_cm_left", scales = c("free_x"), nrow = 2) +
  scale_x_continuous(breaks = seq(1, 25))

## Warning: Removed 28 rows containing non-finite values (stat_count).

#right testicle
d %>% group_by(testis_in_cm_right) %>% do(table2(.$testis_right)) %>% print(n=999)

## # A tibble: 21 × 4
## # Groups:   testis_in_cm_right [2]
##    testis_in_cm_right Group Count Percent
##    <lgl>              <chr> <dbl>   <dbl>
##  1 FALSE              20      639  27.9  
##  2 FALSE              25      585  25.6  
##  3 FALSE              15      539  23.5  
##  4 FALSE              12      261  11.4  
##  5 FALSE              10      128   5.59 
##  6 FALSE              8        60   2.62 
##  7 FALSE              6        24   1.05 
##  8 FALSE              <NA>     16   0.699
##  9 FALSE              3        12   0.524
## 10 FALSE              4        12   0.524
## 11 FALSE              2        10   0.437
## 12 FALSE              5         3   0.131
## 13 TRUE               4      1142  52.6  
## 14 TRUE               5       625  28.8  
## 15 TRUE               3       216   9.94 
## 16 TRUE               6       118   5.43 
## 17 TRUE               7        29   1.33 
## 18 TRUE               2        21   0.966
## 19 TRUE               <NA>     11   0.506
## 20 TRUE               8         6   0.276
## 21 TRUE               1         5   0.230

ggplot(d, aes(testis_right)) +
  geom_bar() +
  facet_wrap(facets = "testis_in_cm_right", scales = c("free_x"), nrow = 2) +
  scale_x_continuous(breaks = seq(1, 25))

## Warning: Removed 27 rows containing non-finite values (stat_count).

#recode both to z scores within method and combine
d$testis_left[d$testis_in_cm_left] = d$testis_left[d$testis_in_cm_left] %>% standardize()
d$testis_left[!d$testis_in_cm_left] = d$testis_left[!d$testis_in_cm_left] %>% standardize()

d$testis_right[d$testis_in_cm_right] = d$testis_right[d$testis_in_cm_right] %>% standardize()
d$testis_right[!d$testis_in_cm_right] = d$testis_right[!d$testis_in_cm_right] %>% standardize()

#did everything work?!
ggplot(d, aes(testis_right, fill = testis_in_cm_right)) +
  geom_histogram(position = "dodge")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 27 rows containing non-finite values (stat_bin).

#compare ball sizes across sides before and after conversions
d %>% filter(testis_in_cm) %>% GG_scatter("testis_left_orig", "testis_right_orig")

## `geom_smooth()` using formula 'y ~ x'

d %>% filter(!testis_in_cm) %>% GG_scatter("testis_left_orig", "testis_right_orig")

## `geom_smooth()` using formula 'y ~ x'

#after joining all data to same scale
GG_scatter(d, "testis_left", "testis_right")

## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/testis_left_right.png")

## `geom_smooth()` using formula 'y ~ x'

GG_scatter(d, "testis_left", "testis_right", color = "testis_in_cm")

## `geom_smooth()` using formula 'y ~ x'

#average them
d$testis_avg_z = rowMeans(d %>% select(testis_right, testis_left)) %>% standardize()

Results

Simple

#race counts
d$race %>% table2()

#descriptives
d %>% 
  select(
    #not so interesting because these are Z scores
    testis_right, testis_left, testis_avg_z, 
    testosterone,
    # pubic_hair_normal, penis_abnormal,
    age,
    HEIGHTCM, WEIGHTKG, BMI
  ) %>% 
  describe2()

#by race
d %>% 
  select(
    testis_avg_z,
    testosterone
  ) %>% 
  describeBy(group = d$race, mat = T) %>% 
  select(-item, -vars, -range, -skew, -kurtosis, -trimmed)

#correlations
d %>% 
  select(
    testis_right, testis_left, testis_avg_z,
    testosterone,
    age,
    HEIGHTCM, WEIGHTKG, BMI
  ) %>% 
  cor_matrix(p_val = T)

##              testis_right testis_left testis_avg_z testosterone age       
## testis_right "1"          "0.80***"   "0.95***"    "0.02"       "0.00"    
## testis_left  "0.80***"    "1"         "0.95***"    "0.03"       "0.00"    
## testis_avg_z "0.95***"    "0.95***"   "1"          "0.03"       "0.00"    
## testosterone "0.02"       "0.03"      "0.03"       "1"          "-0.19***"
## age          "0.00"       "0.00"      "0.00"       "-0.19***"   "1"       
## HEIGHTCM     "0.10***"    "0.09***"   "0.10***"    "-0.02"      "0.01"    
## WEIGHTKG     "0.09***"    "0.09***"   "0.09***"    "-0.32***"   "0.06***" 
## BMI          "0.05***"    "0.05***"   "0.06***"    "-0.34***"   "0.06***" 
##              HEIGHTCM  WEIGHTKG   BMI       
## testis_right "0.10***" "0.09***"  "0.05***" 
## testis_left  "0.09***" "0.09***"  "0.05***" 
## testis_avg_z "0.10***" "0.09***"  "0.06***" 
## testosterone "-0.02"   "-0.32***" "-0.34***"
## age          "0.01"    "0.06***"  "0.06***" 
## HEIGHTCM     "1"       "0.43***"  "0.04*"   
## WEIGHTKG     "0.43***" "1"        "0.81***" 
## BMI          "0.04*"   "0.81***"  "1"

Testis size

GG_group_means(d, "testis_avg_z", "race", type = "boxplot")

## Missing values were removed.

SMD_matrix(d$testis_avg_z, group = d$race, extended_output = T)

## $d
##            White  Black Hispanic  Asian  Native
## White         NA  0.247   0.0682  0.788 -0.0749
## Black     0.2474     NA  -0.1792  0.540 -0.3223
## Hispanic  0.0682 -0.179       NA  0.720 -0.1431
## Asian     0.7879  0.540   0.7197     NA -0.8628
## Native   -0.0749 -0.322  -0.1431 -0.863      NA
## 
## $d_string
##          White                 Black                  Hispanic              
## White    NA                    "0.25 [0.16 0.34]"     "0.068 [-0.075 0.21]" 
## Black    "0.25 [0.16 0.34]"    NA                     "-0.18 [-0.34 -0.015]"
## Hispanic "0.068 [-0.075 0.21]" "-0.18 [-0.34 -0.015]" NA                    
## Asian    "0.79 [0.45 1.13]"    "0.54 [0.19 0.89]"     "0.72 [0.35 1.09]"    
## Native   "-0.075 [-0.36 0.21]" "-0.32 [-0.62 -0.026]" "-0.14 [-0.46 0.17]"  
##          Asian                 Native                
## White    "0.79 [0.45 1.13]"    "-0.075 [-0.36 0.21]" 
## Black    "0.54 [0.19 0.89]"    "-0.32 [-0.62 -0.026]"
## Hispanic "0.72 [0.35 1.09]"    "-0.14 [-0.46 0.17]"  
## Asian    NA                    "-0.86 [-1.32 -0.40]" 
## Native   "-0.86 [-1.32 -0.40]" NA                    
## 
## $CI_lower
##            White  Black Hispanic  Asian Native
## White         NA  0.155  -0.0752  0.450 -0.360
## Black     0.1552     NA  -0.3435  0.192 -0.619
## Hispanic -0.0752 -0.344       NA  0.350 -0.459
## Asian     0.4497  0.192   0.3498     NA -1.322
## Native   -0.3596 -0.619  -0.4588 -1.322     NA
## 
## $CI_upper
##          White   Black Hispanic  Asian Native
## White       NA  0.3397   0.2116  1.126  0.210
## Black    0.340      NA  -0.0149  0.889 -0.026
## Hispanic 0.212 -0.0149       NA  1.090  0.173
## Asian    1.126  0.8889   1.0896     NA -0.404
## Native   0.210 -0.0260   0.1727 -0.404     NA
## 
## $se_z
## [1] 1.96
## 
## $se
##           White  Black Hispanic Asian Native
## White        NA 0.0471   0.0732 0.173  0.145
## Black    0.0471     NA   0.0838 0.178  0.151
## Hispanic 0.0732 0.0838       NA 0.189  0.161
## Asian    0.1725 0.1778   0.1887    NA  0.234
## Native   0.1453 0.1512   0.1611 0.234     NA
## 
## $p
##             White    Black Hispanic    Asian   Native
## White          NA 1.45e-07 0.351118 4.96e-06 0.606430
## Black    1.45e-07       NA 0.032546 2.37e-03 0.033020
## Hispanic 3.51e-01 3.25e-02       NA 1.37e-04 0.374467
## Asian    4.96e-06 2.37e-03 0.000137       NA 0.000231
## Native   6.06e-01 3.30e-02 0.374467 2.31e-04       NA
## 
## $pairwise_n
##          White Black Hispanic Asian Native
## White       NA  4134     3813  3650   3664
## Black     4134    NA      715   552    566
## Hispanic  3813   715       NA   231    245
## Asian     3650   552      231    NA     82
## Native    3664   566      245    82     NA

#model
list(
  ols(testis_avg_z ~ race, data = d),
  ols(testis_avg_z ~ race + age + height_z + BMI_z, data = d)
) %>% summarize_models(asterisks_only = F)

Testosterone

#raw units
describeBy(d$testosterone, d$race, mat = T)

GG_group_means(d, "testosterone_z", "race", type = "boxplot")

SMD_matrix(d$testosterone_z, group = d$race, extended_output = T)

## $d
##             White   Black Hispanic  Asian   Native
## White          NA -0.1086  -0.0203 -0.221 -0.00896
## Black    -0.10860      NA   0.0883 -0.112  0.09965
## Hispanic -0.02027  0.0883       NA -0.200  0.01132
## Asian    -0.22064 -0.1120  -0.2004     NA  0.21168
## Native   -0.00896  0.0996   0.0113  0.212       NA
## 
## $d_string
##          White                  Black                  Hispanic             
## White    NA                     "-0.11 [-0.20 -0.017]" "-0.02 [-0.16 0.12]" 
## Black    "-0.11 [-0.20 -0.017]" NA                     "0.088 [-0.075 0.25]"
## Hispanic "-0.02 [-0.16 0.12]"   "0.088 [-0.075 0.25]"  NA                   
## Asian    "-0.22 [-0.56 0.12]"   "-0.11 [-0.46 0.23]"   "-0.20 [-0.56 0.16]" 
## Native   "-0.009 [-0.29 0.27]"  "0.10 [-0.19 0.39]"    "0.011 [-0.30 0.32]" 
##          Asian                Native               
## White    "-0.22 [-0.56 0.12]" "-0.009 [-0.29 0.27]"
## Black    "-0.11 [-0.46 0.23]" "0.10 [-0.19 0.39]"  
## Hispanic "-0.20 [-0.56 0.16]" "0.011 [-0.30 0.32]" 
## Asian    NA                   "0.21 [-0.23 0.65]"  
## Native   "0.21 [-0.23 0.65]"  NA                   
## 
## $CI_lower
##           White   Black Hispanic  Asian Native
## White        NA -0.2001  -0.1626 -0.558 -0.291
## Black    -0.200      NA  -0.0746 -0.459 -0.193
## Hispanic -0.163 -0.0746       NA -0.564 -0.301
## Asian    -0.558 -0.4589  -0.5644     NA -0.227
## Native   -0.291 -0.1932  -0.3011 -0.227     NA
## 
## $CI_upper
##            White   Black Hispanic Asian Native
## White         NA -0.0171    0.122 0.117  0.273
## Black    -0.0171      NA    0.251 0.235  0.392
## Hispanic  0.1221  0.2513       NA 0.164  0.324
## Asian     0.1171  0.2349    0.164    NA  0.650
## Native    0.2729  0.3925    0.324 0.650     NA
## 
## $se_z
## [1] 1.96
## 
## $se
##           White  Black Hispanic Asian Native
## White        NA 0.0467   0.0726 0.172  0.144
## Black    0.0467     NA   0.0831 0.177  0.149
## Hispanic 0.0726 0.0831       NA 0.186  0.159
## Asian    0.1723 0.1770   0.1857    NA  0.224
## Native   0.1438 0.1494   0.1594 0.224     NA
## 
## $p
##          White Black Hispanic Asian Native
## White       NA 0.020    0.780 0.200  0.950
## Black     0.02    NA    0.288 0.527  0.505
## Hispanic  0.78 0.288       NA 0.281  0.943
## Asian     0.20 0.527    0.281    NA  0.344
## Native    0.95 0.505    0.943 0.344     NA
## 
## $pairwise_n
##          White Black Hispanic Asian Native
## White       NA  4179     3854  3688   3703
## Black     4179    NA      725   559    574
## Hispanic  3854   725       NA   234    249
## Asian     3688   559      234    NA     83
## Native    3703   574      249    83     NA

#model T
list(
  ols(testosterone_z ~ race, data = d),
  ols(testosterone_z ~ race + age + height_z + BMI_z, data = d)
) %>% summarize_models(asterisks_only = F)

#raw units
list(
  ols(testosterone ~ race, data = d),
  ols(testosterone ~ race + age + height_z + BMI_z, data = d)
) %>% summarize_models(asterisks_only = F)

#in %
26.38 / 675.9603

## [1] 0.039

19.82 / 675.9603

## [1] 0.0293

Black-white difference in testicle size and testosterone (VES)

20 September, 2021

Init

Data

Recoding

Testis conversion

Results

Simple

Testis size

Testosterone

Meta