ELSA dissertation

w1_core <- read_dta("raw data/RAW_data_stata/wave_1_core_data_v3.dta")
w1_derived <- read_dta("raw data/RAW_data_stata/wave_1_ifs_derived_variables.dta")

w1 <- left_join(w1_core, w1_derived, by = "idauniq")

#load waves 2 to 9
w2_core  <- read_dta("raw data/RAW_data_stata/wave_2_core_data_v4.dta")
w3_core  <- read_dta("raw data/RAW_data_stata/wave_3_elsa_data_v4.dta")
w4_core  <- read_dta("raw data/RAW_data_stata/wave_4_elsa_data_v3.dta")
w5_core  <- read_dta("raw data/RAW_data_stata/wave_5_elsa_data_v4.dta")
w6_core  <- read_dta("raw data/RAW_data_stata/wave_6_elsa_data_v2.dta")
w7_core  <- read_dta("raw data/RAW_data_stata/wave_7_elsa_data.dta")
w8_core  <- read_dta("raw data/RAW_data_stata/wave_8_elsa_data_eul_v2.dta")
w9_core  <- read_dta("raw data/RAW_data_stata/wave_9_elsa_data_eul_v2.dta")

Table 1 baseline characteristics NO PARTICIPANTS EXCLUDED

# Demographic / social variables
names(w1)[grepl("age|ager|dob", names(w1), ignore.case = TRUE)]

##  [1] "dhdobyr"   "dhager"    "didob"     "heage"     "wprage"    "indobyr.x"
##  [7] "indager"   "aagemab"   "aagepab"   "aageangi"  "aagehart"  "aagestro" 
## [13] "aagedi"    "age"       "age_p"     "indobyr.y" "indobyr_p" "agebuhead"
## [19] "agebusp"   "agehoh"    "agehhch1"  "agehhch2"  "agehhch3"  "agehhch4" 
## [25] "agehhch5"  "agehhch6"  "agehhch7"  "agehhch8"  "agebuch1"  "agebuch2" 
## [31] "agebuch3"  "agebuch4"  "agebuch5"  "agebuch6"  "agebuch7"  "agebuch8" 
## [37] "chage1"    "chage2"    "chage3"    "chage4"    "chage5"    "chage6"   
## [43] "chage7"    "chage8"    "chage9"    "chage10"   "chage11"   "chage12"  
## [49] "chage13"   "chage14"   "chage15"   "chage16"   "ageg5"     "ageg5_bu" 
## [55] "ageg7"     "ageg7_bu"  "ageg10"    "ageg10_bu" "ageg3"     "ageg3_bu" 
## [61] "ageg3_spa" "spage"     "spage_bu"  "agehhldr1" "agehhldr2" "agehhldr3"
## [67] "agehhldr4" "mothage"   "magedied"  "fathage"   "fagedied"

names(w1)[grepl("sex", names(w1), ignore.case = TRUE)]

##  [1] "dhsex"     "disex"     "indsex"    "asex"      "sex"       "sex_p"    
##  [7] "sexbuhead" "sexhoh"    "chsex1"    "chsex2"    "chsex3"    "chsex4"   
## [13] "chsex5"    "chsex6"    "chsex7"    "chsex8"    "chsex9"    "chsex10"  
## [19] "chsex11"   "chsex12"   "chsex13"   "chsex14"   "chsex15"   "chsex16"

names(w1)[grepl("educ|qual|school", names(w1), ignore.case = TRUE)]

##  [1] "fqqual1"  "fqqual2"  "fqqual3"  "edqual.x" "aqual"    "aeducend"
##  [7] "edqual.y" "qual2"    "qual3"    "qual2_p"  "qual3_p"

names(w1)[grepl("ethnic|ethn|race", names(w1), ignore.case = TRUE)]

## [1] "fqethnr"  "aethnicr"

names(w1)[grepl("mar|partner|spouse|widow|single", names(w1), ignore.case = TRUE)]

## [1] "dimar"   "wpamar"  "partner" "marstat"

names(w1)[grepl("employ|work|job|retir", names(w1), ignore.case = TRUE)]

##  [1] "difjob"         "wpjob"          "wpjobl"         "wpsjoby"       
##  [5] "wpsjobm"        "wpcjob"         "wphjob"         "iawork"        
##  [9] "hojob"          "aeverjob"       "aemploye"       "astwork"       
## [13] "hhgriddhwork"   "hhgriddhwork_p" "worktime"       "everwork"      
## [17] "exwork"         "exworkb"        "exwork55"       "exwork55b"     
## [21] "exwork60"       "exwork60b"      "exwork65"       "exwork65b"     
## [25] "difjobm"

# Lifestyle variables
names(w1)[grepl("smok|cig", names(w1), ignore.case = TRUE)]

## [1] "hecig"      "smoker"     "smokerstat"

names(w1)[grepl("alcohol|drink", names(w1), ignore.case = TRUE)]

## character(0)

# BMI / body size
names(w1)[grepl("bmi|height|weight", names(w1), ignore.case = TRUE)]

## character(0)

# Mood / depression
names(w1)[grepl("depress|cesd|mood", names(w1), ignore.case = TRUE)]

## [1] "cesd_sc" "cesd_na"

# Disease-history blocks
names(w1)[grepl("^hedia", names(w1), ignore.case = TRUE)]

##  [1] "hedia01" "hedia02" "hedia03" "hedia04" "hedia05" "hedia06" "hedia07"
##  [8] "hedia08" "hedia09" "hedia10"

names(w1)[grepl("^hedib", names(w1), ignore.case = TRUE)]

##  [1] "hedib01" "hedib02" "hedib03" "hedib04" "hedib05" "hedib06" "hedib07"
##  [8] "hedib08" "hedib09" "hedib10"

# age
attr(w1$dhager, "label")

## [1] "Age variable from HH grid collapsed at 90 plus"

table(w1$dhager, useNA = "ifany")[1:10]

## 
## 20 30 31 32 33 34 35 36 37 38 
##  1  2  1  1  6  3  4  6  8 15

# sex
attr(w1$dhsex, "label")

## [1] "ASK OR CODE RESPONDENT~S SEX"

table(w1$dhsex, useNA = "ifany")

## 
##    1    2 
## 5335 6764

# education
attr(w1$edqual.x, "label")

## [1] "(D) Highest Educational Qualification at ELSA W1"

table(w1$edqual.x, useNA = "ifany")

## 
##   -9   -8   -1    1    2    3    4    5    6    7 
##    6   11   18 1388 1333  764 1974  582 1015 5008

# ethnicity
attr(w1$fqethnr, "label")

## [1] "ELSA ethnic group collapsed into White and Non-white to avoid disclosure"

table(w1$fqethnr, useNA = "ifany")

## 
##   -9   -8   -1    1    2 
##   12    2 6810 5111  164

# marital status
attr(w1$marstat, "label")

## [1] "marital status - couple1 combined with dimar"

table(w1$marstat, useNA = "ifany")

## 
##    1    2    3    4    5    6 
## 8035  504  575 1951  823  211

# smoking
attr(w1$smokerstat, "label")

## [1] "smoker status (past or present)"

table(w1$smokerstat, useNA = "ifany")

## 
##   -9   -8   -2    0    1    2    3    4 
##    5   11  175 4286  674 4342  445 2161

# depression
attr(w1$cesd_sc, "label")

## [1] "number of cesd questions answered yes"

summary(w1$cesd_sc)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -2.000   0.000   1.000   1.504   2.000   8.000

attr(w1$cesd_na, "label")

## [1] "number of cesd questions answered"

table(w1$cesd_na, useNA = "ifany")

## 
##    -2    -1     0     3     5     6     7     8 
##   136   175    67     2     5    15    87 11612

# alcohol - broader search
names(w1)[grepl("alco|drink|beer|wine|spirit|units", names(w1), ignore.case = TRUE)]

## character(0)

# BMI / body size - broader search
names(w1)[grepl("bmi|body|mass|height|weight|wt|ht", names(w1), ignore.case = TRUE)]

##  [1] "wpwtx"       "wpwtx2"      "wpwtx3"      "iashty1"     "iashty2"    
##  [6] "iashty3"     "iashty4"     "hhtot"       "refreshtype" "chtype1"    
## [11] "chtype2"     "chtype3"     "chtype4"     "chtype5"     "chtype6"    
## [16] "chtype7"     "chtype8"     "chtype9"     "chtype10"    "chtype11"   
## [21] "chtype12"    "chtype13"    "chtype14"    "chtype15"    "chtype16"   
## [26] "nright"

# employment - inspect strongest candidates
attr(w1$worktime, "label")

## [1] "Working full time or part time"

table(w1$worktime, useNA = "ifany")

## 
##   -8   -1    1    2 
##  118 7476 2741 1764

attr(w1$aemploye, "label")

## [1] "HSE Feed Forward: Are you …{an employee or self-employed}"

table(w1$aemploye, useNA = "ifany")

## 
##    -1     1     2 
##   563 10101  1435

attr(w1$everwork, "label")

## [1] "ever worked"

table(w1$everwork, useNA = "ifany")

## 
##    -9    -8    -2     0     1 
##     3     1    27   217 11851

# smoking labels
attributes(w1$smokerstat)

## $label
## [1] "smoker status (past or present)"
## 
## $format.stata
## [1] "%8.0g"
## 
## $labels
##                refused             don't know              not asked 
##                     -9                     -8                     -2 
##           never smoked ex smoker - occasional    ex smoker - regular 
##                      0                      1                      2 
##    ex smoker - DK freq         current smoker 
##                      3                      4 
## 
## $class
## [1] "haven_labelled" "vctrs_vctr"     "double"

# education labels
attributes(w1$edqual.x)

## $label
## [1] "(D) Highest Educational Qualification at ELSA W1"
## 
## $format.stata
## [1] "%8.0g"
## 
## $labels
##                    Refusal                 Don't know 
##                         -9                         -8 
##             Not applicable  NVQ4/NVQ5/Degree or equiv 
##                         -1                          1 
##     Higher ed below degree     NVQ3/GCE A Level equiv 
##                          2                          3 
##     NVQ2/GCE O Level equiv NVQ1/CSE other grade equiv 
##                          4                          5 
##              Foreign/other           No qualification 
##                          6                          7 
## 
## $class
## [1] "haven_labelled" "vctrs_vctr"     "double"

# ethnicity labels
attributes(w1$fqethnr)

## $label
## [1] "ELSA ethnic group collapsed into White and Non-white to avoid disclosure"
## 
## $format.stata
## [1] "%8.0g"
## 
## $labels
##        Refusal     Don't know Not applicable          White      Non-white 
##             -9             -8             -1              1              2 
## 
## $class
## [1] "haven_labelled" "vctrs_vctr"     "double"

# marital status labels
attributes(w1$marstat)

## $label
## [1] "marital status - couple1 combined with dimar"
## 
## $format.stata
## [1] "%8.0g"
## 
## $labels
## married (inc civ pship 06 onwards)                         cohabiting 
##                                  1                                  2 
##              single, never married                            widowed 
##                                  3                                  4 
##                           divorced                          separated 
##                                  5                                  6 
## 
## $class
## [1] "haven_labelled" "vctrs_vctr"     "double"

# alcohol - wider search
names(w1)[grepl("drnk|drink|alc|wine|beer|spirit|unit|pub", names(w1), ignore.case = TRUE)]

## [1] "healc"

# possible nurse / anthropometry style names for BMI
names(w1)[grepl("bm", names(w1), ignore.case = TRUE)]

##  [1] "wpsjobm" "iabm11"  "iabm12"  "iabm13"  "iabm14"  "iabm15"  "iabm16" 
##  [8] "iabm17"  "iabm18"  "iabm19"  "iabm20"  "iabm31"  "iabm32"  "iabm33" 
## [15] "iabm34"  "iabm35"  "iabm36"  "iabm37"  "iabm38"  "iabm39"  "iabm40" 
## [22] "iabm48"  "iabm49"  "iabm50"  "iabm51"  "iabm52"  "iabm53"  "iabm54" 
## [29] "iabm62"  "iabm63"  "iabm64"  "iabm65"  "iabm66"  "iabm67"  "iabm68" 
## [36] "iadebm"  "hobml"   "hobmu"   "hobme"   "hobmr"   "hohbm1"  "hohbm2" 
## [43] "hohbm3"  "difjobm"

names(w1)[grepl("ht", names(w1), ignore.case = TRUE)]

##  [1] "iashty1"     "iashty2"     "iashty3"     "iashty4"     "hhtot"      
##  [6] "refreshtype" "chtype1"     "chtype2"     "chtype3"     "chtype4"    
## [11] "chtype5"     "chtype6"     "chtype7"     "chtype8"     "chtype9"    
## [16] "chtype10"    "chtype11"    "chtype12"    "chtype13"    "chtype14"   
## [21] "chtype15"    "chtype16"    "nright"

names(w1)[grepl("wt", names(w1), ignore.case = TRUE)]

## [1] "wpwtx"  "wpwtx2" "wpwtx3"

# inspect the most plausible body-size candidates already found
attr(w1$wpwtx, "label")

## [1] "Is this before or after tax?"

table(w1$wpwtx, useNA = "ifany")

## 
##    -9    -8    -1     1     2 
##    22    34 11235   172   636

attr(w1$wpwtx2, "label")

## [1] "Is this before or after tax?"

table(w1$wpwtx2, useNA = "ifany")

## 
##    -9    -8    -1     1     2 
##     3     2 12025    16    53

attr(w1$wpwtx3, "label")

## [1] "Is this before or after tax?"

table(w1$wpwtx3, useNA = "ifany")

## 
##    -1     2 
## 12092     7

# alcohol
attr(w1$healc, "label")

## [1] "Do you now drink …? {a lot more..}"

table(w1$healc, useNA = "ifany")

## 
##    -8    -1     1     2     3     4 
##     3 10442    41   349   582   682

attributes(w1$healc)

## $label
## [1] "Do you now drink …? {a lot more..}"
## 
## $format.stata
## [1] "%8.0g"
## 
## $labels
##         Refusal      Don't Know  Not applicable ... a lot more,     a bit more, 
##              -9              -8              -1               1               2 
##     a bit less, or, a lot less? 
##               3               4 
## 
## $class
## [1] "haven_labelled" "vctrs_vctr"     "double"

# better search for height/weight/BMI
names(w1)[grepl("hei|highm|cm|metre|meter", names(w1), ignore.case = TRUE)]

##  [1] "heill"       "heins"       "heiqa"       "heiqb"       "heiqc"      
##  [6] "heiqd"       "heiqe"       "heiqf"       "heiqg"       "heiqh"      
## [11] "heiqi"       "heiqj"       "heiqk"       "heiql"       "heiqm"      
## [16] "heiqn"       "heiqo"       "heiqp"       "heiqq"       "heinct"     
## [21] "wplrcm"      "wplrcm2"     "hoincm1"     "hoincm2"     "hoincm3"    
## [26] "hoincm4"     "horpcm"      "gaselecmeth" "elecmeth"    "rentincme"

names(w1)[grepl("weigh|kilo|kg|stone|pound", names(w1), ignore.case = TRUE)]

## character(0)

names(w1)[grepl("bmi", names(w1), ignore.case = TRUE)]

## character(0)

# create disease indicator variables for Table 1
w1 <- w1 %>%
  mutate(
    htn_w1 = if_else(
      hedia01 == 1 | hedia02 == 1 | hedia03 == 1 | hedia04 == 1 | hedia05 == 1 |
      hedia06 == 1 | hedia07 == 1 | hedia08 == 1 | hedia09 == 1 | hedia10 == 1,
      1, 0
    ),
    hf_w1 = if_else(
      hedia01 == 4 | hedia02 == 4 | hedia03 == 4 | hedia04 == 4 | hedia05 == 4 |
      hedia06 == 4 | hedia07 == 4 | hedia08 == 4 | hedia09 == 4 | hedia10 == 4,
      1, 0
    ),
    arrhythmia_w1 = if_else(
      hedia01 == 6 | hedia02 == 6 | hedia03 == 6 | hedia04 == 6 | hedia05 == 6 |
      hedia06 == 6 | hedia07 == 6 | hedia08 == 6 | hedia09 == 6 | hedia10 == 6,
      1, 0
    ),
    diabetes_w1 = if_else(
      hedia01 == 7 | hedia02 == 7 | hedia03 == 7 | hedia04 == 7 | hedia05 == 7 |
      hedia06 == 7 | hedia07 == 7 | hedia08 == 7 | hedia09 == 7 | hedia10 == 7,
      1, 0
    ),
    stroke_w1 = if_else(
      hedia01 == 8 | hedia02 == 8 | hedia03 == 8 | hedia04 == 8 | hedia05 == 8 |
      hedia06 == 8 | hedia07 == 8 | hedia08 == 8 | hedia09 == 8 | hedia10 == 8,
      1, 0
    ),
    alz_w1 = if_else(
      hedib01 == 8 | hedib02 == 8 | hedib03 == 8 | hedib04 == 8 | hedib05 == 8 |
      hedib06 == 8 | hedib07 == 8 | hedib08 == 8 | hedib09 == 8 | hedib10 == 8,
      1, 0
    ),
    dementia_w1 = if_else(
      hedib01 == 9 | hedib02 == 9 | hedib03 == 9 | hedib04 == 9 | hedib05 == 9 |
      hedib06 == 9 | hedib07 == 9 | hedib08 == 9 | hedib09 == 9 | hedib10 == 9,
      1, 0
    )
  )

# check the new disease variables
table(w1$htn_w1, useNA = "ifany")

## 
##    0    1 
## 7627 4472

table(w1$hf_w1, useNA = "ifany")

## 
##     0     1 
## 12017    82

table(w1$arrhythmia_w1, useNA = "ifany")

## 
##     0     1 
## 11367   732

table(w1$diabetes_w1, useNA = "ifany")

## 
##     0     1 
## 11233   866

table(w1$stroke_w1, useNA = "ifany")

## 
##     0     1 
## 11588   511

table(w1$alz_w1, useNA = "ifany")

## 
##     0     1 
## 12085    14

table(w1$dementia_w1, useNA = "ifany")

## 
##     0     1 
## 12035    64

# keep a clean draft Table 1 dataset - no exclusions
table1_w1 <- w1 %>%
  transmute(
    idauniq,
    age = dhager,
    sex = as_factor(dhsex),
    education = as_factor(edqual.x),
    ethnicity = as_factor(fqethnr),
    marital_status = as_factor(marstat),
    employment = as_factor(worktime),
    smoking = as_factor(smokerstat),
    depression_score = cesd_sc,
    hypertension = htn_w1,
    heart_failure = hf_w1,
    abnormal_heart_rhythm = arrhythmia_w1,
    diabetes = diabetes_w1,
    stroke = stroke_w1,
    baseline_alzheimers = alz_w1,
    baseline_dementia = dementia_w1
  )
# quick checks
dim(table1_w1)

## [1] 12099    16

summary(table1_w1$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20.00   55.00   63.00   64.19   72.00   99.00

table(table1_w1$sex, useNA = "ifany")

## 
##        Refusal     Don't Know Not applicable           Male         Female 
##              0              0              0           5335           6764

table(table1_w1$education, useNA = "ifany")

## 
##                    Refusal                 Don't know 
##                          6                         11 
##             Not applicable  NVQ4/NVQ5/Degree or equiv 
##                         18                       1388 
##     Higher ed below degree     NVQ3/GCE A Level equiv 
##                       1333                        764 
##     NVQ2/GCE O Level equiv NVQ1/CSE other grade equiv 
##                       1974                        582 
##              Foreign/other           No qualification 
##                       1015                       5008

table(table1_w1$ethnicity, useNA = "ifany")

## 
##        Refusal     Don't know Not applicable          White      Non-white 
##             12              2           6810           5111            164

table(table1_w1$marital_status, useNA = "ifany")

## 
## married (inc civ pship 06 onwards)                         cohabiting 
##                               8035                                504 
##              single, never married                            widowed 
##                                575                               1951 
##                           divorced                          separated 
##                                823                                211

table(table1_w1$employment, useNA = "ifany")

## 
##          unknown      Not working Full time (>=35)        Part time 
##              118             7476             2741             1764

table(table1_w1$smoking, useNA = "ifany")

## 
##                refused             don't know              not asked 
##                      5                     11                    175 
##           never smoked ex smoker - occasional    ex smoker - regular 
##                   4286                    674                   4342 
##    ex smoker - DK freq         current smoker 
##                    445                   2161

summary(table1_w1$depression_score)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -2.000   0.000   1.000   1.504   2.000   8.000

table(table1_w1$hypertension, useNA = "ifany")

## 
##    0    1 
## 7627 4472

table(table1_w1$heart_failure, useNA = "ifany")

## 
##     0     1 
## 12017    82

table(table1_w1$abnormal_heart_rhythm, useNA = "ifany")

## 
##     0     1 
## 11367   732

table(table1_w1$diabetes, useNA = "ifany")

## 
##     0     1 
## 11233   866

table(table1_w1$stroke, useNA = "ifany")

## 
##     0     1 
## 11588   511

table(table1_w1$baseline_alzheimers, useNA = "ifany")

## 
##     0     1 
## 12085    14

table(table1_w1$baseline_dementia, useNA = "ifany")

## 
##     0     1 
## 12035    64

w1 <- w1 %>%
  mutate(
    w1_palevel = case_when(
      heacta %in% c(1, 2) ~ "High",
      !heacta %in% c(1, 2) & heactb %in% c(1, 2) ~ "Moderate",
      !heacta %in% c(1, 2) & !heactb %in% c(1, 2) & heactc %in% c(1, 2) ~ "Low",
      heacta %in% c(3, 4) & heactb %in% c(3, 4) & heactc %in% c(3, 4) ~ "Sedentary",
      TRUE ~ NA_character_
    )
  )

table(w1$w1_palevel, useNA = "ifany")

## 
##      High       Low  Moderate Sedentary      <NA> 
##      3302      1756      5607      1240       194

# add physical activity group to the draft Table 1 dataset
table1_w1 <- w1 %>%
  transmute(
    idauniq,
    pa_level = w1_palevel,
    age = dhager,
    sex = as_factor(dhsex),
    education = as_factor(edqual.x),
    ethnicity = as_factor(fqethnr),
    marital_status = as_factor(marstat),
    employment = as_factor(worktime),
    smoking = as_factor(smokerstat),
    depression_score = cesd_sc,
    hypertension = htn_w1,
    heart_failure = hf_w1,
    abnormal_heart_rhythm = arrhythmia_w1,
    diabetes = diabetes_w1,
    stroke = stroke_w1,
    baseline_alzheimers = alz_w1,
    baseline_dementia = dementia_w1
  )
# check PA distribution with no exclusions
table(table1_w1$pa_level, useNA = "ifany")

## 
##      High       Low  Moderate Sedentary      <NA> 
##      3302      1756      5607      1240       194

# mean age by PA level
table1_w1 %>%
  group_by(pa_level) %>%
  summarise(
    n = n(),
    mean_age = mean(age, na.rm = TRUE),
    sd_age = sd(age, na.rm = TRUE)
  )

## # A tibble: 5 × 4
##   pa_level      n mean_age sd_age
##   <chr>     <int>    <dbl>  <dbl>
## 1 High       3302     60.6   9.22
## 2 Low        1756     67.4  12.0 
## 3 Moderate   5607     63.6  10.4 
## 4 Sedentary  1240     70.8  12.2 
## 5 <NA>        194     69.3  15.0

# sex by PA level
table(table1_w1$pa_level, table1_w1$sex, useNA = "ifany")

##            
##             Refusal Don't Know Not applicable Male Female
##   High            0          0              0 1593   1709
##   Low             0          0              0  552   1204
##   Moderate        0          0              0 2507   3100
##   Sedentary       0          0              0  590    650
##   <NA>            0          0              0   93    101

# smoking by PA level
table(table1_w1$pa_level, table1_w1$smoking, useNA = "ifany")

##            
##             refused don't know not asked never smoked ex smoker - occasional
##   High            0          0         0         1309                    194
##   Low             0          1         0          587                     89
##   Moderate        0          1         0         1991                    343
##   Sedentary       0          0         0          395                     48
##   <NA>            5          9       175            4                      0
##            
##             ex smoker - regular ex smoker - DK freq current smoker
##   High                     1200                 147            452
##   Low                       621                  62            396
##   Moderate                 2022                 182           1068
##   Sedentary                 499                  54            244
##   <NA>                        0                   0              1

# disease variables by PA level
table(table1_w1$pa_level, table1_w1$hypertension, useNA = "ifany")

##            
##                0    1
##   High      2338  964
##   Low        960  796
##   Moderate  3533 2074
##   Sedentary  662  578
##   <NA>       134   60

table(table1_w1$pa_level, table1_w1$diabetes, useNA = "ifany")

##            
##                0    1
##   High      3172  130
##   Low       1572  184
##   Moderate  5240  367
##   Sedentary 1074  166
##   <NA>       175   19

table(table1_w1$pa_level, table1_w1$stroke, useNA = "ifany")

##            
##                0    1
##   High      3247   55
##   Low       1655  101
##   Moderate  5434  173
##   Sedentary 1088  152
##   <NA>       164   30

table(table1_w1$pa_level, table1_w1$abnormal_heart_rhythm, useNA = "ifany")

##            
##                0    1
##   High      3142  160
##   Low       1634  122
##   Moderate  5286  321
##   Sedentary 1128  112
##   <NA>       177   17

table(table1_w1$pa_level, table1_w1$heart_failure, useNA = "ifany")

##            
##                0    1
##   High      3296    6
##   Low       1730   26
##   Moderate  5587   20
##   Sedentary 1215   25
##   <NA>       189    5

table(table1_w1$pa_level, table1_w1$baseline_alzheimers, useNA = "ifany")

##            
##                0    1
##   High      3302    0
##   Low       1756    0
##   Moderate  5606    1
##   Sedentary 1233    7
##   <NA>       188    6

table(table1_w1$pa_level, table1_w1$baseline_dementia, useNA = "ifany")

##            
##                0    1
##   High      3297    5
##   Low       1748    8
##   Moderate  5595   12
##   Sedentary 1221   19
##   <NA>       174   20

table1_w1_clean <- table1_w1 %>%
  mutate(
    pa_level = factor(pa_level, levels = c("High", "Moderate", "Low", "Sedentary")),

    sex = fct_drop(sex),

    smoking3 = case_when(
      smoking == "never smoked" ~ "Never",
      smoking %in% c("ex smoker - occasional", "ex smoker - regular", "ex smoker - DK freq") ~ "Ex-smoker",
      smoking == "current smoker" ~ "Current",
      TRUE ~ NA_character_
    ),

    ethnicity2 = case_when(
      ethnicity == "White" ~ "White",
      ethnicity == "Non-white" ~ "Non-white",
      TRUE ~ NA_character_
    ),

    depression_score = if_else(depression_score < 0, NA_real_, as.numeric(depression_score))
  )

# check cleaned variables
table(table1_w1_clean$pa_level, useNA = "ifany")

## 
##      High  Moderate       Low Sedentary      <NA> 
##      3302      5607      1756      1240       194

table(table1_w1_clean$smoking3, useNA = "ifany")

## 
##   Current Ex-smoker     Never      <NA> 
##      2161      5461      4286       191

table(table1_w1_clean$ethnicity2, useNA = "ifany")

## 
## Non-white     White      <NA> 
##       164      5111      6824

summary(table1_w1_clean$depression_score)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.000   1.000   1.582   2.000   8.000     311

table1_summary <- table1_w1_clean %>%
  mutate(
    pa_level = forcats::fct_explicit_na(pa_level, na_level = "Missing PA")
  ) %>%
  group_by(pa_level, .drop = FALSE) %>%
  summarise(
    N = n(),
    `Age, mean (SD)` = sprintf("%.1f (%.1f)", mean(age, na.rm = TRUE), sd(age, na.rm = TRUE)),
    `Female, n (%)` = sprintf("%d (%.1f%%)",
                              sum(sex == "Female", na.rm = TRUE),
                              100 * sum(sex == "Female", na.rm = TRUE) / n()),
    `No qualification, n (%)` = sprintf("%d (%.1f%%)",
                                        sum(education == "No qualification", na.rm = TRUE),
                                        100 * sum(education == "No qualification", na.rm = TRUE) / n()),
    `Married/cohabiting, n (%)` = sprintf("%d (%.1f%%)",
                                          sum(marital_status %in% c("married (inc civ pship 06 onwards)", "cohabiting"), na.rm = TRUE),
                                          100 * sum(marital_status %in% c("married (inc civ pship 06 onwards)", "cohabiting"), na.rm = TRUE) / n()),
    `Working full/part time, n (%)` = sprintf("%d (%.1f%%)",
                                              sum(employment %in% c("Full time (>=35)", "Part time"), na.rm = TRUE),
                                              100 * sum(employment %in% c("Full time (>=35)", "Part time"), na.rm = TRUE) / n()),
    `Current smoker, n (%)` = sprintf("%d (%.1f%%)",
                                      sum(smoking3 == "Current", na.rm = TRUE),
                                      100 * sum(smoking3 == "Current", na.rm = TRUE) / n()),
    `Depression score, mean (SD)` = sprintf("%.2f (%.2f)",
                                            mean(depression_score, na.rm = TRUE),
                                            sd(depression_score, na.rm = TRUE)),
    `Hypertension, n (%)` = sprintf("%d (%.1f%%)",
                                    sum(hypertension == 1, na.rm = TRUE),
                                    100 * sum(hypertension == 1, na.rm = TRUE) / n()),
    `Diabetes, n (%)` = sprintf("%d (%.1f%%)",
                                sum(diabetes == 1, na.rm = TRUE),
                                100 * sum(diabetes == 1, na.rm = TRUE) / n()),
    `Stroke, n (%)` = sprintf("%d (%.1f%%)",
                              sum(stroke == 1, na.rm = TRUE),
                              100 * sum(stroke == 1, na.rm = TRUE) / n()),
    `Abnormal heart rhythm, n (%)` = sprintf("%d (%.1f%%)",
                                             sum(abnormal_heart_rhythm == 1, na.rm = TRUE),
                                             100 * sum(abnormal_heart_rhythm == 1, na.rm = TRUE) / n()),
    `Heart failure, n (%)` = sprintf("%d (%.1f%%)",
                                     sum(heart_failure == 1, na.rm = TRUE),
                                     100 * sum(heart_failure == 1, na.rm = TRUE) / n()),
    `Baseline Alzheimer’s, n (%)` = sprintf("%d (%.1f%%)",
                                            sum(baseline_alzheimers == 1, na.rm = TRUE),
                                            100 * sum(baseline_alzheimers == 1, na.rm = TRUE) / n()),
    `Baseline dementia, n (%)` = sprintf("%d (%.1f%%)",
                                         sum(baseline_dementia == 1, na.rm = TRUE),
                                         100 * sum(baseline_dementia == 1, na.rm = TRUE) / n())
  )

knitr::kable(
  table1_summary,
  caption = "Draft Table 1. Baseline characteristics by physical activity group, wave 1."
)

Draft Table 1. Baseline characteristics by physical activity group, wave 1.
pa_level	N	Age, mean (SD)	Female, n (%)	No qualification, n (%)	Married/cohabiting, n (%)	Working full/part time, n (%)	Current smoker, n (%)	Depression score, mean (SD)	Hypertension, n (%)	Diabetes, n (%)	Stroke, n (%)	Abnormal heart rhythm, n (%)	Heart failure, n (%)	Baseline Alzheimer’s, n (%)	Baseline dementia, n (%)
High	3302	60.6 (9.2)	1709 (51.8%)	935 (28.3%)	2582 (78.2%)	1753 (53.1%)	452 (13.7%)	1.06 (1.61)	964 (29.2%)	130 (3.9%)	55 (1.7%)	160 (4.8%)	6 (0.2%)	0 (0.0%)	5 (0.2%)
Moderate	5607	63.6 (10.4)	3100 (55.3%)	2159 (38.5%)	4067 (72.5%)	2200 (39.2%)	1068 (19.0%)	1.41 (1.88)	2074 (37.0%)	367 (6.5%)	173 (3.1%)	321 (5.7%)	20 (0.4%)	1 (0.0%)	12 (0.2%)
Low	1756	67.4 (12.0)	1204 (68.6%)	1051 (59.9%)	1049 (59.7%)	374 (21.3%)	396 (22.6%)	2.32 (2.22)	796 (45.3%)	184 (10.5%)	101 (5.8%)	122 (6.9%)	26 (1.5%)	0 (0.0%)	8 (0.5%)
Sedentary	1240	70.8 (12.2)	650 (52.4%)	766 (61.8%)	705 (56.9%)	176 (14.2%)	244 (19.7%)	2.73 (2.32)	578 (46.6%)	166 (13.4%)	152 (12.3%)	112 (9.0%)	25 (2.0%)	7 (0.6%)	19 (1.5%)
Missing PA	194	69.3 (15.0)	101 (52.1%)	97 (50.0%)	136 (70.1%)	2 (1.0%)	1 (0.5%)	0.25 (0.62)	60 (30.9%)	19 (9.8%)	30 (15.5%)	17 (8.8%)	5 (2.6%)	6 (3.1%)	20 (10.3%)

table1_summary_final <- table1_summary %>%
  mutate(N = as.integer(N))

knitr::kable(
  table1_summary_final,
  caption = "Table 1. Baseline characteristics by physical activity group at wave 1. No participants were excluded at this stage."
)

Table 1. Baseline characteristics by physical activity group at wave 1. No participants were excluded at this stage.
pa_level	N	Age, mean (SD)	Female, n (%)	No qualification, n (%)	Married/cohabiting, n (%)	Working full/part time, n (%)	Current smoker, n (%)	Depression score, mean (SD)	Hypertension, n (%)	Diabetes, n (%)	Stroke, n (%)	Abnormal heart rhythm, n (%)	Heart failure, n (%)	Baseline Alzheimer’s, n (%)	Baseline dementia, n (%)
High	3302	60.6 (9.2)	1709 (51.8%)	935 (28.3%)	2582 (78.2%)	1753 (53.1%)	452 (13.7%)	1.06 (1.61)	964 (29.2%)	130 (3.9%)	55 (1.7%)	160 (4.8%)	6 (0.2%)	0 (0.0%)	5 (0.2%)
Moderate	5607	63.6 (10.4)	3100 (55.3%)	2159 (38.5%)	4067 (72.5%)	2200 (39.2%)	1068 (19.0%)	1.41 (1.88)	2074 (37.0%)	367 (6.5%)	173 (3.1%)	321 (5.7%)	20 (0.4%)	1 (0.0%)	12 (0.2%)
Low	1756	67.4 (12.0)	1204 (68.6%)	1051 (59.9%)	1049 (59.7%)	374 (21.3%)	396 (22.6%)	2.32 (2.22)	796 (45.3%)	184 (10.5%)	101 (5.8%)	122 (6.9%)	26 (1.5%)	0 (0.0%)	8 (0.5%)
Sedentary	1240	70.8 (12.2)	650 (52.4%)	766 (61.8%)	705 (56.9%)	176 (14.2%)	244 (19.7%)	2.73 (2.32)	578 (46.6%)	166 (13.4%)	152 (12.3%)	112 (9.0%)	25 (2.0%)	7 (0.6%)	19 (1.5%)
Missing PA	194	69.3 (15.0)	101 (52.1%)	97 (50.0%)	136 (70.1%)	2 (1.0%)	1 (0.5%)	0.25 (0.62)	60 (30.9%)	19 (9.8%)	30 (15.5%)	17 (8.8%)	5 (2.6%)	6 (3.1%)	20 (10.3%)

table1_summary_wide <- table1_summary_final %>%
  mutate(N = as.character(N)) %>%
  pivot_longer(
    cols = -pa_level,
    names_to = "Characteristic",
    values_to = "Value"
  ) %>%
  pivot_wider(
    names_from = pa_level,
    values_from = Value
  ) %>%
  select(
    Characteristic,
    High,
    Moderate,
    Low,
    Sedentary,
    `Missing PA`
  )

knitr::kable(
  table1_summary_wide,
  caption = "Table 1. Baseline characteristics at wave 1 by physical activity category. No participants were excluded at this stage."
)

Table 1. Baseline characteristics at wave 1 by physical activity category. No participants were excluded at this stage.
Characteristic	High	Moderate	Low	Sedentary	Missing PA
N	3302	5607	1756	1240	194
Age, mean (SD)	60.6 (9.2)	63.6 (10.4)	67.4 (12.0)	70.8 (12.2)	69.3 (15.0)
Female, n (%)	1709 (51.8%)	3100 (55.3%)	1204 (68.6%)	650 (52.4%)	101 (52.1%)
No qualification, n (%)	935 (28.3%)	2159 (38.5%)	1051 (59.9%)	766 (61.8%)	97 (50.0%)
Married/cohabiting, n (%)	2582 (78.2%)	4067 (72.5%)	1049 (59.7%)	705 (56.9%)	136 (70.1%)
Working full/part time, n (%)	1753 (53.1%)	2200 (39.2%)	374 (21.3%)	176 (14.2%)	2 (1.0%)
Current smoker, n (%)	452 (13.7%)	1068 (19.0%)	396 (22.6%)	244 (19.7%)	1 (0.5%)
Depression score, mean (SD)	1.06 (1.61)	1.41 (1.88)	2.32 (2.22)	2.73 (2.32)	0.25 (0.62)
Hypertension, n (%)	964 (29.2%)	2074 (37.0%)	796 (45.3%)	578 (46.6%)	60 (30.9%)
Diabetes, n (%)	130 (3.9%)	367 (6.5%)	184 (10.5%)	166 (13.4%)	19 (9.8%)
Stroke, n (%)	55 (1.7%)	173 (3.1%)	101 (5.8%)	152 (12.3%)	30 (15.5%)
Abnormal heart rhythm, n (%)	160 (4.8%)	321 (5.7%)	122 (6.9%)	112 (9.0%)	17 (8.8%)
Heart failure, n (%)	6 (0.2%)	20 (0.4%)	26 (1.5%)	25 (2.0%)	5 (2.6%)
Baseline Alzheimer’s, n (%)	0 (0.0%)	1 (0.0%)	0 (0.0%)	7 (0.6%)	6 (3.1%)
Baseline dementia, n (%)	5 (0.2%)	12 (0.2%)	8 (0.5%)	19 (1.5%)	20 (10.3%)

#numbers of alzheimers and dementia in wave 1
w1 %>%
  summarise(
    alz_n = sum(alz_w1 == 1, na.rm = TRUE),
    dem_n = sum(dementia_w1 == 1, na.rm = TRUE),
    both_n = sum(alz_w1 == 1 & dementia_w1 == 1, na.rm = TRUE),
    either_n = sum(alz_w1 == 1 | dementia_w1 == 1, na.rm = TRUE)
  )

## # A tibble: 1 × 4
##   alz_n dem_n both_n either_n
##   <int> <int>  <int>    <int>
## 1    14    64      3       75

# people with both baseline Alzheimer's and baseline dementia wave1
both_alz_dem <- table1_w1_clean %>%
  filter(baseline_alzheimers == 1, baseline_dementia == 1)

# how many participants are there with both alzheimer's and dementia at baseline wave 1?
nrow(both_alz_dem)

## [1] 3

#what category of PA level are the participants who are categorised as both alzheimer's and dementia in?
both_alz_dem %>%
  count(pa_level, .drop = FALSE)

## # A tibble: 5 × 2
##   pa_level      n
##   <fct>     <int>
## 1 High          0
## 2 Moderate      0
## 3 Low           0
## 4 Sedentary     2
## 5 <NA>          1

# which IDs are the participants with both alzheimers and dementa at baseline wave 1?
both_alz_dem %>%
  select(idauniq, pa_level, baseline_alzheimers, baseline_dementia)

## # A tibble: 3 × 4
##   idauniq pa_level  baseline_alzheimers baseline_dementia
##     <dbl> <fct>                   <dbl>             <dbl>
## 1  106735 <NA>                        1                 1
## 2  108547 Sedentary                   1                 1
## 3  119099 Sedentary                   1                 1

#dementia follow up coding chunk
w2_dem <- w2_core %>%
  transmute(
    idauniq,
    dem_w2 = if_else(
      hedib01 == 9 | hedib02 == 9 | hedib03 == 9 | hedib04 == 9,
      1, 0
    )
  )

w3_dem <- w3_core %>%
  transmute(
    idauniq,
    dem_w3 = if_else(hedibde == 3, 1, 0)
  )

w4_dem <- w4_core %>% transmute(idauniq, dem_w4 = if_else(hedibde == 1, 1, 0))
w5_dem <- w5_core %>% transmute(idauniq, dem_w5 = if_else(hedibde == 1, 1, 0))
w6_dem <- w6_core %>% transmute(idauniq, dem_w6 = if_else(hedibde == 1, 1, 0))
w7_dem <- w7_core %>% transmute(idauniq, dem_w7 = if_else(hedibde == 1, 1, 0))
w8_dem <- w8_core %>% transmute(idauniq, dem_w8 = if_else(hedibde == 1, 1, 0))
w9_dem <- w9_core %>% transmute(idauniq, dem_w9 = if_else(hedibde == 1, 1, 0))

#follow up merge chunk
followup_w1 <- table1_w1_clean %>%
  mutate(
    baseline_alzheimers = as.numeric(baseline_alzheimers),
    baseline_dementia = as.numeric(baseline_dementia)
  ) %>%
  left_join(w2_dem, by = "idauniq") %>%
  left_join(w3_dem, by = "idauniq") %>%
  left_join(w4_dem, by = "idauniq") %>%
  left_join(w5_dem, by = "idauniq") %>%
  left_join(w6_dem, by = "idauniq") %>%
  left_join(w7_dem, by = "idauniq") %>%
  left_join(w8_dem, by = "idauniq") %>%
  left_join(w9_dem, by = "idauniq") %>%
  mutate(
    across(starts_with("dem_w"), ~replace_na(., 0))
  )

dim(followup_w1)

## [1] 12099    27

#analysis dataset chunk
analysis_w1 <- followup_w1 %>%
  mutate(
    prevalent_dem_alz_w1 = if_else(baseline_alzheimers == 1 | baseline_dementia == 1, 1, 0)
  ) %>%
  filter(
    prevalent_dem_alz_w1 == 0,
    !is.na(pa_level)
  ) %>%
  mutate(
    first_dem_wave = case_when(
      dem_w2 == 1 ~ 2,
      dem_w3 == 1 ~ 3,
      dem_w4 == 1 ~ 4,
      dem_w5 == 1 ~ 5,
      dem_w6 == 1 ~ 6,
      dem_w7 == 1 ~ 7,
      dem_w8 == 1 ~ 8,
      dem_w9 == 1 ~ 9,
      TRUE ~ NA_real_
    ),
    event_dementia = if_else(!is.na(first_dem_wave), 1, 0),
    time_to_event_waves = if_else(event_dementia == 1, first_dem_wave - 1, 8),
    pa_level = factor(pa_level, levels = c("High", "Moderate", "Low", "Sedentary")),
    current_smoker = if_else(smoking3 == "Current", 1, 0, missing = NA_real_),
    depression_binary = if_else(depression_score >= 4, 1, 0, missing = NA_real_)
  )

dim(analysis_w1)

## [1] 11855    33

table(analysis_w1$event_dementia, useNA = "ifany")

## 
##     0     1 
## 11366   489

table(analysis_w1$pa_level, useNA = "ifany")

## 
##      High  Moderate       Low Sedentary 
##      3297      5594      1748      1216

#cox model chunk
cox_unadjusted <- coxph(
  Surv(time_to_event_waves, event_dementia) ~ pa_level,
  data = analysis_w1
)

cox_adjusted <- coxph(
  Surv(time_to_event_waves, event_dementia) ~ pa_level + age + sex +
    current_smoker + hypertension + diabetes + stroke +
    abnormal_heart_rhythm + heart_failure + depression_binary,
  data = analysis_w1
)

#output chunk for document 
summary(cox_unadjusted)

## Call:
## coxph(formula = Surv(time_to_event_waves, event_dementia) ~ pa_level, 
##     data = analysis_w1)
## 
##   n= 11855, number of events= 489 
## 
##                     coef exp(coef) se(coef)     z Pr(>|z|)    
## pa_levelModerate  0.5030    1.6536   0.1265 3.976 7.01e-05 ***
## pa_levelLow       0.7862    2.1950   0.1486 5.291 1.21e-07 ***
## pa_levelSedentary 0.8453    2.3287   0.1608 5.258 1.46e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                   exp(coef) exp(-coef) lower .95 upper .95
## pa_levelModerate      1.654     0.6047     1.291     2.119
## pa_levelLow           2.195     0.4556     1.640     2.937
## pa_levelSedentary     2.329     0.4294     1.699     3.191
## 
## Concordance= 0.577  (se = 0.012 )
## Likelihood ratio test= 39.91  on 3 df,   p=1e-08
## Wald test            = 37.44  on 3 df,   p=4e-08
## Score (logrank) test = 38.81  on 3 df,   p=2e-08

summary(cox_adjusted)

## Call:
## coxph(formula = Surv(time_to_event_waves, event_dementia) ~ pa_level + 
##     age + sex + current_smoker + hypertension + diabetes + stroke + 
##     abnormal_heart_rhythm + heart_failure + depression_binary, 
##     data = analysis_w1)
## 
##   n= 11728, number of events= 485 
##    (127 observations deleted due to missingness)
## 
##                           coef exp(coef) se(coef)      z Pr(>|z|)    
## pa_levelModerate       0.24344   1.27563  0.12921  1.884   0.0595 .  
## pa_levelLow            0.18022   1.19748  0.15806  1.140   0.2542    
## pa_levelSedentary     -0.01925   0.98094  0.17645 -0.109   0.9131    
## age                    0.06162   1.06356  0.00423 14.566   <2e-16 ***
## sexFemale              0.19684   1.21754  0.09505  2.071   0.0384 *  
## current_smoker        -0.13331   0.87520  0.14047 -0.949   0.3426    
## hypertension           0.09580   1.10054  0.09312  1.029   0.3036    
## diabetes               0.16264   1.17661  0.15589  1.043   0.2968    
## stroke                 0.23524   1.26521  0.17619  1.335   0.1818    
## abnormal_heart_rhythm  0.30517   1.35685  0.15522  1.966   0.0493 *  
## heart_failure         -0.70697   0.49313  0.58253 -1.214   0.2249    
## depression_binary      0.24828   1.28182  0.11251  2.207   0.0273 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                       exp(coef) exp(-coef) lower .95 upper .95
## pa_levelModerate         1.2756     0.7839    0.9903     1.643
## pa_levelLow              1.1975     0.8351    0.8785     1.632
## pa_levelSedentary        0.9809     1.0194    0.6941     1.386
## age                      1.0636     0.9402    1.0548     1.072
## sexFemale                1.2175     0.8213    1.0106     1.467
## current_smoker           0.8752     1.1426    0.6646     1.153
## hypertension             1.1005     0.9086    0.9169     1.321
## diabetes                 1.1766     0.8499    0.8668     1.597
## stroke                   1.2652     0.7904    0.8957     1.787
## abnormal_heart_rhythm    1.3569     0.7370    1.0009     1.839
## heart_failure            0.4931     2.0278    0.1574     1.545
## depression_binary        1.2818     0.7801    1.0282     1.598
## 
## Concordance= 0.74  (se = 0.01 )
## Likelihood ratio test= 302  on 12 df,   p=<2e-16
## Wald test            = 310.1  on 12 df,   p=<2e-16
## Score (logrank) test = 330  on 12 df,   p=<2e-16

Wave 2 Table 1

w2_derived <- read_dta("raw data/RAW_data_stata/wave_2_derived_variables.dta")
w2 <- left_join(w2_core, w2_derived, by = "idauniq")

w2 <- w2 %>%
  mutate(
    htn_w2 = if_else(
      hedia01 == 1 | hedia02 == 1 | hedia03 == 1 | hedia04 == 1,
      1, 0
    ),
    hf_w2 = if_else(
      hedia01 == 4 | hedia02 == 4 | hedia03 == 4 | hedia04 == 4,
      1, 0
    ),
    arrhythmia_w2 = if_else(
      hedia01 == 6 | hedia02 == 6 | hedia03 == 6 | hedia04 == 6,
      1, 0
    ),
    diabetes_w2 = if_else(
      hedia01 == 7 | hedia02 == 7 | hedia03 == 7 | hedia04 == 7,
      1, 0
    ),
    stroke_w2 = if_else(
      hedia01 == 8 | hedia02 == 8 | hedia03 == 8 | hedia04 == 8,
      1, 0
    ),
    alz_w2 = if_else(
      hedib01 == 8 | hedib02 == 8 | hedib03 == 8 | hedib04 == 8,
      1, 0
    ),
    dementia_w2 = if_else(
      hedib01 == 9 | hedib02 == 9 | hedib03 == 9 | hedib04 == 9,
      1, 0
    ),
    pa_level = case_when(
      palevel == 3 ~ "High",
      palevel == 2 ~ "Moderate",
      palevel == 1 ~ "Low",
      palevel == 0 ~ "Sedentary",
      TRUE ~ NA_character_
    )
  )

w2_table1 <- w2 %>%
  transmute(
    idauniq,
    pa_level = factor(pa_level, levels = c("High", "Moderate", "Low", "Sedentary")),
    age = dhager,
    sex = as_factor(DhSex),
    hypertension = htn_w2,
    heart_failure = hf_w2,
    abnormal_heart_rhythm = arrhythmia_w2,
    diabetes = diabetes_w2,
    stroke = stroke_w2,
    baseline_alzheimers = alz_w2,
    baseline_dementia = dementia_w2
  )

w2_table1_summary <- w2_table1 %>%
  mutate(
    pa_level = forcats::fct_explicit_na(pa_level, na_level = "Missing PA")
  ) %>%
  group_by(pa_level, .drop = FALSE) %>%
  summarise(
    N = n(),
    `Age, mean (SD)` = sprintf("%.1f (%.1f)", mean(age, na.rm = TRUE), sd(age, na.rm = TRUE)),
    `Female, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(sex == "Female", na.rm = TRUE),
      100 * sum(sex == "Female", na.rm = TRUE) / n()
    ),
    `Hypertension, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(hypertension == 1, na.rm = TRUE),
      100 * sum(hypertension == 1, na.rm = TRUE) / n()
    ),
    `Diabetes, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(diabetes == 1, na.rm = TRUE),
      100 * sum(diabetes == 1, na.rm = TRUE) / n()
    ),
    `Stroke, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(stroke == 1, na.rm = TRUE),
      100 * sum(stroke == 1, na.rm = TRUE) / n()
    ),
    `Abnormal heart rhythm, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(abnormal_heart_rhythm == 1, na.rm = TRUE),
      100 * sum(abnormal_heart_rhythm == 1, na.rm = TRUE) / n()
    ),
    `Heart failure, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(heart_failure == 1, na.rm = TRUE),
      100 * sum(heart_failure == 1, na.rm = TRUE) / n()
    ),
    `Baseline Alzheimer’s, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_alzheimers == 1, na.rm = TRUE),
      100 * sum(baseline_alzheimers == 1, na.rm = TRUE) / n()
    ),
    `Baseline dementia, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_dementia == 1, na.rm = TRUE),
      100 * sum(baseline_dementia == 1, na.rm = TRUE) / n()
    )
  )

w2_table1_summary_wide <- w2_table1_summary %>%
  mutate(N = as.character(N)) %>%
  pivot_longer(
    cols = -pa_level,
    names_to = "Characteristic",
    values_to = "Value"
  ) %>%
  pivot_wider(
    names_from = pa_level,
    values_from = Value
  ) %>%
  select(
    Characteristic,
    High,
    Moderate,
    Low,
    Sedentary,
    `Missing PA`
  )

knitr::kable(
  w2_table1_summary_wide,
  caption = "Table 1. Baseline characteristics at wave 2 by physical activity category."
)

Table 1. Baseline characteristics at wave 2 by physical activity category.
Characteristic	High	Moderate	Low	Sedentary	Missing PA
N	1744	4684	2309	556	139
Age, mean (SD)	61.8 (8.7)	64.5 (9.8)	68.8 (11.2)	75.2 (11.6)	70.6 (15.8)
Female, n (%)	835 (47.9%)	2615 (55.8%)	1465 (63.4%)	325 (58.5%)	66 (47.5%)
Hypertension, n (%)	253 (14.5%)	784 (16.7%)	487 (21.1%)	156 (28.1%)	26 (18.7%)
Diabetes, n (%)	28 (1.6%)	151 (3.2%)	139 (6.0%)	55 (9.9%)	10 (7.2%)
Stroke, n (%)	3 (0.2%)	49 (1.0%)	59 (2.6%)	39 (7.0%)	13 (9.4%)
Abnormal heart rhythm, n (%)	48 (2.8%)	138 (2.9%)	79 (3.4%)	44 (7.9%)	5 (3.6%)
Heart failure, n (%)	0 (0.0%)	8 (0.2%)	8 (0.3%)	4 (0.7%)	0 (0.0%)
Baseline Alzheimer’s, n (%)	1 (0.1%)	2 (0.0%)	2 (0.1%)	1 (0.2%)	5 (3.6%)
Baseline dementia, n (%)	2 (0.1%)	11 (0.2%)	9 (0.4%)	7 (1.3%)	14 (10.1%)

## Wave 3 table 1

w3_table1_raw <- w3_core %>%
  mutate(
    hypertension = if_else(hediabp == 1, 1, 0, missing = 0),
    heart_failure = if_else(hediahf == 1, 1, 0, missing = 0),
    abnormal_heart_rhythm = if_else(hediaar == 1, 1, 0, missing = 0),
    diabetes = if_else(hediadi == 1, 1, 0, missing = 0),
    stroke = if_else(hediast == 1, 1, 0, missing = 0),

    baseline_alzheimers = if_else(hedibad %in% c(1, 2, 3), 1, 0, missing = 0),
    baseline_dementia   = if_else(hedibde %in% c(1, 2, 3), 1, 0, missing = 0),

    pa_level = case_when(
      palevel == 3 ~ "High",
      palevel == 2 ~ "Moderate",
      palevel == 1 ~ "Low",
      palevel == 0 ~ "Sedentary",
      TRUE ~ NA_character_
    )
  )

w3_table1 <- w3_table1_raw %>%
  transmute(
    idauniq,
    pa_level = factor(pa_level, levels = c("High", "Moderate", "Low", "Sedentary")),
    age = dhager,
    sex = as_factor(dhsex),
    hypertension,
    heart_failure,
    abnormal_heart_rhythm,
    diabetes,
    stroke,
    baseline_alzheimers,
    baseline_dementia
  )

w3_table1_summary <- w3_table1 %>%
  mutate(
    pa_level = forcats::fct_explicit_na(pa_level, na_level = "Missing PA")
  ) %>%
  group_by(pa_level, .drop = FALSE) %>%
  summarise(
    N = n(),
    `Age, mean (SD)` = sprintf("%.1f (%.1f)", mean(age, na.rm = TRUE), sd(age, na.rm = TRUE)),
    `Female, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(sex == "Female", na.rm = TRUE),
      100 * sum(sex == "Female", na.rm = TRUE) / n()
    ),
    `Hypertension, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(hypertension == 1, na.rm = TRUE),
      100 * sum(hypertension == 1, na.rm = TRUE) / n()
    ),
    `Diabetes, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(diabetes == 1, na.rm = TRUE),
      100 * sum(diabetes == 1, na.rm = TRUE) / n()
    ),
    `Stroke, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(stroke == 1, na.rm = TRUE),
      100 * sum(stroke == 1, na.rm = TRUE) / n()
    ),
    `Abnormal heart rhythm, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(abnormal_heart_rhythm == 1, na.rm = TRUE),
      100 * sum(abnormal_heart_rhythm == 1, na.rm = TRUE) / n()
    ),
    `Heart failure, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(heart_failure == 1, na.rm = TRUE),
      100 * sum(heart_failure == 1, na.rm = TRUE) / n()
    ),
    `Baseline Alzheimer’s, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_alzheimers == 1, na.rm = TRUE),
      100 * sum(baseline_alzheimers == 1, na.rm = TRUE) / n()
    ),
    `Baseline dementia, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_dementia == 1, na.rm = TRUE),
      100 * sum(baseline_dementia == 1, na.rm = TRUE) / n()
    )
  )

w3_table1_summary_wide <- w3_table1_summary %>%
  mutate(N = as.character(N)) %>%
  pivot_longer(
    cols = -pa_level,
    names_to = "Characteristic",
    values_to = "Value"
  ) %>%
  pivot_wider(
    names_from = pa_level,
    values_from = Value
  ) %>%
  select(
    Characteristic,
    High,
    Moderate,
    Low,
    Sedentary,
    `Missing PA`
  )

knitr::kable(
  w3_table1_summary_wide,
  caption = "Table 1. Baseline characteristics at wave 3 by physical activity category."
)

Table 1. Baseline characteristics at wave 3 by physical activity category.
Characteristic	High	Moderate	Low	Sedentary	Missing PA
N	1969	4838	2263	686	15
Age, mean (SD)	59.8 (8.9)	63.3 (10.5)	68.1 (12.0)	74.7 (14.2)	61.6 (12.2)
Female, n (%)	939 (47.7%)	2669 (55.2%)	1458 (64.4%)	399 (58.2%)	11 (73.3%)
Hypertension, n (%)	174 (8.8%)	473 (9.8%)	221 (9.8%)	59 (8.6%)	0 (0.0%)
Diabetes, n (%)	36 (1.8%)	115 (2.4%)	75 (3.3%)	30 (4.4%)	0 (0.0%)
Stroke, n (%)	15 (0.8%)	31 (0.6%)	30 (1.3%)	31 (4.5%)	0 (0.0%)
Abnormal heart rhythm, n (%)	44 (2.2%)	90 (1.9%)	62 (2.7%)	21 (3.1%)	0 (0.0%)
Heart failure, n (%)	1 (0.1%)	1 (0.0%)	8 (0.4%)	4 (0.6%)	0 (0.0%)
Baseline Alzheimer’s, n (%)	1 (0.1%)	5 (0.1%)	9 (0.4%)	17 (2.5%)	0 (0.0%)
Baseline dementia, n (%)	3 (0.2%)	20 (0.4%)	33 (1.5%)	56 (8.2%)	0 (0.0%)

wave 4 table 1

w4_table1_raw <- w4_core %>%
  mutate(
    hypertension = if_else(hediabp == 1, 1, 0, missing = 0),
    heart_failure = if_else(hediahf == 1, 1, 0, missing = 0),
    abnormal_heart_rhythm = if_else(hediaar == 1, 1, 0, missing = 0),
    diabetes = if_else(hediadi == 1, 1, 0, missing = 0),
    stroke = if_else(hediast == 1, 1, 0, missing = 0),

    baseline_alzheimers = if_else(hedibad == 1, 1, 0, missing = 0),
    baseline_dementia   = if_else(hedibde == 1, 1, 0, missing = 0),

    pa_level = case_when(
      palevel == 3 ~ "High",
      palevel == 2 ~ "Moderate",
      palevel == 1 ~ "Low",
      palevel == 0 ~ "Sedentary",
      TRUE ~ NA_character_
    )
  )

w4_table1 <- w4_table1_raw %>%
  transmute(
    idauniq,
    pa_level = factor(pa_level, levels = c("High", "Moderate", "Low", "Sedentary")),
    age = indager,
    sex = as_factor(dhsex),
    hypertension,
    heart_failure,
    abnormal_heart_rhythm,
    diabetes,
    stroke,
    baseline_alzheimers,
    baseline_dementia
  )

w4_table1_summary <- w4_table1 %>%
  mutate(
    pa_level = forcats::fct_explicit_na(pa_level, na_level = "Missing PA")
  ) %>%
  group_by(pa_level, .drop = FALSE) %>%
  summarise(
    N = n(),
    `Age, mean (SD)` = sprintf("%.1f (%.1f)", mean(age, na.rm = TRUE), sd(age, na.rm = TRUE)),
    `Female, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(sex == "Female", na.rm = TRUE),
      100 * sum(sex == "Female", na.rm = TRUE) / n()
    ),
    `Hypertension, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(hypertension == 1, na.rm = TRUE),
      100 * sum(hypertension == 1, na.rm = TRUE) / n()
    ),
    `Diabetes, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(diabetes == 1, na.rm = TRUE),
      100 * sum(diabetes == 1, na.rm = TRUE) / n()
    ),
    `Stroke, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(stroke == 1, na.rm = TRUE),
      100 * sum(stroke == 1, na.rm = TRUE) / n()
    ),
    `Abnormal heart rhythm, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(abnormal_heart_rhythm == 1, na.rm = TRUE),
      100 * sum(abnormal_heart_rhythm == 1, na.rm = TRUE) / n()
    ),
    `Heart failure, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(heart_failure == 1, na.rm = TRUE),
      100 * sum(heart_failure == 1, na.rm = TRUE) / n()
    ),
    `Baseline Alzheimer’s, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_alzheimers == 1, na.rm = TRUE),
      100 * sum(baseline_alzheimers == 1, na.rm = TRUE) / n()
    ),
    `Baseline dementia, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_dementia == 1, na.rm = TRUE),
      100 * sum(baseline_dementia == 1, na.rm = TRUE) / n()
    )
  )

w4_table1_summary_wide <- w4_table1_summary %>%
  mutate(N = as.character(N)) %>%
  pivot_longer(
    cols = -pa_level,
    names_to = "Characteristic",
    values_to = "Value"
  ) %>%
  pivot_wider(
    names_from = pa_level,
    values_from = Value
  ) %>%
  select(
    Characteristic,
    High,
    Moderate,
    Low,
    Sedentary,
    `Missing PA`
  )

knitr::kable(
  w4_table1_summary_wide,
  caption = "Table 1. Baseline characteristics at wave 4 by physical activity category."
)

Table 1. Baseline characteristics at wave 4 by physical activity category.
Characteristic	High	Moderate	Low	Sedentary	Missing PA
N	2254	5384	2562	835	15
Age, mean (SD)	61.7 (8.4)	63.9 (9.5)	68.2 (11.3)	74.2 (12.5)	59.1 (9.2)
Female, n (%)	1070 (47.5%)	2929 (54.4%)	1649 (64.4%)	472 (56.5%)	5 (33.3%)
Hypertension, n (%)	201 (8.9%)	603 (11.2%)	350 (13.7%)	135 (16.2%)	0 (0.0%)
Diabetes, n (%)	43 (1.9%)	149 (2.8%)	125 (4.9%)	61 (7.3%)	1 (6.7%)
Stroke, n (%)	7 (0.3%)	55 (1.0%)	46 (1.8%)	68 (8.1%)	0 (0.0%)
Abnormal heart rhythm, n (%)	32 (1.4%)	111 (2.1%)	88 (3.4%)	45 (5.4%)	0 (0.0%)
Heart failure, n (%)	1 (0.0%)	3 (0.1%)	6 (0.2%)	15 (1.8%)	0 (0.0%)
Baseline Alzheimer’s, n (%)	0 (0.0%)	5 (0.1%)	3 (0.1%)	29 (3.5%)	0 (0.0%)
Baseline dementia, n (%)	1 (0.0%)	17 (0.3%)	19 (0.7%)	60 (7.2%)	0 (0.0%)

wave 5 table 1

## Wave 5 table 1
w5_table1_raw <- w5_core %>%
  mutate(
    hypertension = if_else(hediabp == 1, 1, 0, missing = 0),
    heart_failure = if_else(hediahf == 1, 1, 0, missing = 0),
    abnormal_heart_rhythm = if_else(hediaar == 1, 1, 0, missing = 0),
    diabetes = if_else(hediadi == 1, 1, 0, missing = 0),
    stroke = if_else(hediast == 1, 1, 0, missing = 0),

    baseline_alzheimers = if_else(hedibad == 1, 1, 0, missing = 0),
    baseline_dementia   = if_else(hedibde == 1, 1, 0, missing = 0),

    pa_level = case_when(
      palevel == 3 ~ "High",
      palevel == 2 ~ "Moderate",
      palevel == 1 ~ "Low",
      palevel == 0 ~ "Sedentary",
      TRUE ~ NA_character_
    )
  )
# ORIGINAL VERSION OF W5 TABLE 1 PERFORMED FIRST - SEE BELOW FOR AMENDED FINAL VERSION
# Original version produced implausible age summaries because wave 5 #..indager contains negative
# ..special missing codes, which distorted the mean and SD if not #..recoded.
# Kept here as part of the analytic process to include in methods

w5_table1 <- w5_table1_raw %>%
  transmute(
    idauniq,
    pa_level = factor(pa_level, levels = c("High", "Moderate", "Low", "Sedentary")),
    age = indager,
    sex = as_factor(dhsex),
    hypertension,
    heart_failure,
    abnormal_heart_rhythm,
    diabetes,
    stroke,
    baseline_alzheimers,
    baseline_dementia
  )

# Wave 5 table 1 CORRECTED VERSION
# Wave 5 indager includes negative values representing special missing codes.
# These were recoded to NA before summary statistics were generated for wave 5 table 1 amended final version

w5_table1 <- w5_table1_raw %>%
  transmute(
    idauniq,
    pa_level = factor(pa_level, levels = c("High", "Moderate", "Low", "Sedentary")),
    age = if_else(indager < 0, NA_real_, as.numeric(indager)),
    sex = as_factor(dhsex),
    hypertension,
    heart_failure,
    abnormal_heart_rhythm,
    diabetes,
    stroke,
    baseline_alzheimers,
    baseline_dementia
  )

w5_table1_summary <- w5_table1 %>%
  mutate(
    pa_level = forcats::fct_explicit_na(pa_level, na_level = "Missing PA")
  ) %>%
  group_by(pa_level, .drop = FALSE) %>%
  summarise(
    N = n(),
    `Age, mean (SD)` = sprintf("%.1f (%.1f)", mean(age, na.rm = TRUE), sd(age, na.rm = TRUE)),
    `Female, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(sex == "Female", na.rm = TRUE),
      100 * sum(sex == "Female", na.rm = TRUE) / n()
    ),
    `Hypertension, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(hypertension == 1, na.rm = TRUE),
      100 * sum(hypertension == 1, na.rm = TRUE) / n()
    ),
    `Diabetes, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(diabetes == 1, na.rm = TRUE),
      100 * sum(diabetes == 1, na.rm = TRUE) / n()
    ),
    `Stroke, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(stroke == 1, na.rm = TRUE),
      100 * sum(stroke == 1, na.rm = TRUE) / n()
    ),
    `Abnormal heart rhythm, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(abnormal_heart_rhythm == 1, na.rm = TRUE),
      100 * sum(abnormal_heart_rhythm == 1, na.rm = TRUE) / n()
    ),
    `Heart failure, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(heart_failure == 1, na.rm = TRUE),
      100 * sum(heart_failure == 1, na.rm = TRUE) / n()
    ),
    `Baseline Alzheimer’s, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_alzheimers == 1, na.rm = TRUE),
      100 * sum(baseline_alzheimers == 1, na.rm = TRUE) / n()
    ),
    `Baseline dementia, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_dementia == 1, na.rm = TRUE),
      100 * sum(baseline_dementia == 1, na.rm = TRUE) / n()
    )
  )

w5_table1_summary_wide <- w5_table1_summary %>%
  mutate(N = as.character(N)) %>%
  pivot_longer(
    cols = -pa_level,
    names_to = "Characteristic",
    values_to = "Value"
  ) %>%
  pivot_wider(
    names_from = pa_level,
    values_from = Value
  ) %>%
  select(
    Characteristic,
    High,
    Moderate,
    Low,
    Sedentary,
    `Missing PA`
  )

knitr::kable(
  w5_table1_summary_wide,
  caption = "Table 1. Baseline characteristics at wave 5 by physical activity category."
)

Table 1. Baseline characteristics at wave 5 by physical activity category.
Characteristic	High	Moderate	Low	Sedentary	Missing PA
N	1981	4936	2432	801	124
Age, mean (SD)	62.9 (7.8)	65.5 (8.9)	68.9 (10.0)	73.0 (10.2)	60.7 (5.5)
Female, n (%)	934 (47.1%)	2736 (55.4%)	1507 (62.0%)	472 (58.9%)	56 (45.2%)
Hypertension, n (%)	141 (7.1%)	335 (6.8%)	202 (8.3%)	90 (11.2%)	8 (6.5%)
Diabetes, n (%)	33 (1.7%)	112 (2.3%)	93 (3.8%)	49 (6.1%)	1 (0.8%)
Stroke, n (%)	10 (0.5%)	32 (0.6%)	43 (1.8%)	64 (8.0%)	0 (0.0%)
Abnormal heart rhythm, n (%)	25 (1.3%)	99 (2.0%)	78 (3.2%)	52 (6.5%)	2 (1.6%)
Heart failure, n (%)	1 (0.1%)	4 (0.1%)	8 (0.3%)	11 (1.4%)	0 (0.0%)
Baseline Alzheimer’s, n (%)	0 (0.0%)	7 (0.1%)	6 (0.2%)	33 (4.1%)	0 (0.0%)
Baseline dementia, n (%)	0 (0.0%)	11 (0.2%)	26 (1.1%)	73 (9.1%)	0 (0.0%)

wave 6 table 1

w6_table1_raw <- w6_core %>%
  mutate(
    hypertension = if_else(hediabp == 1, 1, 0, missing = 0),
    heart_failure = if_else(hediahf == 1, 1, 0, missing = 0),
    abnormal_heart_rhythm = if_else(hediaar == 1, 1, 0, missing = 0),
    diabetes = if_else(hediadi == 1, 1, 0, missing = 0),
    stroke = if_else(hediast == 1, 1, 0, missing = 0),

    baseline_alzheimers = if_else(hedibad == 1, 1, 0, missing = 0),
    baseline_dementia   = if_else(hedibde == 1, 1, 0, missing = 0),

    pa_level = case_when(
      HeActa %in% c(1, 2) ~ "High",
      !HeActa %in% c(1, 2) & HeActb %in% c(1, 2) ~ "Moderate",
      !HeActa %in% c(1, 2) & !HeActb %in% c(1, 2) & HeActc %in% c(1, 2) ~ "Low",
      HeActa %in% c(3, 4) & HeActb %in% c(3, 4) & HeActc %in% c(3, 4) ~ "Sedentary",
      TRUE ~ NA_character_
    )
  )

# ORIGINAL VERSION OF W6 AGE HANDLING (kept as methodological note)
# age = indager
# This is not used in final summaries because negative indager values are special missing codes.

# wave 6 table 1 CORRECTED VERSION
w6_table1 <- w6_table1_raw %>%
  transmute(
    idauniq,
    pa_level = factor(pa_level, levels = c("High", "Moderate", "Low", "Sedentary")),
    age = if_else(indager < 0, NA_real_, as.numeric(indager)),
    sex = as_factor(DhSex),
    hypertension,
    heart_failure,
    abnormal_heart_rhythm,
    diabetes,
    stroke,
    baseline_alzheimers,
    baseline_dementia
  )

# diagnostic age check
w6_table1 %>%
  group_by(pa_level) %>%
  summarise(
    min_age = min(age, na.rm = TRUE),
    q1_age = quantile(age, 0.25, na.rm = TRUE),
    median_age = median(age, na.rm = TRUE),
    mean_age = mean(age, na.rm = TRUE),
    q3_age = quantile(age, 0.75, na.rm = TRUE),
    max_age = max(age, na.rm = TRUE),
    sd_age = sd(age, na.rm = TRUE),
    n_missing_age = sum(is.na(age))
  )

## # A tibble: 5 × 9
##   pa_level  min_age q1_age median_age mean_age q3_age max_age sd_age
##   <fct>       <dbl>  <dbl>      <dbl>    <dbl>  <dbl>   <dbl>  <dbl>
## 1 High           28   57           63     63.3   69        89   8.44
## 2 Moderate       31   59           65     65.8   73        89   9.35
## 3 Low            41   61           69     69.4   78        89  10.5 
## 4 Sedentary      40   62           72     71.1   80        89  10.8 
## 5 <NA>           63   66.8         71     73     77.2      87  10.4 
## # ℹ 1 more variable: n_missing_age <int>

w6_table1_summary <- w6_table1 %>%
  mutate(
    pa_level = forcats::fct_explicit_na(pa_level, na_level = "Missing PA")
  ) %>%
  group_by(pa_level, .drop = FALSE) %>%
  summarise(
    N = n(),
    `Age, mean (SD)` = sprintf("%.1f (%.1f)", mean(age, na.rm = TRUE), sd(age, na.rm = TRUE)),
    `Female, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(sex == "Female", na.rm = TRUE),
      100 * sum(sex == "Female", na.rm = TRUE) / n()
    ),
    `Hypertension, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(hypertension == 1, na.rm = TRUE),
      100 * sum(hypertension == 1, na.rm = TRUE) / n()
    ),
    `Diabetes, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(diabetes == 1, na.rm = TRUE),
      100 * sum(diabetes == 1, na.rm = TRUE) / n()
    ),
    `Stroke, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(stroke == 1, na.rm = TRUE),
      100 * sum(stroke == 1, na.rm = TRUE) / n()
    ),
    `Abnormal heart rhythm, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(abnormal_heart_rhythm == 1, na.rm = TRUE),
      100 * sum(abnormal_heart_rhythm == 1, na.rm = TRUE) / n()
    ),
    `Heart failure, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(heart_failure == 1, na.rm = TRUE),
      100 * sum(heart_failure == 1, na.rm = TRUE) / n()
    ),
    `Baseline Alzheimer’s, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_alzheimers == 1, na.rm = TRUE),
      100 * sum(baseline_alzheimers == 1, na.rm = TRUE) / n()
    ),
    `Baseline dementia, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_dementia == 1, na.rm = TRUE),
      100 * sum(baseline_dementia == 1, na.rm = TRUE) / n()
    )
  )

w6_table1_summary_wide <- w6_table1_summary %>%
  mutate(N = as.character(N)) %>%
  pivot_longer(
    cols = -pa_level,
    names_to = "Characteristic",
    values_to = "Value"
  ) %>%
  pivot_wider(
    names_from = pa_level,
    values_from = Value
  ) %>%
  select(
    Characteristic,
    High,
    Moderate,
    Low,
    Sedentary,
    `Missing PA`
  )

knitr::kable(
  w6_table1_summary_wide,
  caption = "Table 1. Baseline characteristics at wave 6 by physical activity category."
)

Table 1. Baseline characteristics at wave 6 by physical activity category.
Characteristic	High	Moderate	Low	Sedentary	Missing PA
N	3156	4810	1626	1005	4
Age, mean (SD)	63.3 (8.4)	65.8 (9.4)	69.4 (10.5)	71.1 (10.8)	73.0 (10.4)
Female, n (%)	1526 (48.4%)	2713 (56.4%)	1087 (66.9%)	527 (52.4%)	4 (100.0%)
Hypertension, n (%)	217 (6.9%)	350 (7.3%)	117 (7.2%)	138 (13.7%)	0 (0.0%)
Diabetes, n (%)	57 (1.8%)	130 (2.7%)	50 (3.1%)	79 (7.9%)	0 (0.0%)
Stroke, n (%)	10 (0.3%)	47 (1.0%)	44 (2.7%)	54 (5.4%)	0 (0.0%)
Abnormal heart rhythm, n (%)	69 (2.2%)	107 (2.2%)	72 (4.4%)	38 (3.8%)	0 (0.0%)
Heart failure, n (%)	2 (0.1%)	6 (0.1%)	4 (0.2%)	13 (1.3%)	0 (0.0%)
Baseline Alzheimer’s, n (%)	2 (0.1%)	7 (0.1%)	4 (0.2%)	37 (3.7%)	0 (0.0%)
Baseline dementia, n (%)	2 (0.1%)	15 (0.3%)	18 (1.1%)	77 (7.7%)	0 (0.0%)

wave 7 table 1

## Wave 7 table 1

w7_table1_raw <- w7_core %>%
  mutate(
    hypertension = if_else(hediabp == 1, 1, 0, missing = 0),
    heart_failure = if_else(hediahf == 1, 1, 0, missing = 0),
    abnormal_heart_rhythm = if_else(hediaar == 1, 1, 0, missing = 0),
    diabetes = if_else(hediadi == 1, 1, 0, missing = 0),
    stroke = if_else(hediast == 1, 1, 0, missing = 0),

    baseline_alzheimers = if_else(hedibad == 1, 1, 0, missing = 0),
    baseline_dementia   = if_else(hedibde == 1, 1, 0, missing = 0),

    pa_level = case_when(
      HeActa %in% c(1, 2) ~ "High",
      !HeActa %in% c(1, 2) & HeActb %in% c(1, 2) ~ "Moderate",
      !HeActa %in% c(1, 2) & !HeActb %in% c(1, 2) & HeActc %in% c(1, 2) ~ "Low",
      HeActa %in% c(3, 4) & HeActb %in% c(3, 4) & HeActc %in% c(3, 4) ~ "Sedentary",
      TRUE ~ NA_character_
    )
  )

# ORIGINAL VERSION OF W7 AGE HANDLING (kept as methodological note)
# age = indager
# This is not used in final summaries because negative indager values are special missing codes.

# Wave 7 corrected version
w7_table1 <- w7_table1_raw %>%
  transmute(
    idauniq,
    pa_level = factor(pa_level, levels = c("High", "Moderate", "Low", "Sedentary")),
    age = if_else(indager < 0, NA_real_, as.numeric(indager)),
    sex = as_factor(DhSex),
    hypertension,
    heart_failure,
    abnormal_heart_rhythm,
    diabetes,
    stroke,
    baseline_alzheimers,
    baseline_dementia
  )

# diagnostic age check
w7_table1 %>%
  group_by(pa_level) %>%
  summarise(
    min_age = min(age, na.rm = TRUE),
    q1_age = quantile(age, 0.25, na.rm = TRUE),
    median_age = median(age, na.rm = TRUE),
    mean_age = mean(age, na.rm = TRUE),
    q3_age = quantile(age, 0.75, na.rm = TRUE),
    max_age = max(age, na.rm = TRUE),
    sd_age = sd(age, na.rm = TRUE),
    n_missing_age = sum(is.na(age))
  )

## # A tibble: 5 × 9
##   pa_level  min_age q1_age median_age mean_age q3_age max_age sd_age
##   <fct>       <dbl>  <dbl>      <dbl>    <dbl>  <dbl>   <dbl>  <dbl>
## 1 High           29   58           64     63.9     69      89   8.68
## 2 Moderate       33   60           66     66.7     73      89   9.21
## 3 Low            39   62           70     70.1     78      89  10.3 
## 4 Sedentary      38   64           74     72.3     81      89  10.6 
## 5 <NA>           63   68.5         74     73       78      82   9.54
## # ℹ 1 more variable: n_missing_age <int>

w7_table1_summary <- w7_table1 %>%
  mutate(
    pa_level = forcats::fct_explicit_na(pa_level, na_level = "Missing PA")
  ) %>%
  group_by(pa_level, .drop = FALSE) %>%
  summarise(
    N = n(),
    `Age, mean (SD)` = sprintf("%.1f (%.1f)", mean(age, na.rm = TRUE), sd(age, na.rm = TRUE)),
    `Female, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(sex == "Female", na.rm = TRUE),
      100 * sum(sex == "Female", na.rm = TRUE) / n()
    ),
    `Hypertension, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(hypertension == 1, na.rm = TRUE),
      100 * sum(hypertension == 1, na.rm = TRUE) / n()
    ),
    `Diabetes, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(diabetes == 1, na.rm = TRUE),
      100 * sum(diabetes == 1, na.rm = TRUE) / n()
    ),
    `Stroke, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(stroke == 1, na.rm = TRUE),
      100 * sum(stroke == 1, na.rm = TRUE) / n()
    ),
    `Abnormal heart rhythm, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(abnormal_heart_rhythm == 1, na.rm = TRUE),
      100 * sum(abnormal_heart_rhythm == 1, na.rm = TRUE) / n()
    ),
    `Heart failure, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(heart_failure == 1, na.rm = TRUE),
      100 * sum(heart_failure == 1, na.rm = TRUE) / n()
    ),
    `Baseline Alzheimer’s, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_alzheimers == 1, na.rm = TRUE),
      100 * sum(baseline_alzheimers == 1, na.rm = TRUE) / n()
    ),
    `Baseline dementia, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_dementia == 1, na.rm = TRUE),
      100 * sum(baseline_dementia == 1, na.rm = TRUE) / n()
    )
  )

w7_table1_summary_wide <- w7_table1_summary %>%
  mutate(N = as.character(N)) %>%
  pivot_longer(
    cols = -pa_level,
    names_to = "Characteristic",
    values_to = "Value"
  ) %>%
  pivot_wider(
    names_from = pa_level,
    values_from = Value
  ) %>%
  select(
    Characteristic,
    High,
    Moderate,
    Low,
    Sedentary,
    `Missing PA`
  )

knitr::kable(
  w7_table1_summary_wide,
  caption = "Table 1. Baseline characteristics at wave 7 by physical activity category."
)

Table 1. Baseline characteristics at wave 7 by physical activity category.
Characteristic	High	Moderate	Low	Sedentary	Missing PA
N	2848	4426	1483	906	3
Age, mean (SD)	63.9 (8.7)	66.7 (9.2)	70.1 (10.3)	72.3 (10.6)	73.0 (9.5)
Female, n (%)	1400 (49.2%)	2529 (57.1%)	963 (64.9%)	474 (52.3%)	2 (66.7%)
Hypertension, n (%)	141 (5.0%)	291 (6.6%)	111 (7.5%)	111 (12.3%)	0 (0.0%)
Diabetes, n (%)	45 (1.6%)	110 (2.5%)	60 (4.0%)	64 (7.1%)	0 (0.0%)
Stroke, n (%)	13 (0.5%)	39 (0.9%)	35 (2.4%)	49 (5.4%)	1 (33.3%)
Abnormal heart rhythm, n (%)	55 (1.9%)	116 (2.6%)	54 (3.6%)	49 (5.4%)	0 (0.0%)
Heart failure, n (%)	4 (0.1%)	6 (0.1%)	11 (0.7%)	12 (1.3%)	0 (0.0%)
Baseline Alzheimer’s, n (%)	0 (0.0%)	7 (0.2%)	7 (0.5%)	29 (3.2%)	0 (0.0%)
Baseline dementia, n (%)	3 (0.1%)	22 (0.5%)	13 (0.9%)	76 (8.4%)	0 (0.0%)

wave 8 table 1

w8_table1_raw <- w8_core %>%
  mutate(
    hypertension = if_else(hediabp == 1, 1, 0, missing = 0),
    heart_failure = if_else(hediahf == 1, 1, 0, missing = 0),
    abnormal_heart_rhythm = if_else(hediaar == 1, 1, 0, missing = 0),
    diabetes = if_else(hediadi == 1, 1, 0, missing = 0),
    stroke = if_else(hediast == 1, 1, 0, missing = 0),

    baseline_alzheimers = if_else(hedibad == 1, 1, 0, missing = 0),
    baseline_dementia   = if_else(hedibde == 1, 1, 0, missing = 0),

    pa_level = case_when(
      heacta %in% c(1, 2) ~ "High",
      !heacta %in% c(1, 2) & heactb %in% c(1, 2) ~ "Moderate",
      !heacta %in% c(1, 2) & !heactb %in% c(1, 2) & heactc %in% c(1, 2) ~ "Low",
      heacta %in% c(3, 4) & heactb %in% c(3, 4) & heactc %in% c(3, 4) ~ "Sedentary",
      TRUE ~ NA_character_
    )
  )

# ORIGINAL VERSION OF W8 AGE HANDLING (kept as methodological note)
# age = indager
# This is not used in final summaries because negative indager values are special missing codes.

# Wave 8 corrected version
w8_table1 <- w8_table1_raw %>%
  transmute(
    idauniq,
    pa_level = factor(pa_level, levels = c("High", "Moderate", "Low", "Sedentary")),
    age = if_else(indager < 0, NA_real_, as.numeric(indager)),
    sex = as_factor(indsex),
    hypertension,
    heart_failure,
    abnormal_heart_rhythm,
    diabetes,
    stroke,
    baseline_alzheimers,
    baseline_dementia
  )

# diagnostic age check
w8_table1 %>%
  group_by(pa_level) %>%
  summarise(
    min_age = min(age, na.rm = TRUE),
    q1_age = quantile(age, 0.25, na.rm = TRUE),
    median_age = median(age, na.rm = TRUE),
    mean_age = mean(age, na.rm = TRUE),
    q3_age = quantile(age, 0.75, na.rm = TRUE),
    max_age = max(age, na.rm = TRUE),
    sd_age = sd(age, na.rm = TRUE),
    n_missing_age = sum(is.na(age))
  )

## # A tibble: 5 × 9
##   pa_level  min_age q1_age median_age mean_age q3_age max_age sd_age
##   <fct>       <dbl>  <dbl>      <dbl>    <dbl>  <dbl>   <dbl>  <dbl>
## 1 High           31   60           65     65.5   71        89   8.14
## 2 Moderate       34   62           68     68.4   75        89   8.84
## 3 Low            40   64           71     71.3   80        89   9.74
## 4 Sedentary      40   66           74     73.4   81        89   9.61
## 5 <NA>           66   67.5         69     70.3   72.5      76   5.13
## # ℹ 1 more variable: n_missing_age <int>

w8_table1_summary <- w8_table1 %>%
  mutate(
    pa_level = forcats::fct_explicit_na(pa_level, na_level = "Missing PA")
  ) %>%
  group_by(pa_level, .drop = FALSE) %>%
  summarise(
    N = n(),
    `Age, mean (SD)` = sprintf("%.1f (%.1f)", mean(age, na.rm = TRUE), sd(age, na.rm = TRUE)),
    `Female, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(sex == "Female", na.rm = TRUE),
      100 * sum(sex == "Female", na.rm = TRUE) / n()
    ),
    `Hypertension, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(hypertension == 1, na.rm = TRUE),
      100 * sum(hypertension == 1, na.rm = TRUE) / n()
    ),
    `Diabetes, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(diabetes == 1, na.rm = TRUE),
      100 * sum(diabetes == 1, na.rm = TRUE) / n()
    ),
    `Stroke, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(stroke == 1, na.rm = TRUE),
      100 * sum(stroke == 1, na.rm = TRUE) / n()
    ),
    `Abnormal heart rhythm, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(abnormal_heart_rhythm == 1, na.rm = TRUE),
      100 * sum(abnormal_heart_rhythm == 1, na.rm = TRUE) / n()
    ),
    `Heart failure, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(heart_failure == 1, na.rm = TRUE),
      100 * sum(heart_failure == 1, na.rm = TRUE) / n()
    ),
    `Baseline Alzheimer’s, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_alzheimers == 1, na.rm = TRUE),
      100 * sum(baseline_alzheimers == 1, na.rm = TRUE) / n()
    ),
    `Baseline dementia, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_dementia == 1, na.rm = TRUE),
      100 * sum(baseline_dementia == 1, na.rm = TRUE) / n()
    )
  )

w8_table1_summary_wide <- w8_table1_summary %>%
  mutate(N = as.character(N)) %>%
  pivot_longer(
    cols = -pa_level,
    names_to = "Characteristic",
    values_to = "Value"
  ) %>%
  pivot_wider(
    names_from = pa_level,
    values_from = Value
  ) %>%
  select(
    Characteristic,
    High,
    Moderate,
    Low,
    Sedentary,
    `Missing PA`
  )

knitr::kable(
  w8_table1_summary_wide,
  caption = "Table 1. Baseline characteristics at wave 8 by physical activity category."
)

Table 1. Baseline characteristics at wave 8 by physical activity category.
Characteristic	High	Moderate	Low	Sedentary	Missing PA
N	2443	3932	1281	786	3
Age, mean (SD)	65.5 (8.1)	68.4 (8.8)	71.3 (9.7)	73.4 (9.6)	70.3 (5.1)
Female, n (%)	1204 (49.3%)	2253 (57.3%)	848 (66.2%)	390 (49.6%)	0 (0.0%)
Hypertension, n (%)	111 (4.5%)	204 (5.2%)	92 (7.2%)	78 (9.9%)	0 (0.0%)
Diabetes, n (%)	35 (1.4%)	73 (1.9%)	43 (3.4%)	48 (6.1%)	0 (0.0%)
Stroke, n (%)	14 (0.6%)	42 (1.1%)	30 (2.3%)	62 (7.9%)	0 (0.0%)
Abnormal heart rhythm, n (%)	49 (2.0%)	99 (2.5%)	49 (3.8%)	50 (6.4%)	0 (0.0%)
Heart failure, n (%)	4 (0.2%)	14 (0.4%)	7 (0.5%)	18 (2.3%)	0 (0.0%)
Baseline Alzheimer’s, n (%)	2 (0.1%)	12 (0.3%)	12 (0.9%)	29 (3.7%)	0 (0.0%)
Baseline dementia, n (%)	7 (0.3%)	18 (0.5%)	25 (2.0%)	76 (9.7%)	0 (0.0%)

Wave summary numbers

### Total participant numbers by wave

wave_total_participant_numbers <- tibble(
  wave = c("Wave 1", "Wave 2", "Wave 3", "Wave 4", "Wave 5", "Wave 6", "Wave 7", "Wave 8", "Wave 9"),
  total_n = c(
    nrow(w1),
    nrow(w2_core),
    nrow(w3_core),
    nrow(w4_core),
    nrow(w5_core),
    nrow(w6_core),
    nrow(w7_core),
    nrow(w8_core),
    nrow(w9_core)
  )
)

knitr::kable(
  wave_total_participant_numbers,
  caption = "Total participant numbers by wave."
)

Total participant numbers by wave.
wave	total_n
Wave 1	12099
Wave 2	9432
Wave 3	9771
Wave 4	11050
Wave 5	10274
Wave 6	10601
Wave 7	9666
Wave 8	8445
Wave 9	8736

### Diagnosis summary numbers by wave

# Wave 1 diagnosis summary numbers
wave1_diagnosis_summary_numbers <- w1 %>%
  summarise(
    wave = "Wave 1",
    alzheimers_n = sum(alz_w1 == 1, na.rm = TRUE),
    dementia_n = sum(dementia_w1 == 1, na.rm = TRUE),
    both_n = sum(alz_w1 == 1 & dementia_w1 == 1, na.rm = TRUE),
    either_n = sum(alz_w1 == 1 | dementia_w1 == 1, na.rm = TRUE)
  )

# Wave 2 diagnosis summary numbers
wave2_diagnosis_summary_numbers <- w2_core %>%
  summarise(
    wave = "Wave 2",
    alzheimers_n = sum(hedib01 == 8 | hedib02 == 8 | hedib03 == 8 | hedib04 == 8, na.rm = TRUE),
    dementia_n   = sum(hedib01 == 9 | hedib02 == 9 | hedib03 == 9 | hedib04 == 9, na.rm = TRUE),
    both_n = sum(
      (hedib01 %in% c(8, 9)) +
      (hedib02 %in% c(8, 9)) +
      (hedib03 %in% c(8, 9)) +
      (hedib04 %in% c(8, 9)) > 1,
      na.rm = TRUE
    ),
    either_n = sum(
      hedib01 %in% c(8, 9) | hedib02 %in% c(8, 9) |
      hedib03 %in% c(8, 9) | hedib04 %in% c(8, 9),
      na.rm = TRUE
    )
  )

# Wave 3 diagnosis summary numbers
wave3_diagnosis_summary_numbers <- w3_core %>%
  summarise(
    wave = "Wave 3",
    alzheimers_n = sum(hedibad %in% c(1, 2, 3), na.rm = TRUE),
    dementia_n   = sum(hedibde %in% c(1, 2, 3), na.rm = TRUE),
    both_n       = sum(hedibad %in% c(1, 2, 3) & hedibde %in% c(1, 2, 3), na.rm = TRUE),
    either_n     = sum(hedibad %in% c(1, 2, 3) | hedibde %in% c(1, 2, 3), na.rm = TRUE)
  )

# Waves 4 to 9 diagnosis summary numbers
wave4_diagnosis_summary_numbers <- w4_core %>%
  summarise(
    wave = "Wave 4",
    alzheimers_n = sum(hedibad == 1, na.rm = TRUE),
    dementia_n   = sum(hedibde == 1, na.rm = TRUE),
    both_n       = sum(hedibad == 1 & hedibde == 1, na.rm = TRUE),
    either_n     = sum(hedibad == 1 | hedibde == 1, na.rm = TRUE)
  )

wave5_diagnosis_summary_numbers <- w5_core %>%
  summarise(
    wave = "Wave 5",
    alzheimers_n = sum(hedibad == 1, na.rm = TRUE),
    dementia_n   = sum(hedibde == 1, na.rm = TRUE),
    both_n       = sum(hedibad == 1 & hedibde == 1, na.rm = TRUE),
    either_n     = sum(hedibad == 1 | hedibde == 1, na.rm = TRUE)
  )

wave6_diagnosis_summary_numbers <- w6_core %>%
  summarise(
    wave = "Wave 6",
    alzheimers_n = sum(hedibad == 1, na.rm = TRUE),
    dementia_n   = sum(hedibde == 1, na.rm = TRUE),
    both_n       = sum(hedibad == 1 & hedibde == 1, na.rm = TRUE),
    either_n     = sum(hedibad == 1 | hedibde == 1, na.rm = TRUE)
  )

wave7_diagnosis_summary_numbers <- w7_core %>%
  summarise(
    wave = "Wave 7",
    alzheimers_n = sum(hedibad == 1, na.rm = TRUE),
    dementia_n   = sum(hedibde == 1, na.rm = TRUE),
    both_n       = sum(hedibad == 1 & hedibde == 1, na.rm = TRUE),
    either_n     = sum(hedibad == 1 | hedibde == 1, na.rm = TRUE)
  )

wave8_diagnosis_summary_numbers <- w8_core %>%
  summarise(
    wave = "Wave 8",
    alzheimers_n = sum(hedibad == 1, na.rm = TRUE),
    dementia_n   = sum(hedibde == 1, na.rm = TRUE),
    both_n       = sum(hedibad == 1 & hedibde == 1, na.rm = TRUE),
    either_n     = sum(hedibad == 1 | hedibde == 1, na.rm = TRUE)
  )

wave9_diagnosis_summary_numbers <- w9_core %>%
  summarise(
    wave = "Wave 9",
    alzheimers_n = sum(hedibad == 1, na.rm = TRUE),
    dementia_n   = sum(hedibde == 1, na.rm = TRUE),
    both_n       = sum(hedibad == 1 & hedibde == 1, na.rm = TRUE),
    either_n     = sum(hedibad == 1 | hedibde == 1, na.rm = TRUE)
  )

wave_diagnosis_summary_numbers <- bind_rows(
  wave1_diagnosis_summary_numbers,
  wave2_diagnosis_summary_numbers,
  wave3_diagnosis_summary_numbers,
  wave4_diagnosis_summary_numbers,
  wave5_diagnosis_summary_numbers,
  wave6_diagnosis_summary_numbers,
  wave7_diagnosis_summary_numbers,
  wave8_diagnosis_summary_numbers,
  wave9_diagnosis_summary_numbers
)

knitr::kable(
  wave_diagnosis_summary_numbers,
  caption = "Diagnosis summary numbers by wave."
)

Diagnosis summary numbers by wave.
wave	alzheimers_n	dementia_n	both_n	either_n
Wave 1	14	64	3	75
Wave 2	11	43	2	52
Wave 3	32	112	11	133
Wave 4	37	97	13	121
Wave 5	46	110	19	137
Wave 6	50	112	17	145
Wave 7	43	114	19	138
Wave 8	55	126	23	158
Wave 9	47	127	21	153

### Missing physical activity summary numbers by wave

# Wave 1 missing PA
wave1_missing_pa_summary_numbers <- table1_w1_clean %>%
  summarise(
    wave = "Wave 1",
    missing_pa_n = sum(is.na(pa_level))
  )

# Wave 2 missing PA
w2_derived <- read_dta("raw data/RAW_data_stata/wave_2_derived_variables.dta")
w2 <- left_join(w2_core, w2_derived, by = "idauniq")

wave2_missing_pa_summary_numbers <- w2 %>%
  mutate(pa_level = if_else(palevel %in% c(0, 1, 2, 3), palevel, NA_real_)) %>%
  summarise(
    wave = "Wave 2",
    missing_pa_n = sum(is.na(pa_level))
  )
#waves 3 to 9 MISSING PA FOR NOW -> 'NA' AS PLACEHOLDER AS HAVEN'T STANDARDISED PA VARIABLES FOR WAVES 3 - 9 AS YET
# Placeholder for waves 3 to 9
wave3_missing_pa_summary_numbers <- tibble(wave = "Wave 3", missing_pa_n = NA_integer_)
wave4_missing_pa_summary_numbers <- tibble(wave = "Wave 4", missing_pa_n = NA_integer_)
wave5_missing_pa_summary_numbers <- tibble(wave = "Wave 5", missing_pa_n = NA_integer_)
wave6_missing_pa_summary_numbers <- tibble(wave = "Wave 6", missing_pa_n = NA_integer_)
wave7_missing_pa_summary_numbers <- tibble(wave = "Wave 7", missing_pa_n = NA_integer_)
wave8_missing_pa_summary_numbers <- tibble(wave = "Wave 8", missing_pa_n = NA_integer_)
wave9_missing_pa_summary_numbers <- tibble(wave = "Wave 9", missing_pa_n = NA_integer_)

wave_missing_pa_summary_numbers <- bind_rows(
  wave1_missing_pa_summary_numbers,
  wave2_missing_pa_summary_numbers,
  wave3_missing_pa_summary_numbers,
  wave4_missing_pa_summary_numbers,
  wave5_missing_pa_summary_numbers,
  wave6_missing_pa_summary_numbers,
  wave7_missing_pa_summary_numbers,
  wave8_missing_pa_summary_numbers,
  wave9_missing_pa_summary_numbers
)

knitr::kable(
  wave_missing_pa_summary_numbers,
  caption = "Missing physical activity summary numbers by wave."
)

Missing physical activity summary numbers by wave.
wave	missing_pa_n
Wave 1	194
Wave 2	139
Wave 3	NA
Wave 4	NA
Wave 5	NA
Wave 6	NA
Wave 7	NA
Wave 8	NA
Wave 9	NA

### Combined wave summary numbers

wave_summary_numbers_table <- wave_total_participant_numbers %>%
  left_join(wave_diagnosis_summary_numbers, by = "wave") %>%
  left_join(wave_missing_pa_summary_numbers, by = "wave") %>%
  mutate(
    final_analysis_n = case_when(
      wave == "Wave 1" ~ nrow(analysis_w1),
      TRUE ~ NA_integer_
    )
  )

knitr::kable(
  wave_summary_numbers_table,
  caption = "Combined wave summary numbers."
)

Combined wave summary numbers.
wave	total_n	alzheimers_n	dementia_n	both_n	either_n	missing_pa_n	final_analysis_n
Wave 1	12099	14	64	3	75	194	11855
Wave 2	9432	11	43	2	52	139	NA
Wave 3	9771	32	112	11	133	NA	NA
Wave 4	11050	37	97	13	121	NA	NA
Wave 5	10274	46	110	19	137	NA	NA
Wave 6	10601	50	112	17	145	NA	NA
Wave 7	9666	43	114	19	138	NA	NA
Wave 8	8445	55	126	23	158	NA	NA
Wave 9	8736	47	127	21	153	NA	NA

Structure of the portfolio

Abstract

Provide the reader with a succinct summary of your work

Introduction

Provide an introduction to you portfolio to reader.

Method

covering data access requirements, ethics, metadata and all methodological aspects of your project

Results

Use this section to showcase the results of your data manipulation that will contribute to the project

###Table 1 shows the baseline characteristics at wave 1, organised around physical activity (PA) categories: high, moderate, low and sedentary. As mean age increases the proportion of participants in low and sedentary PA categories increases. Furthermore the low and sedentary categories have higher prevalence of comorbidities that include hypertension, diabetes, stroke, abnormal heart rhythms, and heart failure.

Conclusion

Summaries your findings,discuss them in the context of other similar work or questions and suggestions for future work. Conclude your portfolio with what started your data exploration and what have the data contributed in the decisions for patient care or health service delivery.

In text elements

##Some examples of having in-text elements as you develop your portfolio are provided here.

#* “# Headings” #* “## Subheading 1” #* “### subheading 2” #* “#### subheading 3”

Hyperlinks

#Healthcare Data Science

Notice box

#::: {.infobox .caution data-latex=“{caution}”}

#The format to add boxes to your portfolio #:::

Tables

In-text table

#A template format to add a table into #the document you can use the following md code structure.

#|Data table | Coverage |Area | #|————–|————–|————————-| #|Health survey | 2015 |Self-reported outcomes | #|EHR | 2000 onwards |Electronic health records|

Data tables

defining a dataframe

CREL <- data.frame(
  Data = c("Health survey","EHR"), 
  Coverage=c(
    "2015","2000 onwards"
  ),
  Area=c(
    "Self-reported outcomes",
    "Electronic Health Records"
  ))

kable(CREL)

Data	Coverage	Area
Health survey	2015	Self-reported outcomes
EHR	2000 onwards	Electronic Health Records

Interactive data elements

DT::datatable(
  CREL,
  extensions = 'Buttons',
  options = list(
              paging = TRUE,
              searching = TRUE,
              fixedColumns = TRUE,
              autoWidth = TRUE,
              ordering = TRUE,
              dom = 'tB',
              buttons = c('copy', 'excel')
              ),
              class = "display"
)