ELSA dissertation

getwd()
file.exists("raw data/RAW_data_stata/wave_1_core_data_v3.dta")

w1_core <- read_dta("raw data/RAW_data_stata/wave_1_core_data_v3.dta")
w1_derived <- read_dta("raw data/RAW_data_stata/wave_1_ifs_derived_variables.dta")

w1 <- left_join(w1_core, w1_derived, by = "idauniq")

#load waves 2 to 10
w2_core  <- read_dta("raw data/RAW_data_stata/wave_2_core_data_v4.dta")
w3_core  <- read_dta("raw data/RAW_data_stata/wave_3_elsa_data_v4.dta")
w4_core  <- read_dta("raw data/RAW_data_stata/wave_4_elsa_data_v3.dta")
w5_core  <- read_dta("raw data/RAW_data_stata/wave_5_elsa_data_v4.dta")
w6_core  <- read_dta("raw data/RAW_data_stata/wave_6_elsa_data_v2.dta")
w7_core  <- read_dta("raw data/RAW_data_stata/wave_7_elsa_data.dta")
w8_core  <- read_dta("raw data/RAW_data_stata/wave_8_elsa_data_eul_v2.dta")
w9_core  <- read_dta("raw data/RAW_data_stata/wave_9_elsa_data_eul_v2.dta")
w10_core <- read_dta("raw data/RAW_data_stata/wave_10_elsa_data_eul_v4.dta")

Table 1 baseline characteristics NO PARTICIPANTS EXCLUDED

# Demographic / social variables
names(w1)[grepl("age|ager|dob", names(w1), ignore.case = TRUE)]

##  [1] "dhdobyr"   "dhager"    "didob"     "heage"     "wprage"    "indobyr.x"
##  [7] "indager"   "aagemab"   "aagepab"   "aageangi"  "aagehart"  "aagestro" 
## [13] "aagedi"    "age"       "age_p"     "indobyr.y" "indobyr_p" "agebuhead"
## [19] "agebusp"   "agehoh"    "agehhch1"  "agehhch2"  "agehhch3"  "agehhch4" 
## [25] "agehhch5"  "agehhch6"  "agehhch7"  "agehhch8"  "agebuch1"  "agebuch2" 
## [31] "agebuch3"  "agebuch4"  "agebuch5"  "agebuch6"  "agebuch7"  "agebuch8" 
## [37] "chage1"    "chage2"    "chage3"    "chage4"    "chage5"    "chage6"   
## [43] "chage7"    "chage8"    "chage9"    "chage10"   "chage11"   "chage12"  
## [49] "chage13"   "chage14"   "chage15"   "chage16"   "ageg5"     "ageg5_bu" 
## [55] "ageg7"     "ageg7_bu"  "ageg10"    "ageg10_bu" "ageg3"     "ageg3_bu" 
## [61] "ageg3_spa" "spage"     "spage_bu"  "agehhldr1" "agehhldr2" "agehhldr3"
## [67] "agehhldr4" "mothage"   "magedied"  "fathage"   "fagedied"

names(w1)[grepl("sex", names(w1), ignore.case = TRUE)]

##  [1] "dhsex"     "disex"     "indsex"    "asex"      "sex"       "sex_p"    
##  [7] "sexbuhead" "sexhoh"    "chsex1"    "chsex2"    "chsex3"    "chsex4"   
## [13] "chsex5"    "chsex6"    "chsex7"    "chsex8"    "chsex9"    "chsex10"  
## [19] "chsex11"   "chsex12"   "chsex13"   "chsex14"   "chsex15"   "chsex16"

names(w1)[grepl("educ|qual|school", names(w1), ignore.case = TRUE)]

##  [1] "fqqual1"  "fqqual2"  "fqqual3"  "edqual.x" "aqual"    "aeducend"
##  [7] "edqual.y" "qual2"    "qual3"    "qual2_p"  "qual3_p"

names(w1)[grepl("ethnic|ethn|race", names(w1), ignore.case = TRUE)]

## [1] "fqethnr"  "aethnicr"

names(w1)[grepl("mar|partner|spouse|widow|single", names(w1), ignore.case = TRUE)]

## [1] "dimar"   "wpamar"  "partner" "marstat"

names(w1)[grepl("employ|work|job|retir", names(w1), ignore.case = TRUE)]

##  [1] "difjob"         "wpjob"          "wpjobl"         "wpsjoby"       
##  [5] "wpsjobm"        "wpcjob"         "wphjob"         "iawork"        
##  [9] "hojob"          "aeverjob"       "aemploye"       "astwork"       
## [13] "hhgriddhwork"   "hhgriddhwork_p" "worktime"       "everwork"      
## [17] "exwork"         "exworkb"        "exwork55"       "exwork55b"     
## [21] "exwork60"       "exwork60b"      "exwork65"       "exwork65b"     
## [25] "difjobm"

# Lifestyle variables
names(w1)[grepl("smok|cig", names(w1), ignore.case = TRUE)]

## [1] "hecig"      "smoker"     "smokerstat"

names(w1)[grepl("alcohol|drink", names(w1), ignore.case = TRUE)]

## character(0)

# BMI / body size
names(w1)[grepl("bmi|height|weight", names(w1), ignore.case = TRUE)]

## character(0)

# Mood / depression
names(w1)[grepl("depress|cesd|mood", names(w1), ignore.case = TRUE)]

## [1] "cesd_sc" "cesd_na"

# Disease-history blocks
names(w1)[grepl("^hedia", names(w1), ignore.case = TRUE)]

##  [1] "hedia01" "hedia02" "hedia03" "hedia04" "hedia05" "hedia06" "hedia07"
##  [8] "hedia08" "hedia09" "hedia10"

names(w1)[grepl("^hedib", names(w1), ignore.case = TRUE)]

##  [1] "hedib01" "hedib02" "hedib03" "hedib04" "hedib05" "hedib06" "hedib07"
##  [8] "hedib08" "hedib09" "hedib10"

# age
attr(w1$dhager, "label")

## [1] "Age variable from HH grid collapsed at 90 plus"

table(w1$dhager, useNA = "ifany")[1:10]

## 
## 20 30 31 32 33 34 35 36 37 38 
##  1  2  1  1  6  3  4  6  8 15

# sex
attr(w1$dhsex, "label")

## [1] "ASK OR CODE RESPONDENT~S SEX"

table(w1$dhsex, useNA = "ifany")

## 
##    1    2 
## 5335 6764

# education
attr(w1$edqual.x, "label")

## [1] "(D) Highest Educational Qualification at ELSA W1"

table(w1$edqual.x, useNA = "ifany")

## 
##   -9   -8   -1    1    2    3    4    5    6    7 
##    6   11   18 1388 1333  764 1974  582 1015 5008

# ethnicity
attr(w1$fqethnr, "label")

## [1] "ELSA ethnic group collapsed into White and Non-white to avoid disclosure"

table(w1$fqethnr, useNA = "ifany")

## 
##   -9   -8   -1    1    2 
##   12    2 6810 5111  164

# marital status
attr(w1$marstat, "label")

## [1] "marital status - couple1 combined with dimar"

table(w1$marstat, useNA = "ifany")

## 
##    1    2    3    4    5    6 
## 8035  504  575 1951  823  211

# smoking
attr(w1$smokerstat, "label")

## [1] "smoker status (past or present)"

table(w1$smokerstat, useNA = "ifany")

## 
##   -9   -8   -2    0    1    2    3    4 
##    5   11  175 4286  674 4342  445 2161

# depression
attr(w1$cesd_sc, "label")

## [1] "number of cesd questions answered yes"

summary(w1$cesd_sc)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -2.000   0.000   1.000   1.504   2.000   8.000

attr(w1$cesd_na, "label")

## [1] "number of cesd questions answered"

table(w1$cesd_na, useNA = "ifany")

## 
##    -2    -1     0     3     5     6     7     8 
##   136   175    67     2     5    15    87 11612

# alcohol - broader search
names(w1)[grepl("alco|drink|beer|wine|spirit|units", names(w1), ignore.case = TRUE)]

## character(0)

# BMI / body size - broader search
names(w1)[grepl("bmi|body|mass|height|weight|wt|ht", names(w1), ignore.case = TRUE)]

##  [1] "wpwtx"       "wpwtx2"      "wpwtx3"      "iashty1"     "iashty2"    
##  [6] "iashty3"     "iashty4"     "hhtot"       "refreshtype" "chtype1"    
## [11] "chtype2"     "chtype3"     "chtype4"     "chtype5"     "chtype6"    
## [16] "chtype7"     "chtype8"     "chtype9"     "chtype10"    "chtype11"   
## [21] "chtype12"    "chtype13"    "chtype14"    "chtype15"    "chtype16"   
## [26] "nright"

# employment - inspect strongest candidates
attr(w1$worktime, "label")

## [1] "Working full time or part time"

table(w1$worktime, useNA = "ifany")

## 
##   -8   -1    1    2 
##  118 7476 2741 1764

attr(w1$aemploye, "label")

## [1] "HSE Feed Forward: Are you …{an employee or self-employed}"

table(w1$aemploye, useNA = "ifany")

## 
##    -1     1     2 
##   563 10101  1435

attr(w1$everwork, "label")

## [1] "ever worked"

table(w1$everwork, useNA = "ifany")

## 
##    -9    -8    -2     0     1 
##     3     1    27   217 11851

# smoking labels
attributes(w1$smokerstat)

## $label
## [1] "smoker status (past or present)"
## 
## $format.stata
## [1] "%8.0g"
## 
## $labels
##                refused             don't know              not asked 
##                     -9                     -8                     -2 
##           never smoked ex smoker - occasional    ex smoker - regular 
##                      0                      1                      2 
##    ex smoker - DK freq         current smoker 
##                      3                      4 
## 
## $class
## [1] "haven_labelled" "vctrs_vctr"     "double"

# education labels
attributes(w1$edqual.x)

## $label
## [1] "(D) Highest Educational Qualification at ELSA W1"
## 
## $format.stata
## [1] "%8.0g"
## 
## $labels
##                    Refusal                 Don't know 
##                         -9                         -8 
##             Not applicable  NVQ4/NVQ5/Degree or equiv 
##                         -1                          1 
##     Higher ed below degree     NVQ3/GCE A Level equiv 
##                          2                          3 
##     NVQ2/GCE O Level equiv NVQ1/CSE other grade equiv 
##                          4                          5 
##              Foreign/other           No qualification 
##                          6                          7 
## 
## $class
## [1] "haven_labelled" "vctrs_vctr"     "double"

# ethnicity labels
attributes(w1$fqethnr)

## $label
## [1] "ELSA ethnic group collapsed into White and Non-white to avoid disclosure"
## 
## $format.stata
## [1] "%8.0g"
## 
## $labels
##        Refusal     Don't know Not applicable          White      Non-white 
##             -9             -8             -1              1              2 
## 
## $class
## [1] "haven_labelled" "vctrs_vctr"     "double"

# marital status labels
attributes(w1$marstat)

## $label
## [1] "marital status - couple1 combined with dimar"
## 
## $format.stata
## [1] "%8.0g"
## 
## $labels
## married (inc civ pship 06 onwards)                         cohabiting 
##                                  1                                  2 
##              single, never married                            widowed 
##                                  3                                  4 
##                           divorced                          separated 
##                                  5                                  6 
## 
## $class
## [1] "haven_labelled" "vctrs_vctr"     "double"

# alcohol - wider search
names(w1)[grepl("drnk|drink|alc|wine|beer|spirit|unit|pub", names(w1), ignore.case = TRUE)]

## [1] "healc"

# possible nurse / anthropometry style names for BMI
names(w1)[grepl("bm", names(w1), ignore.case = TRUE)]

##  [1] "wpsjobm" "iabm11"  "iabm12"  "iabm13"  "iabm14"  "iabm15"  "iabm16" 
##  [8] "iabm17"  "iabm18"  "iabm19"  "iabm20"  "iabm31"  "iabm32"  "iabm33" 
## [15] "iabm34"  "iabm35"  "iabm36"  "iabm37"  "iabm38"  "iabm39"  "iabm40" 
## [22] "iabm48"  "iabm49"  "iabm50"  "iabm51"  "iabm52"  "iabm53"  "iabm54" 
## [29] "iabm62"  "iabm63"  "iabm64"  "iabm65"  "iabm66"  "iabm67"  "iabm68" 
## [36] "iadebm"  "hobml"   "hobmu"   "hobme"   "hobmr"   "hohbm1"  "hohbm2" 
## [43] "hohbm3"  "difjobm"

names(w1)[grepl("ht", names(w1), ignore.case = TRUE)]

##  [1] "iashty1"     "iashty2"     "iashty3"     "iashty4"     "hhtot"      
##  [6] "refreshtype" "chtype1"     "chtype2"     "chtype3"     "chtype4"    
## [11] "chtype5"     "chtype6"     "chtype7"     "chtype8"     "chtype9"    
## [16] "chtype10"    "chtype11"    "chtype12"    "chtype13"    "chtype14"   
## [21] "chtype15"    "chtype16"    "nright"

names(w1)[grepl("wt", names(w1), ignore.case = TRUE)]

## [1] "wpwtx"  "wpwtx2" "wpwtx3"

# inspect the most plausible body-size candidates already found
attr(w1$wpwtx, "label")

## [1] "Is this before or after tax?"

table(w1$wpwtx, useNA = "ifany")

## 
##    -9    -8    -1     1     2 
##    22    34 11235   172   636

attr(w1$wpwtx2, "label")

## [1] "Is this before or after tax?"

table(w1$wpwtx2, useNA = "ifany")

## 
##    -9    -8    -1     1     2 
##     3     2 12025    16    53

attr(w1$wpwtx3, "label")

## [1] "Is this before or after tax?"

table(w1$wpwtx3, useNA = "ifany")

## 
##    -1     2 
## 12092     7

# alcohol
attr(w1$healc, "label")

## [1] "Do you now drink …? {a lot more..}"

table(w1$healc, useNA = "ifany")

## 
##    -8    -1     1     2     3     4 
##     3 10442    41   349   582   682

attributes(w1$healc)

## $label
## [1] "Do you now drink …? {a lot more..}"
## 
## $format.stata
## [1] "%8.0g"
## 
## $labels
##         Refusal      Don't Know  Not applicable ... a lot more,     a bit more, 
##              -9              -8              -1               1               2 
##     a bit less, or, a lot less? 
##               3               4 
## 
## $class
## [1] "haven_labelled" "vctrs_vctr"     "double"

# better search for height/weight/BMI
names(w1)[grepl("hei|highm|cm|metre|meter", names(w1), ignore.case = TRUE)]

##  [1] "heill"       "heins"       "heiqa"       "heiqb"       "heiqc"      
##  [6] "heiqd"       "heiqe"       "heiqf"       "heiqg"       "heiqh"      
## [11] "heiqi"       "heiqj"       "heiqk"       "heiql"       "heiqm"      
## [16] "heiqn"       "heiqo"       "heiqp"       "heiqq"       "heinct"     
## [21] "wplrcm"      "wplrcm2"     "hoincm1"     "hoincm2"     "hoincm3"    
## [26] "hoincm4"     "horpcm"      "gaselecmeth" "elecmeth"    "rentincme"

names(w1)[grepl("weigh|kilo|kg|stone|pound", names(w1), ignore.case = TRUE)]

## character(0)

names(w1)[grepl("bmi", names(w1), ignore.case = TRUE)]

## character(0)

# create disease indicator variables for Table 1
w1 <- w1 %>%
  mutate(
    htn_w1 = if_else(
      hedia01 == 1 | hedia02 == 1 | hedia03 == 1 | hedia04 == 1 | hedia05 == 1 |
      hedia06 == 1 | hedia07 == 1 | hedia08 == 1 | hedia09 == 1 | hedia10 == 1,
      1, 0
    ),
    hf_w1 = if_else(
      hedia01 == 4 | hedia02 == 4 | hedia03 == 4 | hedia04 == 4 | hedia05 == 4 |
      hedia06 == 4 | hedia07 == 4 | hedia08 == 4 | hedia09 == 4 | hedia10 == 4,
      1, 0
    ),
    arrhythmia_w1 = if_else(
      hedia01 == 6 | hedia02 == 6 | hedia03 == 6 | hedia04 == 6 | hedia05 == 6 |
      hedia06 == 6 | hedia07 == 6 | hedia08 == 6 | hedia09 == 6 | hedia10 == 6,
      1, 0
    ),
    diabetes_w1 = if_else(
      hedia01 == 7 | hedia02 == 7 | hedia03 == 7 | hedia04 == 7 | hedia05 == 7 |
      hedia06 == 7 | hedia07 == 7 | hedia08 == 7 | hedia09 == 7 | hedia10 == 7,
      1, 0
    ),
    stroke_w1 = if_else(
      hedia01 == 8 | hedia02 == 8 | hedia03 == 8 | hedia04 == 8 | hedia05 == 8 |
      hedia06 == 8 | hedia07 == 8 | hedia08 == 8 | hedia09 == 8 | hedia10 == 8,
      1, 0
    ),
    parkinsons_w1 = if_else(
      hedib01 == 6 | hedib02 == 6 | hedib03 == 6 | hedib04 == 6 | hedib05 == 6 |
      hedib06 == 6 | hedib07 == 6 | hedib08 == 6 | hedib09 == 6 | hedib10 == 6,
      1, 0
    ),
    alz_w1 = if_else(
      hedib01 == 8 | hedib02 == 8 | hedib03 == 8 | hedib04 == 8 | hedib05 == 8 |
      hedib06 == 8 | hedib07 == 8 | hedib08 == 8 | hedib09 == 8 | hedib10 == 8,
      1, 0
    ),
    dementia_w1 = if_else(
      hedib01 == 9 | hedib02 == 9 | hedib03 == 9 | hedib04 == 9 | hedib05 == 9 |
      hedib06 == 9 | hedib07 == 9 | hedib08 == 9 | hedib09 == 9 | hedib10 == 9,
      1, 0
    )
  )
# check the new disease variables
table(w1$htn_w1, useNA = "ifany")

## 
##    0    1 
## 7627 4472

table(w1$hf_w1, useNA = "ifany")

## 
##     0     1 
## 12017    82

table(w1$arrhythmia_w1, useNA = "ifany")

## 
##     0     1 
## 11367   732

table(w1$diabetes_w1, useNA = "ifany")

## 
##     0     1 
## 11233   866

table(w1$stroke_w1, useNA = "ifany")

## 
##     0     1 
## 11588   511

table(w1$parkinsons_w1, useNA = "ifany")

## 
##     0     1 
## 12043    56

table(w1$alz_w1, useNA = "ifany")

## 
##     0     1 
## 12085    14

table(w1$dementia_w1, useNA = "ifany")

## 
##     0     1 
## 12035    64

# keep a clean draft Table 1 dataset - no exclusions
table1_w1 <- w1 %>%
  transmute(
    idauniq,
    age = dhager,
    sex = as_factor(dhsex),
    education = as_factor(edqual.x),
    ethnicity = as_factor(fqethnr),
    marital_status = as_factor(marstat),
    employment = as_factor(worktime),
    smoking = as_factor(smokerstat),
    depression_score = cesd_sc,
    hypertension = htn_w1,
    heart_failure = hf_w1,
    abnormal_heart_rhythm = arrhythmia_w1,
    diabetes = diabetes_w1,
    stroke = stroke_w1,
    baseline_parkinsons = parkinsons_w1,
    baseline_alzheimers = alz_w1,
    baseline_dementia = dementia_w1
  )
# quick checks
dim(table1_w1)

## [1] 12099    17

summary(table1_w1$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20.00   55.00   63.00   64.19   72.00   99.00

table(table1_w1$sex, useNA = "ifany")

## 
##        Refusal     Don't Know Not applicable           Male         Female 
##              0              0              0           5335           6764

table(table1_w1$education, useNA = "ifany")

## 
##                    Refusal                 Don't know 
##                          6                         11 
##             Not applicable  NVQ4/NVQ5/Degree or equiv 
##                         18                       1388 
##     Higher ed below degree     NVQ3/GCE A Level equiv 
##                       1333                        764 
##     NVQ2/GCE O Level equiv NVQ1/CSE other grade equiv 
##                       1974                        582 
##              Foreign/other           No qualification 
##                       1015                       5008

table(table1_w1$ethnicity, useNA = "ifany")

## 
##        Refusal     Don't know Not applicable          White      Non-white 
##             12              2           6810           5111            164

table(table1_w1$marital_status, useNA = "ifany")

## 
## married (inc civ pship 06 onwards)                         cohabiting 
##                               8035                                504 
##              single, never married                            widowed 
##                                575                               1951 
##                           divorced                          separated 
##                                823                                211

table(table1_w1$employment, useNA = "ifany")

## 
##          unknown      Not working Full time (>=35)        Part time 
##              118             7476             2741             1764

table(table1_w1$smoking, useNA = "ifany")

## 
##                refused             don't know              not asked 
##                      5                     11                    175 
##           never smoked ex smoker - occasional    ex smoker - regular 
##                   4286                    674                   4342 
##    ex smoker - DK freq         current smoker 
##                    445                   2161

summary(table1_w1$depression_score)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -2.000   0.000   1.000   1.504   2.000   8.000

table(table1_w1$hypertension, useNA = "ifany")

## 
##    0    1 
## 7627 4472

table(table1_w1$heart_failure, useNA = "ifany")

## 
##     0     1 
## 12017    82

table(table1_w1$abnormal_heart_rhythm, useNA = "ifany")

## 
##     0     1 
## 11367   732

table(table1_w1$diabetes, useNA = "ifany")

## 
##     0     1 
## 11233   866

table(table1_w1$stroke, useNA = "ifany")

## 
##     0     1 
## 11588   511

table(table1_w1$baseline_alzheimers, useNA = "ifany")

## 
##     0     1 
## 12085    14

table(table1_w1$baseline_dementia, useNA = "ifany")

## 
##     0     1 
## 12035    64

w1 <- w1 %>%
  mutate(
    w1_palevel = case_when(
      heacta %in% c(1, 2) ~ "High",
      !heacta %in% c(1, 2) & heactb %in% c(1, 2) ~ "Moderate",
      !heacta %in% c(1, 2) & !heactb %in% c(1, 2) & heactc %in% c(1, 2) ~ "Low",
      heacta %in% c(3, 4) & heactb %in% c(3, 4) & heactc %in% c(3, 4) ~ "Sedentary",
      TRUE ~ NA_character_
    )
  )

table(w1$w1_palevel, useNA = "ifany")

## 
##      High       Low  Moderate Sedentary      <NA> 
##      3302      1756      5607      1240       194

# WAVE 1 IQCODE CREATION
iqcode_vars <- c(
  "heiqa",
  "heiqb",
  "heiqc",
  "heiqd",
  "heiqe",
  "heiqf",
  "heiqg",
  "heiqh",
  "heiqi",
  "heiqj",
  "heiqk",
  "heiql",
  "heiqm",
  "heiqn",
  "heiqo",
  "heiqp"
)

w1 <- w1 %>%
  mutate(
    across(all_of(iqcode_vars), ~if_else(.x < 0, NA_real_, as.numeric(.x)))
  ) %>%
  rowwise() %>%
  mutate(
    iqcode_n_answered = sum(!is.na(c_across(all_of(iqcode_vars)))),
    iqcode_mean = if_else(
      iqcode_n_answered >= 12,
      mean(c_across(all_of(iqcode_vars)), na.rm = TRUE),
      NA_real_
    )
  ) %>%
  ungroup() %>%
  mutate(
    baseline_pathological_cognitive_decline = if_else(
      !is.na(iqcode_mean) & iqcode_mean >= 3.38,
      1, 0,
      missing = NA_real_
    )
  )

summary(w1$iqcode_mean)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   2.875   3.000   3.261   3.606   4.750   11927

table(w1$baseline_pathological_cognitive_decline, useNA = "ifany")

## 
##     0     1 
## 12047    52

table(w1$iqcode_n_answered, useNA = "ifany")

## 
##     0     1     6     8    14    15    16 
## 11924     1     1     1     5     4   163

# add physical activity group to the draft Table 1 dataset
table1_w1 <- w1 %>%
  transmute(
    idauniq,
    pa_level = w1_palevel,
    age = dhager,
    sex = as_factor(dhsex),
    education = as_factor(edqual.x),
    ethnicity = as_factor(fqethnr),
    marital_status = as_factor(marstat),
    employment = as_factor(worktime),
    smoking = as_factor(smokerstat),
    depression_score = cesd_sc,
    hypertension = htn_w1,
    heart_failure = hf_w1,
    abnormal_heart_rhythm = arrhythmia_w1,
    diabetes = diabetes_w1,
    stroke = stroke_w1,
    baseline_alzheimers = alz_w1,
    baseline_dementia = dementia_w1,
    baseline_parkinsons = parkinsons_w1,
        iqcode_mean = iqcode_mean,
    baseline_pathological_cognitive_decline = baseline_pathological_cognitive_decline
  )

# check PA distribution with no exclusions
table(table1_w1$pa_level, useNA = "ifany")

## 
##      High       Low  Moderate Sedentary      <NA> 
##      3302      1756      5607      1240       194

table(table1_w1$baseline_parkinsons, useNA = "ifany")

## 
##     0     1 
## 12043    56

# mean age by PA level
table1_w1 %>%
  group_by(pa_level) %>%
  summarise(
    n = n(),
    mean_age = mean(age, na.rm = TRUE),
    sd_age = sd(age, na.rm = TRUE)
  )

## # A tibble: 5 × 4
##   pa_level      n mean_age sd_age
##   <chr>     <int>    <dbl>  <dbl>
## 1 High       3302     60.6   9.22
## 2 Low        1756     67.4  12.0 
## 3 Moderate   5607     63.6  10.4 
## 4 Sedentary  1240     70.8  12.2 
## 5 <NA>        194     69.3  15.0

# sex by PA level
table(table1_w1$pa_level, table1_w1$sex, useNA = "ifany")

##            
##             Refusal Don't Know Not applicable Male Female
##   High            0          0              0 1593   1709
##   Low             0          0              0  552   1204
##   Moderate        0          0              0 2507   3100
##   Sedentary       0          0              0  590    650
##   <NA>            0          0              0   93    101

# smoking by PA level
table(table1_w1$pa_level, table1_w1$smoking, useNA = "ifany")

##            
##             refused don't know not asked never smoked ex smoker - occasional
##   High            0          0         0         1309                    194
##   Low             0          1         0          587                     89
##   Moderate        0          1         0         1991                    343
##   Sedentary       0          0         0          395                     48
##   <NA>            5          9       175            4                      0
##            
##             ex smoker - regular ex smoker - DK freq current smoker
##   High                     1200                 147            452
##   Low                       621                  62            396
##   Moderate                 2022                 182           1068
##   Sedentary                 499                  54            244
##   <NA>                        0                   0              1

# disease variables by PA level
table(table1_w1$pa_level, table1_w1$hypertension, useNA = "ifany")

##            
##                0    1
##   High      2338  964
##   Low        960  796
##   Moderate  3533 2074
##   Sedentary  662  578
##   <NA>       134   60

table(table1_w1$pa_level, table1_w1$diabetes, useNA = "ifany")

##            
##                0    1
##   High      3172  130
##   Low       1572  184
##   Moderate  5240  367
##   Sedentary 1074  166
##   <NA>       175   19

table(table1_w1$pa_level, table1_w1$stroke, useNA = "ifany")

##            
##                0    1
##   High      3247   55
##   Low       1655  101
##   Moderate  5434  173
##   Sedentary 1088  152
##   <NA>       164   30

table(table1_w1$pa_level, table1_w1$abnormal_heart_rhythm, useNA = "ifany")

##            
##                0    1
##   High      3142  160
##   Low       1634  122
##   Moderate  5286  321
##   Sedentary 1128  112
##   <NA>       177   17

table(table1_w1$pa_level, table1_w1$heart_failure, useNA = "ifany")

##            
##                0    1
##   High      3296    6
##   Low       1730   26
##   Moderate  5587   20
##   Sedentary 1215   25
##   <NA>       189    5

table(table1_w1$pa_level, table1_w1$baseline_alzheimers, useNA = "ifany")

##            
##                0    1
##   High      3302    0
##   Low       1756    0
##   Moderate  5606    1
##   Sedentary 1233    7
##   <NA>       188    6

table(table1_w1$pa_level, table1_w1$baseline_dementia, useNA = "ifany")

##            
##                0    1
##   High      3297    5
##   Low       1748    8
##   Moderate  5595   12
##   Sedentary 1221   19
##   <NA>       174   20

table1_w1_clean <- table1_w1 %>%
  mutate(
    pa_level = factor(pa_level, levels = c("High", "Moderate", "Low", "Sedentary")),

    sex = fct_drop(sex),

    smoking3 = case_when(
      smoking == "never smoked" ~ "Never",
      smoking %in% c("ex smoker - occasional", "ex smoker - regular", "ex smoker - DK freq") ~ "Ex-smoker",
      smoking == "current smoker" ~ "Current",
      TRUE ~ NA_character_
    ),

    ethnicity2 = case_when(
      ethnicity == "White" ~ "White",
      ethnicity == "Non-white" ~ "Non-white",
      TRUE ~ NA_character_
    ),

    depression_score = if_else(depression_score < 0, NA_real_, as.numeric(depression_score))
  )

# check cleaned variables
table(table1_w1_clean$pa_level, useNA = "ifany")

## 
##      High  Moderate       Low Sedentary      <NA> 
##      3302      5607      1756      1240       194

table(table1_w1_clean$smoking3, useNA = "ifany")

## 
##   Current Ex-smoker     Never      <NA> 
##      2161      5461      4286       191

table(table1_w1_clean$ethnicity2, useNA = "ifany")

## 
## Non-white     White      <NA> 
##       164      5111      6824

summary(table1_w1_clean$depression_score)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.000   1.000   1.582   2.000   8.000     311

table1_summary <- table1_w1_clean %>%
  mutate(
    pa_level = forcats::fct_explicit_na(pa_level, na_level = "Missing PA")
  ) %>%
  group_by(pa_level, .drop = FALSE) %>%
  summarise(
    N = n(),
    `Age, mean (SD)` = sprintf("%.1f (%.1f)", mean(age, na.rm = TRUE), sd(age, na.rm = TRUE)),
    `Female, n (%)` = sprintf("%d (%.1f%%)",
                              sum(sex == "Female", na.rm = TRUE),
                              100 * sum(sex == "Female", na.rm = TRUE) / n()),
    `No qualification, n (%)` = sprintf("%d (%.1f%%)",
                                        sum(education == "No qualification", na.rm = TRUE),
                                        100 * sum(education == "No qualification", na.rm = TRUE) / n()),
    `Married/cohabiting, n (%)` = sprintf("%d (%.1f%%)",
                                          sum(marital_status %in% c("married (inc civ pship 06 onwards)", "cohabiting"), na.rm = TRUE),
                                          100 * sum(marital_status %in% c("married (inc civ pship 06 onwards)", "cohabiting"), na.rm = TRUE) / n()),
    `Working full/part time, n (%)` = sprintf("%d (%.1f%%)",
                                              sum(employment %in% c("Full time (>=35)", "Part time"), na.rm = TRUE),
                                              100 * sum(employment %in% c("Full time (>=35)", "Part time"), na.rm = TRUE) / n()),
    `Current smoker, n (%)` = sprintf("%d (%.1f%%)",
                                      sum(smoking3 == "Current", na.rm = TRUE),
                                      100 * sum(smoking3 == "Current", na.rm = TRUE) / n()),
    `Depression score, mean (SD)` = sprintf("%.2f (%.2f)",
                                            mean(depression_score, na.rm = TRUE),
                                            sd(depression_score, na.rm = TRUE)),
    `Hypertension, n (%)` = sprintf("%d (%.1f%%)",
                                    sum(hypertension == 1, na.rm = TRUE),
                                    100 * sum(hypertension == 1, na.rm = TRUE) / n()),
    `Diabetes, n (%)` = sprintf("%d (%.1f%%)",
                                sum(diabetes == 1, na.rm = TRUE),
                                100 * sum(diabetes == 1, na.rm = TRUE) / n()),
    `Stroke, n (%)` = sprintf("%d (%.1f%%)",
                              sum(stroke == 1, na.rm = TRUE),
                              100 * sum(stroke == 1, na.rm = TRUE) / n()),
    `Abnormal heart rhythm, n (%)` = sprintf("%d (%.1f%%)",
                                             sum(abnormal_heart_rhythm == 1, na.rm = TRUE),
                                             100 * sum(abnormal_heart_rhythm == 1, na.rm = TRUE) / n()),
    `Heart failure, n (%)` = sprintf("%d (%.1f%%)",
                                     sum(heart_failure == 1, na.rm = TRUE),
                                     100 * sum(heart_failure == 1, na.rm = TRUE) / n()),
    `Baseline Parkinson's disease, n (%)` = sprintf("%d (%.1f%%)",
                                      sum(baseline_parkinsons == 1, na.rm = TRUE),
                                      100 * sum(baseline_parkinsons == 1, na.rm = TRUE) / n()),
    `Baseline Alzheimer’s, n (%)` = sprintf("%d (%.1f%%)",
                                            sum(baseline_alzheimers == 1, na.rm = TRUE),
                                            100 * sum(baseline_alzheimers == 1, na.rm = TRUE) / n()),
    `Baseline dementia, n (%)` = sprintf("%d (%.1f%%)",
                                         sum(baseline_dementia == 1, na.rm = TRUE),
                                         100 * sum(baseline_dementia == 1, na.rm = TRUE) / n())
  )

knitr::kable(
  table1_summary,
  caption = "Draft Table 1. Baseline characteristics by physical activity group, wave 1."
)

Draft Table 1. Baseline characteristics by physical activity group, wave 1.
pa_level	N	Age, mean (SD)	Female, n (%)	No qualification, n (%)	Married/cohabiting, n (%)	Working full/part time, n (%)	Current smoker, n (%)	Depression score, mean (SD)	Hypertension, n (%)	Diabetes, n (%)	Stroke, n (%)	Abnormal heart rhythm, n (%)	Heart failure, n (%)	Baseline Parkinson’s disease, n (%)	Baseline Alzheimer’s, n (%)	Baseline dementia, n (%)
High	3302	60.6 (9.2)	1709 (51.8%)	935 (28.3%)	2582 (78.2%)	1753 (53.1%)	452 (13.7%)	1.06 (1.61)	964 (29.2%)	130 (3.9%)	55 (1.7%)	160 (4.8%)	6 (0.2%)	10 (0.3%)	0 (0.0%)	5 (0.2%)
Moderate	5607	63.6 (10.4)	3100 (55.3%)	2159 (38.5%)	4067 (72.5%)	2200 (39.2%)	1068 (19.0%)	1.41 (1.88)	2074 (37.0%)	367 (6.5%)	173 (3.1%)	321 (5.7%)	20 (0.4%)	15 (0.3%)	1 (0.0%)	12 (0.2%)
Low	1756	67.4 (12.0)	1204 (68.6%)	1051 (59.9%)	1049 (59.7%)	374 (21.3%)	396 (22.6%)	2.32 (2.22)	796 (45.3%)	184 (10.5%)	101 (5.8%)	122 (6.9%)	26 (1.5%)	15 (0.9%)	0 (0.0%)	8 (0.5%)
Sedentary	1240	70.8 (12.2)	650 (52.4%)	766 (61.8%)	705 (56.9%)	176 (14.2%)	244 (19.7%)	2.73 (2.32)	578 (46.6%)	166 (13.4%)	152 (12.3%)	112 (9.0%)	25 (2.0%)	12 (1.0%)	7 (0.6%)	19 (1.5%)
Missing PA	194	69.3 (15.0)	101 (52.1%)	97 (50.0%)	136 (70.1%)	2 (1.0%)	1 (0.5%)	0.25 (0.62)	60 (30.9%)	19 (9.8%)	30 (15.5%)	17 (8.8%)	5 (2.6%)	4 (2.1%)	6 (3.1%)	20 (10.3%)

table1_summary_final <- table1_summary %>%
  mutate(N = as.integer(N))

knitr::kable(
  table1_summary_final,
  caption = "Table 1. Baseline characteristics by physical activity group at wave 1. No participants were excluded at this stage."
)

Table 1. Baseline characteristics by physical activity group at wave 1. No participants were excluded at this stage.
pa_level	N	Age, mean (SD)	Female, n (%)	No qualification, n (%)	Married/cohabiting, n (%)	Working full/part time, n (%)	Current smoker, n (%)	Depression score, mean (SD)	Hypertension, n (%)	Diabetes, n (%)	Stroke, n (%)	Abnormal heart rhythm, n (%)	Heart failure, n (%)	Baseline Parkinson’s disease, n (%)	Baseline Alzheimer’s, n (%)	Baseline dementia, n (%)
High	3302	60.6 (9.2)	1709 (51.8%)	935 (28.3%)	2582 (78.2%)	1753 (53.1%)	452 (13.7%)	1.06 (1.61)	964 (29.2%)	130 (3.9%)	55 (1.7%)	160 (4.8%)	6 (0.2%)	10 (0.3%)	0 (0.0%)	5 (0.2%)
Moderate	5607	63.6 (10.4)	3100 (55.3%)	2159 (38.5%)	4067 (72.5%)	2200 (39.2%)	1068 (19.0%)	1.41 (1.88)	2074 (37.0%)	367 (6.5%)	173 (3.1%)	321 (5.7%)	20 (0.4%)	15 (0.3%)	1 (0.0%)	12 (0.2%)
Low	1756	67.4 (12.0)	1204 (68.6%)	1051 (59.9%)	1049 (59.7%)	374 (21.3%)	396 (22.6%)	2.32 (2.22)	796 (45.3%)	184 (10.5%)	101 (5.8%)	122 (6.9%)	26 (1.5%)	15 (0.9%)	0 (0.0%)	8 (0.5%)
Sedentary	1240	70.8 (12.2)	650 (52.4%)	766 (61.8%)	705 (56.9%)	176 (14.2%)	244 (19.7%)	2.73 (2.32)	578 (46.6%)	166 (13.4%)	152 (12.3%)	112 (9.0%)	25 (2.0%)	12 (1.0%)	7 (0.6%)	19 (1.5%)
Missing PA	194	69.3 (15.0)	101 (52.1%)	97 (50.0%)	136 (70.1%)	2 (1.0%)	1 (0.5%)	0.25 (0.62)	60 (30.9%)	19 (9.8%)	30 (15.5%)	17 (8.8%)	5 (2.6%)	4 (2.1%)	6 (3.1%)	20 (10.3%)

table1_summary_wide <- table1_summary_final %>%
  mutate(N = as.character(N)) %>%
  pivot_longer(
    cols = -pa_level,
    names_to = "Characteristic",
    values_to = "Value"
  ) %>%
  pivot_wider(
    names_from = pa_level,
    values_from = Value
  ) %>%
  select(
    Characteristic,
    High,
    Moderate,
    Low,
    Sedentary,
    `Missing PA`
  )

knitr::kable(
  table1_summary_wide,
  caption = "Table 1. Baseline characteristics at wave 1 by physical activity category. No participants were excluded at this stage."
)

Table 1. Baseline characteristics at wave 1 by physical activity category. No participants were excluded at this stage.
Characteristic	High	Moderate	Low	Sedentary	Missing PA
N	3302	5607	1756	1240	194
Age, mean (SD)	60.6 (9.2)	63.6 (10.4)	67.4 (12.0)	70.8 (12.2)	69.3 (15.0)
Female, n (%)	1709 (51.8%)	3100 (55.3%)	1204 (68.6%)	650 (52.4%)	101 (52.1%)
No qualification, n (%)	935 (28.3%)	2159 (38.5%)	1051 (59.9%)	766 (61.8%)	97 (50.0%)
Married/cohabiting, n (%)	2582 (78.2%)	4067 (72.5%)	1049 (59.7%)	705 (56.9%)	136 (70.1%)
Working full/part time, n (%)	1753 (53.1%)	2200 (39.2%)	374 (21.3%)	176 (14.2%)	2 (1.0%)
Current smoker, n (%)	452 (13.7%)	1068 (19.0%)	396 (22.6%)	244 (19.7%)	1 (0.5%)
Depression score, mean (SD)	1.06 (1.61)	1.41 (1.88)	2.32 (2.22)	2.73 (2.32)	0.25 (0.62)
Hypertension, n (%)	964 (29.2%)	2074 (37.0%)	796 (45.3%)	578 (46.6%)	60 (30.9%)
Diabetes, n (%)	130 (3.9%)	367 (6.5%)	184 (10.5%)	166 (13.4%)	19 (9.8%)
Stroke, n (%)	55 (1.7%)	173 (3.1%)	101 (5.8%)	152 (12.3%)	30 (15.5%)
Abnormal heart rhythm, n (%)	160 (4.8%)	321 (5.7%)	122 (6.9%)	112 (9.0%)	17 (8.8%)
Heart failure, n (%)	6 (0.2%)	20 (0.4%)	26 (1.5%)	25 (2.0%)	5 (2.6%)
Baseline Parkinson’s disease, n (%)	10 (0.3%)	15 (0.3%)	15 (0.9%)	12 (1.0%)	4 (2.1%)
Baseline Alzheimer’s, n (%)	0 (0.0%)	1 (0.0%)	0 (0.0%)	7 (0.6%)	6 (3.1%)
Baseline dementia, n (%)	5 (0.2%)	12 (0.2%)	8 (0.5%)	19 (1.5%)	20 (10.3%)

#numbers of alzheimers and dementia in wave 1
w1 %>%
  summarise(
    alz_n = sum(alz_w1 == 1, na.rm = TRUE),
    dem_n = sum(dementia_w1 == 1, na.rm = TRUE),
    both_n = sum(alz_w1 == 1 & dementia_w1 == 1, na.rm = TRUE),
    either_n = sum(alz_w1 == 1 | dementia_w1 == 1, na.rm = TRUE)
  )

## # A tibble: 1 × 4
##   alz_n dem_n both_n either_n
##   <int> <int>  <int>    <int>
## 1    14    64      3       75

# people with both baseline Alzheimer's and baseline dementia wave1
both_alz_dem <- table1_w1_clean %>%
  filter(baseline_alzheimers == 1, baseline_dementia == 1)

# how many participants are there with both alzheimer's and dementia at baseline wave 1?
nrow(both_alz_dem)

## [1] 3

#what category of PA level are the participants who are categorised as both alzheimer's and dementia in?
both_alz_dem %>%
  count(pa_level, .drop = FALSE)

## # A tibble: 5 × 2
##   pa_level      n
##   <fct>     <int>
## 1 High          0
## 2 Moderate      0
## 3 Low           0
## 4 Sedentary     2
## 5 <NA>          1

# which IDs are the participants with both alzheimers and dementa at baseline wave 1?
both_alz_dem %>%
  select(idauniq, pa_level, baseline_alzheimers, baseline_dementia)

## # A tibble: 3 × 4
##   idauniq pa_level  baseline_alzheimers baseline_dementia
##     <dbl> <fct>                   <dbl>             <dbl>
## 1  106735 <NA>                        1                 1
## 2  108547 Sedentary                   1                 1
## 3  119099 Sedentary                   1                 1

#dementia follow up coding chunk
w2_dem <- w2_core %>%
  transmute(
    idauniq,
    dem_w2 = if_else(
      hedib01 == 9 | hedib02 == 9 | hedib03 == 9 | hedib04 == 9,
      1, 0
    )
  )

w3_dem <- w3_core %>%
  transmute(
    idauniq,
    dem_w3 = if_else(hedibde == 3, 1, 0)
  )

w4_dem <- w4_core %>% transmute(idauniq, dem_w4 = if_else(hedibde == 1, 1, 0))
w5_dem <- w5_core %>% transmute(idauniq, dem_w5 = if_else(hedibde == 1, 1, 0))
w6_dem <- w6_core %>% transmute(idauniq, dem_w6 = if_else(hedibde == 1, 1, 0))
w7_dem <- w7_core %>% transmute(idauniq, dem_w7 = if_else(hedibde == 1, 1, 0))
w8_dem <- w8_core %>% transmute(idauniq, dem_w8 = if_else(hedibde == 1, 1, 0))
w9_dem <- w9_core %>% transmute(idauniq, dem_w9 = if_else(hedibde == 1, 1, 0))

#follow up merge chunk
followup_w1 <- table1_w1_clean %>%
  mutate(
    baseline_alzheimers = as.numeric(baseline_alzheimers),
    baseline_dementia = as.numeric(baseline_dementia),
    baseline_parkinsons = as.numeric(baseline_parkinsons),
    iqcode_mean = as.numeric(iqcode_mean),
    baseline_pathological_cognitive_decline = as.numeric(baseline_pathological_cognitive_decline)
  ) %>%
  left_join(w2_dem, by = "idauniq") %>%
  left_join(w3_dem, by = "idauniq") %>%
  left_join(w4_dem, by = "idauniq") %>%
  left_join(w5_dem, by = "idauniq") %>%
  left_join(w6_dem, by = "idauniq") %>%
  left_join(w7_dem, by = "idauniq") %>%
  left_join(w8_dem, by = "idauniq") %>%
  left_join(w9_dem, by = "idauniq") %>%
  mutate(
    across(starts_with("dem_w"), ~replace_na(., 0))
  )

dim(followup_w1)

## [1] 12099    30

#number of wave 1 participants met iqcode pathological decline threshold
w1 %>%
  count(baseline_pathological_cognitive_decline, useNA = "ifany")

## # A tibble: 2 × 3
##   baseline_pathological_cognitive_decline useNA     n
##                                     <dbl> <chr> <int>
## 1                                       0 ifany 12047
## 2                                       1 ifany    52

#post other exclusions are IQCODE path decline participants present
followup_w1 %>%
  mutate(
    prevalent_dem_alz_w1 = if_else(baseline_alzheimers == 1 | baseline_dementia == 1, 1, 0)
  ) %>%
  filter(
    prevalent_dem_alz_w1 == 0,
    baseline_parkinsons == 0,
    !is.na(pa_level)
  ) %>%
  count(baseline_pathological_cognitive_decline, useNA = "ifany")

## # A tibble: 1 × 3
##   baseline_pathological_cognitive_decline useNA     n
##                                     <dbl> <chr> <int>
## 1                                       0 ifany 11805

#analysis dataset chunk
analysis_w1 <- followup_w1 %>%
  mutate(
    prevalent_dem_alz_w1 = if_else(baseline_alzheimers == 1 | baseline_dementia == 1, 1, 0)
  ) %>%
  filter(
    prevalent_dem_alz_w1 == 0,
    baseline_parkinsons == 0,
    baseline_pathological_cognitive_decline == 0,
    !is.na(pa_level)
  ) %>%
  mutate(
    first_dem_wave = case_when(
      dem_w2 == 1 ~ 2,
      dem_w3 == 1 ~ 3,
      dem_w4 == 1 ~ 4,
      dem_w5 == 1 ~ 5,
      dem_w6 == 1 ~ 6,
      dem_w7 == 1 ~ 7,
      dem_w8 == 1 ~ 8,
      dem_w9 == 1 ~ 9,
      TRUE ~ NA_real_
    ),
    event_dementia = if_else(!is.na(first_dem_wave), 1, 0),
    time_to_event_waves = if_else(event_dementia == 1, first_dem_wave - 1, 8),
    pa_level = factor(pa_level, levels = c("High", "Moderate", "Low", "Sedentary")),
    current_smoker = if_else(smoking3 == "Current", 1, 0, missing = NA_real_),
    depression_binary = if_else(depression_score >= 4, 1, 0, missing = NA_real_)
  )

dim(analysis_w1)

## [1] 11805    36

table(analysis_w1$event_dementia, useNA = "ifany")

## 
##     0     1 
## 11325   480

table(analysis_w1$pa_level, useNA = "ifany")

## 
##      High  Moderate       Low Sedentary 
##      3287      5580      1733      1205

#cox model chunk
cox_unadjusted <- coxph(
  Surv(time_to_event_waves, event_dementia) ~ pa_level,
  data = analysis_w1
)

cox_adjusted <- coxph(
  Surv(time_to_event_waves, event_dementia) ~ pa_level + age + sex +
    current_smoker + hypertension + diabetes + stroke +
    abnormal_heart_rhythm + heart_failure + depression_binary,
  data = analysis_w1
)

#output chunk for document 
summary(cox_unadjusted)

## Call:
## coxph(formula = Surv(time_to_event_waves, event_dementia) ~ pa_level, 
##     data = analysis_w1)
## 
##   n= 11805, number of events= 480 
## 
##                     coef exp(coef) se(coef)     z Pr(>|z|)    
## pa_levelModerate  0.5470    1.7281   0.1288 4.246 2.18e-05 ***
## pa_levelLow       0.8084    2.2444   0.1516 5.332 9.69e-08 ***
## pa_levelSedentary 0.8857    2.4246   0.1632 5.427 5.72e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                   exp(coef) exp(-coef) lower .95 upper .95
## pa_levelModerate      1.728     0.5787     1.342     2.225
## pa_levelLow           2.244     0.4456     1.667     3.021
## pa_levelSedentary     2.425     0.4124     1.761     3.339
## 
## Concordance= 0.579  (se = 0.012 )
## Likelihood ratio test= 41.82  on 3 df,   p=4e-09
## Wald test            = 38.66  on 3 df,   p=2e-08
## Score (logrank) test = 40.21  on 3 df,   p=1e-08

summary(cox_adjusted)

## Call:
## coxph(formula = Surv(time_to_event_waves, event_dementia) ~ pa_level + 
##     age + sex + current_smoker + hypertension + diabetes + stroke + 
##     abnormal_heart_rhythm + heart_failure + depression_binary, 
##     data = analysis_w1)
## 
##   n= 11679, number of events= 476 
##    (126 observations deleted due to missingness)
## 
##                            coef exp(coef)  se(coef)      z Pr(>|z|)    
## pa_levelModerate       0.288026  1.333792  0.131591  2.189   0.0286 *  
## pa_levelLow            0.204592  1.227024  0.161119  1.270   0.2041    
## pa_levelSedentary      0.020806  1.021024  0.178992  0.116   0.9075    
## age                    0.061431  1.063357  0.004261 14.418   <2e-16 ***
## sexFemale              0.208323  1.231611  0.096090  2.168   0.0302 *  
## current_smoker        -0.117040  0.889550  0.140726 -0.832   0.4056    
## hypertension           0.085720  1.089501  0.094001  0.912   0.3618    
## diabetes               0.160271  1.173829  0.157535  1.017   0.3090    
## stroke                 0.218415  1.244104  0.178580  1.223   0.2213    
## abnormal_heart_rhythm  0.329790  1.390676  0.155400  2.122   0.0338 *  
## heart_failure         -0.696122  0.498515  0.582639 -1.195   0.2322    
## depression_binary      0.258923  1.295534  0.113208  2.287   0.0222 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                       exp(coef) exp(-coef) lower .95 upper .95
## pa_levelModerate         1.3338     0.7497    1.0306     1.726
## pa_levelLow              1.2270     0.8150    0.8948     1.683
## pa_levelSedentary        1.0210     0.9794    0.7189     1.450
## age                      1.0634     0.9404    1.0545     1.072
## sexFemale                1.2316     0.8119    1.0202     1.487
## current_smoker           0.8895     1.1242    0.6751     1.172
## hypertension             1.0895     0.9179    0.9062     1.310
## diabetes                 1.1738     0.8519    0.8620     1.598
## stroke                   1.2441     0.8038    0.8767     1.765
## abnormal_heart_rhythm    1.3907     0.7191    1.0255     1.886
## heart_failure            0.4985     2.0060    0.1591     1.562
## depression_binary        1.2955     0.7719    1.0377     1.617
## 
## Concordance= 0.74  (se = 0.01 )
## Likelihood ratio test= 298.4  on 12 df,   p=<2e-16
## Wald test            = 306.5  on 12 df,   p=<2e-16
## Score (logrank) test = 326.3  on 12 df,   p=<2e-16

Sensitivity analysis NO IQCODE

analysis_w1_no_iqcode <- followup_w1 %>%
  mutate(
    prevalent_dem_alz_w1 = if_else(baseline_alzheimers == 1 | baseline_dementia == 1, 1, 0)
  ) %>%
  filter(
    prevalent_dem_alz_w1 == 0,
    baseline_parkinsons == 0,
    !is.na(pa_level)
  ) %>%
  mutate(
    first_dem_wave = case_when(
      dem_w2 == 1 ~ 2,
      dem_w3 == 1 ~ 3,
      dem_w4 == 1 ~ 4,
      dem_w5 == 1 ~ 5,
      dem_w6 == 1 ~ 6,
      dem_w7 == 1 ~ 7,
      dem_w8 == 1 ~ 8,
      dem_w9 == 1 ~ 9,
      TRUE ~ NA_real_
    ),
    event_dementia = if_else(!is.na(first_dem_wave), 1, 0),
    time_to_event_waves = if_else(event_dementia == 1, first_dem_wave - 1, 8),
    pa_level = factor(pa_level, levels = c("High", "Moderate", "Low", "Sedentary")),
    current_smoker = if_else(smoking3 == "Current", 1, 0, missing = NA_real_),
    depression_binary = if_else(depression_score >= 4, 1, 0, missing = NA_real_)
  )

dim(analysis_w1_no_iqcode)

## [1] 11805    36

table(analysis_w1_no_iqcode$event_dementia, useNA = "ifany")

## 
##     0     1 
## 11325   480

table(analysis_w1_no_iqcode$pa_level, useNA = "ifany")

## 
##      High  Moderate       Low Sedentary 
##      3287      5580      1733      1205

2. Cox models: NO IQCODE exclusion

cox_unadjusted_no_iqcode <- coxph(
  Surv(time_to_event_waves, event_dementia) ~ pa_level,
  data = analysis_w1_no_iqcode
)

cox_adjusted_no_iqcode <- coxph(
  Surv(time_to_event_waves, event_dementia) ~ pa_level + age + sex +
    current_smoker + hypertension + diabetes + stroke +
    abnormal_heart_rhythm + heart_failure + depression_binary,
  data = analysis_w1_no_iqcode
)

summary(cox_unadjusted_no_iqcode)

## Call:
## coxph(formula = Surv(time_to_event_waves, event_dementia) ~ pa_level, 
##     data = analysis_w1_no_iqcode)
## 
##   n= 11805, number of events= 480 
## 
##                     coef exp(coef) se(coef)     z Pr(>|z|)    
## pa_levelModerate  0.5470    1.7281   0.1288 4.246 2.18e-05 ***
## pa_levelLow       0.8084    2.2444   0.1516 5.332 9.69e-08 ***
## pa_levelSedentary 0.8857    2.4246   0.1632 5.427 5.72e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                   exp(coef) exp(-coef) lower .95 upper .95
## pa_levelModerate      1.728     0.5787     1.342     2.225
## pa_levelLow           2.244     0.4456     1.667     3.021
## pa_levelSedentary     2.425     0.4124     1.761     3.339
## 
## Concordance= 0.579  (se = 0.012 )
## Likelihood ratio test= 41.82  on 3 df,   p=4e-09
## Wald test            = 38.66  on 3 df,   p=2e-08
## Score (logrank) test = 40.21  on 3 df,   p=1e-08

summary(cox_adjusted_no_iqcode)

## Call:
## coxph(formula = Surv(time_to_event_waves, event_dementia) ~ pa_level + 
##     age + sex + current_smoker + hypertension + diabetes + stroke + 
##     abnormal_heart_rhythm + heart_failure + depression_binary, 
##     data = analysis_w1_no_iqcode)
## 
##   n= 11679, number of events= 476 
##    (126 observations deleted due to missingness)
## 
##                            coef exp(coef)  se(coef)      z Pr(>|z|)    
## pa_levelModerate       0.288026  1.333792  0.131591  2.189   0.0286 *  
## pa_levelLow            0.204592  1.227024  0.161119  1.270   0.2041    
## pa_levelSedentary      0.020806  1.021024  0.178992  0.116   0.9075    
## age                    0.061431  1.063357  0.004261 14.418   <2e-16 ***
## sexFemale              0.208323  1.231611  0.096090  2.168   0.0302 *  
## current_smoker        -0.117040  0.889550  0.140726 -0.832   0.4056    
## hypertension           0.085720  1.089501  0.094001  0.912   0.3618    
## diabetes               0.160271  1.173829  0.157535  1.017   0.3090    
## stroke                 0.218415  1.244104  0.178580  1.223   0.2213    
## abnormal_heart_rhythm  0.329790  1.390676  0.155400  2.122   0.0338 *  
## heart_failure         -0.696122  0.498515  0.582639 -1.195   0.2322    
## depression_binary      0.258923  1.295534  0.113208  2.287   0.0222 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                       exp(coef) exp(-coef) lower .95 upper .95
## pa_levelModerate         1.3338     0.7497    1.0306     1.726
## pa_levelLow              1.2270     0.8150    0.8948     1.683
## pa_levelSedentary        1.0210     0.9794    0.7189     1.450
## age                      1.0634     0.9404    1.0545     1.072
## sexFemale                1.2316     0.8119    1.0202     1.487
## current_smoker           0.8895     1.1242    0.6751     1.172
## hypertension             1.0895     0.9179    0.9062     1.310
## diabetes                 1.1738     0.8519    0.8620     1.598
## stroke                   1.2441     0.8038    0.8767     1.765
## abnormal_heart_rhythm    1.3907     0.7191    1.0255     1.886
## heart_failure            0.4985     2.0060    0.1591     1.562
## depression_binary        1.2955     0.7719    1.0377     1.617
## 
## Concordance= 0.74  (se = 0.01 )
## Likelihood ratio test= 298.4  on 12 df,   p=<2e-16
## Wald test            = 306.5  on 12 df,   p=<2e-16
## Score (logrank) test = 326.3  on 12 df,   p=<2e-16

participants WITH IQCODE PATHOLOGICAL COGNITIVE DECLINE

followup_w1 %>%
  mutate(
    prevalent_dem_alz_w1 = if_else(baseline_alzheimers == 1 | baseline_dementia == 1, 1, 0)
  ) %>%
  filter(
    prevalent_dem_alz_w1 == 0,
    baseline_parkinsons == 0,
    !is.na(pa_level)
  ) %>%
  count(baseline_pathological_cognitive_decline, useNA = "ifany")

Wave 2 Table 1

w2_derived <- read_dta("raw data/RAW_data_stata/wave_2_derived_variables.dta")
w2 <- left_join(w2_core, w2_derived, by = "idauniq")

w2 <- w2 %>%
  mutate(
    htn_w2 = if_else(
      hedia01 == 1 | hedia02 == 1 | hedia03 == 1 | hedia04 == 1,
      1, 0
    ),
    hf_w2 = if_else(
      hedia01 == 4 | hedia02 == 4 | hedia03 == 4 | hedia04 == 4,
      1, 0
    ),
    arrhythmia_w2 = if_else(
      hedia01 == 6 | hedia02 == 6 | hedia03 == 6 | hedia04 == 6,
      1, 0
    ),
    diabetes_w2 = if_else(
      hedia01 == 7 | hedia02 == 7 | hedia03 == 7 | hedia04 == 7,
      1, 0
    ),
    stroke_w2 = if_else(
      hedia01 == 8 | hedia02 == 8 | hedia03 == 8 | hedia04 == 8,
      1, 0
    ),
    alz_w2 = if_else(
      hedib01 == 8 | hedib02 == 8 | hedib03 == 8 | hedib04 == 8,
      1, 0
    ),
    dementia_w2 = if_else(
      hedib01 == 9 | hedib02 == 9 | hedib03 == 9 | hedib04 == 9,
      1, 0
    ),
    pa_level = case_when(
      palevel == 3 ~ "High",
      palevel == 2 ~ "Moderate",
      palevel == 1 ~ "Low",
      palevel == 0 ~ "Sedentary",
      TRUE ~ NA_character_
    )
  )

w2_table1 <- w2 %>%
  transmute(
    idauniq,
    pa_level = factor(pa_level, levels = c("High", "Moderate", "Low", "Sedentary")),
    age = dhager,
    sex = as_factor(DhSex),
    hypertension = htn_w2,
    heart_failure = hf_w2,
    abnormal_heart_rhythm = arrhythmia_w2,
    diabetes = diabetes_w2,
    stroke = stroke_w2,
    baseline_alzheimers = alz_w2,
    baseline_dementia = dementia_w2
  )

w2_table1_summary <- w2_table1 %>%
  mutate(
    pa_level = forcats::fct_explicit_na(pa_level, na_level = "Missing PA")
  ) %>%
  group_by(pa_level, .drop = FALSE) %>%
  summarise(
    N = n(),
    `Age, mean (SD)` = sprintf("%.1f (%.1f)", mean(age, na.rm = TRUE), sd(age, na.rm = TRUE)),
    `Female, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(sex == "Female", na.rm = TRUE),
      100 * sum(sex == "Female", na.rm = TRUE) / n()
    ),
    `Hypertension, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(hypertension == 1, na.rm = TRUE),
      100 * sum(hypertension == 1, na.rm = TRUE) / n()
    ),
    `Diabetes, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(diabetes == 1, na.rm = TRUE),
      100 * sum(diabetes == 1, na.rm = TRUE) / n()
    ),
    `Stroke, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(stroke == 1, na.rm = TRUE),
      100 * sum(stroke == 1, na.rm = TRUE) / n()
    ),
    `Abnormal heart rhythm, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(abnormal_heart_rhythm == 1, na.rm = TRUE),
      100 * sum(abnormal_heart_rhythm == 1, na.rm = TRUE) / n()
    ),
    `Heart failure, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(heart_failure == 1, na.rm = TRUE),
      100 * sum(heart_failure == 1, na.rm = TRUE) / n()
    ),
    `Baseline Alzheimer’s, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_alzheimers == 1, na.rm = TRUE),
      100 * sum(baseline_alzheimers == 1, na.rm = TRUE) / n()
    ),
    `Baseline dementia, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_dementia == 1, na.rm = TRUE),
      100 * sum(baseline_dementia == 1, na.rm = TRUE) / n()
    )
  )

w2_table1_summary_wide <- w2_table1_summary %>%
  mutate(N = as.character(N)) %>%
  pivot_longer(
    cols = -pa_level,
    names_to = "Characteristic",
    values_to = "Value"
  ) %>%
  pivot_wider(
    names_from = pa_level,
    values_from = Value
  ) %>%
  select(
    Characteristic,
    High,
    Moderate,
    Low,
    Sedentary,
    `Missing PA`
  )

knitr::kable(
  w2_table1_summary_wide,
  caption = "Table 1. Baseline characteristics at wave 2 by physical activity category."
)

Table 1. Baseline characteristics at wave 2 by physical activity category.
Characteristic	High	Moderate	Low	Sedentary	Missing PA
N	1744	4684	2309	556	139
Age, mean (SD)	61.8 (8.7)	64.5 (9.8)	68.8 (11.2)	75.2 (11.6)	70.6 (15.8)
Female, n (%)	835 (47.9%)	2615 (55.8%)	1465 (63.4%)	325 (58.5%)	66 (47.5%)
Hypertension, n (%)	253 (14.5%)	784 (16.7%)	487 (21.1%)	156 (28.1%)	26 (18.7%)
Diabetes, n (%)	28 (1.6%)	151 (3.2%)	139 (6.0%)	55 (9.9%)	10 (7.2%)
Stroke, n (%)	3 (0.2%)	49 (1.0%)	59 (2.6%)	39 (7.0%)	13 (9.4%)
Abnormal heart rhythm, n (%)	48 (2.8%)	138 (2.9%)	79 (3.4%)	44 (7.9%)	5 (3.6%)
Heart failure, n (%)	0 (0.0%)	8 (0.2%)	8 (0.3%)	4 (0.7%)	0 (0.0%)
Baseline Alzheimer’s, n (%)	1 (0.1%)	2 (0.0%)	2 (0.1%)	1 (0.2%)	5 (3.6%)
Baseline dementia, n (%)	2 (0.1%)	11 (0.2%)	9 (0.4%)	7 (1.3%)	14 (10.1%)

## Wave 3 table 1

w3_table1_raw <- w3_core %>%
  mutate(
    hypertension = if_else(hediabp == 1, 1, 0, missing = 0),
    heart_failure = if_else(hediahf == 1, 1, 0, missing = 0),
    abnormal_heart_rhythm = if_else(hediaar == 1, 1, 0, missing = 0),
    diabetes = if_else(hediadi == 1, 1, 0, missing = 0),
    stroke = if_else(hediast == 1, 1, 0, missing = 0),

    baseline_alzheimers = if_else(hedibad %in% c(1, 2, 3), 1, 0, missing = 0),
    baseline_dementia   = if_else(hedibde %in% c(1, 2, 3), 1, 0, missing = 0),

    pa_level = case_when(
      palevel == 3 ~ "High",
      palevel == 2 ~ "Moderate",
      palevel == 1 ~ "Low",
      palevel == 0 ~ "Sedentary",
      TRUE ~ NA_character_
    )
  )

w3_table1 <- w3_table1_raw %>%
  transmute(
    idauniq,
    pa_level = factor(pa_level, levels = c("High", "Moderate", "Low", "Sedentary")),
    age = dhager,
    sex = as_factor(dhsex),
    hypertension,
    heart_failure,
    abnormal_heart_rhythm,
    diabetes,
    stroke,
    baseline_alzheimers,
    baseline_dementia
  )

w3_table1_summary <- w3_table1 %>%
  mutate(
    pa_level = forcats::fct_explicit_na(pa_level, na_level = "Missing PA")
  ) %>%
  group_by(pa_level, .drop = FALSE) %>%
  summarise(
    N = n(),
    `Age, mean (SD)` = sprintf("%.1f (%.1f)", mean(age, na.rm = TRUE), sd(age, na.rm = TRUE)),
    `Female, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(sex == "Female", na.rm = TRUE),
      100 * sum(sex == "Female", na.rm = TRUE) / n()
    ),
    `Hypertension, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(hypertension == 1, na.rm = TRUE),
      100 * sum(hypertension == 1, na.rm = TRUE) / n()
    ),
    `Diabetes, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(diabetes == 1, na.rm = TRUE),
      100 * sum(diabetes == 1, na.rm = TRUE) / n()
    ),
    `Stroke, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(stroke == 1, na.rm = TRUE),
      100 * sum(stroke == 1, na.rm = TRUE) / n()
    ),
    `Abnormal heart rhythm, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(abnormal_heart_rhythm == 1, na.rm = TRUE),
      100 * sum(abnormal_heart_rhythm == 1, na.rm = TRUE) / n()
    ),
    `Heart failure, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(heart_failure == 1, na.rm = TRUE),
      100 * sum(heart_failure == 1, na.rm = TRUE) / n()
    ),
    `Baseline Alzheimer’s, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_alzheimers == 1, na.rm = TRUE),
      100 * sum(baseline_alzheimers == 1, na.rm = TRUE) / n()
    ),
    `Baseline dementia, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_dementia == 1, na.rm = TRUE),
      100 * sum(baseline_dementia == 1, na.rm = TRUE) / n()
    )
  )

w3_table1_summary_wide <- w3_table1_summary %>%
  mutate(N = as.character(N)) %>%
  pivot_longer(
    cols = -pa_level,
    names_to = "Characteristic",
    values_to = "Value"
  ) %>%
  pivot_wider(
    names_from = pa_level,
    values_from = Value
  ) %>%
  select(
    Characteristic,
    High,
    Moderate,
    Low,
    Sedentary,
    `Missing PA`
  )

knitr::kable(
  w3_table1_summary_wide,
  caption = "Table 1. Baseline characteristics at wave 3 by physical activity category."
)

Table 1. Baseline characteristics at wave 3 by physical activity category.
Characteristic	High	Moderate	Low	Sedentary	Missing PA
N	1969	4838	2263	686	15
Age, mean (SD)	59.8 (8.9)	63.3 (10.5)	68.1 (12.0)	74.7 (14.2)	61.6 (12.2)
Female, n (%)	939 (47.7%)	2669 (55.2%)	1458 (64.4%)	399 (58.2%)	11 (73.3%)
Hypertension, n (%)	174 (8.8%)	473 (9.8%)	221 (9.8%)	59 (8.6%)	0 (0.0%)
Diabetes, n (%)	36 (1.8%)	115 (2.4%)	75 (3.3%)	30 (4.4%)	0 (0.0%)
Stroke, n (%)	15 (0.8%)	31 (0.6%)	30 (1.3%)	31 (4.5%)	0 (0.0%)
Abnormal heart rhythm, n (%)	44 (2.2%)	90 (1.9%)	62 (2.7%)	21 (3.1%)	0 (0.0%)
Heart failure, n (%)	1 (0.1%)	1 (0.0%)	8 (0.4%)	4 (0.6%)	0 (0.0%)
Baseline Alzheimer’s, n (%)	1 (0.1%)	5 (0.1%)	9 (0.4%)	17 (2.5%)	0 (0.0%)
Baseline dementia, n (%)	3 (0.2%)	20 (0.4%)	33 (1.5%)	56 (8.2%)	0 (0.0%)

wave 4 table 1

w4_table1_raw <- w4_core %>%
  mutate(
    hypertension = if_else(hediabp == 1, 1, 0, missing = 0),
    heart_failure = if_else(hediahf == 1, 1, 0, missing = 0),
    abnormal_heart_rhythm = if_else(hediaar == 1, 1, 0, missing = 0),
    diabetes = if_else(hediadi == 1, 1, 0, missing = 0),
    stroke = if_else(hediast == 1, 1, 0, missing = 0),

    baseline_alzheimers = if_else(hedibad == 1, 1, 0, missing = 0),
    baseline_dementia   = if_else(hedibde == 1, 1, 0, missing = 0),

    pa_level = case_when(
      palevel == 3 ~ "High",
      palevel == 2 ~ "Moderate",
      palevel == 1 ~ "Low",
      palevel == 0 ~ "Sedentary",
      TRUE ~ NA_character_
    )
  )

w4_table1 <- w4_table1_raw %>%
  transmute(
    idauniq,
    pa_level = factor(pa_level, levels = c("High", "Moderate", "Low", "Sedentary")),
    age = indager,
    sex = as_factor(dhsex),
    hypertension,
    heart_failure,
    abnormal_heart_rhythm,
    diabetes,
    stroke,
    baseline_alzheimers,
    baseline_dementia
  )

w4_table1_summary <- w4_table1 %>%
  mutate(
    pa_level = forcats::fct_explicit_na(pa_level, na_level = "Missing PA")
  ) %>%
  group_by(pa_level, .drop = FALSE) %>%
  summarise(
    N = n(),
    `Age, mean (SD)` = sprintf("%.1f (%.1f)", mean(age, na.rm = TRUE), sd(age, na.rm = TRUE)),
    `Female, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(sex == "Female", na.rm = TRUE),
      100 * sum(sex == "Female", na.rm = TRUE) / n()
    ),
    `Hypertension, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(hypertension == 1, na.rm = TRUE),
      100 * sum(hypertension == 1, na.rm = TRUE) / n()
    ),
    `Diabetes, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(diabetes == 1, na.rm = TRUE),
      100 * sum(diabetes == 1, na.rm = TRUE) / n()
    ),
    `Stroke, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(stroke == 1, na.rm = TRUE),
      100 * sum(stroke == 1, na.rm = TRUE) / n()
    ),
    `Abnormal heart rhythm, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(abnormal_heart_rhythm == 1, na.rm = TRUE),
      100 * sum(abnormal_heart_rhythm == 1, na.rm = TRUE) / n()
    ),
    `Heart failure, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(heart_failure == 1, na.rm = TRUE),
      100 * sum(heart_failure == 1, na.rm = TRUE) / n()
    ),
    `Baseline Alzheimer’s, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_alzheimers == 1, na.rm = TRUE),
      100 * sum(baseline_alzheimers == 1, na.rm = TRUE) / n()
    ),
    `Baseline dementia, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_dementia == 1, na.rm = TRUE),
      100 * sum(baseline_dementia == 1, na.rm = TRUE) / n()
    )
  )

w4_table1_summary_wide <- w4_table1_summary %>%
  mutate(N = as.character(N)) %>%
  pivot_longer(
    cols = -pa_level,
    names_to = "Characteristic",
    values_to = "Value"
  ) %>%
  pivot_wider(
    names_from = pa_level,
    values_from = Value
  ) %>%
  select(
    Characteristic,
    High,
    Moderate,
    Low,
    Sedentary,
    `Missing PA`
  )

knitr::kable(
  w4_table1_summary_wide,
  caption = "Table 1. Baseline characteristics at wave 4 by physical activity category."
)

Table 1. Baseline characteristics at wave 4 by physical activity category.
Characteristic	High	Moderate	Low	Sedentary	Missing PA
N	2254	5384	2562	835	15
Age, mean (SD)	61.7 (8.4)	63.9 (9.5)	68.2 (11.3)	74.2 (12.5)	59.1 (9.2)
Female, n (%)	1070 (47.5%)	2929 (54.4%)	1649 (64.4%)	472 (56.5%)	5 (33.3%)
Hypertension, n (%)	201 (8.9%)	603 (11.2%)	350 (13.7%)	135 (16.2%)	0 (0.0%)
Diabetes, n (%)	43 (1.9%)	149 (2.8%)	125 (4.9%)	61 (7.3%)	1 (6.7%)
Stroke, n (%)	7 (0.3%)	55 (1.0%)	46 (1.8%)	68 (8.1%)	0 (0.0%)
Abnormal heart rhythm, n (%)	32 (1.4%)	111 (2.1%)	88 (3.4%)	45 (5.4%)	0 (0.0%)
Heart failure, n (%)	1 (0.0%)	3 (0.1%)	6 (0.2%)	15 (1.8%)	0 (0.0%)
Baseline Alzheimer’s, n (%)	0 (0.0%)	5 (0.1%)	3 (0.1%)	29 (3.5%)	0 (0.0%)
Baseline dementia, n (%)	1 (0.0%)	17 (0.3%)	19 (0.7%)	60 (7.2%)	0 (0.0%)

wave 5 table 1

## Wave 5 table 1
w5_table1_raw <- w5_core %>%
  mutate(
    hypertension = if_else(hediabp == 1, 1, 0, missing = 0),
    heart_failure = if_else(hediahf == 1, 1, 0, missing = 0),
    abnormal_heart_rhythm = if_else(hediaar == 1, 1, 0, missing = 0),
    diabetes = if_else(hediadi == 1, 1, 0, missing = 0),
    stroke = if_else(hediast == 1, 1, 0, missing = 0),

    baseline_alzheimers = if_else(hedibad == 1, 1, 0, missing = 0),
    baseline_dementia   = if_else(hedibde == 1, 1, 0, missing = 0),

    pa_level = case_when(
      palevel == 3 ~ "High",
      palevel == 2 ~ "Moderate",
      palevel == 1 ~ "Low",
      palevel == 0 ~ "Sedentary",
      TRUE ~ NA_character_
    )
  )
# ORIGINAL VERSION OF W5 TABLE 1 PERFORMED FIRST - SEE BELOW FOR AMENDED FINAL VERSION
# Original version produced implausible age summaries because wave 5 #..indager contains negative
# ..special missing codes, which distorted the mean and SD if not #..recoded.
# Kept here as part of the analytic process to include in methods

w5_table1 <- w5_table1_raw %>%
  transmute(
    idauniq,
    pa_level = factor(pa_level, levels = c("High", "Moderate", "Low", "Sedentary")),
    age = indager,
    sex = as_factor(dhsex),
    hypertension,
    heart_failure,
    abnormal_heart_rhythm,
    diabetes,
    stroke,
    baseline_alzheimers,
    baseline_dementia
  )

# Wave 5 table 1 CORRECTED VERSION
# Wave 5 indager includes negative values representing special missing codes.
# These were recoded to NA before summary statistics were generated for wave 5 table 1 amended final version

w5_table1 <- w5_table1_raw %>%
  transmute(
    idauniq,
    pa_level = factor(pa_level, levels = c("High", "Moderate", "Low", "Sedentary")),
    age = if_else(indager < 0, NA_real_, as.numeric(indager)),
    sex = as_factor(dhsex),
    hypertension,
    heart_failure,
    abnormal_heart_rhythm,
    diabetes,
    stroke,
    baseline_alzheimers,
    baseline_dementia
  )

w5_table1_summary <- w5_table1 %>%
  mutate(
    pa_level = forcats::fct_explicit_na(pa_level, na_level = "Missing PA")
  ) %>%
  group_by(pa_level, .drop = FALSE) %>%
  summarise(
    N = n(),
    `Age, mean (SD)` = sprintf("%.1f (%.1f)", mean(age, na.rm = TRUE), sd(age, na.rm = TRUE)),
    `Female, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(sex == "Female", na.rm = TRUE),
      100 * sum(sex == "Female", na.rm = TRUE) / n()
    ),
    `Hypertension, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(hypertension == 1, na.rm = TRUE),
      100 * sum(hypertension == 1, na.rm = TRUE) / n()
    ),
    `Diabetes, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(diabetes == 1, na.rm = TRUE),
      100 * sum(diabetes == 1, na.rm = TRUE) / n()
    ),
    `Stroke, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(stroke == 1, na.rm = TRUE),
      100 * sum(stroke == 1, na.rm = TRUE) / n()
    ),
    `Abnormal heart rhythm, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(abnormal_heart_rhythm == 1, na.rm = TRUE),
      100 * sum(abnormal_heart_rhythm == 1, na.rm = TRUE) / n()
    ),
    `Heart failure, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(heart_failure == 1, na.rm = TRUE),
      100 * sum(heart_failure == 1, na.rm = TRUE) / n()
    ),
    `Baseline Alzheimer’s, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_alzheimers == 1, na.rm = TRUE),
      100 * sum(baseline_alzheimers == 1, na.rm = TRUE) / n()
    ),
    `Baseline dementia, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_dementia == 1, na.rm = TRUE),
      100 * sum(baseline_dementia == 1, na.rm = TRUE) / n()
    )
  )

w5_table1_summary_wide <- w5_table1_summary %>%
  mutate(N = as.character(N)) %>%
  pivot_longer(
    cols = -pa_level,
    names_to = "Characteristic",
    values_to = "Value"
  ) %>%
  pivot_wider(
    names_from = pa_level,
    values_from = Value
  ) %>%
  select(
    Characteristic,
    High,
    Moderate,
    Low,
    Sedentary,
    `Missing PA`
  )

knitr::kable(
  w5_table1_summary_wide,
  caption = "Table 1. Baseline characteristics at wave 5 by physical activity category."
)

Table 1. Baseline characteristics at wave 5 by physical activity category.
Characteristic	High	Moderate	Low	Sedentary	Missing PA
N	1981	4936	2432	801	124
Age, mean (SD)	62.9 (7.8)	65.5 (8.9)	68.9 (10.0)	73.0 (10.2)	60.7 (5.5)
Female, n (%)	934 (47.1%)	2736 (55.4%)	1507 (62.0%)	472 (58.9%)	56 (45.2%)
Hypertension, n (%)	141 (7.1%)	335 (6.8%)	202 (8.3%)	90 (11.2%)	8 (6.5%)
Diabetes, n (%)	33 (1.7%)	112 (2.3%)	93 (3.8%)	49 (6.1%)	1 (0.8%)
Stroke, n (%)	10 (0.5%)	32 (0.6%)	43 (1.8%)	64 (8.0%)	0 (0.0%)
Abnormal heart rhythm, n (%)	25 (1.3%)	99 (2.0%)	78 (3.2%)	52 (6.5%)	2 (1.6%)
Heart failure, n (%)	1 (0.1%)	4 (0.1%)	8 (0.3%)	11 (1.4%)	0 (0.0%)
Baseline Alzheimer’s, n (%)	0 (0.0%)	7 (0.1%)	6 (0.2%)	33 (4.1%)	0 (0.0%)
Baseline dementia, n (%)	0 (0.0%)	11 (0.2%)	26 (1.1%)	73 (9.1%)	0 (0.0%)

wave 6 table 1

w6_table1_raw <- w6_core %>%
  mutate(
    hypertension = if_else(hediabp == 1, 1, 0, missing = 0),
    heart_failure = if_else(hediahf == 1, 1, 0, missing = 0),
    abnormal_heart_rhythm = if_else(hediaar == 1, 1, 0, missing = 0),
    diabetes = if_else(hediadi == 1, 1, 0, missing = 0),
    stroke = if_else(hediast == 1, 1, 0, missing = 0),

    baseline_alzheimers = if_else(hedibad == 1, 1, 0, missing = 0),
    baseline_dementia   = if_else(hedibde == 1, 1, 0, missing = 0),

    pa_level = case_when(
      HeActa %in% c(1, 2) ~ "High",
      !HeActa %in% c(1, 2) & HeActb %in% c(1, 2) ~ "Moderate",
      !HeActa %in% c(1, 2) & !HeActb %in% c(1, 2) & HeActc %in% c(1, 2) ~ "Low",
      HeActa %in% c(3, 4) & HeActb %in% c(3, 4) & HeActc %in% c(3, 4) ~ "Sedentary",
      TRUE ~ NA_character_
    )
  )

# ORIGINAL VERSION OF W6 AGE HANDLING (kept as methodological note)
# age = indager
# This is not used in final summaries because negative indager values are special missing codes.

# wave 6 table 1 CORRECTED VERSION
w6_table1 <- w6_table1_raw %>%
  transmute(
    idauniq,
    pa_level = factor(pa_level, levels = c("High", "Moderate", "Low", "Sedentary")),
    age = if_else(indager < 0, NA_real_, as.numeric(indager)),
    sex = as_factor(DhSex),
    hypertension,
    heart_failure,
    abnormal_heart_rhythm,
    diabetes,
    stroke,
    baseline_alzheimers,
    baseline_dementia
  )

# diagnostic age check
w6_table1 %>%
  group_by(pa_level) %>%
  summarise(
    min_age = min(age, na.rm = TRUE),
    q1_age = quantile(age, 0.25, na.rm = TRUE),
    median_age = median(age, na.rm = TRUE),
    mean_age = mean(age, na.rm = TRUE),
    q3_age = quantile(age, 0.75, na.rm = TRUE),
    max_age = max(age, na.rm = TRUE),
    sd_age = sd(age, na.rm = TRUE),
    n_missing_age = sum(is.na(age))
  )

## # A tibble: 5 × 9
##   pa_level  min_age q1_age median_age mean_age q3_age max_age sd_age
##   <fct>       <dbl>  <dbl>      <dbl>    <dbl>  <dbl>   <dbl>  <dbl>
## 1 High           28   57           63     63.3   69        89   8.44
## 2 Moderate       31   59           65     65.8   73        89   9.35
## 3 Low            41   61           69     69.4   78        89  10.5 
## 4 Sedentary      40   62           72     71.1   80        89  10.8 
## 5 <NA>           63   66.8         71     73     77.2      87  10.4 
## # ℹ 1 more variable: n_missing_age <int>

w6_table1_summary <- w6_table1 %>%
  mutate(
    pa_level = forcats::fct_explicit_na(pa_level, na_level = "Missing PA")
  ) %>%
  group_by(pa_level, .drop = FALSE) %>%
  summarise(
    N = n(),
    `Age, mean (SD)` = sprintf("%.1f (%.1f)", mean(age, na.rm = TRUE), sd(age, na.rm = TRUE)),
    `Female, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(sex == "Female", na.rm = TRUE),
      100 * sum(sex == "Female", na.rm = TRUE) / n()
    ),
    `Hypertension, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(hypertension == 1, na.rm = TRUE),
      100 * sum(hypertension == 1, na.rm = TRUE) / n()
    ),
    `Diabetes, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(diabetes == 1, na.rm = TRUE),
      100 * sum(diabetes == 1, na.rm = TRUE) / n()
    ),
    `Stroke, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(stroke == 1, na.rm = TRUE),
      100 * sum(stroke == 1, na.rm = TRUE) / n()
    ),
    `Abnormal heart rhythm, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(abnormal_heart_rhythm == 1, na.rm = TRUE),
      100 * sum(abnormal_heart_rhythm == 1, na.rm = TRUE) / n()
    ),
    `Heart failure, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(heart_failure == 1, na.rm = TRUE),
      100 * sum(heart_failure == 1, na.rm = TRUE) / n()
    ),
    `Baseline Alzheimer’s, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_alzheimers == 1, na.rm = TRUE),
      100 * sum(baseline_alzheimers == 1, na.rm = TRUE) / n()
    ),
    `Baseline dementia, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_dementia == 1, na.rm = TRUE),
      100 * sum(baseline_dementia == 1, na.rm = TRUE) / n()
    )
  )

w6_table1_summary_wide <- w6_table1_summary %>%
  mutate(N = as.character(N)) %>%
  pivot_longer(
    cols = -pa_level,
    names_to = "Characteristic",
    values_to = "Value"
  ) %>%
  pivot_wider(
    names_from = pa_level,
    values_from = Value
  ) %>%
  select(
    Characteristic,
    High,
    Moderate,
    Low,
    Sedentary,
    `Missing PA`
  )

knitr::kable(
  w6_table1_summary_wide,
  caption = "Table 1. Baseline characteristics at wave 6 by physical activity category."
)

Table 1. Baseline characteristics at wave 6 by physical activity category.
Characteristic	High	Moderate	Low	Sedentary	Missing PA
N	3156	4810	1626	1005	4
Age, mean (SD)	63.3 (8.4)	65.8 (9.4)	69.4 (10.5)	71.1 (10.8)	73.0 (10.4)
Female, n (%)	1526 (48.4%)	2713 (56.4%)	1087 (66.9%)	527 (52.4%)	4 (100.0%)
Hypertension, n (%)	217 (6.9%)	350 (7.3%)	117 (7.2%)	138 (13.7%)	0 (0.0%)
Diabetes, n (%)	57 (1.8%)	130 (2.7%)	50 (3.1%)	79 (7.9%)	0 (0.0%)
Stroke, n (%)	10 (0.3%)	47 (1.0%)	44 (2.7%)	54 (5.4%)	0 (0.0%)
Abnormal heart rhythm, n (%)	69 (2.2%)	107 (2.2%)	72 (4.4%)	38 (3.8%)	0 (0.0%)
Heart failure, n (%)	2 (0.1%)	6 (0.1%)	4 (0.2%)	13 (1.3%)	0 (0.0%)
Baseline Alzheimer’s, n (%)	2 (0.1%)	7 (0.1%)	4 (0.2%)	37 (3.7%)	0 (0.0%)
Baseline dementia, n (%)	2 (0.1%)	15 (0.3%)	18 (1.1%)	77 (7.7%)	0 (0.0%)

wave 7 table 1

## Wave 7 table 1

w7_table1_raw <- w7_core %>%
  mutate(
    hypertension = if_else(hediabp == 1, 1, 0, missing = 0),
    heart_failure = if_else(hediahf == 1, 1, 0, missing = 0),
    abnormal_heart_rhythm = if_else(hediaar == 1, 1, 0, missing = 0),
    diabetes = if_else(hediadi == 1, 1, 0, missing = 0),
    stroke = if_else(hediast == 1, 1, 0, missing = 0),

    baseline_alzheimers = if_else(hedibad == 1, 1, 0, missing = 0),
    baseline_dementia   = if_else(hedibde == 1, 1, 0, missing = 0),

    pa_level = case_when(
      HeActa %in% c(1, 2) ~ "High",
      !HeActa %in% c(1, 2) & HeActb %in% c(1, 2) ~ "Moderate",
      !HeActa %in% c(1, 2) & !HeActb %in% c(1, 2) & HeActc %in% c(1, 2) ~ "Low",
      HeActa %in% c(3, 4) & HeActb %in% c(3, 4) & HeActc %in% c(3, 4) ~ "Sedentary",
      TRUE ~ NA_character_
    )
  )

# ORIGINAL VERSION OF W7 AGE HANDLING (kept as methodological note)
# age = indager
# This is not used in final summaries because negative indager values are special missing codes.

# Wave 7 corrected version
w7_table1 <- w7_table1_raw %>%
  transmute(
    idauniq,
    pa_level = factor(pa_level, levels = c("High", "Moderate", "Low", "Sedentary")),
    age = if_else(indager < 0, NA_real_, as.numeric(indager)),
    sex = as_factor(DhSex),
    hypertension,
    heart_failure,
    abnormal_heart_rhythm,
    diabetes,
    stroke,
    baseline_alzheimers,
    baseline_dementia
  )

# diagnostic age check
w7_table1 %>%
  group_by(pa_level) %>%
  summarise(
    min_age = min(age, na.rm = TRUE),
    q1_age = quantile(age, 0.25, na.rm = TRUE),
    median_age = median(age, na.rm = TRUE),
    mean_age = mean(age, na.rm = TRUE),
    q3_age = quantile(age, 0.75, na.rm = TRUE),
    max_age = max(age, na.rm = TRUE),
    sd_age = sd(age, na.rm = TRUE),
    n_missing_age = sum(is.na(age))
  )

## # A tibble: 5 × 9
##   pa_level  min_age q1_age median_age mean_age q3_age max_age sd_age
##   <fct>       <dbl>  <dbl>      <dbl>    <dbl>  <dbl>   <dbl>  <dbl>
## 1 High           29   58           64     63.9     69      89   8.68
## 2 Moderate       33   60           66     66.7     73      89   9.21
## 3 Low            39   62           70     70.1     78      89  10.3 
## 4 Sedentary      38   64           74     72.3     81      89  10.6 
## 5 <NA>           63   68.5         74     73       78      82   9.54
## # ℹ 1 more variable: n_missing_age <int>

w7_table1_summary <- w7_table1 %>%
  mutate(
    pa_level = forcats::fct_explicit_na(pa_level, na_level = "Missing PA")
  ) %>%
  group_by(pa_level, .drop = FALSE) %>%
  summarise(
    N = n(),
    `Age, mean (SD)` = sprintf("%.1f (%.1f)", mean(age, na.rm = TRUE), sd(age, na.rm = TRUE)),
    `Female, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(sex == "Female", na.rm = TRUE),
      100 * sum(sex == "Female", na.rm = TRUE) / n()
    ),
    `Hypertension, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(hypertension == 1, na.rm = TRUE),
      100 * sum(hypertension == 1, na.rm = TRUE) / n()
    ),
    `Diabetes, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(diabetes == 1, na.rm = TRUE),
      100 * sum(diabetes == 1, na.rm = TRUE) / n()
    ),
    `Stroke, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(stroke == 1, na.rm = TRUE),
      100 * sum(stroke == 1, na.rm = TRUE) / n()
    ),
    `Abnormal heart rhythm, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(abnormal_heart_rhythm == 1, na.rm = TRUE),
      100 * sum(abnormal_heart_rhythm == 1, na.rm = TRUE) / n()
    ),
    `Heart failure, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(heart_failure == 1, na.rm = TRUE),
      100 * sum(heart_failure == 1, na.rm = TRUE) / n()
    ),
    `Baseline Alzheimer’s, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_alzheimers == 1, na.rm = TRUE),
      100 * sum(baseline_alzheimers == 1, na.rm = TRUE) / n()
    ),
    `Baseline dementia, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_dementia == 1, na.rm = TRUE),
      100 * sum(baseline_dementia == 1, na.rm = TRUE) / n()
    )
  )

w7_table1_summary_wide <- w7_table1_summary %>%
  mutate(N = as.character(N)) %>%
  pivot_longer(
    cols = -pa_level,
    names_to = "Characteristic",
    values_to = "Value"
  ) %>%
  pivot_wider(
    names_from = pa_level,
    values_from = Value
  ) %>%
  select(
    Characteristic,
    High,
    Moderate,
    Low,
    Sedentary,
    `Missing PA`
  )

knitr::kable(
  w7_table1_summary_wide,
  caption = "Table 1. Baseline characteristics at wave 7 by physical activity category."
)

Table 1. Baseline characteristics at wave 7 by physical activity category.
Characteristic	High	Moderate	Low	Sedentary	Missing PA
N	2848	4426	1483	906	3
Age, mean (SD)	63.9 (8.7)	66.7 (9.2)	70.1 (10.3)	72.3 (10.6)	73.0 (9.5)
Female, n (%)	1400 (49.2%)	2529 (57.1%)	963 (64.9%)	474 (52.3%)	2 (66.7%)
Hypertension, n (%)	141 (5.0%)	291 (6.6%)	111 (7.5%)	111 (12.3%)	0 (0.0%)
Diabetes, n (%)	45 (1.6%)	110 (2.5%)	60 (4.0%)	64 (7.1%)	0 (0.0%)
Stroke, n (%)	13 (0.5%)	39 (0.9%)	35 (2.4%)	49 (5.4%)	1 (33.3%)
Abnormal heart rhythm, n (%)	55 (1.9%)	116 (2.6%)	54 (3.6%)	49 (5.4%)	0 (0.0%)
Heart failure, n (%)	4 (0.1%)	6 (0.1%)	11 (0.7%)	12 (1.3%)	0 (0.0%)
Baseline Alzheimer’s, n (%)	0 (0.0%)	7 (0.2%)	7 (0.5%)	29 (3.2%)	0 (0.0%)
Baseline dementia, n (%)	3 (0.1%)	22 (0.5%)	13 (0.9%)	76 (8.4%)	0 (0.0%)

wave 8 table 1

w8_table1_raw <- w8_core %>%
  mutate(
    hypertension = if_else(hediabp == 1, 1, 0, missing = 0),
    heart_failure = if_else(hediahf == 1, 1, 0, missing = 0),
    abnormal_heart_rhythm = if_else(hediaar == 1, 1, 0, missing = 0),
    diabetes = if_else(hediadi == 1, 1, 0, missing = 0),
    stroke = if_else(hediast == 1, 1, 0, missing = 0),

    baseline_alzheimers = if_else(hedibad == 1, 1, 0, missing = 0),
    baseline_dementia   = if_else(hedibde == 1, 1, 0, missing = 0),

    pa_level = case_when(
      heacta %in% c(1, 2) ~ "High",
      !heacta %in% c(1, 2) & heactb %in% c(1, 2) ~ "Moderate",
      !heacta %in% c(1, 2) & !heactb %in% c(1, 2) & heactc %in% c(1, 2) ~ "Low",
      heacta %in% c(3, 4) & heactb %in% c(3, 4) & heactc %in% c(3, 4) ~ "Sedentary",
      TRUE ~ NA_character_
    )
  )

# ORIGINAL VERSION OF W8 AGE HANDLING (kept as methodological note)
# age = indager
# This is not used in final summaries because negative indager values are special missing codes.

# Wave 8 corrected version
w8_table1 <- w8_table1_raw %>%
  transmute(
    idauniq,
    pa_level = factor(pa_level, levels = c("High", "Moderate", "Low", "Sedentary")),
    age = if_else(indager < 0, NA_real_, as.numeric(indager)),
    sex = as_factor(indsex),
    hypertension,
    heart_failure,
    abnormal_heart_rhythm,
    diabetes,
    stroke,
    baseline_alzheimers,
    baseline_dementia
  )

# diagnostic age check
w8_table1 %>%
  group_by(pa_level) %>%
  summarise(
    min_age = min(age, na.rm = TRUE),
    q1_age = quantile(age, 0.25, na.rm = TRUE),
    median_age = median(age, na.rm = TRUE),
    mean_age = mean(age, na.rm = TRUE),
    q3_age = quantile(age, 0.75, na.rm = TRUE),
    max_age = max(age, na.rm = TRUE),
    sd_age = sd(age, na.rm = TRUE),
    n_missing_age = sum(is.na(age))
  )

## # A tibble: 5 × 9
##   pa_level  min_age q1_age median_age mean_age q3_age max_age sd_age
##   <fct>       <dbl>  <dbl>      <dbl>    <dbl>  <dbl>   <dbl>  <dbl>
## 1 High           31   60           65     65.5   71        89   8.14
## 2 Moderate       34   62           68     68.4   75        89   8.84
## 3 Low            40   64           71     71.3   80        89   9.74
## 4 Sedentary      40   66           74     73.4   81        89   9.61
## 5 <NA>           66   67.5         69     70.3   72.5      76   5.13
## # ℹ 1 more variable: n_missing_age <int>

w8_table1_summary <- w8_table1 %>%
  mutate(
    pa_level = forcats::fct_explicit_na(pa_level, na_level = "Missing PA")
  ) %>%
  group_by(pa_level, .drop = FALSE) %>%
  summarise(
    N = n(),
    `Age, mean (SD)` = sprintf("%.1f (%.1f)", mean(age, na.rm = TRUE), sd(age, na.rm = TRUE)),
    `Female, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(sex == "Female", na.rm = TRUE),
      100 * sum(sex == "Female", na.rm = TRUE) / n()
    ),
    `Hypertension, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(hypertension == 1, na.rm = TRUE),
      100 * sum(hypertension == 1, na.rm = TRUE) / n()
    ),
    `Diabetes, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(diabetes == 1, na.rm = TRUE),
      100 * sum(diabetes == 1, na.rm = TRUE) / n()
    ),
    `Stroke, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(stroke == 1, na.rm = TRUE),
      100 * sum(stroke == 1, na.rm = TRUE) / n()
    ),
    `Abnormal heart rhythm, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(abnormal_heart_rhythm == 1, na.rm = TRUE),
      100 * sum(abnormal_heart_rhythm == 1, na.rm = TRUE) / n()
    ),
    `Heart failure, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(heart_failure == 1, na.rm = TRUE),
      100 * sum(heart_failure == 1, na.rm = TRUE) / n()
    ),
    `Baseline Alzheimer’s, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_alzheimers == 1, na.rm = TRUE),
      100 * sum(baseline_alzheimers == 1, na.rm = TRUE) / n()
    ),
    `Baseline dementia, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_dementia == 1, na.rm = TRUE),
      100 * sum(baseline_dementia == 1, na.rm = TRUE) / n()
    )
  )

w8_table1_summary_wide <- w8_table1_summary %>%
  mutate(N = as.character(N)) %>%
  pivot_longer(
    cols = -pa_level,
    names_to = "Characteristic",
    values_to = "Value"
  ) %>%
  pivot_wider(
    names_from = pa_level,
    values_from = Value
  ) %>%
  select(
    Characteristic,
    High,
    Moderate,
    Low,
    Sedentary,
    `Missing PA`
  )

knitr::kable(
  w8_table1_summary_wide,
  caption = "Table 1. Baseline characteristics at wave 8 by physical activity category."
)

Table 1. Baseline characteristics at wave 8 by physical activity category.
Characteristic	High	Moderate	Low	Sedentary	Missing PA
N	2443	3932	1281	786	3
Age, mean (SD)	65.5 (8.1)	68.4 (8.8)	71.3 (9.7)	73.4 (9.6)	70.3 (5.1)
Female, n (%)	1204 (49.3%)	2253 (57.3%)	848 (66.2%)	390 (49.6%)	0 (0.0%)
Hypertension, n (%)	111 (4.5%)	204 (5.2%)	92 (7.2%)	78 (9.9%)	0 (0.0%)
Diabetes, n (%)	35 (1.4%)	73 (1.9%)	43 (3.4%)	48 (6.1%)	0 (0.0%)
Stroke, n (%)	14 (0.6%)	42 (1.1%)	30 (2.3%)	62 (7.9%)	0 (0.0%)
Abnormal heart rhythm, n (%)	49 (2.0%)	99 (2.5%)	49 (3.8%)	50 (6.4%)	0 (0.0%)
Heart failure, n (%)	4 (0.2%)	14 (0.4%)	7 (0.5%)	18 (2.3%)	0 (0.0%)
Baseline Alzheimer’s, n (%)	2 (0.1%)	12 (0.3%)	12 (0.9%)	29 (3.7%)	0 (0.0%)
Baseline dementia, n (%)	7 (0.3%)	18 (0.5%)	25 (2.0%)	76 (9.7%)	0 (0.0%)

wave 9 table 1

## Wave 9 table 1

w9_table1_raw <- w9_core %>%
  mutate(
    hypertension = if_else(hediabp == 1, 1, 0, missing = 0),
    heart_failure = if_else(hediahf == 1, 1, 0, missing = 0),
    abnormal_heart_rhythm = if_else(hediaar == 1, 1, 0, missing = 0),
    diabetes = if_else(hediadi == 1, 1, 0, missing = 0),
    stroke = if_else(hediast == 1, 1, 0, missing = 0),

    baseline_alzheimers = if_else(hedibad == 1, 1, 0, missing = 0),
    baseline_dementia   = if_else(hedibde == 1, 1, 0, missing = 0),

    pa_level = case_when(
      heacta %in% c(1, 2) ~ "High",
      !heacta %in% c(1, 2) & heactb %in% c(1, 2) ~ "Moderate",
      !heacta %in% c(1, 2) & !heactb %in% c(1, 2) & heactc %in% c(1, 2) ~ "Low",
      heacta %in% c(3, 4) & heactb %in% c(3, 4) & heactc %in% c(3, 4) ~ "Sedentary",
      TRUE ~ NA_character_
    )
  )

# ORIGINAL VERSION OF W9 AGE HANDLING (kept as methodological note)
# age = indager
# This is not used in final summaries because negative indager values are special missing codes.

# Wave 9 corrected version
w9_table1 <- w9_table1_raw %>%
  transmute(
    idauniq,
    pa_level = factor(pa_level, levels = c("High", "Moderate", "Low", "Sedentary")),
    age = if_else(indager < 0, NA_real_, as.numeric(indager)),
    sex = as_factor(indsex),
    hypertension,
    heart_failure,
    abnormal_heart_rhythm,
    diabetes,
    stroke,
    baseline_alzheimers,
    baseline_dementia
  )

# diagnostic age check
w9_table1 %>%
  group_by(pa_level) %>%
  summarise(
    min_age = min(age, na.rm = TRUE),
    q1_age = quantile(age, 0.25, na.rm = TRUE),
    median_age = median(age, na.rm = TRUE),
    mean_age = mean(age, na.rm = TRUE),
    q3_age = quantile(age, 0.75, na.rm = TRUE),
    max_age = max(age, na.rm = TRUE),
    sd_age = sd(age, na.rm = TRUE),
    n_missing_age = sum(is.na(age))
  )

## # A tibble: 5 × 9
##   pa_level  min_age q1_age median_age mean_age q3_age max_age sd_age
##   <fct>       <dbl>  <dbl>      <dbl>    <dbl>  <dbl>   <dbl>  <dbl>
## 1 High           33     56         64     63.9   71        89   9.20
## 2 Moderate       33     61         68     67.7   75        90   9.95
## 3 Low            30     64         72     71.2   80.5      90  10.9 
## 4 Sedentary      42     65         74     73.4   83        90  10.9 
## 5 <NA>           56     56         56     56     56        56  NA   
## # ℹ 1 more variable: n_missing_age <int>

w9_table1_summary <- w9_table1 %>%
  mutate(
    pa_level = forcats::fct_explicit_na(pa_level, na_level = "Missing PA")
  ) %>%
  group_by(pa_level, .drop = FALSE) %>%
  summarise(
    N = n(),
    `Age, mean (SD)` = sprintf("%.1f (%.1f)", mean(age, na.rm = TRUE), sd(age, na.rm = TRUE)),
    `Female, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(sex == "Female", na.rm = TRUE),
      100 * sum(sex == "Female", na.rm = TRUE) / n()
    ),
    `Hypertension, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(hypertension == 1, na.rm = TRUE),
      100 * sum(hypertension == 1, na.rm = TRUE) / n()
    ),
    `Diabetes, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(diabetes == 1, na.rm = TRUE),
      100 * sum(diabetes == 1, na.rm = TRUE) / n()
    ),
    `Stroke, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(stroke == 1, na.rm = TRUE),
      100 * sum(stroke == 1, na.rm = TRUE) / n()
    ),
    `Abnormal heart rhythm, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(abnormal_heart_rhythm == 1, na.rm = TRUE),
      100 * sum(abnormal_heart_rhythm == 1, na.rm = TRUE) / n()
    ),
    `Heart failure, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(heart_failure == 1, na.rm = TRUE),
      100 * sum(heart_failure == 1, na.rm = TRUE) / n()
    ),
    `Baseline Alzheimer’s, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_alzheimers == 1, na.rm = TRUE),
      100 * sum(baseline_alzheimers == 1, na.rm = TRUE) / n()
    ),
    `Baseline dementia, n (%)` = sprintf(
      "%d (%.1f%%)",
      sum(baseline_dementia == 1, na.rm = TRUE),
      100 * sum(baseline_dementia == 1, na.rm = TRUE) / n()
    )
  )

w9_table1_summary_wide <- w9_table1_summary %>%
  mutate(N = as.character(N)) %>%
  pivot_longer(
    cols = -pa_level,
    names_to = "Characteristic",
    values_to = "Value"
  ) %>%
  pivot_wider(
    names_from = pa_level,
    values_from = Value
  ) %>%
  select(
    Characteristic,
    High,
    Moderate,
    Low,
    Sedentary,
    `Missing PA`
  )

knitr::kable(
  w9_table1_summary_wide,
  caption = "Table 1. Baseline characteristics at wave 9 by physical activity category."
)

Table 1. Baseline characteristics at wave 9 by physical activity category.
Characteristic	High	Moderate	Low	Sedentary	Missing PA
N	2741	3876	1337	780	2
Age, mean (SD)	63.9 (9.2)	67.7 (9.9)	71.2 (10.9)	73.4 (10.9)	56.0 (NA)
Female, n (%)	1384 (50.5%)	2203 (56.8%)	892 (66.7%)	402 (51.5%)	1 (50.0%)
Hypertension, n (%)	198 (7.2%)	289 (7.5%)	102 (7.6%)	113 (14.5%)	0 (0.0%)
Diabetes, n (%)	88 (3.2%)	139 (3.6%)	60 (4.5%)	61 (7.8%)	0 (0.0%)
Stroke, n (%)	14 (0.5%)	39 (1.0%)	28 (2.1%)	39 (5.0%)	0 (0.0%)
Abnormal heart rhythm, n (%)	68 (2.5%)	168 (4.3%)	64 (4.8%)	59 (7.6%)	0 (0.0%)
Heart failure, n (%)	2 (0.1%)	8 (0.2%)	18 (1.3%)	16 (2.1%)	0 (0.0%)
Baseline Alzheimer’s, n (%)	1 (0.0%)	9 (0.2%)	9 (0.7%)	28 (3.6%)	0 (0.0%)
Baseline dementia, n (%)	4 (0.1%)	23 (0.6%)	23 (1.7%)	76 (9.7%)	1 (50.0%)

Wave summary numbers

### Total participant numbers by wave

wave_total_participant_numbers <- tibble(
  wave = c("Wave 1", "Wave 2", "Wave 3", "Wave 4", "Wave 5", "Wave 6", "Wave 7", "Wave 8", "Wave 9"),
  total_n = c(
    nrow(w1),
    nrow(w2_core),
    nrow(w3_core),
    nrow(w4_core),
    nrow(w5_core),
    nrow(w6_core),
    nrow(w7_core),
    nrow(w8_core),
    nrow(w9_core)
  )
)

knitr::kable(
  wave_total_participant_numbers,
  caption = "Total participant numbers by wave."
)

Total participant numbers by wave.
wave	total_n
Wave 1	12099
Wave 2	9432
Wave 3	9771
Wave 4	11050
Wave 5	10274
Wave 6	10601
Wave 7	9666
Wave 8	8445
Wave 9	8736

### Diagnosis summary numbers by wave

# Wave 1 diagnosis summary numbers
wave1_diagnosis_summary_numbers <- w1 %>%
  summarise(
    wave = "Wave 1",
    alzheimers_n = sum(alz_w1 == 1, na.rm = TRUE),
    dementia_n = sum(dementia_w1 == 1, na.rm = TRUE),
    both_n = sum(alz_w1 == 1 & dementia_w1 == 1, na.rm = TRUE),
    either_n = sum(alz_w1 == 1 | dementia_w1 == 1, na.rm = TRUE)
  )

# Wave 2 diagnosis summary numbers
wave2_diagnosis_summary_numbers <- w2_core %>%
  summarise(
    wave = "Wave 2",
    alzheimers_n = sum(hedib01 == 8 | hedib02 == 8 | hedib03 == 8 | hedib04 == 8, na.rm = TRUE),
    dementia_n   = sum(hedib01 == 9 | hedib02 == 9 | hedib03 == 9 | hedib04 == 9, na.rm = TRUE),
    both_n = sum(
      (hedib01 %in% c(8, 9)) +
      (hedib02 %in% c(8, 9)) +
      (hedib03 %in% c(8, 9)) +
      (hedib04 %in% c(8, 9)) > 1,
      na.rm = TRUE
    ),
    either_n = sum(
      hedib01 %in% c(8, 9) | hedib02 %in% c(8, 9) |
      hedib03 %in% c(8, 9) | hedib04 %in% c(8, 9),
      na.rm = TRUE
    )
  )

# Wave 3 diagnosis summary numbers
wave3_diagnosis_summary_numbers <- w3_core %>%
  summarise(
    wave = "Wave 3",
    alzheimers_n = sum(hedibad %in% c(1, 2, 3), na.rm = TRUE),
    dementia_n   = sum(hedibde %in% c(1, 2, 3), na.rm = TRUE),
    both_n       = sum(hedibad %in% c(1, 2, 3) & hedibde %in% c(1, 2, 3), na.rm = TRUE),
    either_n     = sum(hedibad %in% c(1, 2, 3) | hedibde %in% c(1, 2, 3), na.rm = TRUE)
  )

# Waves 4 to 9 diagnosis summary numbers
wave4_diagnosis_summary_numbers <- w4_core %>%
  summarise(
    wave = "Wave 4",
    alzheimers_n = sum(hedibad == 1, na.rm = TRUE),
    dementia_n   = sum(hedibde == 1, na.rm = TRUE),
    both_n       = sum(hedibad == 1 & hedibde == 1, na.rm = TRUE),
    either_n     = sum(hedibad == 1 | hedibde == 1, na.rm = TRUE)
  )

wave5_diagnosis_summary_numbers <- w5_core %>%
  summarise(
    wave = "Wave 5",
    alzheimers_n = sum(hedibad == 1, na.rm = TRUE),
    dementia_n   = sum(hedibde == 1, na.rm = TRUE),
    both_n       = sum(hedibad == 1 & hedibde == 1, na.rm = TRUE),
    either_n     = sum(hedibad == 1 | hedibde == 1, na.rm = TRUE)
  )

wave6_diagnosis_summary_numbers <- w6_core %>%
  summarise(
    wave = "Wave 6",
    alzheimers_n = sum(hedibad == 1, na.rm = TRUE),
    dementia_n   = sum(hedibde == 1, na.rm = TRUE),
    both_n       = sum(hedibad == 1 & hedibde == 1, na.rm = TRUE),
    either_n     = sum(hedibad == 1 | hedibde == 1, na.rm = TRUE)
  )

wave7_diagnosis_summary_numbers <- w7_core %>%
  summarise(
    wave = "Wave 7",
    alzheimers_n = sum(hedibad == 1, na.rm = TRUE),
    dementia_n   = sum(hedibde == 1, na.rm = TRUE),
    both_n       = sum(hedibad == 1 & hedibde == 1, na.rm = TRUE),
    either_n     = sum(hedibad == 1 | hedibde == 1, na.rm = TRUE)
  )

wave8_diagnosis_summary_numbers <- w8_core %>%
  summarise(
    wave = "Wave 8",
    alzheimers_n = sum(hedibad == 1, na.rm = TRUE),
    dementia_n   = sum(hedibde == 1, na.rm = TRUE),
    both_n       = sum(hedibad == 1 & hedibde == 1, na.rm = TRUE),
    either_n     = sum(hedibad == 1 | hedibde == 1, na.rm = TRUE)
  )

wave9_diagnosis_summary_numbers <- w9_core %>%
  summarise(
    wave = "Wave 9",
    alzheimers_n = sum(hedibad == 1, na.rm = TRUE),
    dementia_n   = sum(hedibde == 1, na.rm = TRUE),
    both_n       = sum(hedibad == 1 & hedibde == 1, na.rm = TRUE),
    either_n     = sum(hedibad == 1 | hedibde == 1, na.rm = TRUE)
  )

wave_diagnosis_summary_numbers <- bind_rows(
  wave1_diagnosis_summary_numbers,
  wave2_diagnosis_summary_numbers,
  wave3_diagnosis_summary_numbers,
  wave4_diagnosis_summary_numbers,
  wave5_diagnosis_summary_numbers,
  wave6_diagnosis_summary_numbers,
  wave7_diagnosis_summary_numbers,
  wave8_diagnosis_summary_numbers,
  wave9_diagnosis_summary_numbers
)

knitr::kable(
  wave_diagnosis_summary_numbers,
  caption = "Diagnosis summary numbers by wave."
)

Diagnosis summary numbers by wave.
wave	alzheimers_n	dementia_n	both_n	either_n
Wave 1	14	64	3	75
Wave 2	11	43	2	52
Wave 3	32	112	11	133
Wave 4	37	97	13	121
Wave 5	46	110	19	137
Wave 6	50	112	17	145
Wave 7	43	114	19	138
Wave 8	55	126	23	158
Wave 9	47	127	21	153

### Missing physical activity summary numbers by wave

# Wave 1 missing PA
wave1_missing_pa_summary_numbers <- table1_w1_clean %>%
  summarise(
    wave = "Wave 1",
    missing_pa_n = sum(is.na(pa_level))
  )

# Wave 2 missing PA
w2_derived <- read_dta("raw data/RAW_data_stata/wave_2_derived_variables.dta")
w2 <- left_join(w2_core, w2_derived, by = "idauniq")

wave2_missing_pa_summary_numbers <- w2 %>%
  mutate(pa_level = if_else(palevel %in% c(0, 1, 2, 3), palevel, NA_real_)) %>%
  summarise(
    wave = "Wave 2",
    missing_pa_n = sum(is.na(pa_level))
  )
#waves 3 to 9 MISSING PA FOR NOW -> 'NA' AS PLACEHOLDER AS HAVEN'T STANDARDISED PA VARIABLES FOR WAVES 3 - 9 AS YET
wave3_missing_pa_summary_numbers <- tibble(wave = "Wave 3", missing_pa_n = NA_integer_)
wave4_missing_pa_summary_numbers <- tibble(wave = "Wave 4", missing_pa_n = NA_integer_)
wave5_missing_pa_summary_numbers <- tibble(wave = "Wave 5", missing_pa_n = NA_integer_)
wave6_missing_pa_summary_numbers <- tibble(wave = "Wave 6", missing_pa_n = NA_integer_)
wave7_missing_pa_summary_numbers <- tibble(wave = "Wave 7", missing_pa_n = NA_integer_)
wave8_missing_pa_summary_numbers <- tibble(wave = "Wave 8", missing_pa_n = NA_integer_)
wave9_missing_pa_summary_numbers <- tibble(wave = "Wave 9", missing_pa_n = NA_integer_)

wave_missing_pa_summary_numbers <- bind_rows(
  wave1_missing_pa_summary_numbers,
  wave2_missing_pa_summary_numbers,
  wave3_missing_pa_summary_numbers,
  wave4_missing_pa_summary_numbers,
  wave5_missing_pa_summary_numbers,
  wave6_missing_pa_summary_numbers,
  wave7_missing_pa_summary_numbers,
  wave8_missing_pa_summary_numbers,
  wave9_missing_pa_summary_numbers
)

knitr::kable(
  wave_missing_pa_summary_numbers,
  caption = "Missing physical activity summary numbers by wave."
)

Missing physical activity summary numbers by wave.
wave	missing_pa_n
Wave 1	194
Wave 2	139
Wave 3	NA
Wave 4	NA
Wave 5	NA
Wave 6	NA
Wave 7	NA
Wave 8	NA
Wave 9	NA

## correction for PA missing 
wave_missing_pa_summary_numbers <- bind_rows(
  table1_w1_clean %>% summarise(wave = "Wave 1", missing_pa_n = sum(is.na(pa_level))),
  w2 %>% mutate(pa_level = if_else(palevel %in% c(0, 1, 2, 3), palevel, NA_real_)) %>%
    summarise(wave = "Wave 2", missing_pa_n = sum(is.na(pa_level))),
  w3_table1 %>% summarise(wave = "Wave 3", missing_pa_n = sum(is.na(pa_level))),
  w4_table1 %>% summarise(wave = "Wave 4", missing_pa_n = sum(is.na(pa_level))),
  w5_table1 %>% summarise(wave = "Wave 5", missing_pa_n = sum(is.na(pa_level))),
  w6_table1 %>% summarise(wave = "Wave 6", missing_pa_n = sum(is.na(pa_level))),
  w7_table1 %>% summarise(wave = "Wave 7", missing_pa_n = sum(is.na(pa_level))),
  w8_table1 %>% summarise(wave = "Wave 8", missing_pa_n = sum(is.na(pa_level))),
  w9_table1 %>% summarise(wave = "Wave 9", missing_pa_n = sum(is.na(pa_level)))
)

### Combined wave summary numbers
wave_summary_numbers_table <- wave_total_participant_numbers %>%
  left_join(wave_diagnosis_summary_numbers, by = "wave") %>%
  left_join(wave_missing_pa_summary_numbers, by = "wave") %>%
  mutate(
    final_analysis_n = case_when(
      wave == "Wave 1" ~ nrow(analysis_w1),
      TRUE ~ NA_integer_
    )
  )

knitr::kable(
  wave_summary_numbers_table,
  caption = "Combined wave summary numbers."
)

Combined wave summary numbers.
wave	total_n	alzheimers_n	dementia_n	both_n	either_n	missing_pa_n	final_analysis_n
Wave 1	12099	14	64	3	75	194	11805
Wave 2	9432	11	43	2	52	139	NA
Wave 3	9771	32	112	11	133	15	NA
Wave 4	11050	37	97	13	121	15	NA
Wave 5	10274	46	110	19	137	124	NA
Wave 6	10601	50	112	17	145	4	NA
Wave 7	9666	43	114	19	138	3	NA
Wave 8	8445	55	126	23	158	3	NA
Wave 9	8736	47	127	21	153	2	NA

Part 2: Data Analysis

2.1 Aims and objectives

The principle aim of the following data analysis is to investigate whether there is any directional association between PA and incident dementia risk in adults over fifty years old across a ten year follow-up period in the longitudinal data set provided by ELSA. ELSA is a repository of data from the population residing in private domiciles in England.

The ELSA data includes social demographic variables, in addition to lifestyle and health characteristics. The original sample was based on participants who responded between 1998 to 2001 to the Health Survey for England (HSE), following which the original respondents participated in interviews every two years and were organised into ‘waves’. Interviews included the same questions being inquired about health and lifestyle every two-years. The participant pool was supplemented across certain waves. The above details make the ELSA data-set suitable to meet the primary aim of the analysis by providing a long follow-up period from a representative population whereby new diagnoses of dementia, and baseline levels of physical activity can be matched to assess for any present association.(33)

An objective of the data analysis was to complement the literature review rather than replicate any particular study that formed part of it. Additionally other objectives included the derivation of a baseline cohort from ELSA wave 1 that depicted different levels of PA, dementia, alzheimer’s dementia, alongside age, gender, vascular disease, and other comorbidities to elucidate the raw dataset at inception. Secondly, to categorise the raw dataset based on PA level to assess whether a pattern emerged between different PA levels and different ages and comorbidities. Thirdly the exclusion of participants who have at baseline any dementia or pathological cognitive state for the purpose of minimising reverse causation and finally to utilise Cox proportional hazard to assess for any association between PA and incident dementia.

The final objective was to see if the results could be practically relevant to population health and inform discussions between clinicians and patients.

2.2 Design and Methods

2.2.1 Study design

The details of the data source, ELSA, used to perform this data analysis is documented in section 2.1 (Aims and Objectives).

In order to appreciate any emergent patterns at baseline prior to longitudinal analysis, the wave 1 core dataset was joined using the distinct participant ID (‘idauniq’) with the wave 1 derived variables which led to the creation of a unique wave 1 (‘w1’) dataset that combined the variables of concern from the core dataset (dementia, Alzheimer’s, PA, and IQCODE) with variables from the ‘wave 1 ifs derived variables’ dataset such as smoking status, education, employment and depression score means. This process is part of data stewardship by establishing whether the data is likely to be accurate by enabling expected patterns to be readily visualised, and similarly to highlight any peculiarities within the data. The same process was performed for subsequent waves, for the additional purpose of producing descriptive summaries for each wave.

For the longitudinal analysis, the wave 1 baseline dataset with dementia, Alzheimer’s, Parkinson’s disease and pathological cognitive decline excluded (termed ‘analysis_w1’), was followed up across subsequent waves to observe for cases of incident dementia via the participant ID.

2.2.2 Physical activity

In ELSA PA was gathered via self-reported questionnaires by which participants indicated the type of activity that they engaged with, that was then apportioned into distinct categories to reflect the intensity of activity (vigorous, moderate and mild). The frequency by which they engaged in activity was also gathered. In certain studies that conducted research on the ELSA dataset including one in the literature review both vigorous and moderate categories were combined, potentially reducing the granularity of the information and losing the ability to adequately discriminate between various PA groups.(24) As such a methodological decision was taken to produce in this analysis four categories: ‘high’, ‘moderate’, ‘mild’ and ‘sedentary’, where ‘high’ equated to ELSA’s vigorous (coded as: heacta), ‘moderate’ to ELSA’s moderate (coded as: heactb), ‘low’ to ELSA’s mild (coded as: heactc), and ‘sedentary’ for participants’ that did not meet the thresholds for ‘high’, ‘moderate’, or ‘mild’ (namely ELSA’s vigorous, moderate or mild respectively).

w1_palevel = case_when( heacta %in% c(1, 2) ~ “High”, !heacta %in% c(1, 2) & heactb %in% c(1, 2) ~ “Moderate”, !heacta %in% c(1, 2) & !heactb %in% c(1, 2) & heactc %in% c(1, 2) ~ “Low”, heacta %in% c(3, 4) & heactb %in% c(3, 4) & heactc %in% c(3, 4) ~ “Sedentary”,

Each PA level created in the data analysis related to participating at the PA level for a frequency of at least once per week. If a PA level(s) was established less than once per week they were classed as ‘sedentary’. As per ELSA’s description, ‘high’ (ELSA’s vigorous) equated to activities that included digging, aerobics and cycling; ‘moderate’ (ELSA’s moderate) related to activities such as a moderately paced walk, dancing or cleaning a car; ‘low’ (ELSA’s mild) was housework to include laundry. To qualify as any of ‘high’, ‘moderate’ or ‘low’, to establish a unique cohort for each, the participants could not be able to be included in any of the other PA levels.

Choosing to delineate PA into these distinct categories could prove more useful when providing patients with information about types of activities that could prove profitable to reducing the risk of dementia, by creating specific PA levels that relate to particular types of activities rather than diluting the nuances between PA levels by combining disparate levels together, and therefore not being able to decipher what type of activity is driving any change produced. Furthermore commercial fitness based opportunities where everyday activities are promoted to encourage against sedentary lifestyles could be leveraged to improve healthier lifestyles in a stepwise moderated way.

2.2.3 Incident dementia

Dementia was identified in ELSA by a participant’s positive response to the following question: ‘Has a doctor ever told you that you have (or have had) any of the conditions on this card (with option ’9’ responding to ‘Dementia, organic brain syndrome, senility or any other serious memory impairment’)?’.(34 pg.24) A distinction should be made regarding the use of IQCODE which could be used to denote dementia, but instead was used to establish pathological cognitive impairment in this data analysis. Dementia was solely appropriated to participants identified by the above self-assessment question measure streamlining the process in which dementia was identified in ELSA, whilst IQCODE was used to eliminate pathological cognitive decline participants in an effort to curb reverse causality and as part of sensitivity analysis.

Furthermore the interviewer query that relates to dementia is not reserved only for ‘dementia’ but also senility and other named syndromes and impairments that impact cognition resulting in a heterogenous compilation of diagnostic states as per the design of the ELSA study. Secondly in relation to the ELSA design, Alzheimer’s disease was captured as a separate disease state, and not combined with ‘dementia’, and so as to maintain integrity to the ELSA original construct dementia cases were identified by the aforementioned ‘dementia’ question solely.

In order to locate cases of incident dementia in waves subsequent to wave 1, a longitudinal analysis dataset was created in which dementia indicators from waves 2 to 9 were joined to eligible participants via a unique identifier. By this method the first wave in which any case of incident dementia was found could be isolated.

2.2.4 Covariates, data stewardship and data quality

Variables that could influence the dementia outcome variable were chosen to include modifiable, nonmodifiable, pathological and lifestyle factors, in order to assist with discussions with diverse patient populations about how these findings in conjunction with other research and national guidance could be used to live healthier lives in context of dementia risk. Modifiable covariates chosen included smoking and hypertension. Smoking has been noted to increase dementia risk by between fifty to eighty percent (35 pg.466) and as such was noted as important to include here.

The smoking variable provided an example of how data stewardship and quality was approached within the analysis. The value labels within wave 1 for smoking (‘smokerstat’) were: never smoked, current smoker, three ex-smoker variables (ex-smoker occassional, ex-smoker regular, ex-smoker DK-frequency) and negative codes that included ‘refusal’, ‘didn’t know’, and wasn’t asked. These value labels were then recoded into three categories: Never; Ex-smoker; Current, in a newly created ‘Smoker3’ vector, and in the Cox model a binary variable was created:

current_smoker = if_else(smoking3 == “Current”, 1, 0, missing = NA_real_)

By performing the above the aim was the responsible management of data by assessing the original coding variables, handling negative categories explicitly and producing a current smoker indicator appropriate for analysis.

In England current national guidance refers to cardiovascular disease as a risk factor for dementia, and states that the early prevention of cardiovascular and stroke states are thought to attenuate both vascular and mixed dementias. The inclusion of hypertension, diabetes, abnormal heart rhythms and heart failure were chosen as variables that could affect our outcome variable. Similarly certain publications have noted a significant risk of stroke for incident dementia, which led to the methodological choice of the inclusion of stroke. (36, 37). Age and sex are notable confounding factors and were therefore added to our variable model to assess how these demographic factors may influence dementia.

2.2.5 Reverse causation and sensitivity analysis

To reduce the risk that participants who already had signs of cognitive decline which could impact their ability to engage with PA, participants with an IQCODE of 3.38 and above were excluded from the cox analysis. A score of 3.38 was observed by authors in the field as suitably equating to pathological cognitive decline whilst maintaining both sensitivity and specificity (38). The above was a method used to reduce reverse causation by excluding participants who possibly had prodromal features of dementia and who if included could increase the chance of adding participants that did not engage with PA due to early cognitive states.

Excluding participants who may have dementia in the context of other disease states (namely Parkinson’s disease) was a methodological choice, as parkinson’s disease is known to impact cognitive abilities to varying degrees. Those with Alzheimer’s disease were also excluded for similar reasons. Furthermore, by choosing a follow-up period of ten years, this increased the chance that dementia which is known to have a long prodromal period was more likely to occur after the PA exposure helping to limit issues of reverse causation additionally.

Sensitivity analysis was conducted comparing models with and without IQCODE exclusion to assess if excluding IQCODE determined participants with pathological cognitive decline impacted the outcome in anyway. This was considered as methodologically pertinent to analyse the assumption that the presence of those with informant informed functional cognitive decline may change the outcome by reflecting reduced PA due to prodromal disease thus inadvertently weakening any cautious causal interpretation, and impacting temporality by attenuating the likelihood that any reduced PA was reflecting ability outside of pre-existing cognitive deterioration, thus creating ambiguity that low PA or sedentary levels preceeded any already existing signs of cognitive impairment.

2.2.6 Statistical analyses

Descriptive statistical analyses was initiated on the baseline wave 1 dataset prior to any exclusions to illustrate any trends in age, gender, socio-demographic, and medical factors according to PA levels. Standard deviations were produced for continous variables and categorical variables were represented with percentages. Presentation of the findings in a baseline table provided a transparent account of the dataset creating readily accessbile deductions to be made and allowing for broad appreciation as to whether trends that were presented were expected, and unexpected findings to be highlighted early for further inspection.

Cox proportional hazard regression models were used to assess for associations between PA levels and incident dementia. Two models were produced: unadjusted for covariates, and adjusted. Moderate, low and sedentary PA levels were compared with high PA levels in both models and the results were articulated through hazard ratios, 95% confidence intervals and p-values.

2.3 Results

2.3.1 Descriptive analyses and baseline cohort

Prior to any exclusions the baseline wave 1 dataset was comprised of a total of 12,099 participants. Following the exclusion of prevalent dementia, Alzheimer’s disease, Parkinson’s and pathological cognitive decline the number decreased to 11,805. Table 1 shows the baseline characteristics of the original 12,099. Table 1 evidences that when high PA and sedentary groups were compared, the sedentary group was ten years older (mean ages of 60.6 and 70.8 respectively) and the sedentary group had the highest percentages of hypertension, diabetes, stroke, abnormal heart rhythm, and heart failure compared to all the other PA levels, showing that the sedentary group were older, and burdened with more comorbidities than the high PA and moderate PA groups especially.

The unadjusted cox modeling was performed on the 11,805 participants following the above exclusions and upon follow-up there were 480 cases of incident dementia. Missing covariate information led to a further loss of participants such that a total of 11,679 participants were analysed in the adjusted model, with 476 incident dementia occurences.

2.3.2 Unadjusted Cox model

There was an inverse relationship between lower PA levels and increased risk of incident dementia. This relationship was in a dose-response manner. Hazard ratios were depicted using high PA level acted as the reference group. Moderate PA had a hazard ratio of 1.73 with CI: 1.34 to 2.23 and p=2.18 x 10-5; low PA hazard ratio: 2.24 with CI: 1.67 to 3.02 and p=9.69 x 10-8; sedentary activity hazard ratio: 2.42 with CI: 1.76 to 3.34 and p=5.72 x 10-8.

2.3.3 Adjusted Cox model

The following covariates were accounted for in the adjusted model: age, gender, current smoker, hypertension, diabetes, stroke, abnormal heart rhythms, heart failure and depression. The hazard ratios for moderate, and low activity in comparison to the high PA level remained illustrative of exercise producing a protective effect with hazard ratios of 1.33 and 1.23 respectively, whilst the sedentary group with a HR 1.02 demonstrates no clear difference between sedentary and high PA groups.

The CIs and p-values provide further information with Moderate PA HR 1.33 CI: 1.03 to 1.73, p = 0.0286; low PA HR 1.23 CI: 0.89 to 1.68, p = 0.204; sedentary HR: 1.02 CI: 0.72 to 1.45, p = 0.91. Moderate PA exhibiting a 95% CI that does not intersect 1, and a p value below 0.05 is a significant result, whilst low PA and sedentary groups have a smaller impact when the above covariates are taken into account, and no association in both low PA and sedentary groups can not be ruled out given the CIs for these groups.

Age with a HR 1.06 (CI: 1.05 to 1.072), and p value of <2 x10 -16 was the most highly significant covariate for dementia incidence. Female gender, abnormal heart rhythms and depression were comparable with moderate PA in terms of significance for their respective hazard towards dementia incidence. Their CIs and p-values are as follows.

Female gender: HR 1.23 (CI: 1.02 to 1.49); p = 0.03 Abnormal heart rhythm: HR 1.39 (CI: 1.03 to 1.9); p = 0.03 Depression: HR 1.3 (CI: 1.04 to 1.62); p = 0.02

2.3.4 Sensitivity analysis

Sensitivity analysis was performed both with and without pathological cognitive decline as per an IQCODE score of >3.38, to assess for any change to the results. The cox unadjusted and adjusted statistical analyses produced the same results that the inclusion of pathological cognitive decline determined by IQCODE did. When the above result was investigated for data qualtiy by assessing the counts of participants with IQCODE >3.38 prior to the exclusions of dementia, alzheimer’s and parkinson’s disease the expected number of 52 was generated. Once the count of participants with IQCODE >3.38 following the above exclusions was performed, the count became ‘0’.

The output of the above is represented below.

cox_unadjusted_no_iqcode <- coxph( Surv(time_to_event_waves, event_dementia) ~ pa_level, data = analysis_w1_no_iqcode )

cox_adjusted_no_iqcode <- coxph( Surv(time_to_event_waves, event_dementia) ~ pa_level + age + sex + current_smoker + hypertension + diabetes + stroke + abnormal_heart_rhythm + heart_failure + depression_binary, data = analysis_w1_no_iqcode )

summary(cox_unadjusted_no_iqcode) summary(cox_adjusted_no_iqcode)

Call:
coxph(formula = Surv(time_to_event_waves, event_dementia) ~ pa_level, 
    data = analysis_w1_no_iqcode)

  n= 11805, number of events= 480 

                    coef exp(coef) se(coef)     z Pr(>|z|)    
pa_levelModerate  0.5470    1.7281   0.1288 4.246 2.18e-05 ***
pa_levelLow       0.8084    2.2444   0.1516 5.332 9.69e-08 ***
pa_levelSedentary 0.8857    2.4246   0.1632 5.427 5.72e-08 ***
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

                  exp(coef) exp(-coef) lower .95 upper .95
pa_levelModerate      1.728     0.5787     1.342     2.225
pa_levelLow           2.244     0.4456     1.667     3.021
pa_levelSedentary     2.425     0.4124     1.761     3.339

Concordance= 0.579  (se = 0.012 )
Likelihood ratio test= 41.82  on 3 df,   p=4e-09
Wald test            = 38.66  on 3 df,   p=2e-08
Score (logrank) test = 40.21  on 3 df,   p=1e-08

Call:
coxph(formula = Surv(time_to_event_waves, event_dementia) ~ pa_level + 
    age + sex + current_smoker + hypertension + diabetes + stroke + 
    abnormal_heart_rhythm + heart_failure + depression_binary, 
    data = analysis_w1_no_iqcode)

  n= 11679, number of events= 476 
   (126 observations deleted due to missingness)
   
                              coef exp(coef)  se(coef)      z Pr(>|z|)    
pa_levelModerate       0.288026  1.333792  0.131591  2.189   0.0286 *  
pa_levelLow            0.204592  1.227024  0.161119  1.270   0.2041    
pa_levelSedentary      0.020806  1.021024  0.178992  0.116   0.9075    
age                    0.061431  1.063357  0.004261 14.418   <2e-16 ***
sexFemale              0.208323  1.231611  0.096090  2.168   0.0302 *  
current_smoker        -0.117040  0.889550  0.140726 -0.832   0.4056    
hypertension           0.085720  1.089501  0.094001  0.912   0.3618    
diabetes               0.160271  1.173829  0.157535  1.017   0.3090    
stroke                 0.218415  1.244104  0.178580  1.223   0.2213    
abnormal_heart_rhythm  0.329790  1.390676  0.155400  2.122   0.0338 *  
heart_failure         -0.696122  0.498515  0.582639 -1.195   0.2322    
depression_binary      0.258923  1.295534  0.113208  2.287   0.0222 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

                      exp(coef) exp(-coef) lower .95 upper .95
pa_levelModerate         1.3338     0.7497    1.0306     1.726
pa_levelLow              1.2270     0.8150    0.8948     1.683
pa_levelSedentary        1.0210     0.9794    0.7189     1.450
age                      1.0634     0.9404    1.0545     1.072
sexFemale                1.2316     0.8119    1.0202     1.487
current_smoker           0.8895     1.1242    0.6751     1.172
hypertension             1.0895     0.9179    0.9062     1.310
diabetes                 1.1738     0.8519    0.8620     1.598
stroke                   1.2441     0.8038    0.8767     1.765
abnormal_heart_rhythm    1.3907     0.7191    1.0255     1.886
heart_failure            0.4985     2.0060    0.1591     1.562
depression_binary        1.2955     0.7719    1.0377     1.617

Concordance= 0.74  (se = 0.01 )
Likelihood ratio test= 298.4  on 12 df,   p=<2e-16
Wald test            = 306.5  on 12 df,   p=<2e-16
Score (logrank) test = 326.3  on 12 df,   p=<2e-16


Sensitivity analysis comparing 

w1 %>%
  count(baseline_pathological_cognitive_decline, useNA = "ifany")
  
  A tibble:2 × 3
baseline_pathological_cognitive_decline
<dbl>
useNA
<chr>
n
<int>
0   ifany   12047       
1   ifany   52      
2 rows

followup_w1 %>%
  mutate(
    prevalent_dem_alz_w1 = if_else(baseline_alzheimers == 1 | baseline_dementia == 1, 1, 0)
  ) %>%
  filter(
    prevalent_dem_alz_w1 == 0,
    baseline_parkinsons == 0,
    !is.na(pa_level)
  ) %>%
  count(baseline_pathological_cognitive_decline, useNA = "ifany")
  
  baseline_pathological_cognitive_decline
<dbl>
useNA
<chr>
n
<int>
0   ifany   11805   

#### 2.4 Discussion

The principle aim of the data analysis was to assess for any association between PA and incident dementia in adults above fifty years old from a UK dataset (ELSA).  The other associated objectives acted to serve the above aim by creating stepwise goals to describe the data, assess for patterns and perform formal statistical analyses to answer whether different PA levels could exert an impact on dementia.  The data analysis transparently shows that increased levels of PA have a protective effect on incident dementia, however this association was weakened when covariates were adjusted for.  These findings are more cautious than the trends discussed in the literature review studies which included studies with different cognitive baselines (MCI), whilst the overall conclusion is the same, which also is in tandem with wider research on this subject.

The data analysis is informative when compared to the literature review and as such complements it, in that it included 'abnormal heart rhythm' as a distinct covariate, rather than compiling all cardiovascular diseases, and medical comorbidities into one grouping thus losing granularity.  By chosing abnormal heart rhythm as a sole covariate it showed the potential importance of abnormal heart rhythms as a unique comorbidity towards dementia risk with a HR 1.39 and p vlaue of 0.03).  Abnormal cardiac rhythms have the most direct plausible implications for vascular dementia sufferers by damaging arterial vessels that provide blood flow to the brain.  If the transport of vital nutrients to the brain are compromised, this can lead to deleterious effects on cognitive function, and PA such as walking has been noted to have a beneficial impact on mortality and wider CVD.(39, 40)

The above findings are useful in considering that the benefits of PA are most potent when viewed as part of a protective lifestyle choice whose advantages are  not only relevant to dementia, but other risk factors that can contribute towards incident dementia as documented in a wide variety of clinical and academic sources.  The risk factors that are repeatedly associated with dementia and cardiovascular comorbidities include the covariates analysed in this dissertation, which aligns with the proposed pathophysiology of dementia which encompasses oxidative stress and inflammation.  Furthermore PA is also proposed as a management strategy to manage depression, depression being a covariate that in the adjustment model was a significant hazard for dementia.  The reasons for depression having an impact on incident dementia could be wide ranging, to include social isoloation leading to reduced social engagement and therefore limiting broader cognitive functioning.

The statistical analyses support the almost universal finding that advanced age is the greatest risk factor for dementia.  Older participants in this analysis were also evidenced to have the greatest burden of comorbidities, which may assist in explaining the smaller impact of PA in the adjustment model in that low and sedentary groups were older, and were more likely to have hypertension, diabetes and depression as examples.  Indeed the baseline characteristics evidenced an average age of 60 in the high PA group and 70 years old in the sedentary group, which is akin to comparing the non elderly (less than 65 years old) with the elderly (more than 65 years old).  Attempts to mitigate reverse causation were used to limit participants with undiagnosed cognitive decline that could impact their ability to engage in physical activity, additionally whilst reasoning to approach the sensitivity analysis by examining the impact of pathological cognitive decline was logically sound (to avoid cognitively impaired inclusions that were not formally screend out by other means), these participants were already subsumed in either Alzheimers, Parkinson's or prevalent dementia and thus were already accounted for.

#### 2.5 Limitations

The following limitations in the data analysis should be stated.  The initial limitation involves the description of 'dementia' within ELSA.  The dementia outcome measured includes 'organic brain syndrome, senility or any other serious memory impairment' in addition to 'dementia'.  This creates heterogenity in the outcome analysed, and further more the definition of 'dementia' itself is lacking, so the degree to which mixed-dementias, vascular dementia and other dementias are included in this is unknown.  This may also partly explain why pathological cognitive impairment as expressed through IQCODE produced the same exact output in the sensitivity analysis.  

Self-reporting of dementia and PA was exclusively relied upon, to demonstrate the dementia counts and PA levels.  This was chosen to examine the findings from the self-reporting process as one clear source.  Self reporting is prone to recall bias, possibly resulting in misclassification which could have hampered the results.  

Dementia incidence was based on the wave at which dementia was diagnosed, as opposed to by a diagnosis date, making the timing of incident dementia less precise. Additionally as typicpal for longitudinal studies confounding despite adjustment modelling can not be entirely excluded, and similarly actions to do so as performed in this analyses can reduce the possibility of observing pathways of interest.  An example of this is by adjusting for vascular conditions which the majority of covariates occupy (such as hypertension, diabetes, stroke), may obscure more nuanced physical activity findings, which also produce some of its benefits through vascular processes (impacting oxidative stress and inflammation).

Detailed sub-analysis on the characteristics of the group with missing PA information (a total of 194 individuals) may have produced a more rounded analysis in that this sub-group were older (mean age 69 years old), after the sedentary group had the highest percentage of participants with abnormal heart rhythm, and had the highest precentage of participatns wtih Alzheimers, Parkinson's and prevalent dementia at baseline.

#### 2.6 Conclusion

The findings of the data analysis resonate with everyday clinical primary care in that physical activity is rarely discussed in terms of being the sole solution to any one condition, but instead physical activity advice is provided in the context of a suite of broad positive lifestyle factors that include healthy eating and appropriate hydration, stress management, appropriate sleep, and avoidance of hazardous habits such as smoking, illicit drug use, and excessive alcohol consumption.  The above factors could limit a range of causes of cognitive impairment that may or may not progress to dementia, in addition to dementia itself, in the context of healthy ageing conversations with individual patients and populations.  The adjusted cox harzard model findings provide a cautious estimation of the impact that PA as a solitary factor may have on incident dementia, and as additional comorbidities and advancing age can also influence dementia occurrence, the promotion of only physical activity to those with functional symptoms of cognitive impairment may limit the breadth of useful advice that could be provided to assist in supporting protective lifestyles against dementia.

Furthermore, physical activity is additionally relevant in preventing a host of other conditions to include depression, hypertension, diabetes and stroke, which are also associated with dementia, making physical activity as an exposure an important factor to discuss both in primary care and public health settings.  
Using physical activity as an exposure as opposed to exercise is useful as it enables the discussion of everyday activities that could if done mindfully make the concept of movement more accessible to a variety of patients who may due to frailty be less able to engage in more formal activities such as walking at pace, jogging or aerobics.  The above data analysis informs examples of moderate activity such as cleaning a car, and laundry (low activity) that can be provided to patients.  Guidance can be tailored to their ability, different days of the week, and can be built on over time.

The data analysis also suggests the impact of other factors on incident dementia, to include the impact of gender with women having increased hazards, and depression.  The above analysis can be similarly tailored to the diversity of patients seen in primary care, by finding out more about, for example, a woman's daily activities, and creating systems of how what they already do can be modified to exert increased METs acknowledging that a leisurely walk could have a different result to walking at pace, as could higher BMI.  The data analysis is therefore particularly useful in informing how the information can be communicated to patients, making it accessible to their lives. 

Future work of interest to further the above analysis would include exploration of the other lifestyle factors within the ELSA data for their possible association to dementia, to produce a lifestyle factors package which could further the advice provided to patients and the public in the actions that can be taken to reduce their risk of both the factors that contribute to dementia and possibly dementia itself.



## Summaries your findings,discuss them in the context of other similar work or questions and suggestions for future work. Conclude your portfolio with what started your data exploration and what have the data contributed in the decisions for patient care or health service delivery.


### covering data access requirements, ethics, metadata and all methodological aspects of your project

### Results
### Use this section to showcase the results of your data manipulation that will contribute to the project
###Table 1 shows the baseline characteristics at wave 1, organised around physical activity (PA) categories: high, moderate, low and sedentary.  As mean age increases the proportion of participants in low and sedentary PA categories increases.  Furthermore the low and sedentary categories have higher prevalence of comorbidities that include hypertension, diabetes, stroke, abnormal heart rhythms, and heart failure.

### Discussion

### Conclusion
## Summaries your findings,discuss them in the context of other similar work or questions and suggestions for future work. Conclude your portfolio with what started your data exploration and what have the data contributed in the decisions for patient care or health service delivery.  


# In text elements
##Some examples of having in-text elements as you develop your portfolio are provided here.

#* "# Headings"
#* "## Subheading 1"
#* "### subheading 2"
#* "#### subheading 3"

## Hyperlinks

#[Healthcare Data Science](https://github.com/CambridgeICE-HDS/MSt-Healthcare-Data-Science)

## Notice box 

#::: {.infobox .caution data-latex="{caution}"}

#The format to add boxes to your portfolio
#:::


## Tables

### In-text table

#A template format to add a table into 
#the document you can use the following md code structure. 

#|Data table    | Coverage     |Area                     |
#|--------------|--------------|-------------------------|
#|Health survey | 2015         |Self-reported outcomes   |
#|EHR           | 2000 onwards |Electronic health records|

# Data tables

## defining a dataframe 


``` r
CREL <- data.frame(
  Data = c("Health survey","EHR"), 
  Coverage=c(
    "2015","2000 onwards"
  ),
  Area=c(
    "Self-reported outcomes",
    "Electronic Health Records"
  ))

kable(CREL)

Data	Coverage	Area
Health survey	2015	Self-reported outcomes
EHR	2000 onwards	Electronic Health Records

Interactive data elements

DT::datatable(
  CREL,
  extensions = 'Buttons',
  options = list(
              paging = TRUE,
              searching = TRUE,
              fixedColumns = TRUE,
              autoWidth = TRUE,
              ordering = TRUE,
              dom = 'tB',
              buttons = c('copy', 'excel')
              ),
              class = "display"
)