knitr::opts_chunk$set(echo = TRUE)
library(rmarkdown)
library(kableExtra)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:kableExtra':
## 
##     group_rows
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library(tidyr)
library(lubridate)  
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

#Clear the environment

# Clear the environment
rm(list = ls())

1 Load the dataset

# Load the steps_subset dataset
 data_12mths <- read_csv('/Users/jamesoguta/Documents/James Oguta/My PhD Folder-2023-2025/Trainings/KenyaCVDModel/inst/rmarkdown/templates/empowerhealthsummaries/skeleton/data_12mths_clean.csv')
## Rows: 60968 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (12): country, facility, diagnosis, sex, county, enrollment_status, enr...
## dbl  (22): patient_id, age, latest_bmi, baseline_systolic, baseline_diastoli...
## date  (4): enrollment_date, baseline_assessment_date, anchor_date, followup_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

2 Create a subset of patients with followup BP

# Create a subset of patients with followup BP
medtronic_subset <- data_12mths %>%
  filter(!is.na(followup_systolic)) 
# Display the first few rows of the subset
head(medtronic_subset)
## # A tibble: 6 × 38
##   patient_id country   age facility     diagnosis sex   county enrollment_status
##        <dbl> <chr>   <dbl> <chr>        <chr>     <chr> <chr>  <chr>            
## 1          1 Kenya      65 Matiliku su… "{Hypert… Fema… Makue… Direct Enrollment
## 2          5 Kenya      42 Tawa Sub-co… "{Hypert… Fema… Makue… Direct Enrollment
## 3          9 Kenya      43 Kibwezi Sub… "{Hypert… Fema… Makue… Direct Enrollment
## 4         11 Kenya      69 Kibwezi Sub… "{Hypert… Male  Makue… Direct Enrollment
## 5         15 Kenya      65 Kilungu sub… "{Hypert… Fema… Makue… Direct Enrollment
## 6         29 Kenya      70 Kilungu sub… "{Hypert… Fema… Makue… Direct Enrollment
## # ℹ 30 more variables: enrollment_bmi <chr>, latest_bmi <dbl>,
## #   baseline_systolic <dbl>, baseline_diastolic <dbl>,
## #   baseline_control_status <chr>, baseline_grade <dbl>,
## #   enrollment_date <date>, baseline_assessment_date <date>,
## #   anchor_date <date>, followup_systolic <dbl>, followup_diastolic <dbl>,
## #   followup_control_status <chr>, followup_grade <dbl>, followup_date <date>,
## #   days_between_assessment_and_anchor <dbl>, …
str(medtronic_subset)
## spc_tbl_ [17,409 × 38] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ patient_id                          : num [1:17409] 1 5 9 11 15 29 45 54 57 62 ...
##  $ country                             : chr [1:17409] "Kenya" "Kenya" "Kenya" "Kenya" ...
##  $ age                                 : num [1:17409] 65 42 43 69 65 70 50 68 65 59 ...
##  $ facility                            : chr [1:17409] "Matiliku sub county hospital" "Tawa Sub-county Hospital" "Kibwezi Sub-county Hospital" "Kibwezi Sub-county Hospital" ...
##  $ diagnosis                           : chr [1:17409] "{Hypertension,\"Diabetes Mellitus Type 2\"}" "{Hypertension}" "{Hypertension}" "{Hypertension}" ...
##  $ sex                                 : chr [1:17409] "Female" "Female" "Female" "Male" ...
##  $ county                              : chr [1:17409] "Makueni" "Makueni" "Makueni" "Makueni" ...
##  $ enrollment_status                   : chr [1:17409] "Direct Enrollment" "Direct Enrollment" "Direct Enrollment" "Direct Enrollment" ...
##  $ enrollment_bmi                      : chr [1:17409] "25.4" "37.29" "33.83" "23.24" ...
##  $ latest_bmi                          : num [1:17409] 25.4 37.3 33.8 23.2 23.6 ...
##  $ baseline_systolic                   : num [1:17409] 165 211 151 125 146 117 140 87 156 157 ...
##  $ baseline_diastolic                  : num [1:17409] 74 124 87 81 72 70 90 59 91 85 ...
##  $ baseline_control_status             : chr [1:17409] "Uncontrolled" "Uncontrolled" "Uncontrolled" "Controlled" ...
##  $ baseline_grade                      : num [1:17409] 2 3 1 0 1 0 1 0 1 1 ...
##  $ enrollment_date                     : Date[1:17409], format: "2020-12-14" "2021-11-17" ...
##  $ baseline_assessment_date            : Date[1:17409], format: "2022-02-28" "2021-11-17" ...
##  $ anchor_date                         : Date[1:17409], format: "2023-02-28" "2022-11-17" ...
##  $ followup_systolic                   : num [1:17409] 137 136 146 119 113 123 136 109 149 120 ...
##  $ followup_diastolic                  : num [1:17409] 56 65 78 70 73 77 76 64 75 72 ...
##  $ followup_control_status             : chr [1:17409] "Controlled" "Controlled" "Uncontrolled" "Controlled" ...
##  $ followup_grade                      : num [1:17409] 0 0 1 0 0 0 0 0 1 0 ...
##  $ followup_date                       : Date[1:17409], format: "2023-02-14" "2022-11-03" ...
##  $ days_between_assessment_and_anchor  : num [1:17409] 14 14 6 27 23 19 6 60 40 3 ...
##  $ days_between_assessment_and_baseline: num [1:17409] 351 351 371 392 342 346 359 305 405 368 ...
##  $ number_followup_assessments         : num [1:17409] 3 1 0 3 0 11 4 3 1 5 ...
##  $ total_followup_assessments          : num [1:17409] 4 8 7 3 25 12 11 3 3 5 ...
##  $ number_followup_by_hs               : num [1:17409] 2 0 0 2 1 1 0 2 0 2 ...
##  $ number_medical_reviews              : num [1:17409] 0 0 0 0 0 0 0 0 0 0 ...
##  $ controlled_followup_days            : num [1:17409] 351 351 NA -1 342 -1 359 -1 NA 368 ...
##  $ number_uncontrolled_followups       : num [1:17409] 1 0 1 -1 0 -1 2 -1 2 2 ...
##  $ number_community_assessments        : num [1:17409] 0 0 0 0 0 1 0 0 0 2 ...
##  $ number_facility_assessments         : num [1:17409] 1 0 0 1 1 4 2 3 2 1 ...
##  $ number_feeder_assessments           : num [1:17409] 0 0 0 0 1 0 0 0 0 0 ...
##  $ change_in_systolic                  : num [1:17409] -28 -75 -5 -6 -33 6 -4 22 -7 -37 ...
##  $ change_in_diastolic                 : num [1:17409] -18 -59 -9 -11 1 7 -14 5 -16 -13 ...
##  $ filter_col_str                      : chr [1:17409] "Yes" "Yes" "Yes" "Yes" ...
##  $ diabetes                            : chr [1:17409] "Yes" "No" "No" "No" ...
##  $ age_group                           : chr [1:17409] "65-69" "40-44" "40-44" "65-69" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   patient_id = col_double(),
##   ..   country = col_character(),
##   ..   age = col_double(),
##   ..   facility = col_character(),
##   ..   diagnosis = col_character(),
##   ..   sex = col_character(),
##   ..   county = col_character(),
##   ..   enrollment_status = col_character(),
##   ..   enrollment_bmi = col_character(),
##   ..   latest_bmi = col_double(),
##   ..   baseline_systolic = col_double(),
##   ..   baseline_diastolic = col_double(),
##   ..   baseline_control_status = col_character(),
##   ..   baseline_grade = col_double(),
##   ..   enrollment_date = col_date(format = ""),
##   ..   baseline_assessment_date = col_date(format = ""),
##   ..   anchor_date = col_date(format = ""),
##   ..   followup_systolic = col_double(),
##   ..   followup_diastolic = col_double(),
##   ..   followup_control_status = col_character(),
##   ..   followup_grade = col_double(),
##   ..   followup_date = col_date(format = ""),
##   ..   days_between_assessment_and_anchor = col_double(),
##   ..   days_between_assessment_and_baseline = col_double(),
##   ..   number_followup_assessments = col_double(),
##   ..   total_followup_assessments = col_double(),
##   ..   number_followup_by_hs = col_double(),
##   ..   number_medical_reviews = col_double(),
##   ..   controlled_followup_days = col_double(),
##   ..   number_uncontrolled_followups = col_double(),
##   ..   number_community_assessments = col_double(),
##   ..   number_facility_assessments = col_double(),
##   ..   number_feeder_assessments = col_double(),
##   ..   change_in_systolic = col_double(),
##   ..   change_in_diastolic = col_double(),
##   ..   filter_col_str = col_character(),
##   ..   diabetes = col_character(),
##   ..   age_group = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

3 Summary of the dataset

# Summary of the dataset
medtronic_subset$enrollment_bmi <- as.numeric(medtronic_subset$enrollment_bmi)
## Warning: NAs introduced by coercion
summary(medtronic_subset)
##    patient_id       country               age           facility        
##  Min.   :     1   Length:17409       Min.   :  5.00   Length:17409      
##  1st Qu.: 47964   Class :character   1st Qu.: 54.00   Class :character  
##  Median :329310   Mode  :character   Median : 64.00   Mode  :character  
##  Mean   :241930                      Mean   : 62.72                     
##  3rd Qu.:370978                      3rd Qu.: 72.00                     
##  Max.   :536853                      Max.   :100.00                     
##                                                                         
##   diagnosis             sex               county          enrollment_status 
##  Length:17409       Length:17409       Length:17409       Length:17409      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  enrollment_bmi      latest_bmi     baseline_systolic baseline_diastolic
##  Min.   :   5.30   Min.   :  5.13   Min.   : 65.0     Min.   : 36.00    
##  1st Qu.:  23.06   1st Qu.: 23.03   1st Qu.:128.0     1st Qu.: 75.00    
##  Median :  26.03   Median : 26.02   Median :140.0     Median : 82.00    
##  Mean   :  29.09   Mean   : 28.83   Mean   :142.4     Mean   : 82.99    
##  3rd Qu.:  29.76   3rd Qu.: 29.87   3rd Qu.:155.0     3rd Qu.: 90.00    
##  Max.   :1440.44   Max.   :632.00   Max.   :285.0     Max.   :163.00    
##  NA's   :140       NA's   :1                                            
##  baseline_control_status baseline_grade   enrollment_date     
##  Length:17409            Min.   :0.0000   Min.   :2018-04-18  
##  Class :character        1st Qu.:0.0000   1st Qu.:2021-11-26  
##  Mode  :character        Median :1.0000   Median :2022-04-19  
##                          Mean   :0.8534   Mean   :2022-02-19  
##                          3rd Qu.:1.0000   3rd Qu.:2022-09-02  
##                          Max.   :3.0000   Max.   :2023-09-20  
##                                                               
##  baseline_assessment_date  anchor_date         followup_systolic
##  Min.   :2018-04-19       Min.   :2019-04-19   Min.   : 70.0    
##  1st Qu.:2022-02-22       1st Qu.:2023-02-22   1st Qu.:124.0    
##  Median :2022-07-20       Median :2023-07-20   Median :134.0    
##  Mean   :2022-06-01       Mean   :2023-06-01   Mean   :136.3    
##  3rd Qu.:2022-12-02       3rd Qu.:2023-12-02   3rd Qu.:146.0    
##  Max.   :2023-09-26       Max.   :2024-09-26   Max.   :240.0    
##                                                                 
##  followup_diastolic followup_control_status followup_grade  
##  Min.   : 30.00     Length:17409            Min.   :0.0000  
##  1st Qu.: 73.00     Class :character        1st Qu.:0.0000  
##  Median : 80.00     Mode  :character        Median :0.0000  
##  Mean   : 80.49                             Mean   :0.5977  
##  3rd Qu.: 87.00                             3rd Qu.:1.0000  
##  Max.   :153.00                             Max.   :3.0000  
##                                                             
##  followup_date        days_between_assessment_and_anchor
##  Min.   :2019-03-18   Min.   : 0.00                     
##  1st Qu.:2023-02-21   1st Qu.:10.00                     
##  Median :2023-07-19   Median :23.00                     
##  Mean   :2023-05-31   Mean   :25.66                     
##  3rd Qu.:2023-12-05   3rd Qu.:40.00                     
##  Max.   :2024-07-29   Max.   :62.00                     
##                                                         
##  days_between_assessment_and_baseline number_followup_assessments
##  Min.   :303.0                        Min.   : 0.000             
##  1st Qu.:340.0                        1st Qu.: 1.000             
##  Median :364.0                        Median : 2.000             
##  Mean   :363.8                        Mean   : 2.902             
##  3rd Qu.:386.0                        3rd Qu.: 4.000             
##  Max.   :427.0                        Max.   :32.000             
##                                                                  
##  total_followup_assessments number_followup_by_hs number_medical_reviews
##  Min.   : 1.000             Min.   : 0.000        Min.   :0.00000       
##  1st Qu.: 3.000             1st Qu.: 0.000        1st Qu.:0.00000       
##  Median : 4.000             Median : 1.000        Median :0.00000       
##  Mean   : 5.751             Mean   : 1.305        Mean   :0.03797       
##  3rd Qu.: 8.000             3rd Qu.: 2.000        3rd Qu.:0.00000       
##  Max.   :79.000             Max.   :28.000        Max.   :4.00000       
##                                                                         
##  controlled_followup_days number_uncontrolled_followups
##  Min.   : -1.0            Min.   :-1.0000              
##  1st Qu.: -1.0            1st Qu.:-1.0000              
##  Median : -1.0            Median : 0.0000              
##  Mean   :139.4            Mean   : 0.4923              
##  3rd Qu.:351.0            3rd Qu.: 1.0000              
##  Max.   :427.0            Max.   :18.0000              
##  NA's   :4962                                          
##  number_community_assessments number_facility_assessments
##  Min.   : 0.000               Min.   : 0.00              
##  1st Qu.: 0.000               1st Qu.: 0.00              
##  Median : 0.000               Median : 1.00              
##  Mean   : 1.001               Mean   : 1.37              
##  3rd Qu.: 1.000               3rd Qu.: 2.00              
##  Max.   :21.000               Max.   :19.00              
##                                                          
##  number_feeder_assessments change_in_systolic change_in_diastolic
##  Min.   : 0.00000          Min.   :-146.00    Min.   :-88.000    
##  1st Qu.: 0.00000          1st Qu.: -20.00    1st Qu.:-11.000    
##  Median : 0.00000          Median :  -5.00    Median : -2.000    
##  Mean   : 0.05802          Mean   :  -6.09    Mean   : -2.493    
##  3rd Qu.: 0.00000          3rd Qu.:   9.00    3rd Qu.:  6.000    
##  Max.   :26.00000          Max.   : 109.00    Max.   : 75.000    
##                                                                  
##  filter_col_str       diabetes          age_group        
##  Length:17409       Length:17409       Length:17409      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
## 
# Display the structure of the dataset
str(medtronic_subset)
## spc_tbl_ [17,409 × 38] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ patient_id                          : num [1:17409] 1 5 9 11 15 29 45 54 57 62 ...
##  $ country                             : chr [1:17409] "Kenya" "Kenya" "Kenya" "Kenya" ...
##  $ age                                 : num [1:17409] 65 42 43 69 65 70 50 68 65 59 ...
##  $ facility                            : chr [1:17409] "Matiliku sub county hospital" "Tawa Sub-county Hospital" "Kibwezi Sub-county Hospital" "Kibwezi Sub-county Hospital" ...
##  $ diagnosis                           : chr [1:17409] "{Hypertension,\"Diabetes Mellitus Type 2\"}" "{Hypertension}" "{Hypertension}" "{Hypertension}" ...
##  $ sex                                 : chr [1:17409] "Female" "Female" "Female" "Male" ...
##  $ county                              : chr [1:17409] "Makueni" "Makueni" "Makueni" "Makueni" ...
##  $ enrollment_status                   : chr [1:17409] "Direct Enrollment" "Direct Enrollment" "Direct Enrollment" "Direct Enrollment" ...
##  $ enrollment_bmi                      : num [1:17409] 25.4 37.3 33.8 23.2 23.6 ...
##  $ latest_bmi                          : num [1:17409] 25.4 37.3 33.8 23.2 23.6 ...
##  $ baseline_systolic                   : num [1:17409] 165 211 151 125 146 117 140 87 156 157 ...
##  $ baseline_diastolic                  : num [1:17409] 74 124 87 81 72 70 90 59 91 85 ...
##  $ baseline_control_status             : chr [1:17409] "Uncontrolled" "Uncontrolled" "Uncontrolled" "Controlled" ...
##  $ baseline_grade                      : num [1:17409] 2 3 1 0 1 0 1 0 1 1 ...
##  $ enrollment_date                     : Date[1:17409], format: "2020-12-14" "2021-11-17" ...
##  $ baseline_assessment_date            : Date[1:17409], format: "2022-02-28" "2021-11-17" ...
##  $ anchor_date                         : Date[1:17409], format: "2023-02-28" "2022-11-17" ...
##  $ followup_systolic                   : num [1:17409] 137 136 146 119 113 123 136 109 149 120 ...
##  $ followup_diastolic                  : num [1:17409] 56 65 78 70 73 77 76 64 75 72 ...
##  $ followup_control_status             : chr [1:17409] "Controlled" "Controlled" "Uncontrolled" "Controlled" ...
##  $ followup_grade                      : num [1:17409] 0 0 1 0 0 0 0 0 1 0 ...
##  $ followup_date                       : Date[1:17409], format: "2023-02-14" "2022-11-03" ...
##  $ days_between_assessment_and_anchor  : num [1:17409] 14 14 6 27 23 19 6 60 40 3 ...
##  $ days_between_assessment_and_baseline: num [1:17409] 351 351 371 392 342 346 359 305 405 368 ...
##  $ number_followup_assessments         : num [1:17409] 3 1 0 3 0 11 4 3 1 5 ...
##  $ total_followup_assessments          : num [1:17409] 4 8 7 3 25 12 11 3 3 5 ...
##  $ number_followup_by_hs               : num [1:17409] 2 0 0 2 1 1 0 2 0 2 ...
##  $ number_medical_reviews              : num [1:17409] 0 0 0 0 0 0 0 0 0 0 ...
##  $ controlled_followup_days            : num [1:17409] 351 351 NA -1 342 -1 359 -1 NA 368 ...
##  $ number_uncontrolled_followups       : num [1:17409] 1 0 1 -1 0 -1 2 -1 2 2 ...
##  $ number_community_assessments        : num [1:17409] 0 0 0 0 0 1 0 0 0 2 ...
##  $ number_facility_assessments         : num [1:17409] 1 0 0 1 1 4 2 3 2 1 ...
##  $ number_feeder_assessments           : num [1:17409] 0 0 0 0 1 0 0 0 0 0 ...
##  $ change_in_systolic                  : num [1:17409] -28 -75 -5 -6 -33 6 -4 22 -7 -37 ...
##  $ change_in_diastolic                 : num [1:17409] -18 -59 -9 -11 1 7 -14 5 -16 -13 ...
##  $ filter_col_str                      : chr [1:17409] "Yes" "Yes" "Yes" "Yes" ...
##  $ diabetes                            : chr [1:17409] "Yes" "No" "No" "No" ...
##  $ age_group                           : chr [1:17409] "65-69" "40-44" "40-44" "65-69" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   patient_id = col_double(),
##   ..   country = col_character(),
##   ..   age = col_double(),
##   ..   facility = col_character(),
##   ..   diagnosis = col_character(),
##   ..   sex = col_character(),
##   ..   county = col_character(),
##   ..   enrollment_status = col_character(),
##   ..   enrollment_bmi = col_character(),
##   ..   latest_bmi = col_double(),
##   ..   baseline_systolic = col_double(),
##   ..   baseline_diastolic = col_double(),
##   ..   baseline_control_status = col_character(),
##   ..   baseline_grade = col_double(),
##   ..   enrollment_date = col_date(format = ""),
##   ..   baseline_assessment_date = col_date(format = ""),
##   ..   anchor_date = col_date(format = ""),
##   ..   followup_systolic = col_double(),
##   ..   followup_diastolic = col_double(),
##   ..   followup_control_status = col_character(),
##   ..   followup_grade = col_double(),
##   ..   followup_date = col_date(format = ""),
##   ..   days_between_assessment_and_anchor = col_double(),
##   ..   days_between_assessment_and_baseline = col_double(),
##   ..   number_followup_assessments = col_double(),
##   ..   total_followup_assessments = col_double(),
##   ..   number_followup_by_hs = col_double(),
##   ..   number_medical_reviews = col_double(),
##   ..   controlled_followup_days = col_double(),
##   ..   number_uncontrolled_followups = col_double(),
##   ..   number_community_assessments = col_double(),
##   ..   number_facility_assessments = col_double(),
##   ..   number_feeder_assessments = col_double(),
##   ..   change_in_systolic = col_double(),
##   ..   change_in_diastolic = col_double(),
##   ..   filter_col_str = col_character(),
##   ..   diabetes = col_character(),
##   ..   age_group = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
# Display the number of rows and columns in the dataset
dim(medtronic_subset)
## [1] 17409    38
# Display the column names of the dataset
colnames(medtronic_subset)
##  [1] "patient_id"                          
##  [2] "country"                             
##  [3] "age"                                 
##  [4] "facility"                            
##  [5] "diagnosis"                           
##  [6] "sex"                                 
##  [7] "county"                              
##  [8] "enrollment_status"                   
##  [9] "enrollment_bmi"                      
## [10] "latest_bmi"                          
## [11] "baseline_systolic"                   
## [12] "baseline_diastolic"                  
## [13] "baseline_control_status"             
## [14] "baseline_grade"                      
## [15] "enrollment_date"                     
## [16] "baseline_assessment_date"            
## [17] "anchor_date"                         
## [18] "followup_systolic"                   
## [19] "followup_diastolic"                  
## [20] "followup_control_status"             
## [21] "followup_grade"                      
## [22] "followup_date"                       
## [23] "days_between_assessment_and_anchor"  
## [24] "days_between_assessment_and_baseline"
## [25] "number_followup_assessments"         
## [26] "total_followup_assessments"          
## [27] "number_followup_by_hs"               
## [28] "number_medical_reviews"              
## [29] "controlled_followup_days"            
## [30] "number_uncontrolled_followups"       
## [31] "number_community_assessments"        
## [32] "number_facility_assessments"         
## [33] "number_feeder_assessments"           
## [34] "change_in_systolic"                  
## [35] "change_in_diastolic"                 
## [36] "filter_col_str"                      
## [37] "diabetes"                            
## [38] "age_group"
# Display the number of unique patients in the dataset
length(unique(medtronic_subset$patient_id))
## [1] 17409

4 Comparing the mean systolic blood pressure (SBP) at enrollment and follow-up

# Comparing the mean systolic blood pressure (SBP) at enrollment and follow-up
# Calculate the mean SBP at enrollment and follow-up
mean_sbp_enrollment <- mean(medtronic_subset$baseline_systolic, na.rm = TRUE)
mean_sbp_followup <- mean(medtronic_subset$followup_systolic, na.rm = TRUE)
# Calculate the mean difference
mean_diff <- mean_sbp_followup - mean_sbp_enrollment
# Create a data frame for the mean SBP at enrollment and follow-up
mean_sbp_df <- data.frame(
  Time = c("Enrollment", "Follow-up"),
  Mean_SBP = c(mean_sbp_enrollment, mean_sbp_followup)
)
# Create a bar plot for the mean SBP at enrollment and follow-up
ggplot(mean_sbp_df, aes(x = Time, y = Mean_SBP)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(title = "Mean Systolic Blood Pressure at Enrollment and Follow-up",
       x = "Time",
       y = "Mean SBP (mmHg)") +
  theme_minimal()

# Comparing the mean diastolic blood pressure (DBP) at enrollment and follow-up

# Comparing the mean diastolic blood pressure (DBP) at enrollment and follow-up
# Calculate the mean DBP at enrollment and follow-up
mean_dbp_enrollment <- mean(medtronic_subset$baseline_diastolic, na.rm = TRUE)
mean_dbp_followup <- mean(medtronic_subset$followup_diastolic, na.rm = TRUE)
# Calculate the mean difference
mean_diff_dbp <- mean_dbp_followup - mean_dbp_enrollment
# Create a data frame for the mean DBP at enrollment and follow-up
mean_dbp_df <- data.frame(
  Time = c("Enrollment", "Follow-up"),
  Mean_DBP = c(mean_dbp_enrollment, mean_dbp_followup)
)
# Create a bar plot for the mean DBP at enrollment and follow-up
ggplot(mean_dbp_df, aes(x = Time, y = Mean_DBP)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(title = "Mean Diastolic Blood Pressure at Enrollment and Follow-up",
       x = "Time",
       y = "Mean DBP (mmHg)") +
  theme_minimal()

# Comparing the mean body mass index (BMI) at enrollment and follow-up

# Comparing the mean body mass index (BMI) at enrollment and follow-up
# Calculate the mean BMI at enrollment and follow-up
mean_bmi_enrollment <- mean(medtronic_subset$enrollment_bmi, na.rm = TRUE)
mean_bmi_followup <- mean(medtronic_subset$latest_bmi, na.rm = TRUE)
# Calculate the mean difference
mean_diff_bmi <- mean_bmi_followup - mean_bmi_enrollment
# Create a data frame for the mean BMI at enrollment and follow-up
mean_bmi_df <- data.frame(
  Time = c("Enrollment", "Follow-up"),
  Mean_BMI = c(mean_bmi_enrollment, mean_bmi_followup)
)
# Create a bar plot for the mean BMI at enrollment and follow-up
ggplot(mean_bmi_df, aes(x = Time, y = Mean_BMI)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(title = "Mean Body Mass Index at Enrollment and Follow-up",
       x = "Time",
       y = "Mean BMI (kg/m^2)") +
  theme_minimal()

# Comparing the mean sbp at enrollment and follow-up by enrollment_status

# Comparing the mean sbp at enrollment and follow-up by enrollment_status
# Calculate the mean sbp at enrollment and follow-up by enrollment_status-add number of patients
mean_sbp_enrollment_status <- medtronic_subset %>%
  group_by(enrollment_status) %>%
  summarise(
    Mean_SBP_Enrollment = mean(baseline_systolic, na.rm = TRUE),
    Mean_SBP_Followup = mean(followup_systolic, na.rm = TRUE),
    Number_of_Patients = n()
  )
# Calculate the mean difference
mean_diff_sbp_enrollment_status <- mean_sbp_enrollment_status %>%
  mutate(Mean_Difference = Mean_SBP_Followup - Mean_SBP_Enrollment)
mean_diff_sbp_enrollment_status
## # A tibble: 2 × 5
##   enrollment_status Mean_SBP_Enrollment Mean_SBP_Followup Number_of_Patients
##   <chr>                           <dbl>             <dbl>              <int>
## 1 Direct Enrollment                142.              136.              15376
## 2 From Screening                   143.              135.               2033
## # ℹ 1 more variable: Mean_Difference <dbl>

5 Comparing mean sbp at enrollment and follow-up by county

# Comparing mean sbp at enrollment and follow-up by county
# Calculate the mean sbp at enrollment and follow-up by county add number of patients
mean_sbp_county <- medtronic_subset %>%
  group_by(county) %>%
  summarise(
    Mean_SBP_Enrollment = mean(baseline_systolic, na.rm = TRUE),
    Mean_SBP_Followup = mean(followup_systolic, na.rm = TRUE),
    Number_of_Patients = n()
  )
# Calculate the mean difference
mean_diff_sbp_county <- mean_sbp_county %>%
  mutate(Mean_Difference = Mean_SBP_Followup - Mean_SBP_Enrollment)

mean_diff_sbp_county
## # A tibble: 10 × 5
##    county    Mean_SBP_Enrollment Mean_SBP_Followup Number_of_Patients
##    <chr>                   <dbl>             <dbl>              <int>
##  1 Kakamega                 143.              140.               1193
##  2 Kilifi                   138.              134.               1702
##  3 Makueni                  142.              135.               8474
##  4 Meru                     144.              143.                248
##  5 Mombasa                  143.              135.               1632
##  6 Nairobi                  145.              137.                166
##  7 Nakuru                   146.              139.               2191
##  8 Nyandurua                143.              140.                329
##  9 Nyeri                    141.              139.               1280
## 10 Siaya                    141.              137.                194
## # ℹ 1 more variable: Mean_Difference <dbl>

6 Comparing mean sbp at enrollment and follow-up by sex-add patient counts

# Drop non-binary sex category because of few observations (Only 2)
# Drop rows where sex is not "Male" or "Female"
medtronic_subset <- medtronic_subset[medtronic_subset$sex %in% c("Male", "Female"), ]

mean_sbp_sex <- medtronic_subset %>% 
  group_by(sex) %>%
  summarise(
    Mean_SBP_Enrollment = mean(baseline_systolic, na.rm = TRUE),
    Mean_SBP_Followup = mean(followup_systolic, na.rm = TRUE),
    Number_of_Patients = n()
  )
# Calculate the mean difference
mean_diff_sbp_sex <- mean_sbp_sex %>% 
  mutate(Mean_Difference = Mean_SBP_Followup - Mean_SBP_Enrollment)
mean_diff_sbp_sex
## # A tibble: 2 × 5
##   sex   Mean_SBP_Enrollment Mean_SBP_Followup Number_of_Patients Mean_Difference
##   <chr>               <dbl>             <dbl>              <int>           <dbl>
## 1 Fema…                142.              136.              13010           -5.90
## 2 Male                 145.              138.               4397           -6.63

7 Testing whether the mean difference is statistically significant

# Testing whether the mean difference is statistically significant
t_test_result <- t.test(medtronic_subset$followup_systolic, medtronic_subset$baseline_systolic, paired = TRUE)

# Display the t-test result
t_test_result
## 
##  Paired t-test
## 
## data:  medtronic_subset$followup_systolic and medtronic_subset$baseline_systolic
## t = -32.918, df = 17406, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -6.449306 -5.724418
## sample estimates:
## mean difference 
##       -6.086862

8 Comparing the mean systolic blood pressure (SBP) at enrollment and follow-up by age group

# Comparing the mean systolic blood pressure (SBP) at enrollment and follow-up by five-year age groups
# Create a new variable for age group
medtronic_subset$age_group <- cut(medtronic_subset$age, 
                                   breaks = seq(0, 100, by = 5), 
                                   right = FALSE, 
                                   labels = paste(seq(0, 95, by = 5), seq(4, 99, by = 5), sep = "-"))
# Calculate the mean SBP at enrollment and follow-up by age group-add patient counts
mean_sbp_age_group <- medtronic_subset %>%
  group_by(age_group) %>%
  summarise(
    Mean_SBP_Enrollment = mean(baseline_systolic, na.rm = TRUE),
    Mean_SBP_Followup = mean(followup_systolic, na.rm = TRUE),
    Number_of_Patients = n()
  )
# Calculate the mean difference
mean_diff_sbp_age_group <- mean_sbp_age_group %>%
  mutate(Mean_Difference = Mean_SBP_Followup - Mean_SBP_Enrollment)
mean_diff_sbp_age_group
## # A tibble: 20 × 5
##    age_group Mean_SBP_Enrollment Mean_SBP_Followup Number_of_Patients
##    <fct>                   <dbl>             <dbl>              <int>
##  1 5-9                      134               130.                  2
##  2 10-14                    114.              130.                  3
##  3 15-19                    136.              129.                 12
##  4 20-24                    133.              126.                 30
##  5 25-29                    138.              133.                101
##  6 30-34                    139.              133.                212
##  7 35-39                    138.              133.                435
##  8 40-44                    140.              133.                747
##  9 45-49                    140.              134.               1202
## 10 50-54                    141.              135.               1906
## 11 55-59                    140.              135.               1880
## 12 60-64                    142.              136.               2636
## 13 65-69                    143.              137.               2492
## 14 70-74                    144.              138.               2680
## 15 75-79                    145.              138.               1364
## 16 80-84                    146.              138.                994
## 17 85-89                    147.              139.                412
## 18 90-94                    145.              139.                215
## 19 95-99                    149.              142.                 75
## 20 <NA>                     154.              138.                  9
## # ℹ 1 more variable: Mean_Difference <dbl>

9 Comparing the mean systolic blood pressure (SBP) at enrollment and follow-up by diabetes status

# Comparing the mean systolic blood pressure (SBP) at enrollment and follow-up by diabetes status
# Calculate the mean SBP at enrollment and follow-up by diabetes status
mean_sbp_diabetes <- medtronic_subset %>%
  group_by(diabetes) %>%
  summarise(
    Mean_SBP_Enrollment = mean(baseline_systolic, na.rm = TRUE),
    Mean_SBP_Followup = mean(followup_systolic, na.rm = TRUE),
    Number_of_Patients = n()
  )
# Calculate the mean difference
mean_diff_sbp_diabetes <- mean_sbp_diabetes %>%
  mutate(Mean_Difference = Mean_SBP_Followup - Mean_SBP_Enrollment)
mean_diff_sbp_diabetes
## # A tibble: 2 × 5
##   diabetes Mean_SBP_Enrollment Mean_SBP_Followup Number_of_Patients
##   <chr>                  <dbl>             <dbl>              <int>
## 1 No                      142.              136.              12806
## 2 Yes                     142.              138.               4601
## # ℹ 1 more variable: Mean_Difference <dbl>

10 Summarise the total number of followup assessments

# Summarise the total number of followup assessments
summary(medtronic_subset$total_followup_assessments)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   3.000   4.000   5.751   8.000  79.000
# Display the distribution of the total number of followup assessments-
# histogram
hist(medtronic_subset$total_followup_assessments, 
     main = "Distribution of Total Follow-up Assessments", 
     xlab = "Total Follow-up Assessments", 
     ylab = "Frequency", 
     col = "lightblue", 
     border = "black")

# Display the distribution of the total number of followup assessments-boxplot
boxplot(medtronic_subset$total_followup_assessments, 
        main = "Boxplot of Total Follow-up Assessments", 
        ylab = "Total Follow-up Assessments", 
        col = "lightblue", 
        border = "black")

11 Create number of followup assessment variable

# Create number of followup assessment variable- Categorise d based on those with five or more followup assessments
medtronic_subset <- medtronic_subset %>%
    mutate(
    num_followup_assess = case_when(
      total_followup_assessments >= 5 ~ "Five or more",
      TRUE ~ "< 5 "
    )
  )

12 Comparing the mean systolic blood pressure (SBP) at enrollment and follow-up by number of followup assessments

# Comparing the mean systolic blood pressure (SBP) at enrollment and follow-up by number of followup assessments
mean_sbp_followup_assessments <- medtronic_subset %>%
  group_by(num_followup_assess) %>%
  summarise(
    Mean_SBP_Enrollment = mean(baseline_systolic, na.rm = TRUE),
    Mean_SBP_Followup = mean(followup_systolic, na.rm = TRUE),
    Number_of_Patients = n()
  )
# Calculate the mean difference
mean_diff_sbp_followup_assessments <- mean_sbp_followup_assessments %>%
  mutate(Mean_Difference = Mean_SBP_Followup - Mean_SBP_Enrollment)
mean_diff_sbp_followup_assessments
## # A tibble: 2 × 5
##   num_followup_assess Mean_SBP_Enrollment Mean_SBP_Followup Number_of_Patients
##   <chr>                             <dbl>             <dbl>              <int>
## 1 "< 5 "                             142.              137.               9256
## 2 "Five or more"                     143.              135.               8151
## # ℹ 1 more variable: Mean_Difference <dbl>

13 Testing whether the differences between baseline and followup sbp between the groups is statistically significant

# Testing whether the differences between baseline and followup sbp between the groups is statistically significant
t.test(change_in_systolic ~ num_followup_assess, data = medtronic_subset)
## 
##  Welch Two Sample t-test
## 
## data:  change_in_systolic by num_followup_assess
## t = 6.4316, df = 17128, p-value = 1.296e-10
## alternative hypothesis: true difference in means between group < 5  and group Five or more is not equal to 0
## 95 percent confidence interval:
##  1.655077 3.106106
## sample estimates:
##         mean in group < 5  mean in group Five or more 
##                  -4.972126                  -7.352717
t.test(change_in_systolic ~ diabetes, data = medtronic_subset)
## 
##  Welch Two Sample t-test
## 
## data:  change_in_systolic by diabetes
## t = -6.148, df = 8260.7, p-value = 8.209e-10
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##  -3.366255 -1.738611
## sample estimates:
##  mean in group No mean in group Yes 
##         -6.761518         -4.209085
t.test(change_in_systolic ~ enrollment_status, data = medtronic_subset)
## 
##  Welch Two Sample t-test
## 
## data:  change_in_systolic by enrollment_status
## t = 3.828, df = 2607.4, p-value = 0.0001322
## alternative hypothesis: true difference in means between group Direct Enrollment and group From Screening is not equal to 0
## 95 percent confidence interval:
##  1.068750 3.313623
## sample estimates:
## mean in group Direct Enrollment    mean in group From Screening 
##                       -5.830948                       -8.022135

14 Comparing the mean systolic blood pressure (SBP) at enrollment and follow-up by number of followup assessments-Selecting individuals with only two followup assessments

# Comparing the mean systolic blood pressure (SBP) at enrollment and follow-up by number of followup assessments
# Create a subset of patients with only two followup assessments
# Create number of followup assessment variable- Categorise d based on those with two or more followup assessments
medtronic_subset <- medtronic_subset %>%
    mutate(
    assess_count = case_when(
      total_followup_assessments <= 1 ~ "One",
      TRUE ~ "Two or more "
    )
  )


# Create number of followup assessment variable- Cutoff for two or more and those with less
medtronic_subset <- medtronic_subset %>% 
  mutate (
    at_least_two_followup = case_when(
      total_followup_assessments >= 2 ~ "Two or more followups",
      TRUE ~ "Less than two followups"
    )
  )

15 Comparing the mean systolic blood pressure (SBP) at enrollment and follow-up by number of followup assessments-Comparing those with one followup assessment

mean_sbp_one_followup_assessments <- medtronic_subset %>%
  group_by(assess_count) %>%
  summarise(
    Mean_SBP_Enrollment = mean(baseline_systolic, na.rm = TRUE),
    Mean_SBP_Followup = mean(followup_systolic, na.rm = TRUE),
    Number_of_Patients = n()
  )
# Calculate the mean difference
mean_diff_sbp_one_followup_assessments <- mean_sbp_one_followup_assessments %>%
  mutate(Mean_Difference = Mean_SBP_Followup - Mean_SBP_Enrollment)
mean_diff_sbp_one_followup_assessments
## # A tibble: 2 × 5
##   assess_count   Mean_SBP_Enrollment Mean_SBP_Followup Number_of_Patients
##   <chr>                        <dbl>             <dbl>              <int>
## 1 "One"                         142.              137.               1364
## 2 "Two or more "                142.              136.              16043
## # ℹ 1 more variable: Mean_Difference <dbl>

16 Testing whether the differences between baseline and followup sbp between the groups is statistically significant

# Testing whether the differences between baseline and followup sbp between the groups is statistically significant
t.test(change_in_systolic ~ assess_count, data = medtronic_subset)
## 
##  Welch Two Sample t-test
## 
## data:  change_in_systolic by assess_count
## t = 2.6913, df = 1618, p-value = 0.007191
## alternative hypothesis: true difference in means between group One and group Two or more  is not equal to 0
## 95 percent confidence interval:
##  0.4904499 3.1265206
## sample estimates:
##          mean in group One mean in group Two or more  
##                  -4.420088                  -6.228573
t.test(change_in_systolic ~ diabetes, data = medtronic_subset)
## 
##  Welch Two Sample t-test
## 
## data:  change_in_systolic by diabetes
## t = -6.148, df = 8260.7, p-value = 8.209e-10
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##  -3.366255 -1.738611
## sample estimates:
##  mean in group No mean in group Yes 
##         -6.761518         -4.209085
t.test(change_in_systolic ~ enrollment_status, data = medtronic_subset)
## 
##  Welch Two Sample t-test
## 
## data:  change_in_systolic by enrollment_status
## t = 3.828, df = 2607.4, p-value = 0.0001322
## alternative hypothesis: true difference in means between group Direct Enrollment and group From Screening is not equal to 0
## 95 percent confidence interval:
##  1.068750 3.313623
## sample estimates:
## mean in group Direct Enrollment    mean in group From Screening 
##                       -5.830948                       -8.022135

17 Comparing the mean systolic blood pressure (SBP) at enrollment and follow-up by number of followup assessments-Comparing those with at least two followup assessments

mean_sbp_at_least_two_followup_assessments <- medtronic_subset %>%
  group_by(at_least_two_followup) %>%
  summarise(
    Mean_SBP_Enrollment = mean(baseline_systolic, na.rm = TRUE),
    Mean_SBP_Followup = mean(followup_systolic, na.rm = TRUE),
    Number_of_Patients = n()
  )
# Calculate the mean difference
mean_diff_sbp_at_least_two_followup_assessments <- mean_sbp_at_least_two_followup_assessments %>%
  mutate(Mean_Difference = Mean_SBP_Followup - Mean_SBP_Enrollment)
mean_diff_sbp_at_least_two_followup_assessments
## # A tibble: 2 × 5
##   at_least_two_followup Mean_SBP_Enrollment Mean_SBP_Followup Number_of_Patients
##   <chr>                               <dbl>             <dbl>              <int>
## 1 Less than two follow…                142.              137.               1364
## 2 Two or more followups                142.              136.              16043
## # ℹ 1 more variable: Mean_Difference <dbl>

18 Testing whether the differences between baseline and followup sbp between the groups is statistically significant

# Testing whether the differences between baseline and followup sbp between the groups is statistically significant
t.test(change_in_systolic ~ assess_count, data = medtronic_subset)
## 
##  Welch Two Sample t-test
## 
## data:  change_in_systolic by assess_count
## t = 2.6913, df = 1618, p-value = 0.007191
## alternative hypothesis: true difference in means between group One and group Two or more  is not equal to 0
## 95 percent confidence interval:
##  0.4904499 3.1265206
## sample estimates:
##          mean in group One mean in group Two or more  
##                  -4.420088                  -6.228573
t.test(change_in_systolic ~ diabetes, data = medtronic_subset)
## 
##  Welch Two Sample t-test
## 
## data:  change_in_systolic by diabetes
## t = -6.148, df = 8260.7, p-value = 8.209e-10
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##  -3.366255 -1.738611
## sample estimates:
##  mean in group No mean in group Yes 
##         -6.761518         -4.209085
t.test(change_in_systolic ~ enrollment_status, data = medtronic_subset)
## 
##  Welch Two Sample t-test
## 
## data:  change_in_systolic by enrollment_status
## t = 3.828, df = 2607.4, p-value = 0.0001322
## alternative hypothesis: true difference in means between group Direct Enrollment and group From Screening is not equal to 0
## 95 percent confidence interval:
##  1.068750 3.313623
## sample estimates:
## mean in group Direct Enrollment    mean in group From Screening 
##                       -5.830948                       -8.022135
t.test(change_in_systolic ~ at_least_two_followup, data = medtronic_subset)
## 
##  Welch Two Sample t-test
## 
## data:  change_in_systolic by at_least_two_followup
## t = 2.6913, df = 1618, p-value = 0.007191
## alternative hypothesis: true difference in means between group Less than two followups and group Two or more followups is not equal to 0
## 95 percent confidence interval:
##  0.4904499 3.1265206
## sample estimates:
## mean in group Less than two followups   mean in group Two or more followups 
##                             -4.420088                             -6.228573

19 Create subsets of patients by enrollment status

# Create subsets of patients by enrollment status
medtronic_direct_enrollment <- medtronic_subset %>%
  filter(enrollment_status == "Direct Enrollment")
medtronic_screening <- medtronic_subset %>%
  filter(enrollment_status == "From Screening")
# Display the number of patients in each subset
nrow(medtronic_direct_enrollment)
## [1] 15374
nrow(medtronic_screening)
## [1] 2033

20 Comparing the mean systolic blood pressure (SBP) at enrollment and follow-up by number of followup assessments-Comparing those with two or less followup assessments in screening dataset

# Comparing the mean systolic blood pressure (SBP) at enrollment and follow-up by number of followup assessments
mean_sbp_one_followup_assessments_screening <- medtronic_screening %>%
  group_by(assess_count) %>%
  summarise(
    Mean_SBP_Enrollment = mean(baseline_systolic, na.rm = TRUE),
    Mean_SBP_Followup = mean(followup_systolic, na.rm = TRUE),
    Number_of_Patients = n()
  )
# Calculate the mean difference
mean_diff_sbp_one_followup_assessments_screening <- mean_sbp_one_followup_assessments_screening %>%
  mutate(Mean_Difference = Mean_SBP_Followup - Mean_SBP_Enrollment)
mean_diff_sbp_one_followup_assessments_screening
## # A tibble: 2 × 5
##   assess_count   Mean_SBP_Enrollment Mean_SBP_Followup Number_of_Patients
##   <chr>                        <dbl>             <dbl>              <int>
## 1 "One"                         141.              137.                169
## 2 "Two or more "                144.              135.               1864
## # ℹ 1 more variable: Mean_Difference <dbl>

21 Testing whether the differences between baseline and followup sbp between the groups is statistically significant

# Testing whether the differences between baseline and followup sbp between the groups is statistically significant
t.test(change_in_systolic ~ assess_count, data = medtronic_screening)
## 
##  Welch Two Sample t-test
## 
## data:  change_in_systolic by assess_count
## t = 2.1467, df = 200.23, p-value = 0.03302
## alternative hypothesis: true difference in means between group One and group Two or more  is not equal to 0
## 95 percent confidence interval:
##  0.3378558 7.9581799
## sample estimates:
##          mean in group One mean in group Two or more  
##                  -4.218935                  -8.366953
t.test(change_in_systolic ~ diabetes, data = medtronic_screening)
## 
##  Welch Two Sample t-test
## 
## data:  change_in_systolic by diabetes
## t = -3.5855, df = 381.4, p-value = 0.0003801
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##  -8.369783 -2.441307
## sample estimates:
##  mean in group No mean in group Yes 
##         -8.763968         -3.358423

22 Comparing the mean systolic blood pressure (SBP) at enrollment and follow-up by number of followup assessments-Comparing those with one followup assessment in direct enrollment dataset

# Comparing the mean systolic blood pressure (SBP) at enrollment and follow-up by number of followup assessments
mean_sbp_one_followup_assessments_direct_enrollment <- medtronic_direct_enrollment %>%
  group_by(assess_count) %>%
  summarise(
    Mean_SBP_Enrollment = mean(baseline_systolic, na.rm = TRUE),
    Mean_SBP_Followup = mean(followup_systolic, na.rm = TRUE),
    Number_of_Patients = n()
  )
# Calculate the mean difference
mean_diff_sbp_one_followup_assessments_direct_enrollment <- mean_sbp_one_followup_assessments_direct_enrollment %>%
  mutate(Mean_Difference = Mean_SBP_Followup - Mean_SBP_Enrollment)
mean_diff_sbp_one_followup_assessments_direct_enrollment
## # A tibble: 2 × 5
##   assess_count   Mean_SBP_Enrollment Mean_SBP_Followup Number_of_Patients
##   <chr>                        <dbl>             <dbl>              <int>
## 1 "One"                         142.              137.               1195
## 2 "Two or more "                142.              136.              14179
## # ℹ 1 more variable: Mean_Difference <dbl>

23 Testing whether the differences between baseline and followup sbp between the groups is statistically significant

# Testing whether the differences between baseline and followup sbp between the groups is statistically significant
t.test(change_in_systolic ~ assess_count, data = medtronic_direct_enrollment)
## 
##  Welch Two Sample t-test
## 
## data:  change_in_systolic by assess_count
## t = 2.0907, df = 1416.1, p-value = 0.03673
## alternative hypothesis: true difference in means between group One and group Two or more  is not equal to 0
## 95 percent confidence interval:
##  0.09255293 2.90529096
## sample estimates:
##          mean in group One mean in group Two or more  
##                  -4.448536                  -5.947458
t.test(change_in_systolic ~ diabetes, data = medtronic_direct_enrollment)
## 
##  Welch Two Sample t-test
## 
## data:  change_in_systolic by diabetes
## t = -5.02, df = 8014.9, p-value = 5.278e-07
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##  -3.030880 -1.328565
## sample estimates:
##  mean in group No mean in group Yes 
##         -6.443721         -4.263998

24 Comparing the mean systolic blood pressure (SBP) at enrollment and follow-up by number of followup assessments-Comparing those with at least two followup assessments in screening dataset

# Comparing the mean systolic blood pressure (SBP) at enrollment and follow-up by number of followup assessments
mean_sbp_at_least_two_followup_assessments_screening <- medtronic_screening %>%
  group_by(at_least_two_followup) %>%
  summarise(
    Mean_SBP_Enrollment = mean(baseline_systolic, na.rm = TRUE),
    Mean_SBP_Followup = mean(followup_systolic, na.rm = TRUE),
    Number_of_Patients = n()
  )
# Calculate the mean difference
mean_diff_sbp_at_least_two_followup_assessments_screening <- mean_sbp_at_least_two_followup_assessments_screening %>%
  mutate(Mean_Difference = Mean_SBP_Followup - Mean_SBP_Enrollment)
mean_diff_sbp_at_least_two_followup_assessments_screening
## # A tibble: 2 × 5
##   at_least_two_followup Mean_SBP_Enrollment Mean_SBP_Followup Number_of_Patients
##   <chr>                               <dbl>             <dbl>              <int>
## 1 Less than two follow…                141.              137.                169
## 2 Two or more followups                144.              135.               1864
## # ℹ 1 more variable: Mean_Difference <dbl>

25 Testing whether the differences between baseline and followup sbp between the groups is statistically significant

# Testing whether the differences between baseline and followup sbp between the groups is statistically significant
t.test(change_in_systolic ~ assess_count, data = medtronic_screening)
## 
##  Welch Two Sample t-test
## 
## data:  change_in_systolic by assess_count
## t = 2.1467, df = 200.23, p-value = 0.03302
## alternative hypothesis: true difference in means between group One and group Two or more  is not equal to 0
## 95 percent confidence interval:
##  0.3378558 7.9581799
## sample estimates:
##          mean in group One mean in group Two or more  
##                  -4.218935                  -8.366953
t.test(change_in_systolic ~ diabetes, data = medtronic_screening)
## 
##  Welch Two Sample t-test
## 
## data:  change_in_systolic by diabetes
## t = -3.5855, df = 381.4, p-value = 0.0003801
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##  -8.369783 -2.441307
## sample estimates:
##  mean in group No mean in group Yes 
##         -8.763968         -3.358423
t.test(change_in_systolic ~ at_least_two_followup, data = medtronic_screening)
## 
##  Welch Two Sample t-test
## 
## data:  change_in_systolic by at_least_two_followup
## t = 2.1467, df = 200.23, p-value = 0.03302
## alternative hypothesis: true difference in means between group Less than two followups and group Two or more followups is not equal to 0
## 95 percent confidence interval:
##  0.3378558 7.9581799
## sample estimates:
## mean in group Less than two followups   mean in group Two or more followups 
##                             -4.218935                             -8.366953

26 Comparing the mean systolic blood pressure (SBP) at enrollment and follow-up by number of followup assessments-Comparing those with two or less followup assessments in direct enrollment dataset

# Comparing the mean systolic blood pressure (SBP) at enrollment and follow-up by number of followup assessments
mean_sbp_at_least_two_followup_assessments_direct_enrollment <- medtronic_direct_enrollment %>%
  group_by(at_least_two_followup) %>%
  summarise(
    Mean_SBP_Enrollment = mean(baseline_systolic, na.rm = TRUE),
    Mean_SBP_Followup = mean(followup_systolic, na.rm = TRUE),
    Number_of_Patients = n()
  )
# Calculate the mean difference
mean_diff_sbp_at_least_two_followup_assessments_direct_enrollment <- mean_sbp_at_least_two_followup_assessments_direct_enrollment %>%
  mutate(Mean_Difference = Mean_SBP_Followup - Mean_SBP_Enrollment)
mean_diff_sbp_at_least_two_followup_assessments_direct_enrollment
## # A tibble: 2 × 5
##   at_least_two_followup Mean_SBP_Enrollment Mean_SBP_Followup Number_of_Patients
##   <chr>                               <dbl>             <dbl>              <int>
## 1 Less than two follow…                142.              137.               1195
## 2 Two or more followups                142.              136.              14179
## # ℹ 1 more variable: Mean_Difference <dbl>

27 Testing whether the differences between baseline and followup sbp between the groups is statistically significant

# Testing whether the differences between baseline and followup sbp between the groups is statistically significant
t.test(change_in_systolic ~ assess_count, data = medtronic_direct_enrollment)
## 
##  Welch Two Sample t-test
## 
## data:  change_in_systolic by assess_count
## t = 2.0907, df = 1416.1, p-value = 0.03673
## alternative hypothesis: true difference in means between group One and group Two or more  is not equal to 0
## 95 percent confidence interval:
##  0.09255293 2.90529096
## sample estimates:
##          mean in group One mean in group Two or more  
##                  -4.448536                  -5.947458
t.test(change_in_systolic ~ diabetes, data = medtronic_direct_enrollment)
## 
##  Welch Two Sample t-test
## 
## data:  change_in_systolic by diabetes
## t = -5.02, df = 8014.9, p-value = 5.278e-07
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##  -3.030880 -1.328565
## sample estimates:
##  mean in group No mean in group Yes 
##         -6.443721         -4.263998
t.test(change_in_systolic ~ at_least_two_followup, data = medtronic_direct_enrollment)
## 
##  Welch Two Sample t-test
## 
## data:  change_in_systolic by at_least_two_followup
## t = 2.0907, df = 1416.1, p-value = 0.03673
## alternative hypothesis: true difference in means between group Less than two followups and group Two or more followups is not equal to 0
## 95 percent confidence interval:
##  0.09255293 2.90529096
## sample estimates:
## mean in group Less than two followups   mean in group Two or more followups 
##                             -4.448536                             -5.947458

28 Comparing the mean systolic blood pressure (SBP) at enrollment and follow-up by number of followup assessments-Comparing those with five or more followup assessments in screening dataset

# Comparing the mean systolic blood pressure (SBP) at enrollment and follow-up by number of followup assessments
mean_sbp_five_followup_assessments_screening <- medtronic_screening %>%
  group_by(num_followup_assess) %>%
  summarise(
    Mean_SBP_Enrollment = mean(baseline_systolic, na.rm = TRUE),
    Mean_SBP_Followup = mean(followup_systolic, na.rm = TRUE),
    Number_of_Patients = n()
  )
# Calculate the mean difference
mean_diff_sbp_five_followup_assessments_screening <- mean_sbp_five_followup_assessments_screening %>%
  mutate(Mean_Difference = Mean_SBP_Followup - Mean_SBP_Enrollment)
mean_diff_sbp_five_followup_assessments_screening
## # A tibble: 2 × 5
##   num_followup_assess Mean_SBP_Enrollment Mean_SBP_Followup Number_of_Patients
##   <chr>                             <dbl>             <dbl>              <int>
## 1 "< 5 "                             142.              136.               1064
## 2 "Five or more"                     145.              135.                969
## # ℹ 1 more variable: Mean_Difference <dbl>

29 Testing whether the differences between baseline and followup sbp between the groups is statistically significant

# Testing whether the differences between baseline and followup sbp between the groups is statistically significant
t.test(change_in_systolic ~ num_followup_assess, data = medtronic_screening)
## 
##  Welch Two Sample t-test
## 
## data:  change_in_systolic by num_followup_assess
## t = 4.4073, df = 2006.2, p-value = 1.102e-05
## alternative hypothesis: true difference in means between group < 5  and group Five or more is not equal to 0
## 95 percent confidence interval:
##  2.622849 6.828389
## sample estimates:
##         mean in group < 5  mean in group Five or more 
##                  -5.769737                 -10.495356
t.test(change_in_systolic ~ diabetes, data = medtronic_screening)
## 
##  Welch Two Sample t-test
## 
## data:  change_in_systolic by diabetes
## t = -3.5855, df = 381.4, p-value = 0.0003801
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##  -8.369783 -2.441307
## sample estimates:
##  mean in group No mean in group Yes 
##         -8.763968         -3.358423
t.test(change_in_systolic ~ assess_count, data = medtronic_screening)
## 
##  Welch Two Sample t-test
## 
## data:  change_in_systolic by assess_count
## t = 2.1467, df = 200.23, p-value = 0.03302
## alternative hypothesis: true difference in means between group One and group Two or more  is not equal to 0
## 95 percent confidence interval:
##  0.3378558 7.9581799
## sample estimates:
##          mean in group One mean in group Two or more  
##                  -4.218935                  -8.366953

30 Comparing the mean systolic blood pressure (SBP) at enrollment and follow-up by number of followup assessments-Comparing those with five or more followup assessments in direct enrollment dataset

# Comparing the mean systolic blood pressure (SBP) at enrollment and follow-up by number of followup assessments
mean_sbp_five_followup_assessments_direct_enrollment <- medtronic_direct_enrollment %>%
  group_by(num_followup_assess) %>%
  summarise(
    Mean_SBP_Enrollment = mean(baseline_systolic, na.rm = TRUE),
    Mean_SBP_Followup = mean(followup_systolic, na.rm = TRUE),
    Number_of_Patients = n()
  )
# Calculate the mean difference
mean_diff_sbp_five_followup_assessments_direct_enrollment <- mean_sbp_five_followup_assessments_direct_enrollment %>%
  mutate(Mean_Difference = Mean_SBP_Followup - Mean_SBP_Enrollment)
mean_diff_sbp_five_followup_assessments_direct_enrollment
## # A tibble: 2 × 5
##   num_followup_assess Mean_SBP_Enrollment Mean_SBP_Followup Number_of_Patients
##   <chr>                             <dbl>             <dbl>              <int>
## 1 "< 5 "                             142.              137.               8192
## 2 "Five or more"                     142.              136.               7182
## # ℹ 1 more variable: Mean_Difference <dbl>

31 Testing whether the differences between baseline and followup sbp between the groups is statistically significant

# Testing whether the differences between baseline and followup sbp between the groups is statistically significant
t.test(change_in_systolic ~ num_followup_assess, data = medtronic_direct_enrollment)
## 
##  Welch Two Sample t-test
## 
## data:  change_in_systolic by num_followup_assess
## t = 5.2274, df = 15123, p-value = 1.742e-07
## alternative hypothesis: true difference in means between group < 5  and group Five or more is not equal to 0
## 95 percent confidence interval:
##  1.287674 2.832687
## sample estimates:
##         mean in group < 5  mean in group Five or more 
##                  -4.868530                  -6.928711
t.test(change_in_systolic ~ diabetes, data = medtronic_direct_enrollment)
## 
##  Welch Two Sample t-test
## 
## data:  change_in_systolic by diabetes
## t = -5.02, df = 8014.9, p-value = 5.278e-07
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##  -3.030880 -1.328565
## sample estimates:
##  mean in group No mean in group Yes 
##         -6.443721         -4.263998
t.test(change_in_systolic ~ assess_count, data = medtronic_direct_enrollment)
## 
##  Welch Two Sample t-test
## 
## data:  change_in_systolic by assess_count
## t = 2.0907, df = 1416.1, p-value = 0.03673
## alternative hypothesis: true difference in means between group One and group Two or more  is not equal to 0
## 95 percent confidence interval:
##  0.09255293 2.90529096
## sample estimates:
##          mean in group One mean in group Two or more  
##                  -4.448536                  -5.947458
t.test(change_in_systolic ~ at_least_two_followup, data = medtronic_direct_enrollment)
## 
##  Welch Two Sample t-test
## 
## data:  change_in_systolic by at_least_two_followup
## t = 2.0907, df = 1416.1, p-value = 0.03673
## alternative hypothesis: true difference in means between group Less than two followups and group Two or more followups is not equal to 0
## 95 percent confidence interval:
##  0.09255293 2.90529096
## sample estimates:
## mean in group Less than two followups   mean in group Two or more followups 
##                             -4.448536                             -5.947458

###############Assessing the effect of followup assessments on systolic blood pressure change#######

32 Fitting a linear regression model to predict followup systolic blood pressure

# Fitting a linear regression model to predict change in systolic blood pressure per month
# Create a new variable for change in systolic blood pressure
medtronic_subset$change_in_systolic <- medtronic_subset$followup_systolic - medtronic_subset$baseline_systolic
# Fit a linear regression model to predict change in systolic blood pressure
lm_sbp_change <- lm(change_in_systolic~num_followup_assess, data = medtronic_screening)
# Display the summary of the linear regression model
summary(lm_sbp_change)
## 
## Call:
## lm(formula = change_in_systolic ~ num_followup_assess, data = medtronic_screening)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -93.50 -13.51   0.77  14.49  89.50 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      -5.7697     0.7396  -7.801 9.76e-15 ***
## num_followup_assessFive or more  -4.7256     1.0713  -4.411 1.08e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.13 on 2031 degrees of freedom
## Multiple R-squared:  0.009489,   Adjusted R-squared:  0.009001 
## F-statistic: 19.46 on 1 and 2031 DF,  p-value: 1.083e-05
# Display the coefficients of the linear regression model
coef(lm_sbp_change)
##                     (Intercept) num_followup_assessFive or more 
##                       -5.769737                       -4.725619
# Display the ANOVA table of the linear regression model
anova(lm_sbp_change)
## Analysis of Variance Table
## 
## Response: change_in_systolic
##                       Df  Sum Sq Mean Sq F value    Pr(>F)    
## num_followup_assess    1   11325 11325.2  19.457 1.083e-05 ***
## Residuals           2031 1182193   582.1                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Display the diagnostic plots of the linear regression model
par(mfrow = c(2, 2))
plot(lm_sbp_change)

# Adding other covariates

# Adding other covariates
lm_sbp_change_2 <- lm(change_in_systolic~num_followup_assess+age+sex+latest_bmi+diabetes, data = medtronic_screening)
# Display the summary of the linear regression model
summary(lm_sbp_change_2)
## 
## Call:
## lm(formula = change_in_systolic ~ num_followup_assess + age + 
##     sex + latest_bmi + diabetes, data = medtronic_screening)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -93.412 -13.718   1.135  14.439  90.860 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     -2.33270    2.65984  -0.877  0.38059    
## num_followup_assessFive or more -4.86393    1.07055  -4.543 5.86e-06 ***
## age                             -0.06557    0.04049  -1.619  0.10551    
## sexMale                         -1.19379    1.21762  -0.980  0.32699    
## latest_bmi                       0.01026    0.01821   0.563  0.57325    
## diabetesYes                      5.72624    1.55156   3.691  0.00023 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.04 on 2027 degrees of freedom
## Multiple R-squared:  0.01833,    Adjusted R-squared:  0.01591 
## F-statistic: 7.571 on 5 and 2027 DF,  p-value: 4.681e-07
# Display the coefficients of the linear regression model
coef(lm_sbp_change_2)
##                     (Intercept) num_followup_assessFive or more 
##                     -2.33269794                     -4.86392963 
##                             age                         sexMale 
##                     -0.06556565                     -1.19379261 
##                      latest_bmi                     diabetesYes 
##                      0.01025691                      5.72623962
# Display the ANOVA table of the linear regression model
anova(lm_sbp_change_2)
## Analysis of Variance Table
## 
## Response: change_in_systolic
##                       Df  Sum Sq Mean Sq F value    Pr(>F)    
## num_followup_assess    1   11325 11325.2 19.5932 1.009e-05 ***
## age                    1    1900  1900.1  3.2872 0.0699685 .  
## sex                    1     574   573.8  0.9928 0.3191877    
## latest_bmi             1     209   208.9  0.3614 0.5478166    
## diabetes               1    7873  7873.0 13.6208 0.0002296 ***
## Residuals           2027 1171637   578.0                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Display the diagnostic plots of the linear regression model
par(mfrow = c(2, 2))
plot(lm_sbp_change_2)

####################################################################################################

32.1 Trying Causal inference methods –> Five or more followup visits

33 Propensity score matching

# Propensity score matching
library(MatchIt)
# Create a new variable for enrollment status
medtronic_screening$num_followup_assess <- as.factor(medtronic_screening$num_followup_assess)
# Fit a propensity score model
ps_model <- matchit(num_followup_assess ~ age + sex + baseline_grade + diabetes, 
                    data = medtronic_screening, method = "nearest", distance = "logit")
# Display the summary of the propensity score model
summary(ps_model)
## 
## Call:
## matchit(formula = num_followup_assess ~ age + sex + baseline_grade + 
##     diabetes, data = medtronic_screening, method = "nearest", 
##     distance = "logit")
## 
## Summary of Balance for All Data:
##                Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## distance              0.4844        0.4695          0.2414     1.0793    0.0629
## age                  63.4964       62.3590          0.0893     0.8437    0.0159
## sexFemale             0.7430        0.7190          0.0550          .    0.0240
## sexMale               0.2570        0.2810         -0.0550          .    0.0240
## baseline_grade        0.9783        0.7970          0.1860     1.1205    0.0453
## diabetesNo            0.8452        0.8788         -0.0928          .    0.0336
## diabetesYes           0.1548        0.1212          0.0928          .    0.0336
##                eCDF Max
## distance         0.1257
## age              0.0438
## sexFemale        0.0240
## sexMale          0.0240
## baseline_grade   0.0913
## diabetesNo       0.0336
## diabetesYes      0.0336
## 
## Summary of Balance for Matched Data:
##                Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## distance              0.4844        0.4765          0.1291     1.1510    0.0350
## age                  63.4964       63.3106          0.0146     0.8952    0.0084
## sexFemale             0.7430        0.7410          0.0047          .    0.0021
## sexMale               0.2570        0.2590         -0.0047          .    0.0021
## baseline_grade        0.9783        0.8648          0.1164     1.0916    0.0284
## diabetesNo            0.8452        0.8669         -0.0599          .    0.0217
## diabetesYes           0.1548        0.1331          0.0599          .    0.0217
##                eCDF Max Std. Pair Dist.
## distance         0.0898          0.1294
## age              0.0310          0.9663
## sexFemale        0.0021          0.6896
## sexMale          0.0021          0.6896
## baseline_grade   0.0506          0.5503
## diabetesNo       0.0217          0.5392
## diabetesYes      0.0217          0.5392
## 
## Sample Sizes:
##           Control Treated
## All          1064     969
## Matched       969     969
## Unmatched      95       0
## Discarded       0       0
# Create a matched dataset
matched_data <- match.data(ps_model)
# Fit a linear regression model to predict change in systolic blood pressure
lm_sbp_change_matched <- lm(change_in_systolic ~ num_followup_assess, data = matched_data)
# Display the summary of the linear regression model
summary(lm_sbp_change_matched)
## 
## Call:
## lm(formula = change_in_systolic ~ num_followup_assess, data = matched_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -93.505 -13.505   0.809  14.809  89.495 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      -6.8091     0.7775  -8.758  < 2e-16 ***
## num_followup_assessFive or more  -3.6863     1.0995  -3.353 0.000816 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.2 on 1936 degrees of freedom
## Multiple R-squared:  0.005772,   Adjusted R-squared:  0.005259 
## F-statistic: 11.24 on 1 and 1936 DF,  p-value: 0.000816
# Display the coefficients of the linear regression model
coef(lm_sbp_change_matched)
##                     (Intercept) num_followup_assessFive or more 
##                       -6.809082                       -3.686275
# Display the ANOVA table of the linear regression model
anova(lm_sbp_change_matched)
## Analysis of Variance Table
## 
## Response: change_in_systolic
##                       Df  Sum Sq Mean Sq F value   Pr(>F)    
## num_followup_assess    1    6584  6583.7   11.24 0.000816 ***
## Residuals           1936 1133990   585.7                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Display the diagnostic plots of the linear regression model
par(mfrow = c(2, 2))
plot(lm_sbp_change_matched)

# Display the balance of covariates before and after matching
library(cobalt)
##  cobalt (Version 4.6.0, Build Date: 2025-04-15)
## 
## Attaching package: 'cobalt'
## The following object is masked from 'package:MatchIt':
## 
##     lalonde
bal.tab(ps_model)
## Balance Measures
##                    Type Diff.Adj
## distance       Distance   0.1291
## age             Contin.   0.0146
## sex_Male         Binary  -0.0021
## baseline_grade  Contin.   0.1164
## diabetes_Yes     Binary   0.0217
## 
## Sample sizes
##           Control Treated
## All          1064     969
## Matched       969     969
## Unmatched      95       0

34 Display the balance of covariates before and after matching

library(cobalt)
bal.tab(ps_model)
## Balance Measures
##                    Type Diff.Adj
## distance       Distance   0.1291
## age             Contin.   0.0146
## sex_Male         Binary  -0.0021
## baseline_grade  Contin.   0.1164
## diabetes_Yes     Binary   0.0217
## 
## Sample sizes
##           Control Treated
## All          1064     969
## Matched       969     969
## Unmatched      95       0

35 Plot the balance of covariates before and after matching

love.plot(ps_model, binary = "std")  # Love plot for standardized mean differences

# Create a plot comparing observed vs. counterfactual BP change for matched data:

# Create a plot comparing observed vs. counterfactual BP change for matched sample
library(ggplot2)
ggplot(matched_data, aes(x = num_followup_assess, y = change_in_systolic)) +
  geom_boxplot() +
  labs(title = "Effect of Follow-up on Systolic BP Change (Matched Sample)",
       x = "Follow-up Assessments",
       y = "Change in Systolic BP (mmHg)")

# Create a plot comparing observed vs. counterfactual BP change for matched sample

# Create a plot comparing observed vs. counterfactual BP change for control group
ggplot(matched_data, aes(x = num_followup_assess, y = change_in_systolic)) +
  geom_boxplot() +
  labs(title = "Effect of Follow-up on Systolic BP Change (Matched Sample)",
       x = "Follow-up Assessments",
       y = "Change in Systolic BP (mmHg)")

## Inverse Probability of Treatment Weighting (IPTW)

# Inverse Probability of Treatment Weighting (IPTW)
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
# Step 1: Estimate the propensity scores
ps <- glm(num_followup_assess ~ age + sex + baseline_grade + diabetes, 
          family = binomial(), data = medtronic_screening)$fitted.values
# Step 2: Create the weights
medtronic_screening$weights <- ifelse(medtronic_screening$num_followup_assess == "Five or more", 1/ps, 1/(1 - ps))
# Step 3: Create a survey design object
design <- svydesign(id = ~1, data = medtronic_screening, weights = ~weights)
# Step 4: Fit a linear regression model using the survey design object
iptw_model <- svyglm(change_in_systolic ~ num_followup_assess, design = design)
# Step 5: Display the summary of the linear regression model
summary(iptw_model)
## 
## Call:
## svyglm(formula = change_in_systolic ~ num_followup_assess, design = design)
## 
## Survey design:
## svydesign(id = ~1, data = medtronic_screening, weights = ~weights)
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      -7.1008     0.7822  -9.078   <2e-16 ***
## num_followup_assessFive or more  -1.9648     1.0879  -1.806   0.0711 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 585.9808)
## 
## Number of Fisher Scoring iterations: 2
# Step 6: Display the coefficients of the linear regression model
coef(iptw_model)
##                     (Intercept) num_followup_assessFive or more 
##                       -7.100807                       -1.964818
# Step 7: Display the ANOVA table of the linear regression model
anova(iptw_model)
## NULL
# Step 8: Display the diagnostic plots of the linear regression model
par(mfrow = c(2, 2))
plot(iptw_model)

# IPTW Balance Diagnostics

# IPTW Balance Diagnostics

library(cobalt)

bal.tab(num_followup_assess ~ age + sex + latest_bmi + diabetes, 
        data = medtronic_screening, 
        weights = medtronic_screening$weights,
        method = "weighting")
## Note: `s.d.denom` not specified; assuming "pooled".
## Balance Measures
##                 Type Diff.Adj
## age          Contin.   0.0002
## sex_Male      Binary  -0.0002
## latest_bmi   Contin.  -0.0267
## diabetes_Yes  Binary  -0.0001
## 
## Effective sample sizes
##               < 5  Five or more
## Unadjusted 1064.            969
## Adjusted   1047.88          954

36 Visualize the balance

love.plot(num_followup_assess ~ age + sex + baseline_grade + diabetes,
          data = medtronic_screening,
          weights = medtronic_screening$weights,
          method = "weighting")
## Note: `s.d.denom` not specified; assuming "pooled".
## Warning: Standardized mean differences and raw mean differences are present in
## the same plot. Use the `stars` argument to distinguish between them and
## appropriately label the x-axis. See `?love.plot` for details.

36.1 Apply Doubly Robust Estimation Method

# Load the survey package
library(survey)

# --- Existing IPTW Steps ---

# Step 1: Estimate the propensity scores
ps <- glm(num_followup_assess ~ age + sex + baseline_grade + diabetes, 
          family = binomial(), data = medtronic_screening)$fitted.values
# Step 2: Create the weights
medtronic_screening$weights <- ifelse(medtronic_screening$num_followup_assess == "Five or more", 1/ps, 1/(1 - ps))
# Step 3: Create a survey design object
design <- svydesign(id = ~1, data = medtronic_screening, weights = ~weights)
# Step 4: Fit a linear regression model using the survey design object
iptw_model <- svyglm(change_in_systolic ~ num_followup_assess, design = design)

# --- Doubly Robust Estimation Step ---

# Step 4: Fit a linear regression model using the survey design object
#         AND include the previously unbalanced covariates (age, diabetes)
#         in the outcome model.
iptw_doubly_robust_model <- svyglm(change_in_systolic ~ num_followup_assess + age + diabetes,
                                   design = design)

# Step 5: Display the summary of the doubly robust linear regression model
summary(iptw_doubly_robust_model)
## 
## Call:
## svyglm(formula = change_in_systolic ~ num_followup_assess + age + 
##     diabetes, design = design)
## 
## Survey design:
## svydesign(id = ~1, data = medtronic_screening, weights = ~weights)
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     -3.04615    2.60254  -1.170 0.241956    
## num_followup_assessFive or more -1.96429    1.08426  -1.812 0.070190 .  
## age                             -0.07612    0.04008  -1.899 0.057663 .  
## diabetesYes                      5.34718    1.54922   3.452 0.000569 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 581.5351)
## 
## Number of Fisher Scoring iterations: 2
# Step 6: Display the coefficients of the doubly robust linear regression model
coef(iptw_doubly_robust_model)
##                     (Intercept) num_followup_assessFive or more 
##                     -3.04615012                     -1.96428747 
##                             age                     diabetesYes 
##                     -0.07612023                      5.34718442
# Step 7: Display the ANOVA table of the doubly robust linear regression model
anova(iptw_doubly_robust_model)
## Anova table:  (Rao-Scott LRT)
## svyglm(formula = change_in_systolic ~ num_followup_assess, design = design)
##                      stats    DEff      df  ddf         p    
## num_followup_assess 1962.1  601.53    1.00 2031 0.0731049 .  
## age                 2131.8  584.11    1.00 2030 0.0580155 .  
## diabetes            6901.8  579.34    1.00 2029 0.0006081 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Step 8: Display the diagnostic plots of the doubly robust linear regression model
par(mfrow = c(2, 2))
plot(iptw_doubly_robust_model)

36.1.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.1

36.2 Trying Causal inference methods –> Two or more visits

37 Propensity score matching

# Propensity score matching
library(MatchIt)
# Create a new variable for enrollment status
medtronic_screening$assess_count <- as.factor(medtronic_screening$assess_count)

# OR Option B: Remove weight variable 
medtronic_screening$weights <- NULL
# Fit a propensity score model
ps_model_2 <- matchit(assess_count ~ age + sex + baseline_grade + diabetes, 
                    data = medtronic_screening, method = "nearest", distance = "logit")
## Warning: Fewer control units than treated units; not all treated units will get
## a match.
# Display the summary of the propensity score model
summary(ps_model_2)
## 
## Call:
## matchit(formula = assess_count ~ age + sex + baseline_grade + 
##     diabetes, data = medtronic_screening, method = "nearest", 
##     distance = "logit")
## 
## Summary of Balance for All Data:
##                Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## distance              0.9172        0.9135          0.2220     1.0300    0.0488
## age                  63.0397       61.3728          0.1264     0.7636    0.0241
## sexFemale             0.7318        0.7160          0.0356          .    0.0158
## sexMale               0.2682        0.2840         -0.0356          .    0.0158
## baseline_grade        0.8879        0.8343          0.0562     1.0201    0.0161
## diabetesNo            0.8584        0.9112         -0.1516          .    0.0529
## diabetesYes           0.1416        0.0888          0.1516          .    0.0529
##                eCDF Max
## distance         0.0963
## age              0.0664
## sexFemale        0.0158
## sexMale          0.0158
## baseline_grade   0.0410
## diabetesNo       0.0529
## diabetesYes      0.0529
## 
## Summary of Balance for Matched Data:
##                Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## distance              0.9502        0.9135          2.2311     0.0723    0.5148
## age                  68.6272       61.3728          0.5503     0.3653    0.0904
## sexFemale             0.8225        0.7160          0.2404          .    0.1065
## sexMale               0.1775        0.2840         -0.2404          .    0.1065
## baseline_grade        0.9467        0.8343          0.1181     0.9810    0.0340
## diabetesNo            0.0059        0.9112         -2.5965          .    0.9053
## diabetesYes           0.9941        0.0888          2.5965          .    0.9053
##                eCDF Max Std. Pair Dist.
## distance         0.9349          2.2311
## age              0.3018          0.8645
## sexFemale        0.1065          0.8548
## sexMale          0.1065          0.8548
## baseline_grade   0.1006          0.9879
## diabetesNo       0.9053          2.5965
## diabetesYes      0.9053          2.5965
## 
## Sample Sizes:
##           Control Treated
## All           169    1864
## Matched       169     169
## Unmatched       0    1695
## Discarded       0       0
# Create a matched dataset
matched_data_2 <- match.data(ps_model_2)
# Fit a linear regression model to predict change in systolic blood pressure
lm_sbp_change_matched_2 <- lm(change_in_systolic ~ assess_count, data = matched_data_2)
# Display the summary of the linear regression model
summary(lm_sbp_change_matched_2)
## 
## Call:
## lm(formula = change_in_systolic ~ assess_count, data = matched_data_2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -81.456 -16.287  -0.118  15.463  67.219 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                -4.219      1.900  -2.220   0.0271 *
## assess_countTwo or more    -4.325      2.688  -1.609   0.1085  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.71 on 336 degrees of freedom
## Multiple R-squared:  0.00765,    Adjusted R-squared:  0.004697 
## F-statistic:  2.59 on 1 and 336 DF,  p-value: 0.1085
# Display the coefficients of the linear regression model
coef(lm_sbp_change_matched_2)
##              (Intercept) assess_countTwo or more  
##                -4.218935                -4.325444
# Display the ANOVA table of the linear regression model
anova(lm_sbp_change_matched_2)
## Analysis of Variance Table
## 
## Response: change_in_systolic
##               Df Sum Sq Mean Sq F value Pr(>F)
## assess_count   1   1581 1580.95  2.5902 0.1085
## Residuals    336 205081  610.36
# Display the diagnostic plots of the linear regression model
par(mfrow = c(2, 2))
plot(lm_sbp_change_matched_2)

# Display the balance of covariates before and after matching
library(cobalt)
bal.tab(ps_model_2)
## Balance Measures
##                    Type Diff.Adj
## distance       Distance   2.2311
## age             Contin.   0.5503
## sex_Male         Binary  -0.1065
## baseline_grade  Contin.   0.1181
## diabetes_Yes     Binary   0.9053
## 
## Sample sizes
##           Control Treated
## All           169    1864
## Matched       169     169
## Unmatched       0    1695

38 Display the balance of covariates before and after matching

library(cobalt)
bal.tab(ps_model_2)
## Balance Measures
##                    Type Diff.Adj
## distance       Distance   2.2311
## age             Contin.   0.5503
## sex_Male         Binary  -0.1065
## baseline_grade  Contin.   0.1181
## diabetes_Yes     Binary   0.9053
## 
## Sample sizes
##           Control Treated
## All           169    1864
## Matched       169     169
## Unmatched       0    1695

39 Plot the balance of covariates before and after matching

love.plot(ps_model_2, binary = "std")  # Love plot for standardized mean differences

# Create a plot comparing observed vs. counterfactual BP change for matched data:

# Create a plot comparing observed vs. counterfactual BP change for matched sample
library(ggplot2)
ggplot(matched_data_2, aes(x = assess_count, y = change_in_systolic)) +
  geom_boxplot() +
  labs(title = "Effect of Follow-up on Systolic BP Change (Matched Sample)",
       x = "Follow-up Assessments",
       y = "Change in Systolic BP (mmHg)")

# Create a plot comparing observed vs. counterfactual BP change for matched sample

# Create a plot comparing observed vs. counterfactual BP change for control group
ggplot(matched_data_2, aes(x = assess_count, y = change_in_systolic)) +
  geom_boxplot() +
  labs(title = "Effect of Follow-up on Systolic BP Change (Matched Sample)",
       x = "Follow-up Assessments",
       y = "Change in Systolic BP (mmHg)")

## Inverse Probability of Treatment Weighting (IPTW)

# Inverse Probability of Treatment Weighting (IPTW)
library(survey)
# Step 1: Estimate the propensity scores
ps_2 <- glm(assess_count ~ age + sex + latest_bmi + diabetes, 
          family = binomial(), data = medtronic_screening)$fitted.values
# Step 2: Create the weights
medtronic_screening$weights_2 <- ifelse(medtronic_screening$assess_count == "> 2", 1/ps_2, 1/(1 - ps_2))
# Step 3: Create a survey design object
design <- svydesign(id = ~1, data = medtronic_screening, weights = ~weights_2)
# Step 4: Fit a linear regression model using the survey design object
iptw_model_2 <- svyglm(change_in_systolic ~ assess_count, design = design)
# Step 5: Display the summary of the linear regression model
summary(iptw_model_2)
## 
## Call:
## svyglm(formula = change_in_systolic ~ assess_count, design = design)
## 
## Survey design:
## svydesign(id = ~1, data = medtronic_screening, weights = ~weights_2)
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                -3.715      1.939  -1.916   0.0555 .
## assess_countTwo or more    -4.404      2.024  -2.176   0.0297 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 590.4661)
## 
## Number of Fisher Scoring iterations: 2
# Step 6: Display the coefficients of the linear regression model
coef(iptw_model_2)
##              (Intercept) assess_countTwo or more  
##                -3.714679                -4.403589
# Step 7: Display the ANOVA table of the linear regression model
anova(iptw_model_2)
## NULL
# Step 8: Display the diagnostic plots of the linear regression model
par(mfrow = c(2, 2))
plot(iptw_model_2)

## IPTW Balance Diagnostics

# IPTW Balance Diagnostics

library(cobalt)

bal.tab(assess_count ~ age + sex + latest_bmi + diabetes, 
        data = medtronic_screening, 
        weights = medtronic_screening$weights_2,
        method = "weighting")
## Note: `s.d.denom` not specified; assuming "pooled".
## Balance Measures
##                 Type Diff.Adj
## age          Contin.   0.0763
## sex_Male      Binary  -0.0185
## latest_bmi   Contin.  -0.0207
## diabetes_Yes  Binary   0.0715
## 
## Effective sample sizes
##               One Two or more 
## Unadjusted 169.        1864.  
## Adjusted   160.67      1765.67

40 Visualize the balance

love.plot(assess_count ~ age + sex + latest_bmi + diabetes,
          data = medtronic_screening,
          weights = medtronic_screening$weights_2,
          method = "weighting")
## Note: `s.d.denom` not specified; assuming "pooled".
## Warning: Standardized mean differences and raw mean differences are present in
## the same plot. Use the `stars` argument to distinguish between them and
## appropriately label the x-axis. See `?love.plot` for details.

40.1 Apply Doubly Robust Estimation Method

# Load the survey package
library(survey)

# --- Existing IPTW Steps ---

# Step 1: Estimate the propensity scores
# (Assuming 'num_followup_assess' is a factor with "Five or more" as one level)
ps_2 <- glm(assess_count ~ age + sex + latest_bmi + diabetes, 
          family = binomial(), data = medtronic_screening)$fitted.values

# Step 2: Create the weights
# Ensure 'num_followup_assess' is a factor and the levels are correctly identified
# For example, if "One" is the reference and "Five or more" is the treated group:

medtronic_screening$weights_2 <- ifelse(medtronic_screening$assess_count == "> 2", 1/ps_2, 1/(1 - ps_2))
# Step 3: Create a survey design object
design <- svydesign(id = ~1, data = medtronic_screening, weights = ~weights_2)
# Step 4: Fit a linear regression model using the survey design object
iptw_model_2 <- svyglm(change_in_systolic ~ assess_count, design = design)


# --- Doubly Robust Estimation Step ---

# Step 4: Fit a linear regression model using the survey design object
#         AND include the previously unbalanced covariates (age, diabetes)
#         in the outcome model.
iptw_doubly_robust_model <- svyglm(change_in_systolic ~ assess_count + age + diabetes,
                                   design = design)

# Step 5: Display the summary of the doubly robust linear regression model
summary(iptw_doubly_robust_model)
## 
## Call:
## svyglm(formula = change_in_systolic ~ assess_count + age + diabetes, 
##     design = design)
## 
## Survey design:
## svydesign(id = ~1, data = medtronic_screening, weights = ~weights_2)
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               1.02141    3.25499   0.314 0.753706    
## assess_countTwo or more  -4.69669    2.00897  -2.338 0.019491 *  
## age                      -0.08667    0.04219  -2.054 0.040081 *  
## diabetesYes               5.41021    1.53042   3.535 0.000417 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 584.3363)
## 
## Number of Fisher Scoring iterations: 2
# Step 6: Display the coefficients of the doubly robust linear regression model
coef(iptw_doubly_robust_model)
##              (Intercept) assess_countTwo or more                       age 
##               1.02141345              -4.69668600              -0.08666987 
##              diabetesYes 
##               5.41021430
# Step 7: Display the ANOVA table of the doubly robust linear regression model
anova(iptw_doubly_robust_model)
## Anova table:  (Rao-Scott LRT)
## svyglm(formula = change_in_systolic ~ assess_count, design = design)
##               stats    DEff      df  ddf         p    
## assess_count 2891.1  610.68    1.00 2031 0.0308532 *  
## age          2768.2  625.31    1.00 2030 0.0368268 *  
## diabetes     9687.5  775.19    1.00 2029 0.0004462 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Step 8: Display the diagnostic plots of the doubly robust linear regression model
par(mfrow = c(2, 2))
plot(iptw_doubly_robust_model)

library(drtmle)
## drtmle: TMLE with doubly robust inference
## Version: 1.1.2
# Encode treatment as binary: 0 = "<5", 1 = "Five or more"
medtronic_screening$treat <- ifelse(medtronic_screening$assess_count == "Five or more", 1, 0)

# 2. Ensure covariates are numeric or factors (no characters!)
medtronic_screening$sex <- as.factor(medtronic_screening$sex)
medtronic_screening$diabetes <- as.factor(medtronic_screening$diabetes)

# 3. Subset covariates (W), treatment (A), and outcome (Y)
W <- medtronic_screening[, c("age", "sex", "latest_bmi", "diabetes")]
W <- as.data.frame(W)
A <- medtronic_screening$treat
Y <- medtronic_screening$change_in_systolic


# # Fiting doubly robust TMLE with GLMs for outcome and treatment models
# dr_result <- drtmle(
#   W = W,
#   A = A,
#   Y = Y,
#   family = gaussian(),
#   glm_Q = ~  + age + sex + latest_bmi + diabetes,
#   glm_g = "A ~ age + sex + latest_bmi + diabetes"
# )
# 
# # Summarize the results
# summary(dr_result)

41 Create a subset of data with individuals with only one follow-up assessment and those with five or more follow-up assessments-To be used for causal inference methods

# Create a subset of data with individuals with only one follow-up assessment and those with five or more follow-up assessments- Use Total follow-up assessments variable and assign the values-drop those with two to four follow-up assessments and assign new dataset name-use screening dataset
medtronic_screening_subset <- medtronic_screening %>%
  filter(total_followup_assessments == 1 | total_followup_assessments >= 5) %>%
  mutate(
    num_followup_assess = case_when(
      total_followup_assessments == 1 ~ "One",
      total_followup_assessments >= 5 ~ "Five or more"
    )
  )
# Display the number of patients in the new subset
nrow(medtronic_screening_subset)
## [1] 1138

42 Comparing the mean systolic blood pressure (SBP) at enrollment and follow-up by number of followup assessments in the new subset

# # Comparing the mean systolic blood pressure (SBP) at enrollment and follow-up by number of followup assessments in the new subset
mean_sbp_one_vs_five_screening <- medtronic_screening_subset %>%
  group_by(num_followup_assess) %>%
  summarise(
    Mean_SBP_Enrollment = mean(baseline_systolic, na.rm = TRUE),
    Mean_SBP_Followup = mean(followup_systolic, na.rm = TRUE),
    Number_of_Patients = n()
  )
# Calculate the mean difference
mean_diff_sbp_one_vs_five_followup_assessments_screening <- mean_sbp_one_vs_five_screening %>%
  mutate(Mean_Difference = Mean_SBP_Followup - Mean_SBP_Enrollment)
mean_diff_sbp_one_vs_five_followup_assessments_screening
## # A tibble: 2 × 5
##   num_followup_assess Mean_SBP_Enrollment Mean_SBP_Followup Number_of_Patients
##   <chr>                             <dbl>             <dbl>              <int>
## 1 Five or more                       145.              135.                969
## 2 One                                141.              137.                169
## # ℹ 1 more variable: Mean_Difference <dbl>

43 Testing whether the differences between baseline and followup sbp between the groups is statistically significant

# Testing whether the differences between baseline and followup sbp between the groups is statistically significant
t.test(change_in_systolic ~ num_followup_assess, data = medtronic_screening_subset)
## 
##  Welch Two Sample t-test
## 
## data:  change_in_systolic by num_followup_assess
## t = -3.1264, df = 232.21, p-value = 0.001996
## alternative hypothesis: true difference in means between group Five or more and group One is not equal to 0
## 95 percent confidence interval:
##  -10.231804  -2.321039
## sample estimates:
## mean in group Five or more          mean in group One 
##                 -10.495356                  -4.218935

44 Fitting a linear regression model to predict change in systolic blood pressure

# Fitting a linear regression model to predict change in systolic blood pressure
lm_sbp_change_screening_subset <- lm(change_in_systolic ~ num_followup_assess, data = medtronic_screening_subset)
# Display the summary of the linear regression model
summary(lm_sbp_change_screening_subset)
## 
## Call:
## lm(formula = change_in_systolic ~ num_followup_assess, data = medtronic_screening_subset)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -93.505 -13.781   0.857  14.495  89.495 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -10.4954     0.7807 -13.444  < 2e-16 ***
## num_followup_assessOne   6.2764     2.0258   3.098  0.00199 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.3 on 1136 degrees of freedom
## Multiple R-squared:  0.008379,   Adjusted R-squared:  0.007506 
## F-statistic: 9.599 on 1 and 1136 DF,  p-value: 0.001994
# Display the coefficients of the linear regression model
coef(lm_sbp_change_screening_subset)
##            (Intercept) num_followup_assessOne 
##             -10.495356               6.276421
# Display the ANOVA table of the linear regression model
anova(lm_sbp_change_screening_subset)
## Analysis of Variance Table
## 
## Response: change_in_systolic
##                       Df Sum Sq Mean Sq F value   Pr(>F)   
## num_followup_assess    1   5669  5668.8  9.5991 0.001994 **
## Residuals           1136 670871   590.6                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Display the diagnostic plots of the linear regression model
par(mfrow = c(2, 2))
plot(lm_sbp_change_screening_subset)

# Adding other covariates

# Adding other covariates
lm_sbp_change_2_screening_subset <- lm(change_in_systolic~num_followup_assess+age+sex+latest_bmi+diabetes, data = medtronic_screening_subset)
# Display the summary of the linear regression model
summary(lm_sbp_change_2_screening_subset)
## 
## Call:
## lm(formula = change_in_systolic ~ num_followup_assess + age + 
##     sex + latest_bmi + diabetes, data = medtronic_screening_subset)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -93.25 -14.02   1.36  14.10  91.25 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -7.38721    3.62952  -2.035 0.042052 *  
## num_followup_assessOne  6.58511    2.02396   3.254 0.001173 ** 
## age                    -0.07193    0.05538  -1.299 0.194244    
## sexMale                -1.31438    1.65298  -0.795 0.426687    
## latest_bmi              0.02451    0.02178   1.125 0.260683    
## diabetesYes             7.21262    2.03990   3.536 0.000423 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.17 on 1132 degrees of freedom
## Multiple R-squared:  0.02263,    Adjusted R-squared:  0.01832 
## F-statistic: 5.243 on 5 and 1132 DF,  p-value: 9.124e-05
# Display the coefficients of the linear regression model
coef(lm_sbp_change_2_screening_subset)
##            (Intercept) num_followup_assessOne                    age 
##            -7.38721204             6.58511279            -0.07193071 
##                sexMale             latest_bmi            diabetesYes 
##            -1.31438445             0.02450555             7.21262220
# Display the ANOVA table of the linear regression model
anova(lm_sbp_change_2_screening_subset)
## Analysis of Variance Table
## 
## Response: change_in_systolic
##                       Df Sum Sq Mean Sq F value   Pr(>F)    
## num_followup_assess    1   5669  5668.8  9.7048 0.001884 ** 
## age                    1   1247  1247.5  2.1356 0.144190    
## sex                    1    429   429.4  0.7351 0.391430    
## latest_bmi             1    665   664.6  1.1377 0.286369    
## diabetes               1   7303  7302.5 12.5017 0.000423 ***
## Residuals           1132 661227   584.1                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Display the diagnostic plots of the linear regression model
par(mfrow = c(2, 2))
plot(lm_sbp_change_2_screening_subset)

## Applying some Causal inference methods –> One vs Five or more follow-up visits # Propensity score matching

# Propensity score matching
library(MatchIt)
# Create a new variable for enrollment status
medtronic_screening_subset$num_followup_assess <- as.factor(medtronic_screening_subset$num_followup_assess)

# Fit a propensity score model
ps_model_1_vs_5 <- matchit(num_followup_assess ~ age + sex + baseline_grade + diabetes, 
                    data = medtronic_screening_subset, method = "nearest", distance = "logit")
# Display the summary of the propensity score model
summary(ps_model_1_vs_5)
## 
## Call:
## matchit(formula = num_followup_assess ~ age + sex + baseline_grade + 
##     diabetes, data = medtronic_screening_subset, method = "nearest", 
##     distance = "logit")
## 
## Summary of Balance for All Data:
##                Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## distance              0.1590        0.1467          0.3167     1.0028    0.0755
## age                  61.3728       63.4964         -0.1408     1.4015    0.0295
## sexFemale             0.7160        0.7430         -0.0600          .    0.0271
## sexMale               0.2840        0.2570          0.0600          .    0.0271
## baseline_grade        0.8343        0.9783         -0.1527     0.9349    0.0360
## diabetesNo            0.9112        0.8452          0.2322          .    0.0660
## diabetesYes           0.0888        0.1548         -0.2322          .    0.0660
##                eCDF Max
## distance         0.1515
## age              0.0808
## sexFemale        0.0271
## sexMale          0.0271
## baseline_grade   0.0853
## diabetesNo       0.0660
## diabetesYes      0.0660
## 
## Summary of Balance for Matched Data:
##                Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## distance              0.1590        0.1591         -0.0009     0.9988    0.0006
## age                  61.3728       61.6627         -0.0192     1.1747    0.0137
## sexFemale             0.7160        0.7337         -0.0394          .    0.0178
## sexMale               0.2840        0.2663          0.0394          .    0.0178
## baseline_grade        0.8343        0.8343          0.0000     0.9614    0.0059
## diabetesNo            0.9112        0.9231         -0.0416          .    0.0118
## diabetesYes           0.0888        0.0769          0.0416          .    0.0118
##                eCDF Max Std. Pair Dist.
## distance         0.0118          0.0037
## age              0.0355          0.3048
## sexFemale        0.0178          0.3805
## sexMale          0.0178          0.3805
## baseline_grade   0.0118          0.2604
## diabetesNo       0.0118          0.1248
## diabetesYes      0.0118          0.1248
## 
## Sample Sizes:
##           Control Treated
## All           969     169
## Matched       169     169
## Unmatched     800       0
## Discarded       0       0
# Create a matched dataset
matched_data_1_vs_5 <- match.data(ps_model_1_vs_5)
# Fit a linear regression model to predict change in systolic blood pressure
lm_sbp_change_matched_1_vs_5 <- lm(change_in_systolic ~ num_followup_assess, data = matched_data_1_vs_5)
# Display the summary of the linear regression model
summary(lm_sbp_change_matched_1_vs_5)
## 
## Call:
## lm(formula = change_in_systolic ~ num_followup_assess, data = matched_data_1_vs_5)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -94.45 -13.78  -0.45  14.05  81.55 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -7.550      1.841  -4.102 5.15e-05 ***
## num_followup_assessOne    3.331      2.603   1.280    0.202    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.93 on 336 degrees of freedom
## Multiple R-squared:  0.004851,   Adjusted R-squared:  0.001889 
## F-statistic: 1.638 on 1 and 336 DF,  p-value: 0.2015
# Display the coefficients of the linear regression model
coef(lm_sbp_change_matched_1_vs_5)
##            (Intercept) num_followup_assessOne 
##              -7.550296               3.331361
# Display the ANOVA table of the linear regression model
anova(lm_sbp_change_matched_1_vs_5)
## Analysis of Variance Table
## 
## Response: change_in_systolic
##                      Df Sum Sq Mean Sq F value Pr(>F)
## num_followup_assess   1    938  937.78  1.6378 0.2015
## Residuals           336 192389  572.59
# Display the diagnostic plots of the linear regression model
par(mfrow = c(2, 2))
plot(lm_sbp_change_matched_1_vs_5)

# Display the balance of covariates before and after matching
library(cobalt)
bal.tab(ps_model_1_vs_5)
## Balance Measures
##                    Type Diff.Adj
## distance       Distance  -0.0009
## age             Contin.  -0.0192
## sex_Male         Binary   0.0178
## baseline_grade  Contin.   0.0000
## diabetes_Yes     Binary   0.0118
## 
## Sample sizes
##           Control Treated
## All           969     169
## Matched       169     169
## Unmatched     800       0

45 Display the balance of covariates before and after matching

library(cobalt)
bal.tab(ps_model_1_vs_5)
## Balance Measures
##                    Type Diff.Adj
## distance       Distance  -0.0009
## age             Contin.  -0.0192
## sex_Male         Binary   0.0178
## baseline_grade  Contin.   0.0000
## diabetes_Yes     Binary   0.0118
## 
## Sample sizes
##           Control Treated
## All           969     169
## Matched       169     169
## Unmatched     800       0

46 Plot the balance of covariates before and after matching

love.plot(ps_model_1_vs_5, binary = "std")  # Love plot for standardized mean differences

# Create a plot comparing observed vs. counterfactual BP change for matched data:

# Create a plot comparing observed vs. counterfactual BP change for matched sample
library(ggplot2)
ggplot(matched_data_1_vs_5, aes(x = num_followup_assess, y = change_in_systolic)) +
  geom_boxplot() +
  labs(title = "Effect of Follow-up on Systolic BP Change (Matched Sample)",
       x = "Follow-up Assessments",
       y = "Change in Systolic BP (mmHg)")

46.1 Inverse Probability of Treatment Weighting (IPTW)

# Inverse Probability of Treatment Weighting (IPTW)
library(survey)
# Step 1: Estimate the propensity scores
ps_1_vs_5 <- glm(num_followup_assess ~ age + sex + baseline_grade + diabetes, 
          family = binomial(), data = medtronic_screening_subset)$fitted.values
# Step 2: Create the weights
medtronic_screening_subset$weights_1_vs_5 <- ifelse(medtronic_screening_subset$num_followup_assess == "Five or more", 1/ps_1_vs_5, 1/(1 - ps_1_vs_5))
# Step 3: Create a survey design object
design <- svydesign(id = ~1, data = medtronic_screening_subset, weights = ~weights_1_vs_5)
# Step 4: Fit a linear regression model using the survey design object
iptw_model_1_vs_5 <- svyglm(change_in_systolic ~ num_followup_assess, design = design)
# Step 5: Display the summary of the linear regression model
summary(iptw_model_1_vs_5)
## 
## Call:
## svyglm(formula = change_in_systolic ~ num_followup_assess, design = design)
## 
## Survey design:
## svydesign(id = ~1, data = medtronic_screening_subset, weights = ~weights_1_vs_5)
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -12.3649     0.8823 -14.014  < 2e-16 ***
## num_followup_assessOne   8.3747     2.0233   4.139 3.74e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 634.3074)
## 
## Number of Fisher Scoring iterations: 2
# Step 6: Display the coefficients of the linear regression model
coef(iptw_model_1_vs_5)
##            (Intercept) num_followup_assessOne 
##             -12.364858               8.374726
# Step 7: Display the ANOVA table of the linear regression model
anova(iptw_model_1_vs_5)
## NULL
# Step 8: Display the diagnostic plots of the linear regression model
par(mfrow = c(2, 2))
plot(iptw_model_1_vs_5)

# IPTW Balance Diagnostics

# IPTW Balance Diagnostics

library(cobalt)

bal.tab(num_followup_assess ~ age + sex + latest_bmi + diabetes, 
        data = medtronic_screening_subset,, 
        weights = medtronic_screening_subset$weights_1_vs_5,
        method = "weighting")
## Note: `s.d.denom` not specified; assuming "pooled".
## Balance Measures
##                 Type Diff.Adj
## age          Contin.  -0.3040
## sex_Male      Binary   0.0543
## latest_bmi   Contin.   0.0627
## diabetes_Yes  Binary  -0.1585
## 
## Effective sample sizes
##            Five or more    One
## Unadjusted       969.   169.  
## Adjusted         874.92 168.63

47 Visualize the balance

love.plot(num_followup_assess ~ age + sex + baseline_grade + diabetes,
          data = medtronic_screening_subset,
          weights = medtronic_screening_subset$weights_1_vs_5,
          method = "weighting")
## Note: `s.d.denom` not specified; assuming "pooled".
## Warning: Standardized mean differences and raw mean differences are present in
## the same plot. Use the `stars` argument to distinguish between them and
## appropriately label the x-axis. See `?love.plot` for details.

## Apply Doubly Robust Estimation Method

# Load the survey package
library(survey)

# --- Existing IPTW Steps ---

# Step 1: Estimate the propensity scores
# (Assuming 'num_followup_assess' is a factor with "Five or more" as one level)
ps_1_vs_5 <- glm(num_followup_assess ~ age + sex + baseline_grade + diabetes,
                 family = binomial(), data = medtronic_screening_subset)$fitted.values

# Step 2: Create the weights
# Ensure 'num_followup_assess' is a factor and the levels are correctly identified
# For example, if "One" is the reference and "Five or more" is the treated group:
medtronic_screening_subset$weights_1_vs_5 <- ifelse(
    medtronic_screening_subset$num_followup_assess == "Five or more",
    1 / ps_1_vs_5,
    1 / (1 - ps_1_vs_5)
)

# Step 3: Create a survey design object
design <- svydesign(id = ~1, data = medtronic_screening_subset, weights = ~weights_1_vs_5)

# --- Doubly Robust Estimation Step ---

# Step 4: Fit a linear regression model using the survey design object
#         AND include the previously unbalanced covariates (age, diabetes)
#         in the outcome model.
iptw_doubly_robust_model <- svyglm(change_in_systolic ~ num_followup_assess + age + diabetes,
                                   design = design)

# Step 5: Display the summary of the doubly robust linear regression model
summary(iptw_doubly_robust_model)
## 
## Call:
## svyglm(formula = change_in_systolic ~ num_followup_assess + age + 
##     diabetes, design = design)
## 
## Survey design:
## svydesign(id = ~1, data = medtronic_screening_subset, weights = ~weights_1_vs_5)
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -5.61669    4.43271  -1.267 0.205380    
## num_followup_assessOne  8.99313    2.00876   4.477 8.34e-06 ***
## age                    -0.13084    0.07014  -1.866 0.062367 .  
## diabetesYes             7.40757    2.24213   3.304 0.000984 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 621.6061)
## 
## Number of Fisher Scoring iterations: 2
# Step 6: Display the coefficients of the doubly robust linear regression model
coef(iptw_doubly_robust_model)
##            (Intercept) num_followup_assessOne                    age 
##             -5.6166916              8.9931348             -0.1308387 
##            diabetesYes 
##              7.4075721
# Step 7: Display the ANOVA table of the doubly robust linear regression model
anova(iptw_doubly_robust_model)
## Anova table:  (Rao-Scott LRT)
## svyglm(formula = change_in_systolic ~ num_followup_assess, design = design)
##                       stats     DEff       df  ddf         p    
## num_followup_assess  2117.1   123.57     1.00 1136 4.047e-05 ***
## age                  3220.6   927.71     1.00 1135  0.064605 .  
## diabetes            11220.7  1027.99     1.00 1134  0.001049 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Step 8: Display the diagnostic plots of the doubly robust linear regression model
par(mfrow = c(2, 2))
plot(iptw_doubly_robust_model)

48 Assessing proportions of blood pressure control at follow-up by number of follow-up assessments

# Assessing proportions of blood pressure control at follow-up by number of follow-up assessments
bp_control_proportions <- medtronic_screening %>%
  group_by(assess_count) %>%
  summarise(
    Proportion_BP_Control = mean(followup_systolic < 140, na.rm = TRUE),
    Number_of_Patients = n()
  )
# Calculate the mean difference
bp_control_proportions <- bp_control_proportions %>%
  mutate(Mean_Difference = Proportion_BP_Control - mean(medtronic_screening$followup_systolic < 140, na.rm = TRUE))
bp_control_proportions
## # A tibble: 2 × 4
##   assess_count   Proportion_BP_Control Number_of_Patients Mean_Difference
##   <fct>                          <dbl>              <int>           <dbl>
## 1 "One"                          0.621                169        -0.0344 
## 2 "Two or more "                 0.659               1864         0.00312

49 Alluvial for changes in systolic blood pressure by number of follow-up assessments

# Alluvial for changes in systolic blood pressure by number of follow-up assessments
library(ggalluvial)
medtronic_screening %>%
  mutate(
    change_in_systolic = case_when(
      change_in_systolic < -10 ~ "Decrease > 10",
      change_in_systolic >= -10 & change_in_systolic <= 10 ~ "No Change",
      change_in_systolic > 10 ~ "Increase > 10"
    )
  ) %>%
  ggplot(aes(axis1 = assess_count, axis2 = change_in_systolic)) +
  geom_alluvium(aes(fill = change_in_systolic)) +
  geom_stratum() +
  geom_text(stat = "stratum", aes(label = after_stat(stratum))) +
  labs(title = "Changes in Systolic Blood Pressure by Number of Follow-up Assessments",
       x = "Number of Follow-up Assessments",
       y = "Change in Systolic Blood Pressure") +
  theme_minimal()

# Alluvial for changes in systolic blood pressure by number of follow-up assessments in direct enrollment dataset

# Alluvial for changes in systolic blood pressure by number of follow-up assessments in direct enrollment dataset
library(ggalluvial)
medtronic_direct_enrollment %>%
  mutate(
    change_in_systolic = case_when(
      change_in_systolic < -10 ~ "Decrease > 10",
      change_in_systolic >= -10 & change_in_systolic <= 10 ~ "No Change",
      change_in_systolic > 10 ~ "Increase > 10"
    )
  ) %>%
  ggplot(aes(axis1 = assess_count, axis2 = change_in_systolic)) +
  geom_alluvium(aes(fill = change_in_systolic)) +
  geom_stratum() +
  geom_text(stat = "stratum", aes(label = after_stat(stratum))) +
  labs(title = "Changes in Systolic Blood Pressure by Number of Follow-up Assessments (Direct Enrollment)",
       x = "Number of Follow-up Assessments",
       y = "Change in Systolic Blood Pressure") +
  theme_minimal()

# Alluvial for changes in systolic blood pressure by number of follow-up assessments in screening dataset
# Alluvial for changes in systolic blood pressure by number of follow-up assessments in screening dataset
medtronic_screening %>%
  mutate(
    change_in_systolic = case_when(
      change_in_systolic < -10 ~ "Decrease > 10",
      change_in_systolic >= -10 & change_in_systolic <= 10 ~ "No Change",
      change_in_systolic > 10 ~ "Increase > 10"
    )
  ) %>%
  ggplot(aes(axis1 = assess_count, axis2 = change_in_systolic)) +
  geom_alluvium(aes(fill = change_in_systolic)) +
  geom_stratum() +
  geom_text(stat = "stratum", aes(label = after_stat(stratum))) +
  labs(title = "Changes in Systolic Blood Pressure by Number of Follow-up Assessments (Screening)",
       x = "Number of Follow-up Assessments",
       y = "Change in Systolic Blood Pressure") +
  theme_minimal()

# Alluvial for transitions in blood pressure control from baseline and follow-up -Add percentage of patients in each category

# Alluvial for transitions in blood pressure control from baseline and follow-up -Add percentage of patients in each category-use baseline control and follow-up control variables
library(dplyr)
library(ggplot2)
library(ggalluvial)

medtronic_screening %>%
  mutate(
    baseline_control = ifelse(baseline_systolic < 140, "Controlled", "Uncontrolled"),
    followup_control = ifelse(followup_systolic < 140, "Controlled", "Uncontrolled")
  ) %>%
  group_by(baseline_control, followup_control) %>%
  summarise(Count = n(), .groups = "drop") %>%
  ggplot(aes(axis1 = baseline_control, axis2 = followup_control, y = Count)) +
  geom_alluvium(aes(fill = followup_control), width = 1/12) +
  geom_stratum(width = 1/12, fill = "grey", color = "black") +
  geom_text(stat = "stratum", aes(label = after_stat(stratum))) +
  scale_x_discrete(limits = c("Baseline", "Follow-up"), expand = c(.05, .05)) +
  scale_fill_manual(
    values = c("Controlled" = "green", "Uncontrolled" = "red"),
    name = "Follow-up Control Status"
  ) +
  labs(
    title = "Transitions in Blood Pressure Control from Baseline to Follow-up",
    x = "Timepoint",
    y = "Number of Individuals"
  ) +
  theme_minimal()
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

# Alluvial for transitions in blood pressure control from baseline and follow-up -Add percentage of patients in each category

library(dplyr)
library(ggplot2)
library(ggalluvial)

# Summarized data
df_summary <- medtronic_screening %>%
  mutate(
    baseline_control = ifelse(baseline_systolic < 140, "Controlled", "Uncontrolled"),
    followup_control = ifelse(followup_systolic < 140, "Controlled", "Uncontrolled")
  ) %>%
  group_by(baseline_control, followup_control) %>%
  summarise(Count = n(), .groups = "drop")

# Compute % for each axis
baseline_pct <- df_summary %>%
  group_by(baseline_control) %>%
  summarise(total = sum(Count)) %>%
  mutate(pct_label = paste0(baseline_control, " (", round(100 * total / sum(total), 1), "%)"))

followup_pct <- df_summary %>%
  group_by(followup_control) %>%
  summarise(total = sum(Count)) %>%
  mutate(pct_label = paste0(followup_control, " (", round(100 * total / sum(total), 1), "%)"))

# Named vectors to map stratum names to % labels
baseline_labels <- setNames(baseline_pct$pct_label, baseline_pct$baseline_control)
followup_labels <- setNames(followup_pct$pct_label, followup_pct$followup_control)

# Final plot
ggplot(df_summary, aes(axis1 = baseline_control, axis2 = followup_control, y = Count)) +
  geom_alluvium(aes(fill = followup_control), width = 1/12) +
  geom_stratum(width = 1/12, fill = "grey", color = "black") +
  geom_text(
    stat = "stratum",
    aes(label = ifelse(
      after_stat(x) == 1,
      baseline_labels[after_stat(stratum)],
      followup_labels[after_stat(stratum)]
    )),
    size = 4
  ) +
  scale_x_discrete(limits = c("Baseline", "Follow-up"), expand = c(.05, .05)) +
  scale_fill_manual(
    values = c("Controlled" = "green", "Uncontrolled" = "orange"),
    name = "Follow-up Control Status"
  ) +
  labs(
    title = "Transitions in Blood Pressure Control from Baseline to Follow-up_Enrolled from Screening",
    x = "Timepoint",
    y = "Number of Individuals"
  ) +
  theme_minimal()
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

# Alluvial for transitions in blood pressure control from baseline and follow-up -Add percentage of patients in each category-Enrollment dataset

# Alluvial for transitions in blood pressure control from baseline and follow-up -Add percentage of patients in each category-Enrollment dataset
# Summarized data
df_summary <- medtronic_direct_enrollment %>%
  mutate(
    baseline_control = ifelse(baseline_systolic < 140, "Controlled", "Uncontrolled"),
    followup_control = ifelse(followup_systolic < 140, "Controlled", "Uncontrolled")
  ) %>%
  group_by(baseline_control, followup_control) %>%
  summarise(Count = n(), .groups = "drop")

# Compute % for each axis
baseline_pct <- df_summary %>%
  group_by(baseline_control) %>%
  summarise(total = sum(Count)) %>%
  mutate(pct_label = paste0(baseline_control, " (", round(100 * total / sum(total), 1), "%)"))

followup_pct <- df_summary %>%
  group_by(followup_control) %>%
  summarise(total = sum(Count)) %>%
  mutate(pct_label = paste0(followup_control, " (", round(100 * total / sum(total), 1), "%)"))

# Named vectors to map stratum names to % labels
baseline_labels <- setNames(baseline_pct$pct_label, baseline_pct$baseline_control)
followup_labels <- setNames(followup_pct$pct_label, followup_pct$followup_control)

# Final plot
ggplot(df_summary, aes(axis1 = baseline_control, axis2 = followup_control, y = Count)) +
  geom_alluvium(aes(fill = followup_control), width = 1/12) +
  geom_stratum(width = 1/12, fill = "grey", color = "black") +
  geom_text(
    stat = "stratum",
    aes(label = ifelse(
      after_stat(x) == 1,
      baseline_labels[after_stat(stratum)],
      followup_labels[after_stat(stratum)]
    )),
    size = 4
  ) +
  scale_x_discrete(limits = c("Baseline", "Follow-up"), expand = c(.05, .05)) +
  scale_fill_manual(
    values = c("Controlled" = "green", "Uncontrolled" = "orange"),
    name = "Follow-up Control Status"
  ) +
  labs(
    title = "Transitions in Blood Pressure Control from Baseline to Follow-up_Direct Enrollment",
    x = "Timepoint",
    y = "Number of Individuals"
  ) +
  theme_minimal()
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.