knitr::opts_chunk$set(echo = TRUE)
library(rmarkdown)
library(kableExtra)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:kableExtra':
## 
##     group_rows
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library(tidyr)
library(lubridate)  
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

#Clear the environment

# Clear the environment
rm(list = ls())

1 Load the dataset

# Load the steps_subset dataset
 data_12mths <- read_csv("/Users/jamesoguta/Documents/James Oguta/My PhD Folder-2023-2025/PhD Data Analysis-Modelling/Medtronic Dataset/Quantitative Data/Kenya/MEDTRONIC Dataset/Latest/Outputs/bp_result_12mnths.csv")
## Rows: 61041 Columns: 36
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (11): country, facility, diagnosis, sex, county, enrollment_status, enr...
## dbl  (21): patient_id, age, latest_bmi, baseline_systolic, baseline_diastoli...
## date  (4): enrollment_date, baseline_assessment_date, anchor_date, followup_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Display the first few rows of the dataset
head(data_12mths)
## # A tibble: 6 × 36
##   patient_id country   age facility     diagnosis sex   county enrollment_status
##        <dbl> <chr>   <dbl> <chr>        <chr>     <chr> <chr>  <chr>            
## 1          1 Kenya      65 Matiliku su… "{Hypert… Fema… Makue… Direct Enrollment
## 2          5 Kenya      42 Tawa Sub-co… "{Hypert… Fema… Makue… Direct Enrollment
## 3          6 Kenya      48 Kakamega Co… "{Hypert… Fema… Kakam… Direct Enrollment
## 4          9 Kenya      43 Kibwezi Sub… "{Hypert… Fema… Makue… Direct Enrollment
## 5         11 Kenya      69 Kibwezi Sub… "{Hypert… Male  Makue… Direct Enrollment
## 6         15 Kenya      65 Kilungu sub… "{Hypert… Fema… Makue… Direct Enrollment
## # ℹ 28 more variables: enrollment_bmi <chr>, latest_bmi <dbl>,
## #   baseline_systolic <dbl>, baseline_diastolic <dbl>,
## #   baseline_control_status <chr>, baseline_grade <dbl>,
## #   enrollment_date <date>, baseline_assessment_date <date>,
## #   anchor_date <date>, followup_systolic <dbl>, followup_diastolic <dbl>,
## #   followup_control_status <chr>, followup_grade <dbl>, followup_date <date>,
## #   days_between_assessment_and_anchor <dbl>, …

2 Check the structure of the dataset

# Check the structure of the dataset
str(data_12mths)
## spc_tbl_ [61,041 × 36] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ patient_id                          : num [1:61041] 1 5 6 9 11 15 16 17 29 30 ...
##  $ country                             : chr [1:61041] "Kenya" "Kenya" "Kenya" "Kenya" ...
##  $ age                                 : num [1:61041] 65 42 48 43 69 65 48 55 70 49 ...
##  $ facility                            : chr [1:61041] "Matiliku sub county hospital" "Tawa Sub-county Hospital" "Kakamega County General Hospital" "Kibwezi Sub-county Hospital" ...
##  $ diagnosis                           : chr [1:61041] "{Hypertension,\"Diabetes Mellitus Type 2\"}" "{Hypertension}" "{Hypertension,\"Diabetes Mellitus Type 2\"}" "{Hypertension}" ...
##  $ sex                                 : chr [1:61041] "Female" "Female" "Female" "Female" ...
##  $ county                              : chr [1:61041] "Makueni" "Makueni" "Kakamega" "Makueni" ...
##  $ enrollment_status                   : chr [1:61041] "Direct Enrollment" "Direct Enrollment" "Direct Enrollment" "Direct Enrollment" ...
##  $ enrollment_bmi                      : chr [1:61041] "25.4" "37.29" "29.87" "33.83" ...
##  $ latest_bmi                          : num [1:61041] 25.4 37.3 30.1 33.8 23.2 ...
##  $ baseline_systolic                   : num [1:61041] 165 211 136 151 125 146 119 133 117 185 ...
##  $ baseline_diastolic                  : num [1:61041] 74 124 83 87 81 72 75 86 70 86 ...
##  $ baseline_control_status             : chr [1:61041] "Uncontrolled" "Uncontrolled" "Controlled" "Uncontrolled" ...
##  $ baseline_grade                      : num [1:61041] 2 3 0 1 0 1 0 0 0 3 ...
##  $ enrollment_date                     : Date[1:61041], format: "2020-12-14" "2021-11-17" ...
##  $ baseline_assessment_date            : Date[1:61041], format: "2022-02-28" "2021-11-17" ...
##  $ anchor_date                         : Date[1:61041], format: "2023-02-28" "2022-11-17" ...
##  $ followup_systolic                   : num [1:61041] 137 136 NA 146 119 113 NA NA 123 NA ...
##  $ followup_diastolic                  : num [1:61041] 56 65 NA 78 70 73 NA NA 77 NA ...
##  $ followup_control_status             : chr [1:61041] "Controlled" "Controlled" NA "Uncontrolled" ...
##  $ followup_grade                      : num [1:61041] 0 0 NA 1 0 0 NA NA 0 NA ...
##  $ followup_date                       : Date[1:61041], format: "2023-02-14" "2022-11-03" ...
##  $ days_between_assessment_and_anchor  : num [1:61041] 14 14 NA 6 27 23 NA NA 19 NA ...
##  $ days_between_assessment_and_baseline: num [1:61041] 351 351 NA 371 392 342 NA NA 346 NA ...
##  $ number_followup_assessments         : num [1:61041] 3 1 NA 0 3 0 NA NA 11 NA ...
##  $ number_followup_by_hs               : num [1:61041] 2 0 NA 0 2 1 NA NA 1 NA ...
##  $ number_medical_reviews              : num [1:61041] 0 0 NA 0 0 0 NA NA 0 NA ...
##  $ controlled_followup_days            : num [1:61041] 351 351 -1 NA -1 342 -1 -1 -1 NA ...
##  $ number_uncontrolled_followups       : num [1:61041] 1 0 -1 1 -1 0 -1 -1 -1 NA ...
##  $ number_community_assessments        : num [1:61041] 0 0 NA 0 0 0 NA NA 1 NA ...
##  $ number_facility_assessments         : num [1:61041] 1 0 NA 0 1 1 NA NA 4 NA ...
##  $ number_feeder_assessments           : num [1:61041] 0 0 NA 0 0 1 NA NA 0 NA ...
##  $ change_in_systolic                  : num [1:61041] -28 -75 NA -5 -6 -33 NA NA 6 NA ...
##  $ change_in_diastolic                 : num [1:61041] -18 -59 NA -9 -11 1 NA NA 7 NA ...
##  $ filter_col_str                      : chr [1:61041] "Yes" "Yes" "No" "Yes" ...
##  $ diabetes                            : chr [1:61041] "Yes" "No" "Yes" "No" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   patient_id = col_double(),
##   ..   country = col_character(),
##   ..   age = col_double(),
##   ..   facility = col_character(),
##   ..   diagnosis = col_character(),
##   ..   sex = col_character(),
##   ..   county = col_character(),
##   ..   enrollment_status = col_character(),
##   ..   enrollment_bmi = col_character(),
##   ..   latest_bmi = col_double(),
##   ..   baseline_systolic = col_double(),
##   ..   baseline_diastolic = col_double(),
##   ..   baseline_control_status = col_character(),
##   ..   baseline_grade = col_double(),
##   ..   enrollment_date = col_date(format = ""),
##   ..   baseline_assessment_date = col_date(format = ""),
##   ..   anchor_date = col_date(format = ""),
##   ..   followup_systolic = col_double(),
##   ..   followup_diastolic = col_double(),
##   ..   followup_control_status = col_character(),
##   ..   followup_grade = col_double(),
##   ..   followup_date = col_date(format = ""),
##   ..   days_between_assessment_and_anchor = col_double(),
##   ..   days_between_assessment_and_baseline = col_double(),
##   ..   number_followup_assessments = col_double(),
##   ..   number_followup_by_hs = col_double(),
##   ..   number_medical_reviews = col_double(),
##   ..   controlled_followup_days = col_double(),
##   ..   number_uncontrolled_followups = col_double(),
##   ..   number_community_assessments = col_double(),
##   ..   number_facility_assessments = col_double(),
##   ..   number_feeder_assessments = col_double(),
##   ..   change_in_systolic = col_double(),
##   ..   change_in_diastolic = col_double(),
##   ..   filter_col_str = col_character(),
##   ..   diabetes = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

3 Check the column names

# Check the column names
colnames(data_12mths)
##  [1] "patient_id"                          
##  [2] "country"                             
##  [3] "age"                                 
##  [4] "facility"                            
##  [5] "diagnosis"                           
##  [6] "sex"                                 
##  [7] "county"                              
##  [8] "enrollment_status"                   
##  [9] "enrollment_bmi"                      
## [10] "latest_bmi"                          
## [11] "baseline_systolic"                   
## [12] "baseline_diastolic"                  
## [13] "baseline_control_status"             
## [14] "baseline_grade"                      
## [15] "enrollment_date"                     
## [16] "baseline_assessment_date"            
## [17] "anchor_date"                         
## [18] "followup_systolic"                   
## [19] "followup_diastolic"                  
## [20] "followup_control_status"             
## [21] "followup_grade"                      
## [22] "followup_date"                       
## [23] "days_between_assessment_and_anchor"  
## [24] "days_between_assessment_and_baseline"
## [25] "number_followup_assessments"         
## [26] "number_followup_by_hs"               
## [27] "number_medical_reviews"              
## [28] "controlled_followup_days"            
## [29] "number_uncontrolled_followups"       
## [30] "number_community_assessments"        
## [31] "number_facility_assessments"         
## [32] "number_feeder_assessments"           
## [33] "change_in_systolic"                  
## [34] "change_in_diastolic"                 
## [35] "filter_col_str"                      
## [36] "diabetes"

4 Check the dimensions of the dataset

# Check the dimensions of the dataset
dim(data_12mths)
## [1] 61041    36

5 Check for missing values

# Check for missing values
missing_values <- colSums(is.na(data_12mths))
missing_values
##                           patient_id                              country 
##                                    0                                    0 
##                                  age                             facility 
##                                    0                                    0 
##                            diagnosis                                  sex 
##                                    0                                    0 
##                               county                    enrollment_status 
##                                    0                                    0 
##                       enrollment_bmi                           latest_bmi 
##                                    0                                   18 
##                    baseline_systolic                   baseline_diastolic 
##                                    0                                    0 
##              baseline_control_status                       baseline_grade 
##                                    0                                    0 
##                      enrollment_date             baseline_assessment_date 
##                                    0                                    0 
##                          anchor_date                    followup_systolic 
##                                    0                                43617 
##                   followup_diastolic              followup_control_status 
##                                43617                                43617 
##                       followup_grade                        followup_date 
##                                43617                                43617 
##   days_between_assessment_and_anchor days_between_assessment_and_baseline 
##                                43617                                43617 
##          number_followup_assessments                number_followup_by_hs 
##                                43617                                43617 
##               number_medical_reviews             controlled_followup_days 
##                                43617                                29403 
##        number_uncontrolled_followups         number_community_assessments 
##                                24437                                43617 
##          number_facility_assessments            number_feeder_assessments 
##                                43617                                43617 
##                   change_in_systolic                  change_in_diastolic 
##                                43617                                43617 
##                       filter_col_str                             diabetes 
##                                    0                                    0

6 Check the summary of the dataset

# Check the summary of the dataset
summary(data_12mths)
##    patient_id       country               age           facility        
##  Min.   :     1   Length:61041       Min.   :  1.00   Length:61041      
##  1st Qu.: 42991   Class :character   1st Qu.: 53.00   Class :character  
##  Median :329626   Mode  :character   Median : 63.00   Mode  :character  
##  Mean   :255672                      Mean   : 62.32                     
##  3rd Qu.:435102                      3rd Qu.: 72.00                     
##  Max.   :594198                      Max.   :702.00                     
##                                                                         
##   diagnosis             sex               county          enrollment_status 
##  Length:61041       Length:61041       Length:61041       Length:61041      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  enrollment_bmi       latest_bmi      baseline_systolic baseline_diastolic
##  Length:61041       Min.   :   3.12   Min.   : 54.0     Min.   : 34.00    
##  Class :character   1st Qu.:  23.07   1st Qu.:127.0     1st Qu.: 75.00    
##  Mode  :character   Median :  26.04   Median :140.0     Median : 82.00    
##                     Mean   :  29.31   Mean   :142.2     Mean   : 83.24    
##                     3rd Qu.:  29.95   3rd Qu.:155.0     3rd Qu.: 91.00    
##                     Max.   :2110.71   Max.   :288.0     Max.   :234.00    
##                     NA's   :18                                            
##  baseline_control_status baseline_grade   enrollment_date     
##  Length:61041            Min.   :0.0000   Min.   :2018-04-18  
##  Class :character        1st Qu.:0.0000   1st Qu.:2021-07-09  
##  Mode  :character        Median :1.0000   Median :2022-04-08  
##                          Mean   :0.8605   Mean   :2022-03-01  
##                          3rd Qu.:1.0000   3rd Qu.:2022-12-02  
##                          Max.   :3.0000   Max.   :2024-07-29  
##                                                               
##  baseline_assessment_date  anchor_date         followup_systolic
##  Min.   :2018-04-18       Min.   :2019-04-18   Min.   : 70.0    
##  1st Qu.:2021-12-01       1st Qu.:2022-12-01   1st Qu.:124.0    
##  Median :2022-09-02       Median :2023-09-02   Median :134.0    
##  Mean   :2022-06-26       Mean   :2023-06-27   Mean   :136.3    
##  3rd Qu.:2023-03-23       3rd Qu.:2024-03-23   3rd Qu.:146.0    
##  Max.   :2024-07-29       Max.   :2025-07-29   Max.   :240.0    
##                                                NA's   :43617    
##  followup_diastolic followup_control_status followup_grade 
##  Min.   : 30.00     Length:61041            Min.   :0.0    
##  1st Qu.: 73.00     Class :character        1st Qu.:0.0    
##  Median : 80.00     Mode  :character        Median :0.0    
##  Mean   : 80.49                             Mean   :0.6    
##  3rd Qu.: 87.00                             3rd Qu.:1.0    
##  Max.   :153.00                             Max.   :3.0    
##  NA's   :43617                              NA's   :43617  
##  followup_date        days_between_assessment_and_anchor
##  Min.   :2019-03-18   Min.   : 0.00                     
##  1st Qu.:2023-02-21   1st Qu.:10.00                     
##  Median :2023-07-19   Median :23.00                     
##  Mean   :2023-05-31   Mean   :25.66                     
##  3rd Qu.:2023-12-05   3rd Qu.:40.00                     
##  Max.   :2024-07-29   Max.   :62.00                     
##  NA's   :43617        NA's   :43617                     
##  days_between_assessment_and_baseline number_followup_assessments
##  Min.   :303.0                        Min.   : 0.0               
##  1st Qu.:340.0                        1st Qu.: 1.0               
##  Median :364.0                        Median : 2.0               
##  Mean   :363.8                        Mean   : 2.9               
##  3rd Qu.:386.0                        3rd Qu.: 4.0               
##  Max.   :427.0                        Max.   :32.0               
##  NA's   :43617                        NA's   :43617              
##  number_followup_by_hs number_medical_reviews controlled_followup_days
##  Min.   : 0.00         Min.   :0.00           Min.   : -1.00          
##  1st Qu.: 0.00         1st Qu.:0.00           1st Qu.: -1.00          
##  Median : 1.00         Median :0.00           Median : -1.00          
##  Mean   : 1.31         Mean   :0.04           Mean   : 54.28          
##  3rd Qu.: 2.00         3rd Qu.:0.00           3rd Qu.: -1.00          
##  Max.   :28.00         Max.   :4.00           Max.   :427.00          
##  NA's   :43617         NA's   :43617          NA's   :29403           
##  number_uncontrolled_followups number_community_assessments
##  Min.   :-1.00                 Min.   : 0                  
##  1st Qu.:-1.00                 1st Qu.: 0                  
##  Median :-1.00                 Median : 0                  
##  Mean   :-0.29                 Mean   : 1                  
##  3rd Qu.: 0.00                 3rd Qu.: 1                  
##  Max.   :18.00                 Max.   :21                  
##  NA's   :24437                 NA's   :43617               
##  number_facility_assessments number_feeder_assessments change_in_systolic
##  Min.   : 0.00               Min.   : 0.00             Min.   :-146.00   
##  1st Qu.: 0.00               1st Qu.: 0.00             1st Qu.: -20.00   
##  Median : 1.00               Median : 0.00             Median :  -5.00   
##  Mean   : 1.37               Mean   : 0.06             Mean   :  -6.09   
##  3rd Qu.: 2.00               3rd Qu.: 0.00             3rd Qu.:   9.00   
##  Max.   :19.00               Max.   :26.00             Max.   : 109.00   
##  NA's   :43617               NA's   :43617             NA's   :43617     
##  change_in_diastolic filter_col_str       diabetes        
##  Min.   :-88.00      Length:61041       Length:61041      
##  1st Qu.:-11.00      Class :character   Class :character  
##  Median : -2.00      Mode  :character   Mode  :character  
##  Mean   : -2.49                                           
##  3rd Qu.:  6.00                                           
##  Max.   : 75.00                                           
##  NA's   :43617

7 Creating summary statistics for age

# Creating summary statistics for age
age_summary <- data_12mths %>%
  summarise(
    mean_age = mean(age, na.rm = TRUE),
    median_age = median(age, na.rm = TRUE),
    sd_age = sd(age, na.rm = TRUE),
    min_age = min(age, na.rm = TRUE),
    max_age = max(age, na.rm = TRUE)
  )
# Display the summary statistics
age_summary
## # A tibble: 1 × 5
##   mean_age median_age sd_age min_age max_age
##      <dbl>      <dbl>  <dbl>   <dbl>   <dbl>
## 1     62.3         63   14.1       1     702

8 Plotting the distribution of age

# Plotting the distribution of age on histogram
ggplot(data_12mths, aes(x = age)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Distribution of Age", x = "Age", y = "Frequency") +
  theme_minimal()

# Plotting the distribution of age on box plot
ggplot(data_12mths, aes(y = age)) +
  geom_boxplot(fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Box Plot of Age", y = "Age") +
  theme_minimal()

9 Excluding all patients aged above 100

# Excluding all patients aged above 100
data_12mths <- data_12mths %>%
  filter(age <= 100)
# Plotting the distribution of age on boxplot
ggplot(data_12mths, aes(y = age)) +
  geom_boxplot(fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Box Plot of Age (Excluding Patients Aged Above 100)", y = "Age") +
  theme_minimal()

# Creating summary statistics for age after excluding patients aged above 100

# Creating summary statistics for age after excluding patients aged above 100
age_summary <- data_12mths %>%
  summarise(
    mean_age = mean(age, na.rm = TRUE),
    median_age = median(age, na.rm = TRUE),
    sd_age = sd(age, na.rm = TRUE),
    min_age = min(age, na.rm = TRUE),
    max_age = max(age, na.rm = TRUE)
  )
# Display the summary statistics
age_summary
## # A tibble: 1 × 5
##   mean_age median_age sd_age min_age max_age
##      <dbl>      <dbl>  <dbl>   <dbl>   <dbl>
## 1     62.3         63   13.8       1     100

10 Creating five year age groups –>

# Creating five year age groups and name the categories
data_12mths <- data_12mths %>%
  mutate(age_group = case_when(
    age < 20 ~ "<20",
    age >= 20 & age < 25 ~ "20-24",
    age >= 25 & age < 30 ~ "25-29",
    age >= 30 & age < 35 ~ "30-34",
    age >= 35 & age < 40 ~ "35-39",
    age >= 40 & age < 45 ~ "40-44",
    age >= 45 & age < 50 ~ "45-49",
    age >= 50 & age < 55 ~ "50-54",
    age >= 55 & age < 60 ~ "55-59",
    age >= 60 & age < 65 ~ "60-64",
    age >= 65 & age < 70 ~ "65-69",
    age >= 70 & age < 75 ~ "70-74",
    age >= 75 & age < 80 ~ "75-79",
    age >= 80 & age < 85 ~ "80-84",
    TRUE ~ "85+"
  ))


# Display the first few rows of the dataset with age groups
head(data_12mths)
## # A tibble: 6 × 37
##   patient_id country   age facility     diagnosis sex   county enrollment_status
##        <dbl> <chr>   <dbl> <chr>        <chr>     <chr> <chr>  <chr>            
## 1          1 Kenya      65 Matiliku su… "{Hypert… Fema… Makue… Direct Enrollment
## 2          5 Kenya      42 Tawa Sub-co… "{Hypert… Fema… Makue… Direct Enrollment
## 3          6 Kenya      48 Kakamega Co… "{Hypert… Fema… Kakam… Direct Enrollment
## 4          9 Kenya      43 Kibwezi Sub… "{Hypert… Fema… Makue… Direct Enrollment
## 5         11 Kenya      69 Kibwezi Sub… "{Hypert… Male  Makue… Direct Enrollment
## 6         15 Kenya      65 Kilungu sub… "{Hypert… Fema… Makue… Direct Enrollment
## # ℹ 29 more variables: enrollment_bmi <chr>, latest_bmi <dbl>,
## #   baseline_systolic <dbl>, baseline_diastolic <dbl>,
## #   baseline_control_status <chr>, baseline_grade <dbl>,
## #   enrollment_date <date>, baseline_assessment_date <date>,
## #   anchor_date <date>, followup_systolic <dbl>, followup_diastolic <dbl>,
## #   followup_control_status <chr>, followup_grade <dbl>, followup_date <date>,
## #   days_between_assessment_and_anchor <dbl>, …

11 Display the missing observations by variable

# Display the missing observations by variable
missing_values <- sapply(data_12mths, function(x) sum(is.na(x)))
missing_values <- data.frame(variable = names(missing_values), missing = missing_values)
missing_values <- missing_values[missing_values$missing > 0, ]
# Display the missing values
missing_values
##                                                                  variable
## latest_bmi                                                     latest_bmi
## followup_systolic                                       followup_systolic
## followup_diastolic                                     followup_diastolic
## followup_control_status                           followup_control_status
## followup_grade                                             followup_grade
## followup_date                                               followup_date
## days_between_assessment_and_anchor     days_between_assessment_and_anchor
## days_between_assessment_and_baseline days_between_assessment_and_baseline
## number_followup_assessments                   number_followup_assessments
## number_followup_by_hs                               number_followup_by_hs
## number_medical_reviews                             number_medical_reviews
## controlled_followup_days                         controlled_followup_days
## number_uncontrolled_followups               number_uncontrolled_followups
## number_community_assessments                 number_community_assessments
## number_facility_assessments                   number_facility_assessments
## number_feeder_assessments                       number_feeder_assessments
## change_in_systolic                                     change_in_systolic
## change_in_diastolic                                   change_in_diastolic
##                                      missing
## latest_bmi                                17
## followup_systolic                      43559
## followup_diastolic                     43559
## followup_control_status                43559
## followup_grade                         43559
## followup_date                          43559
## days_between_assessment_and_anchor     43559
## days_between_assessment_and_baseline   43559
## number_followup_assessments            43559
## number_followup_by_hs                  43559
## number_medical_reviews                 43559
## controlled_followup_days               29371
## number_uncontrolled_followups          24409
## number_community_assessments           43559
## number_facility_assessments            43559
## number_feeder_assessments              43559
## change_in_systolic                     43559
## change_in_diastolic                    43559
# Display the missing values as a table
missing_values_table <- kable(missing_values, format = "html", caption = "Missing Values by Variable") %>%
  kable_styling("striped", full_width = F)
# Display the table
missing_values_table
Missing Values by Variable
variable missing
latest_bmi latest_bmi 17
followup_systolic followup_systolic 43559
followup_diastolic followup_diastolic 43559
followup_control_status followup_control_status 43559
followup_grade followup_grade 43559
followup_date followup_date 43559
days_between_assessment_and_anchor days_between_assessment_and_anchor 43559
days_between_assessment_and_baseline days_between_assessment_and_baseline 43559
number_followup_assessments number_followup_assessments 43559
number_followup_by_hs number_followup_by_hs 43559
number_medical_reviews number_medical_reviews 43559
controlled_followup_days controlled_followup_days 29371
number_uncontrolled_followups number_uncontrolled_followups 24409
number_community_assessments number_community_assessments 43559
number_facility_assessments number_facility_assessments 43559
number_feeder_assessments number_feeder_assessments 43559
change_in_systolic change_in_systolic 43559
change_in_diastolic change_in_diastolic 43559

12 Summarize the systolic blood pressure of patients

# Summarize the systolic blood pressure of all patients
systolic_bp <- data_12mths %>%
  group_by(baseline_systolic) %>%
  summarise(count = n()) %>%
  arrange(desc(count))
# Display the summary of systolic blood pressure
systolic_bp
## # A tibble: 190 × 2
##    baseline_systolic count
##                <dbl> <int>
##  1               135  1357
##  2               136  1308
##  3               130  1272
##  4               138  1270
##  5               133  1249
##  6               140  1244
##  7               142  1224
##  8               132  1203
##  9               134  1202
## 10               137  1198
## # ℹ 180 more rows
# Display the summary of systolic blood pressure as a table
systolic_bp_table <- kable(systolic_bp, format = "html", caption = "Summary of Systolic Blood Pressure") %>%
  kable_styling("striped", full_width = F)
# Display the table
# systolic_bp_table

13 Generate summary statistics for the systolic blood pressure

# Generate summary statistics for the systolic blood pressure
systolic_bp_summary <- data_12mths %>%
  summarise(mean = mean(baseline_systolic, na.rm = TRUE),
            median = median(baseline_systolic, na.rm = TRUE),
            sd = sd(baseline_systolic, na.rm = TRUE),
            min = min(baseline_systolic, na.rm = TRUE),
            max = max(baseline_systolic, na.rm = TRUE))
# Display the summary statistics for the systolic blood pressure
systolic_bp_summary
## # A tibble: 1 × 5
##    mean median    sd   min   max
##   <dbl>  <dbl> <dbl> <dbl> <dbl>
## 1  142.    140  21.7    54   288
# Display the summary statistics for the systolic blood pressure as a table
systolic_bp_summary_table <- kable(systolic_bp_summary, format = "html", caption = "Summary Statistics for Systolic Blood Pressure-All Patients") %>%
  kable_styling("striped", full_width = F)
# Display the table
systolic_bp_summary_table
Summary Statistics for Systolic Blood Pressure-All Patients
mean median sd min max
142.1662 140 21.67617 54 288

14 Plot the distribution of systolic blood pressure

# Plot the distribution of systolic blood pressure
ggplot(data_12mths, aes(x = baseline_systolic)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Distribution of Systolic Blood Pressure", x = "Systolic Blood Pressure", y = "Frequency") +
  theme_minimal()

# Plot the distribution of systolic blood pressure on box plot # To check outlier BP readings
ggplot(data_12mths, aes(y = baseline_systolic)) +
  geom_boxplot(fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Box Plot of Systolic Blood Pressure", y = "Systolic Blood Pressure") +
  theme_minimal()

# Summarise the systolic blood pressure of patients by age group-include patients counts

# Summarise the systolic blood pressure of patients by age group-include patients counts
systolic_bp_medtronic_baseline <- data_12mths %>%
  group_by(age_group) %>%
  summarise(count = n(),
            mean = mean(baseline_systolic, na.rm = TRUE),
            median = median(baseline_systolic, na.rm = TRUE),
            sd = sd(baseline_systolic, na.rm = TRUE),
            min = min(baseline_systolic, na.rm = TRUE),
            max = max(baseline_systolic, na.rm = TRUE))

# Display the summary of systolic blood pressure by age group
systolic_bp_medtronic_baseline
## # A tibble: 15 × 7
##    age_group count  mean median    sd   min   max
##    <chr>     <int> <dbl>  <dbl> <dbl> <dbl> <dbl>
##  1 20-24       175  136.    136  20.4    90   232
##  2 25-29       443  137.    135  20.1    69   216
##  3 30-34       942  139.    137  20.0    86   240
##  4 35-39      1797  138.    136  20.5    80   250
##  5 40-44      2953  139.    137  20.6    69   252
##  6 45-49      4314  140.    138  20.2    65   246
##  7 50-54      6577  141.    138  20.9    74   260
##  8 55-59      6580  141.    139  21.0    80   288
##  9 60-64      8761  142.    140  21.7    54   285
## 10 65-69      8120  143.    141  21.5    70   239
## 11 70-74      9065  144.    142  22.0    76   269
## 12 75-79      4953  145.    142  22.3    76   257
## 13 80-84      3466  145.    143  23.0    76   263
## 14 85+        2695  145.    143  24.0    74   231
## 15 <20         127  131.    130  24.3    71   215
# Export the table as CSV file
write.csv(systolic_bp_medtronic_baseline, "systolic_bp_medtronic_baseline.csv", row.names = FALSE)
# Display the summary of systolic blood pressure by age group as a table
systolic_bp_medtronic_baseline_table <- kable(systolic_bp_medtronic_baseline, format = "html", caption = "Summary of Systolic Blood Pressure by Age Group") %>%
  kable_styling("striped", full_width = F)

# Display the table
systolic_bp_medtronic_baseline_table
Summary of Systolic Blood Pressure by Age Group
age_group count mean median sd min max
20-24 175 136.0457 136 20.43375 90 232
25-29 443 137.1919 135 20.14746 69 216
30-34 942 138.8270 137 20.04360 86 240
35-39 1797 138.2387 136 20.47420 80 250
40-44 2953 139.2167 137 20.63008 69 252
45-49 4314 139.6354 138 20.24455 65 246
50-54 6577 140.5965 138 20.86798 74 260
55-59 6580 140.9512 139 21.04281 80 288
60-64 8761 142.0468 140 21.73070 54 285
65-69 8120 142.9201 141 21.51000 70 239
70-74 9065 144.0042 142 21.96746 76 269
75-79 4953 144.5750 142 22.27816 76 257
80-84 3466 145.2822 143 23.01210 76 263
85+ 2695 145.2935 143 23.99778 74 231
<20 127 130.5591 130 24.29568 71 215

15 Plot the systolic blood pressure by age group

# Plot the systolic blood pressure by age group
ggplot(systolic_bp_medtronic_baseline, aes(x = age_group, y = mean)) +
  geom_bar(stat = "identity", fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Mean Systolic Blood Pressure by Age Group", x = "Age Group", y = "Mean Systolic Blood Pressure") +
  theme_minimal()

# Plot the systolic blood pressure by age group on box plot
ggplot(data_12mths, aes(x = age_group, y = baseline_systolic)) +
  geom_boxplot(fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Box Plot of Systolic Blood Pressure by Age Group", x = "Age Group", y = "Systolic Blood Pressure") +
  theme_minimal()

16 Create subsets of patients from the dataset-Selecting only id, baseline sbp and age

# Create subsets of patients from the dataset-Selecting only id, baseline sbp (rename to systolic) and age-All patients
medtronic_subset_all <- data_12mths %>%
  select(patient_id, baseline_systolic, age, age_group) %>%
  rename(systolic = baseline_systolic) %>%
  mutate(dataset="medtronic_subset_all")
# Save the dataset
write.csv(medtronic_subset_all, "medtronic_subset_all.csv", row.names = FALSE)
# Create a subset of patients from the dataset-Selecting only id, baseline sbp (rename to systolic) and age-Patients Enrolled from screening
medtronic_subset_screening <- data_12mths %>%
  filter(enrollment_status == "From Screening") %>%
  select(patient_id, baseline_systolic, age, age_group) %>%
  rename(systolic = baseline_systolic) %>% 
  mutate(dataset="medtronic_subset_screening")
  
# Save the dataset
write.csv(medtronic_subset_screening, "medtronic_subset_screening.csv", row.names = FALSE)
# Create a subset of patients from the dataset-Selecting only id, baseline sbp (rename to systolic) and age-Patients Enrolled Directly_On care
medtronic_subset_direct <- data_12mths %>%
  filter(enrollment_status == "Direct Enrollment") %>%
  select(patient_id, baseline_systolic, age, age_group) %>%
  rename(systolic = baseline_systolic) %>% 
  mutate(dataset="medtronic_subset_direct")
  
# Save the dataset
write.csv(medtronic_subset_direct, "medtronic_subset_direct.csv", row.names = FALSE)
# Create a subset of patients from the dataset-Selecting only id, baseline sbp (rename to systolic) and age-Patients with diabetes
medtronic_subset_diabetes <- data_12mths %>%
  filter(diabetes == "Yes") %>%
  select(patient_id, baseline_systolic, age, age_group) %>%
  rename(systolic = baseline_systolic) %>%
  mutate(dataset="medtronic_subset_diabetes")
# Save the dataset
write.csv(medtronic_subset_diabetes, "medtronic_subset_diabetes.csv", row.names = FALSE)