Setup

Load libraries

library(tidyverse, quietly = TRUE)
#library(tidymodels, quietly = TRUE)
library(psych, quietly = TRUE)
library(DataExplorer, quietly = TRUE)
library(flextable, quietly = TRUE)

Load data

getwd()

## [1] "C:/Users/tsapa/Downloads"

setwd("C:/Users/tsapa/Downloads")
# Read in the dataset
df_frmgham2 <- read.csv("frmgham2.csv")

# Display structure and summary
str(df_frmgham2)

## 'data.frame':    11627 obs. of  39 variables:
##  $ RANDID  : int  2448 2448 6238 6238 6238 9428 9428 10552 10552 11252 ...
##  $ SEX     : int  1 1 2 2 2 1 1 2 2 2 ...
##  $ TOTCHOL : int  195 209 250 260 237 245 283 225 232 285 ...
##  $ AGE     : int  39 52 46 52 58 48 54 61 67 46 ...
##  $ SYSBP   : num  106 121 121 105 108 ...
##  $ DIABP   : num  70 66 81 69.5 66 80 89 95 109 84 ...
##  $ CURSMOKE: int  0 0 0 0 0 1 1 1 1 1 ...
##  $ CIGPDAY : int  0 0 0 0 0 20 30 30 20 23 ...
##  $ BMI     : num  27 NA 28.7 29.4 28.5 ...
##  $ DIABETES: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ BPMEDS  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HEARTRTE: int  80 69 95 80 80 75 75 65 60 85 ...
##  $ GLUCOSE : int  77 92 76 86 71 70 87 103 89 85 ...
##  $ educ    : int  4 4 2 2 2 1 1 3 3 3 ...
##  $ PREVCHD : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PREVAP  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PREVMI  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PREVSTRK: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PREVHYP : int  0 0 0 0 0 0 0 1 1 0 ...
##  $ TIME    : int  0 4628 0 2156 4344 0 2199 0 1977 0 ...
##  $ PERIOD  : int  1 3 1 2 3 1 2 1 2 1 ...
##  $ HDLC    : int  NA 31 NA NA 54 NA NA NA NA NA ...
##  $ LDLC    : int  NA 178 NA NA 141 NA NA NA NA NA ...
##  $ DEATH   : int  0 0 0 0 0 0 0 1 1 0 ...
##  $ ANGINA  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HOSPMI  : int  1 1 0 0 0 0 0 0 0 0 ...
##  $ MI_FCHD : int  1 1 0 0 0 0 0 0 0 0 ...
##  $ ANYCHD  : int  1 1 0 0 0 0 0 0 0 0 ...
##  $ STROKE  : int  0 0 0 0 0 0 0 1 1 0 ...
##  $ CVD     : int  1 1 0 0 0 0 0 1 1 0 ...
##  $ HYPERTEN: int  0 0 0 0 0 0 0 1 1 1 ...
##  $ TIMEAP  : int  8766 8766 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ TIMEMI  : int  6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ TIMEMIFC: int  6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ TIMECHD : int  6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ TIMESTRK: int  8766 8766 8766 8766 8766 8766 8766 2089 2089 8766 ...
##  $ TIMECVD : int  6438 6438 8766 8766 8766 8766 8766 2089 2089 8766 ...
##  $ TIMEDTH : int  8766 8766 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ TIMEHYP : int  8766 8766 8766 8766 8766 8766 8766 0 0 4285 ...

# Corrected plotting for missing values
df_frmgham2 %>%
  plot_missing()

# Corrected descriptive statistics excluding certain columns
df_frmgham2 %>%
  dplyr::select(-RANDID) %>%
  describe() %>%
  dplyr::select(-c(trimmed, mad, skew, kurtosis))

##          vars     n    mean      sd  median    min    max   range    se
## SEX         1 11627    1.57    0.50    2.00   1.00    2.0    1.00  0.00
## TOTCHOL     2 11218  241.16   45.37  238.00 107.00  696.0  589.00  0.43
## AGE         3 11627   54.79    9.56   54.00  32.00   81.0   49.00  0.09
## SYSBP       4 11627  136.32   22.80  132.00  83.50  295.0  211.50  0.21
## DIABP       5 11627   83.04   11.66   82.00  30.00  150.0  120.00  0.11
## CURSMOKE    6 11627    0.43    0.50    0.00   0.00    1.0    1.00  0.00
## CIGPDAY     7 11548    8.25   12.19    0.00   0.00   90.0   90.00  0.11
## BMI         8 11575   25.88    4.10   25.48  14.43   56.8   42.37  0.04
## DIABETES    9 11627    0.05    0.21    0.00   0.00    1.0    1.00  0.00
## BPMEDS     10 11034    0.09    0.28    0.00   0.00    1.0    1.00  0.00
## HEARTRTE   11 11621   76.78   12.46   75.00  37.00  220.0  183.00  0.12
## GLUCOSE    12 10187   84.12   24.99   80.00  39.00  478.0  439.00  0.25
## educ       13 11332    1.99    1.03    2.00   1.00    4.0    3.00  0.01
## PREVCHD    14 11627    0.07    0.26    0.00   0.00    1.0    1.00  0.00
## PREVAP     15 11627    0.05    0.23    0.00   0.00    1.0    1.00  0.00
## PREVMI     16 11627    0.03    0.18    0.00   0.00    1.0    1.00  0.00
## PREVSTRK   17 11627    0.01    0.11    0.00   0.00    1.0    1.00  0.00
## PREVHYP    18 11627    0.46    0.50    0.00   0.00    1.0    1.00  0.00
## TIME       19 11627 1957.02 1758.78 2156.00   0.00 4854.0 4854.00 16.31
## PERIOD     20 11627    1.90    0.81    2.00   1.00    3.0    2.00  0.01
## HDLC       21  3027   49.36   15.63   48.00  10.00  189.0  179.00  0.28
## LDLC       22  3026  176.47   46.86  173.00  20.00  565.0  545.00  0.85
## DEATH      23 11627    0.30    0.46    0.00   0.00    1.0    1.00  0.00
## ANGINA     24 11627    0.16    0.37    0.00   0.00    1.0    1.00  0.00
## HOSPMI     25 11627    0.10    0.30    0.00   0.00    1.0    1.00  0.00
## MI_FCHD    26 11627    0.15    0.36    0.00   0.00    1.0    1.00  0.00
## ANYCHD     27 11627    0.27    0.44    0.00   0.00    1.0    1.00  0.00
## STROKE     28 11627    0.09    0.29    0.00   0.00    1.0    1.00  0.00
## CVD        29 11627    0.25    0.43    0.00   0.00    1.0    1.00  0.00
## HYPERTEN   30 11627    0.74    0.44    1.00   0.00    1.0    1.00  0.00
## TIMEAP     31 11627 7241.56 2477.78 8766.00   0.00 8766.0 8766.00 22.98
## TIMEMI     32 11627 7593.85 2136.73 8766.00   0.00 8766.0 8766.00 19.82
## TIMEMIFC   33 11627 7543.04 2192.12 8766.00   0.00 8766.0 8766.00 20.33
## TIMECHD    34 11627 7008.15 2641.34 8766.00   0.00 8766.0 8766.00 24.50
## TIMESTRK   35 11627 7660.88 2011.08 8766.00   0.00 8766.0 8766.00 18.65
## TIMECVD    36 11627 7166.08 2541.67 8766.00   0.00 8766.0 8766.00 23.57
## TIMEDTH    37 11627 7854.10 1788.37 8766.00  26.00 8766.0 8740.00 16.59
## TIMEHYP    38 11627 3598.96 3464.16 2429.00   0.00 8766.0 8766.00 32.13

# Count unique observations in RANDID
unique_randids <- df_frmgham2 %>% 
  distinct(RANDID) %>% 
  nrow()

print(unique_randids)

## [1] 4434

# Print first and last few rows
head(df_frmgham2)

##   RANDID SEX TOTCHOL AGE SYSBP DIABP CURSMOKE CIGPDAY   BMI DIABETES BPMEDS
## 1   2448   1     195  39 106.0  70.0        0       0 26.97        0      0
## 2   2448   1     209  52 121.0  66.0        0       0    NA        0      0
## 3   6238   2     250  46 121.0  81.0        0       0 28.73        0      0
## 4   6238   2     260  52 105.0  69.5        0       0 29.43        0      0
## 5   6238   2     237  58 108.0  66.0        0       0 28.50        0      0
## 6   9428   1     245  48 127.5  80.0        1      20 25.34        0      0
##   HEARTRTE GLUCOSE educ PREVCHD PREVAP PREVMI PREVSTRK PREVHYP TIME PERIOD HDLC
## 1       80      77    4       0      0      0        0       0    0      1   NA
## 2       69      92    4       0      0      0        0       0 4628      3   31
## 3       95      76    2       0      0      0        0       0    0      1   NA
## 4       80      86    2       0      0      0        0       0 2156      2   NA
## 5       80      71    2       0      0      0        0       0 4344      3   54
## 6       75      70    1       0      0      0        0       0    0      1   NA
##   LDLC DEATH ANGINA HOSPMI MI_FCHD ANYCHD STROKE CVD HYPERTEN TIMEAP TIMEMI
## 1   NA     0      0      1       1      1      0   1        0   8766   6438
## 2  178     0      0      1       1      1      0   1        0   8766   6438
## 3   NA     0      0      0       0      0      0   0        0   8766   8766
## 4   NA     0      0      0       0      0      0   0        0   8766   8766
## 5  141     0      0      0       0      0      0   0        0   8766   8766
## 6   NA     0      0      0       0      0      0   0        0   8766   8766
##   TIMEMIFC TIMECHD TIMESTRK TIMECVD TIMEDTH TIMEHYP
## 1     6438    6438     8766    6438    8766    8766
## 2     6438    6438     8766    6438    8766    8766
## 3     8766    8766     8766    8766    8766    8766
## 4     8766    8766     8766    8766    8766    8766
## 5     8766    8766     8766    8766    8766    8766
## 6     8766    8766     8766    8766    8766    8766

tail(df_frmgham2)

##        RANDID SEX TOTCHOL AGE SYSBP DIABP CURSMOKE CIGPDAY   BMI DIABETES
## 11622 9998212   1     185  40   141    98        0       0 25.60        0
## 11623 9998212   1     173  46   126    82        0       0 19.17        0
## 11624 9998212   1     153  52   143    89        0       0 25.74        0
## 11625 9999312   2     196  39   133    86        1      30 20.91        0
## 11626 9999312   2     240  46   138    79        1      20 26.39        0
## 11627 9999312   2      NA  50   147    96        1      10 24.19        0
##       BPMEDS HEARTRTE GLUCOSE educ PREVCHD PREVAP PREVMI PREVSTRK PREVHYP TIME
## 11622      0       67      72    3       0      0      0        0       1    0
## 11623      0       70      NA    3       0      0      0        0       1 2333
## 11624      0       65      72    3       0      0      0        0       1 4538
## 11625      0       85      80    3       0      0      0        0       0    0
## 11626      0       90      83    3       0      0      0        0       0 2390
## 11627      0       94      NA    3       0      0      0        0       1 4201
##       PERIOD HDLC LDLC DEATH ANGINA HOSPMI MI_FCHD ANYCHD STROKE CVD HYPERTEN
## 11622      1   NA   NA     0      0      0       0      0      0   0        1
## 11623      2   NA   NA     0      0      0       0      0      0   0        1
## 11624      3   30  123     0      0      0       0      0      0   0        1
## 11625      1   NA   NA     0      0      0       0      0      0   0        1
## 11626      2   NA   NA     0      0      0       0      0      0   0        1
## 11627      3   NA   NA     0      0      0       0      0      0   0        1
##       TIMEAP TIMEMI TIMEMIFC TIMECHD TIMESTRK TIMECVD TIMEDTH TIMEHYP
## 11622   8766   8766     8766    8766     8766    8766    8766       0
## 11623   8766   8766     8766    8766     8766    8766    8766       0
## 11624   8766   8766     8766    8766     8766    8766    8766       0
## 11625   8766   8766     8766    8766     8766    8766    8766    4201
## 11626   8766   8766     8766    8766     8766    8766    8766    4201
## 11627   8766   8766     8766    8766     8766    8766    8766    4201

Data Cleaning Functions

# Function to remove outliers
is_not_outlier <- function(x) {
  Q1 <- quantile(x, 0.25)
  Q3 <- quantile(x, 0.75)
  IQR <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  x >= lower_bound & x <= upper_bound
}

# Function to clean dataset
clean_data <- function(df) {
  df %>% 
    dplyr::select(-c(LDLC, HDLC)) %>% 
    na.omit() %>% 
    filter(if_all(c(TOTCHOL, AGE, SYSBP, DIABP, BMI, HEARTRTE, GLUCOSE), is_not_outlier))
}

Data Transformation

final_data <- df_frmgham2 %>% 
  
  # Sex
  mutate("Gender" = case_when(SEX == 1 ~ "Male",
                              SEX == 2 ~ "Female")) %>%
  
  # Education
  # mutate("Education Level" = case_when(
  #                                     EDUCATION == 1 ~ "0 - 11th Grade",
  #                                     EDUCATION == 2 ~ "High School Diploma/GED",
  #                                     EDUCATION == 3 ~ "Some College/Vocational School",
  #                                     EDUCATION == 4 ~ "College degree (BA, BS) or higher"
  #                                     )
  #        ) %>% 
  
  # Smoking
  mutate("Smoker" = case_when(
                              CURSMOKE == 0 ~ "Not current smoker",
                              CURSMOKE == 1 ~ "Smoker"
                              )) %>% 
  # mutate("Cigarettes/per day" = case_when(
  #                                         CIGPDAY ==  0 ~ "Not current smoker",
  #                                         CIGPDAY >=  1 ~ "Cigarettes Per Day"
  #                                         )) %>% 
  
  # Diabetes
  mutate("Diabetic" = case_when(
                                DIABETES == 0 ~ "Non Diabetic",
                                DIABETES == 1 ~ "Diabetic"
                                )) %>% 
  
  # Previous conditions
  mutate("PREVCHD - Coronary Disease" = case_when(
                                                  PREVCHD == 0 ~ "Free of coronary disease",
                                                  PREVCHD == 1 ~ "Prevalent coronary disease"
                                                  )) %>% 
  mutate("PREVAP - Angina Pectoris" = case_when(
                                                PREVAP ==  0 ~ "Free of disease angina pectoris",
                                                PREVAP ==  1 ~ "Prevalent disease angina pectoris"
                                                )) %>% 
  mutate("PREVMI - Myocardial Infarction" = case_when(
                                                      PREVMI ==  0 ~ "Free of prevalent myocardial infarction",
                                                      PREVMI ==  1 ~ "Prevalent myocardial infarction"
                                                      )) %>% 
  mutate("PREVSTRK - Stroke History" = case_when(
                                                PREVSTRK == 0 ~ "No stroke history",
                                                PREVSTRK == 1 ~ "History of stroke"
                                                )) %>% 
  mutate("PREVHYP - Hypertension" = case_when(
                                              PREVHYP ==  0 ~ "No prevalent hypertension",
                                              PREVHYP ==  1 ~ "Prevalent hypertension"
                                              )) %>% 
 # dplyr::select(-c(SEX, EDUCATION, CURSMOKE, CIGPDAY, DIABETES, PREVCHD, PREVAP, PREVMI, PREVSTRK, PREVHYP))
  dplyr::select(-c(SEX,  CURSMOKE,  DIABETES, PREVCHD, PREVAP, PREVMI, PREVSTRK, PREVHYP))

Clean the Data

df_clean <- clean_data(final_data)
dim(df_clean)

## [1] 8154   37

str(df_clean)

## 'data.frame':    8154 obs. of  37 variables:
##  $ RANDID                        : int  2448 6238 6238 6238 9428 9428 10552 10552 11252 11252 ...
##  $ TOTCHOL                       : int  195 250 260 237 245 283 225 232 285 343 ...
##  $ AGE                           : int  39 46 52 58 48 54 61 67 46 51 ...
##  $ SYSBP                         : num  106 121 105 108 128 ...
##  $ DIABP                         : num  70 81 69.5 66 80 89 95 109 84 77 ...
##  $ CIGPDAY                       : int  0 0 0 0 20 30 30 20 23 30 ...
##  $ BMI                           : num  27 28.7 29.4 28.5 25.3 ...
##  $ BPMEDS                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HEARTRTE                      : int  80 95 80 80 75 75 65 60 85 90 ...
##  $ GLUCOSE                       : int  77 76 86 71 70 87 103 89 85 72 ...
##  $ educ                          : int  4 2 2 2 1 1 3 3 3 3 ...
##  $ TIME                          : int  0 0 2156 4344 0 2199 0 1977 0 2072 ...
##  $ PERIOD                        : int  1 1 2 3 1 2 1 2 1 2 ...
##  $ DEATH                         : int  0 0 0 0 0 0 1 1 0 0 ...
##  $ ANGINA                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HOSPMI                        : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ MI_FCHD                       : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ ANYCHD                        : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ STROKE                        : int  0 0 0 0 0 0 1 1 0 0 ...
##  $ CVD                           : int  1 0 0 0 0 0 1 1 0 0 ...
##  $ HYPERTEN                      : int  0 0 0 0 0 0 1 1 1 1 ...
##  $ TIMEAP                        : int  8766 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
##  $ TIMEMI                        : int  6438 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
##  $ TIMEMIFC                      : int  6438 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
##  $ TIMECHD                       : int  6438 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
##  $ TIMESTRK                      : int  8766 8766 8766 8766 8766 8766 2089 2089 8766 8766 ...
##  $ TIMECVD                       : int  6438 8766 8766 8766 8766 8766 2089 2089 8766 8766 ...
##  $ TIMEDTH                       : int  8766 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
##  $ TIMEHYP                       : int  8766 8766 8766 8766 8766 8766 0 0 4285 4285 ...
##  $ Gender                        : chr  "Male" "Female" "Female" "Female" ...
##  $ Smoker                        : chr  "Not current smoker" "Not current smoker" "Not current smoker" "Not current smoker" ...
##  $ Diabetic                      : chr  "Non Diabetic" "Non Diabetic" "Non Diabetic" "Non Diabetic" ...
##  $ PREVCHD - Coronary Disease    : chr  "Free of coronary disease" "Free of coronary disease" "Free of coronary disease" "Free of coronary disease" ...
##  $ PREVAP - Angina Pectoris      : chr  "Free of disease angina pectoris" "Free of disease angina pectoris" "Free of disease angina pectoris" "Free of disease angina pectoris" ...
##  $ PREVMI - Myocardial Infarction: chr  "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" ...
##  $ PREVSTRK - Stroke History     : chr  "No stroke history" "No stroke history" "No stroke history" "No stroke history" ...
##  $ PREVHYP - Hypertension        : chr  "No prevalent hypertension" "No prevalent hypertension" "No prevalent hypertension" "No prevalent hypertension" ...
##  - attr(*, "na.action")= 'omit' Named int [1:2317] 2 12 19 28 29 36 41 53 55 66 ...
##   ..- attr(*, "names")= chr [1:2317] "2" "12" "19" "28" ...

Descriptive Statistics Table

Raw data

df_frmgham2 %>% 
  pivot_longer(cols = c(TOTCHOL, AGE, SYSBP, DIABP, BMI, HEARTRTE, HDLC, LDLC, GLUCOSE)) %>% 
  group_by(name) %>% 
  summarise(
    N = n(),
    Mean = mean(value, na.rm = TRUE),
    SD = sd(value, na.rm = TRUE),
    Median = median(value, na.rm = TRUE),
    Min = min(value, na.rm = TRUE),
    Max = max(value, na.rm = TRUE),
    Q1 = quantile(value, probs = 0.25, na.rm = TRUE),
    Q3 = quantile(value, probs = 0.75, na.rm = TRUE),
    IQR = Q3 - Q1,
    Lower_bound = Q1 - 1.5 * IQR,
    Upper_bound = Q3 + 1.5 * IQR
  ) %>%
  mutate(across(where(is.numeric), round, 2)) %>% 
  flextable() %>% 
  autofit()

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(where(is.numeric), round, 2)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))

name	N	Mean	SD	Median	Min	Max	Q1	Q3	IQR	Lower_bound	Upper_bound
AGE	11,627	54.79	9.56	54.00	32.00	81.0	48.00	62.00	14.00	27.00	83.00
BMI	11,627	25.88	4.10	25.48	14.43	56.8	23.09	28.07	4.98	15.63	35.53
DIABP	11,627	83.04	11.66	82.00	30.00	150.0	75.00	90.00	15.00	52.50	112.50
GLUCOSE	11,627	84.12	24.99	80.00	39.00	478.0	72.00	89.00	17.00	46.50	114.50
HDLC	11,627	49.36	15.63	48.00	10.00	189.0	39.00	58.00	19.00	10.50	86.50
HEARTRTE	11,627	76.78	12.46	75.00	37.00	220.0	69.00	85.00	16.00	45.00	109.00
LDLC	11,627	176.47	46.86	173.00	20.00	565.0	145.00	205.00	60.00	55.00	295.00
SYSBP	11,627	136.32	22.80	132.00	83.50	295.0	120.00	149.00	29.00	76.50	192.50
TOTCHOL	11,627	241.16	45.37	238.00	107.00	696.0	210.00	268.00	58.00	123.00	355.00

Cleaned data

df_clean %>% 
  pivot_longer(cols = c(TOTCHOL, AGE, SYSBP, DIABP, BMI, HEARTRTE, GLUCOSE)) %>% 
  group_by(name) %>% 
  summarise(
    N = n(),
    Mean = mean(value, na.rm = TRUE),
    SD = sd(value, na.rm = TRUE),
    Median = median(value, na.rm = TRUE),
    Min = min(value, na.rm = TRUE),
    Max = max(value, na.rm = TRUE),
    Q1 = quantile(value, probs = 0.25, na.rm = TRUE),
    Q3 = quantile(value, probs = 0.75, na.rm = TRUE),
    IQR = Q3 - Q1,
    Lower_bound = Q1 - 1.5 * IQR,
    Upper_bound = Q3 + 1.5 * IQR
  ) %>%
  mutate(across(where(is.numeric), round, 2)) %>% 
  flextable() %>% 
  autofit()

name	N	Mean	SD	Median	Min	Max	Q1	Q3	IQR	Lower_bound	Upper_bound
AGE	8,154	53.80	9.41	53.00	32.00	81.00	47.00	61.00	14.00	26.00	82.00
BMI	8,154	25.46	3.45	25.26	15.92	35.33	23.05	27.72	4.67	16.05	34.72
DIABP	8,154	81.86	10.47	81.00	53.00	112.50	74.00	88.50	14.50	52.25	110.25
GLUCOSE	8,154	79.60	11.92	78.00	47.00	114.00	71.00	87.00	16.00	47.00	111.00
HEARTRTE	8,154	75.65	11.21	75.00	45.00	109.00	68.00	82.00	14.00	47.00	103.00
SYSBP	8,154	132.79	19.33	130.00	83.50	190.00	119.00	145.00	26.00	80.00	184.00
TOTCHOL	8,154	239.26	41.40	238.00	124.00	356.00	210.00	266.00	56.00	126.00	350.00

Write cleaned data

df_clean |> 
  write.csv("FHS_cleaned.csv")
str(df_clean)

## 'data.frame':    8154 obs. of  37 variables:
##  $ RANDID                        : int  2448 6238 6238 6238 9428 9428 10552 10552 11252 11252 ...
##  $ TOTCHOL                       : int  195 250 260 237 245 283 225 232 285 343 ...
##  $ AGE                           : int  39 46 52 58 48 54 61 67 46 51 ...
##  $ SYSBP                         : num  106 121 105 108 128 ...
##  $ DIABP                         : num  70 81 69.5 66 80 89 95 109 84 77 ...
##  $ CIGPDAY                       : int  0 0 0 0 20 30 30 20 23 30 ...
##  $ BMI                           : num  27 28.7 29.4 28.5 25.3 ...
##  $ BPMEDS                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HEARTRTE                      : int  80 95 80 80 75 75 65 60 85 90 ...
##  $ GLUCOSE                       : int  77 76 86 71 70 87 103 89 85 72 ...
##  $ educ                          : int  4 2 2 2 1 1 3 3 3 3 ...
##  $ TIME                          : int  0 0 2156 4344 0 2199 0 1977 0 2072 ...
##  $ PERIOD                        : int  1 1 2 3 1 2 1 2 1 2 ...
##  $ DEATH                         : int  0 0 0 0 0 0 1 1 0 0 ...
##  $ ANGINA                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HOSPMI                        : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ MI_FCHD                       : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ ANYCHD                        : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ STROKE                        : int  0 0 0 0 0 0 1 1 0 0 ...
##  $ CVD                           : int  1 0 0 0 0 0 1 1 0 0 ...
##  $ HYPERTEN                      : int  0 0 0 0 0 0 1 1 1 1 ...
##  $ TIMEAP                        : int  8766 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
##  $ TIMEMI                        : int  6438 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
##  $ TIMEMIFC                      : int  6438 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
##  $ TIMECHD                       : int  6438 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
##  $ TIMESTRK                      : int  8766 8766 8766 8766 8766 8766 2089 2089 8766 8766 ...
##  $ TIMECVD                       : int  6438 8766 8766 8766 8766 8766 2089 2089 8766 8766 ...
##  $ TIMEDTH                       : int  8766 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
##  $ TIMEHYP                       : int  8766 8766 8766 8766 8766 8766 0 0 4285 4285 ...
##  $ Gender                        : chr  "Male" "Female" "Female" "Female" ...
##  $ Smoker                        : chr  "Not current smoker" "Not current smoker" "Not current smoker" "Not current smoker" ...
##  $ Diabetic                      : chr  "Non Diabetic" "Non Diabetic" "Non Diabetic" "Non Diabetic" ...
##  $ PREVCHD - Coronary Disease    : chr  "Free of coronary disease" "Free of coronary disease" "Free of coronary disease" "Free of coronary disease" ...
##  $ PREVAP - Angina Pectoris      : chr  "Free of disease angina pectoris" "Free of disease angina pectoris" "Free of disease angina pectoris" "Free of disease angina pectoris" ...
##  $ PREVMI - Myocardial Infarction: chr  "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" ...
##  $ PREVSTRK - Stroke History     : chr  "No stroke history" "No stroke history" "No stroke history" "No stroke history" ...
##  $ PREVHYP - Hypertension        : chr  "No prevalent hypertension" "No prevalent hypertension" "No prevalent hypertension" "No prevalent hypertension" ...
##  - attr(*, "na.action")= 'omit' Named int [1:2317] 2 12 19 28 29 36 41 53 55 66 ...
##   ..- attr(*, "names")= chr [1:2317] "2" "12" "19" "28" ...

Data Splitting

# Split the data into training and testing sets
set.seed(123)

# data_split <- df_clean |> 
#   dplyr::select(-c(TIME, TIMEAP, TIMEMI, TIMEMIFC, TIMECHD, TIMESTRK, TIMECVD, TIMEDTH, TIMEHYP, RANDID, DEATH, ANGINA, HOSPMI, MI_FCHD, ANYCHD, STROKE, HYPERTEN, `PREVCHD - Coronary Disease`, `PREVAP - Angina Pectoris`, `PREVMI - Myocardial Infarction`, `PREVSTRK - Stroke History`, `PREVHYP - Hypertension`)) |> 
#   #mutate_at(.vars = c(6, 9:17), .funs = as.factor) |> 
#   mutate(heart_attack = as.factor(CVD)) %>%
#   dplyr::select(-CVD) |> 
#   initial_split(prop = 0.75)

library(rsample)

## Warning: package 'rsample' was built under R version 4.3.3

# Perform the data split
data_split <- df_clean %>% 
  dplyr::select(-c(TIME, TIMEAP, TIMEMI, TIMEMIFC, TIMECHD, TIMESTRK, TIMECVD, TIMEDTH, TIMEHYP, RANDID, DEATH, ANGINA, HOSPMI, MI_FCHD, ANYCHD, STROKE, HYPERTEN, 
                   `PREVCHD - Coronary Disease`, `PREVAP - Angina Pectoris`, `PREVMI - Myocardial Infarction`, `PREVSTRK - Stroke History`, `PREVHYP - Hypertension`)) %>% 
  # Uncomment and modify if needed
  # mutate_at(.vars = c(6, 9:17), .funs = as.factor) %>% 
  #mutate(CVD = as.factor(CVD)) %>%
  initial_split(prop = 0.75)
# Training data
train_data <- training(data_split)
train_data |> write_csv("train2.csv")
saveRDS(train_data, file = "train_data.rds")

# Testing data
test_data <- testing(data_split)
test_data |> write_csv("test2.csv")
saveRDS(test_data, file = "test_data.rds")
str(test_data)

## 'data.frame':    2039 obs. of  15 variables:
##  $ TOTCHOL : int  260 237 225 232 343 220 313 260 258 294 ...
##  $ AGE     : int  52 58 61 67 51 55 45 52 49 46 ...
##  $ SYSBP   : num  105 108 150 183 109 ...
##  $ DIABP   : num  69.5 66 95 109 77 106 71 89 102 94 ...
##  $ CIGPDAY : int  0 0 30 20 30 0 20 0 0 15 ...
##  $ BMI     : num  29.4 28.5 28.6 30.2 23.5 ...
##  $ BPMEDS  : int  0 0 0 0 0 1 0 0 1 0 ...
##  $ HEARTRTE: int  80 80 65 60 90 86 79 76 75 98 ...
##  $ GLUCOSE : int  86 71 103 89 72 81 78 79 74 64 ...
##  $ educ    : int  2 2 3 3 3 2 2 1 1 1 ...
##  $ PERIOD  : int  2 3 1 2 2 3 1 1 2 1 ...
##  $ CVD     : int  0 0 1 1 0 1 0 0 0 0 ...
##  $ Gender  : chr  "Female" "Female" "Female" "Female" ...
##  $ Smoker  : chr  "Not current smoker" "Not current smoker" "Smoker" "Smoker" ...
##  $ Diabetic: chr  "Non Diabetic" "Non Diabetic" "Non Diabetic" "Non Diabetic" ...
##  - attr(*, "na.action")= 'omit' Named int [1:2317] 2 12 19 28 29 36 41 53 55 66 ...
##   ..- attr(*, "names")= chr [1:2317] "2" "12" "19" "28" ...

table(df_clean$CVD, df_clean$HOSPMI, useNA = "no")

##    
##        0    1
##   0 6232   55
##   1 1167  700

df_clean |> 
  count(CVD, HOSPMI)

##   CVD HOSPMI    n
## 1   0      0 6232
## 2   0      1   55
## 3   1      0 1167
## 4   1      1  700

Part 1 - Data Cleaning

Joyce D. Williams

2024-02-24

Setup

Load libraries

Load data

Data Cleaning Functions

Data Transformation

Clean the Data

Descriptive Statistics Table

Raw data

Cleaned data

Write cleaned data

Data Splitting