Setup

Load libraries

library(tidyverse, quietly = TRUE)
#library(tidymodels, quietly = TRUE)
library(psych, quietly = TRUE)
library(DataExplorer, quietly = TRUE)
library(flextable, quietly = TRUE)

Load data

getwd()

## [1] "C:/Users/tsapa/Downloads"

setwd("C:/Users/tsapa/Downloads")
# Read in the dataset
df_frmgham2 <- read.csv("frmgham2.csv")

# Display structure and summary
str(df_frmgham2)

## 'data.frame':    11627 obs. of  39 variables:
##  $ RANDID  : int  2448 2448 6238 6238 6238 9428 9428 10552 10552 11252 ...
##  $ SEX     : int  1 1 2 2 2 1 1 2 2 2 ...
##  $ TOTCHOL : int  195 209 250 260 237 245 283 225 232 285 ...
##  $ AGE     : int  39 52 46 52 58 48 54 61 67 46 ...
##  $ SYSBP   : num  106 121 121 105 108 ...
##  $ DIABP   : num  70 66 81 69.5 66 80 89 95 109 84 ...
##  $ CURSMOKE: int  0 0 0 0 0 1 1 1 1 1 ...
##  $ CIGPDAY : int  0 0 0 0 0 20 30 30 20 23 ...
##  $ BMI     : num  27 NA 28.7 29.4 28.5 ...
##  $ DIABETES: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ BPMEDS  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HEARTRTE: int  80 69 95 80 80 75 75 65 60 85 ...
##  $ GLUCOSE : int  77 92 76 86 71 70 87 103 89 85 ...
##  $ educ    : int  4 4 2 2 2 1 1 3 3 3 ...
##  $ PREVCHD : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PREVAP  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PREVMI  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PREVSTRK: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PREVHYP : int  0 0 0 0 0 0 0 1 1 0 ...
##  $ TIME    : int  0 4628 0 2156 4344 0 2199 0 1977 0 ...
##  $ PERIOD  : int  1 3 1 2 3 1 2 1 2 1 ...
##  $ HDLC    : int  NA 31 NA NA 54 NA NA NA NA NA ...
##  $ LDLC    : int  NA 178 NA NA 141 NA NA NA NA NA ...
##  $ DEATH   : int  0 0 0 0 0 0 0 1 1 0 ...
##  $ ANGINA  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HOSPMI  : int  1 1 0 0 0 0 0 0 0 0 ...
##  $ MI_FCHD : int  1 1 0 0 0 0 0 0 0 0 ...
##  $ ANYCHD  : int  1 1 0 0 0 0 0 0 0 0 ...
##  $ STROKE  : int  0 0 0 0 0 0 0 1 1 0 ...
##  $ CVD     : int  1 1 0 0 0 0 0 1 1 0 ...
##  $ HYPERTEN: int  0 0 0 0 0 0 0 1 1 1 ...
##  $ TIMEAP  : int  8766 8766 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ TIMEMI  : int  6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ TIMEMIFC: int  6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ TIMECHD : int  6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ TIMESTRK: int  8766 8766 8766 8766 8766 8766 8766 2089 2089 8766 ...
##  $ TIMECVD : int  6438 6438 8766 8766 8766 8766 8766 2089 2089 8766 ...
##  $ TIMEDTH : int  8766 8766 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ TIMEHYP : int  8766 8766 8766 8766 8766 8766 8766 0 0 4285 ...

df_frmgham2 <- df_frmgham2[df_frmgham2$PERIOD == 3,]

# Corrected plotting for missing values
df_frmgham2 %>%
  plot_missing()

# Corrected descriptive statistics excluding certain columns
df_frmgham2 %>%
  dplyr::select(-RANDID) %>%
  describe() %>%
  dplyr::select(-c(trimmed, mad, skew, kurtosis))

##          vars    n    mean      sd  median     min    max   range    se
## SEX         1 3263    1.57    0.49    2.00    1.00    2.0    1.00  0.01
## TOTCHOL     2 3049  236.71   44.45  234.00  112.00  625.0  513.00  0.80
## AGE         3 3263   60.65    8.30   60.00   44.00   81.0   37.00  0.15
## SYSBP       4 3263  140.22   22.93  137.00   86.00  267.0  181.00  0.40
## DIABP       5 3263   81.79   11.27   80.00   30.00  130.0  100.00  0.20
## CURSMOKE    6 3263    0.34    0.47    0.00    0.00    1.0    1.00  0.01
## CIGPDAY     7 3249    6.77   11.63    0.00    0.00   80.0   80.00  0.20
## BMI         8 3246   25.89    4.08   25.46   14.43   56.8   42.37  0.07
## DIABETES    9 3263    0.08    0.27    0.00    0.00    1.0    1.00  0.00
## BPMEDS     10 2817    0.15    0.36    0.00    0.00    1.0    1.00  0.01
## HEARTRTE   11 3259   77.36   12.49   76.00   37.00  150.0  113.00  0.22
## GLUCOSE    12 2701   89.78   28.16   84.00   46.00  478.0  432.00  0.54
## educ       13 3181    2.01    1.03    2.00    1.00    4.0    3.00  0.02
## PREVCHD    14 3263    0.11    0.31    0.00    0.00    1.0    1.00  0.01
## PREVAP     15 3263    0.08    0.27    0.00    0.00    1.0    1.00  0.00
## PREVMI     16 3263    0.05    0.21    0.00    0.00    1.0    1.00  0.00
## PREVSTRK   17 3263    0.02    0.14    0.00    0.00    1.0    1.00  0.00
## PREVHYP    18 3263    0.60    0.49    1.00    0.00    1.0    1.00  0.01
## TIME       19 3263 4353.67   95.10 4361.00 3748.00 4854.0 1106.00  1.66
## PERIOD     20 3263    3.00    0.00    3.00    3.00    3.0    0.00  0.00
## HDLC       21 3027   49.36   15.63   48.00   10.00  189.0  179.00  0.28
## LDLC       22 3026  176.47   46.86  173.00   20.00  565.0  545.00  0.85
## DEATH      23 3263    0.24    0.43    0.00    0.00    1.0    1.00  0.01
## ANGINA     24 3263    0.16    0.37    0.00    0.00    1.0    1.00  0.01
## HOSPMI     25 3263    0.09    0.29    0.00    0.00    1.0    1.00  0.01
## MI_FCHD    26 3263    0.14    0.35    0.00    0.00    1.0    1.00  0.01
## ANYCHD     27 3263    0.26    0.44    0.00    0.00    1.0    1.00  0.01
## STROKE     28 3263    0.09    0.28    0.00    0.00    1.0    1.00  0.00
## CVD        29 3263    0.23    0.42    0.00    0.00    1.0    1.00  0.01
## HYPERTEN   30 3263    0.75    0.43    1.00    0.00    1.0    1.00  0.01
## TIMEAP     31 3263 7678.69 2058.41 8766.00    0.00 8766.0 8766.00 36.03
## TIMEMI     32 3263 8034.80 1589.87 8766.00    0.00 8766.0 8766.00 27.83
## TIMEMIFC   33 3263 7988.03 1662.14 8766.00    0.00 8766.0 8766.00 29.10
## TIMECHD    34 3263 7445.92 2279.22 8766.00    0.00 8766.0 8766.00 39.90
## TIMESTRK   35 3263 8104.21 1392.10 8766.00    0.00 8766.0 8766.00 24.37
## TIMECVD    36 3263 7609.05 2134.32 8766.00    0.00 8766.0 8766.00 37.36
## TIMEDTH    37 3263 8286.02 1045.52 8766.00 4182.00 8766.0 4584.00 18.30
## TIMEHYP    38 3263 3834.51 3476.47 2941.00    0.00 8766.0 8766.00 60.86

# Count unique observations in RANDID
unique_randids <- df_frmgham2 %>% 
  distinct(RANDID) %>% 
  nrow()

print(unique_randids)

## [1] 3263

# Print first and last few rows
head(df_frmgham2)

##    RANDID SEX TOTCHOL AGE SYSBP DIABP CURSMOKE CIGPDAY   BMI DIABETES BPMEDS
## 2    2448   1     209  52   121    66        0       0    NA        0      0
## 5    6238   2     237  58   108    66        0       0 28.50        0      0
## 12  11252   2      NA  58   155    90        1      30 24.61        0      0
## 15  11263   2     220  55   180   106        0       0 31.17        1      1
## 20  12806   2     320  57   110    46        1      30 22.02        0      0
## 23  14367   1     280  64   168   100        0       0 25.72        0      0
##    HEARTRTE GLUCOSE educ PREVCHD PREVAP PREVMI PREVSTRK PREVHYP TIME PERIOD
## 2        69      92    4       0      0      0        0       0 4628      3
## 5        80      71    2       0      0      0        0       0 4344      3
## 12       74      NA    3       0      0      0        0       1 4285      3
## 15       86      81    2       0      0      0        0       1 4351      3
## 20       75      87    2       0      0      0        0       0 4289      3
## 23       92      82    1       0      0      0        0       1 4438      3
##    HDLC LDLC DEATH ANGINA HOSPMI MI_FCHD ANYCHD STROKE CVD HYPERTEN TIMEAP
## 2    31  178     0      0      1       1      1      0   1        0   8766
## 5    54  141     0      0      0       0      0      0   0        0   8766
## 12   NA   NA     0      0      0       0      0      0   0        1   8766
## 15   46  135     0      0      0       1      1      0   1        1   8766
## 20   34  286     0      0      0       0      0      0   0        1   8766
## 23   44  236     0      0      0       0      0      0   0        1   8766
##    TIMEMI TIMEMIFC TIMECHD TIMESTRK TIMECVD TIMEDTH TIMEHYP
## 2    6438     6438    6438     8766    6438    8766    8766
## 5    8766     8766    8766     8766    8766    8766    8766
## 12   8766     8766    8766     8766    8766    8766    4285
## 15   8766     5719    5719     8766    5719    8766       0
## 20   8766     8766    8766     8766    8766    8766    8679
## 23   8766     8766    8766     8766    8766    8766       0

tail(df_frmgham2)

##        RANDID SEX TOTCHOL AGE SYSBP DIABP CURSMOKE CIGPDAY   BMI DIABETES
## 11607 9982118   1     219  70 163.5    75        0       0 25.26        0
## 11613 9989287   1      NA  63 120.0    80        1      50 17.38        0
## 11616 9990894   2     228  60 205.0   100        1      20 22.37        0
## 11619 9993179   2     251  56 145.0    92        1      35 21.97        0
## 11624 9998212   1     153  52 143.0    89        0       0 25.74        0
## 11627 9999312   2      NA  50 147.0    96        1      10 24.19        0
##       BPMEDS HEARTRTE GLUCOSE educ PREVCHD PREVAP PREVMI PREVSTRK PREVHYP TIME
## 11607      0       65     103    3       0      0      0        0       1 4429
## 11613      0       76      NA    3       0      0      0        0       0 4439
## 11616      0       80      91    2       0      0      0        0       1 4417
## 11619      1       95      90    1       0      0      0        0       1 4396
## 11624      0       65      72    3       0      0      0        0       1 4538
## 11627      0       94      NA    3       0      0      0        0       1 4201
##       PERIOD HDLC LDLC DEATH ANGINA HOSPMI MI_FCHD ANYCHD STROKE CVD HYPERTEN
## 11607      3   50  169     1      0      0       0      0      1   1        1
## 11613      3   NA   NA     1      0      0       0      0      0   0        0
## 11616      3   47  181     1      0      0       0      0      0   0        1
## 11619      3   70  181     1      0      0       0      0      0   0        1
## 11624      3   30  123     0      0      0       0      0      0   0        1
## 11627      3   NA   NA     0      0      0       0      0      0   0        1
##       TIMEAP TIMEMI TIMEMIFC TIMECHD TIMESTRK TIMECVD TIMEDTH TIMEHYP
## 11607   8457   8457     8457    8457     8346    8346    8457       0
## 11613   7746   7746     7746    7746     7746    7746    7746    7746
## 11616   6433   6433     6433    6433     6433    6433    6433    2219
## 11619   6729   6729     6729    6729     6729    6729    6729    4396
## 11624   8766   8766     8766    8766     8766    8766    8766       0
## 11627   8766   8766     8766    8766     8766    8766    8766    4201

Data Cleaning Functions

# Function to remove outliers
is_not_outlier <- function(x) {
  Q1 <- quantile(x, 0.25)
  Q3 <- quantile(x, 0.75)
  IQR <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  x >= lower_bound & x <= upper_bound
}

# Function to clean dataset
clean_data <- function(df) {
  df %>% 
    dplyr::select(-c(LDLC, HDLC)) %>% 
    na.omit() %>% 
    filter(if_all(c(TOTCHOL, AGE, SYSBP, DIABP, BMI, HEARTRTE, GLUCOSE), is_not_outlier))
}

Data Transformation

final_data <- df_frmgham2 %>% 
  
  # Sex
  mutate("Gender" = case_when(SEX == 1 ~ "Male",
                              SEX == 2 ~ "Female")) %>%
  
  # Education
  # mutate("Education Level" = case_when(
  #                                     EDUCATION == 1 ~ "0 - 11th Grade",
  #                                     EDUCATION == 2 ~ "High School Diploma/GED",
  #                                     EDUCATION == 3 ~ "Some College/Vocational School",
  #                                     EDUCATION == 4 ~ "College degree (BA, BS) or higher"
  #                                     )
  #        ) %>% 
  
  # Smoking
  mutate("Smoker" = case_when(
                              CURSMOKE == 0 ~ "Not current smoker",
                              CURSMOKE == 1 ~ "Smoker"
                              )) %>% 
  # mutate("Cigarettes/per day" = case_when(
  #                                         CIGPDAY ==  0 ~ "Not current smoker",
  #                                         CIGPDAY >=  1 ~ "Cigarettes Per Day"
  #                                         )) %>% 
  
  # Diabetes
  mutate("Diabetic" = case_when(
                                DIABETES == 0 ~ "Non Diabetic",
                                DIABETES == 1 ~ "Diabetic"
                                )) %>% 
  
  # Previous conditions
  mutate("PREVCHD - Coronary Disease" = case_when(
                                                  PREVCHD == 0 ~ "Free of coronary disease",
                                                  PREVCHD == 1 ~ "Prevalent coronary disease"
                                                  )) %>% 
  mutate("PREVAP - Angina Pectoris" = case_when(
                                                PREVAP ==  0 ~ "Free of disease angina pectoris",
                                                PREVAP ==  1 ~ "Prevalent disease angina pectoris"
                                                )) %>% 
  mutate("PREVMI - Myocardial Infarction" = case_when(
                                                      PREVMI ==  0 ~ "Free of prevalent myocardial infarction",
                                                      PREVMI ==  1 ~ "Prevalent myocardial infarction"
                                                      )) %>% 
  mutate("PREVSTRK - Stroke History" = case_when(
                                                PREVSTRK == 0 ~ "No stroke history",
                                                PREVSTRK == 1 ~ "History of stroke"
                                                )) %>% 
  mutate("PREVHYP - Hypertension" = case_when(
                                              PREVHYP ==  0 ~ "No prevalent hypertension",
                                              PREVHYP ==  1 ~ "Prevalent hypertension"
                                              )) %>% 
 # dplyr::select(-c(SEX, EDUCATION, CURSMOKE, CIGPDAY, DIABETES, PREVCHD, PREVAP, PREVMI, PREVSTRK, PREVHYP))
  dplyr::select(-c(SEX,  CURSMOKE,  DIABETES, PREVCHD, PREVAP, PREVMI, PREVSTRK, PREVHYP))

Clean the Data

df_clean <- clean_data(final_data)
dim(df_clean)

## [1] 1991   37

str(df_clean)

## 'data.frame':    1991 obs. of  37 variables:
##  $ RANDID                        : int  6238 11263 14367 24721 33077 34689 36459 40435 45464 47561 ...
##  $ TOTCHOL                       : int  237 220 280 264 215 212 162 226 236 290 ...
##  $ AGE                           : int  58 55 64 51 60 49 53 54 64 56 ...
##  $ SYSBP                         : num  108 180 168 141 144 ...
##  $ DIABP                         : num  66 106 100 81 80 96 101 75 89 70 ...
##  $ CIGPDAY                       : int  0 0 0 15 10 10 0 40 20 40 ...
##  $ BMI                           : num  28.5 31.2 25.7 24.8 23 ...
##  $ BPMEDS                        : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ HEARTRTE                      : int  80 86 92 85 57 82 105 85 90 100 ...
##  $ GLUCOSE                       : int  71 81 82 97 91 84 78 102 80 90 ...
##  $ educ                          : int  2 2 1 2 3 2 2 2 3 2 ...
##  $ TIME                          : int  4344 4351 4438 4408 4383 4289 4411 4372 4368 4071 ...
##  $ PERIOD                        : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ DEATH                         : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ ANGINA                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HOSPMI                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MI_FCHD                       : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ ANYCHD                        : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ STROKE                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CVD                           : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ HYPERTEN                      : int  0 1 1 1 1 1 1 1 1 1 ...
##  $ TIMEAP                        : int  8766 8766 8766 6411 8766 8766 8766 8766 8766 8766 ...
##  $ TIMEMI                        : int  8766 8766 8766 6411 8766 8766 8766 8766 8766 8766 ...
##  $ TIMEMIFC                      : int  8766 5719 8766 6411 8766 8766 8766 8766 8766 8766 ...
##  $ TIMECHD                       : int  8766 5719 8766 6411 8766 8766 8766 8766 8766 8766 ...
##  $ TIMESTRK                      : int  8766 8766 8766 6411 8766 8766 8766 8766 8766 8766 ...
##  $ TIMECVD                       : int  8766 5719 8766 6411 8766 8766 8766 8766 8766 8766 ...
##  $ TIMEDTH                       : int  8766 8766 8766 6411 8766 8766 8766 8766 8766 8766 ...
##  $ TIMEHYP                       : int  8766 0 0 4408 0 2157 1469 5933 2177 0 ...
##  $ Gender                        : chr  "Female" "Female" "Male" "Female" ...
##  $ Smoker                        : chr  "Not current smoker" "Not current smoker" "Not current smoker" "Smoker" ...
##  $ Diabetic                      : chr  "Non Diabetic" "Diabetic" "Non Diabetic" "Non Diabetic" ...
##  $ PREVCHD - Coronary Disease    : chr  "Free of coronary disease" "Free of coronary disease" "Free of coronary disease" "Free of coronary disease" ...
##  $ PREVAP - Angina Pectoris      : chr  "Free of disease angina pectoris" "Free of disease angina pectoris" "Free of disease angina pectoris" "Free of disease angina pectoris" ...
##  $ PREVMI - Myocardial Infarction: chr  "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" ...
##  $ PREVSTRK - Stroke History     : chr  "No stroke history" "No stroke history" "No stroke history" "No stroke history" ...
##  $ PREVHYP - Hypertension        : chr  "No prevalent hypertension" "Prevalent hypertension" "Prevalent hypertension" "Prevalent hypertension" ...
##  - attr(*, "na.action")= 'omit' Named int [1:1009] 1 3 8 15 25 26 28 29 33 35 ...
##   ..- attr(*, "names")= chr [1:1009] "2" "12" "29" "55" ...

Descriptive Statistics Table

Raw data

df_frmgham2 %>% 
  pivot_longer(cols = c(TOTCHOL, AGE, SYSBP, DIABP, BMI, HEARTRTE, HDLC, LDLC, GLUCOSE)) %>% 
  group_by(name) %>% 
  summarise(
    N = n(),
    Mean = mean(value, na.rm = TRUE),
    SD = sd(value, na.rm = TRUE),
    Median = median(value, na.rm = TRUE),
    Min = min(value, na.rm = TRUE),
    Max = max(value, na.rm = TRUE),
    Q1 = quantile(value, probs = 0.25, na.rm = TRUE),
    Q3 = quantile(value, probs = 0.75, na.rm = TRUE),
    IQR = Q3 - Q1,
    Lower_bound = Q1 - 1.5 * IQR,
    Upper_bound = Q3 + 1.5 * IQR
  ) %>%
  mutate(across(where(is.numeric), round, 2)) %>% 
  flextable() %>% 
  autofit()

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(where(is.numeric), round, 2)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))

name	N	Mean	SD	Median	Min	Max	Q1	Q3	IQR	Lower_bound	Upper_bound
AGE	3,263	60.65	8.30	60.00	44.00	81.0	54.00	67.00	13.00	34.50	86.50
BMI	3,263	25.89	4.08	25.46	14.43	56.8	23.19	28.06	4.87	15.89	35.36
DIABP	3,263	81.79	11.27	80.00	30.00	130.0	74.00	89.00	15.00	51.50	111.50
GLUCOSE	3,263	89.78	28.16	84.00	46.00	478.0	76.00	96.00	20.00	46.00	126.00
HDLC	3,263	49.36	15.63	48.00	10.00	189.0	39.00	58.00	19.00	10.50	86.50
HEARTRTE	3,263	77.36	12.49	76.00	37.00	150.0	70.00	85.00	15.00	47.50	107.50
LDLC	3,263	176.47	46.86	173.00	20.00	565.0	145.00	205.00	60.00	55.00	295.00
SYSBP	3,263	140.22	22.93	137.00	86.00	267.0	123.00	154.00	31.00	76.50	200.50
TOTCHOL	3,263	236.71	44.45	234.00	112.00	625.0	206.00	264.00	58.00	119.00	351.00

Cleaned data

df_clean %>% 
  pivot_longer(cols = c(TOTCHOL, AGE, SYSBP, DIABP, BMI, HEARTRTE, GLUCOSE)) %>% 
  group_by(name) %>% 
  summarise(
    N = n(),
    Mean = mean(value, na.rm = TRUE),
    SD = sd(value, na.rm = TRUE),
    Median = median(value, na.rm = TRUE),
    Min = min(value, na.rm = TRUE),
    Max = max(value, na.rm = TRUE),
    Q1 = quantile(value, probs = 0.25, na.rm = TRUE),
    Q3 = quantile(value, probs = 0.75, na.rm = TRUE),
    IQR = Q3 - Q1,
    Lower_bound = Q1 - 1.5 * IQR,
    Upper_bound = Q3 + 1.5 * IQR
  ) %>%
  mutate(across(where(is.numeric), round, 2)) %>% 
  flextable() %>% 
  autofit()

name	N	Mean	SD	Median	Min	Max	Q1	Q3	IQR	Lower_bound	Upper_bound
AGE	1,991	59.95	8.23	59.00	44.00	81.00	53.00	67.00	14.00	32.00	88.00
BMI	1,991	25.47	3.38	25.23	16.58	34.84	23.10	27.62	4.52	16.33	34.39
DIABP	1,991	80.41	10.50	80.00	52.00	110.00	72.75	87.00	14.25	51.38	108.38
GLUCOSE	1,991	84.19	13.81	83.00	46.00	125.00	75.00	92.00	17.00	49.50	117.50
HEARTRTE	1,991	76.54	11.18	76.00	48.00	106.00	70.00	85.00	15.00	47.50	107.50
SYSBP	1,991	136.88	20.32	135.00	86.00	199.00	121.00	150.00	29.00	77.50	193.50
TOTCHOL	1,991	235.66	40.68	235.00	134.00	353.00	206.00	263.00	57.00	120.50	348.50

Write cleaned data

df_clean |> 
  write.csv("FHS_cleaned.csv")
str(df_clean)

## 'data.frame':    1991 obs. of  37 variables:
##  $ RANDID                        : int  6238 11263 14367 24721 33077 34689 36459 40435 45464 47561 ...
##  $ TOTCHOL                       : int  237 220 280 264 215 212 162 226 236 290 ...
##  $ AGE                           : int  58 55 64 51 60 49 53 54 64 56 ...
##  $ SYSBP                         : num  108 180 168 141 144 ...
##  $ DIABP                         : num  66 106 100 81 80 96 101 75 89 70 ...
##  $ CIGPDAY                       : int  0 0 0 15 10 10 0 40 20 40 ...
##  $ BMI                           : num  28.5 31.2 25.7 24.8 23 ...
##  $ BPMEDS                        : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ HEARTRTE                      : int  80 86 92 85 57 82 105 85 90 100 ...
##  $ GLUCOSE                       : int  71 81 82 97 91 84 78 102 80 90 ...
##  $ educ                          : int  2 2 1 2 3 2 2 2 3 2 ...
##  $ TIME                          : int  4344 4351 4438 4408 4383 4289 4411 4372 4368 4071 ...
##  $ PERIOD                        : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ DEATH                         : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ ANGINA                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HOSPMI                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MI_FCHD                       : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ ANYCHD                        : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ STROKE                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CVD                           : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ HYPERTEN                      : int  0 1 1 1 1 1 1 1 1 1 ...
##  $ TIMEAP                        : int  8766 8766 8766 6411 8766 8766 8766 8766 8766 8766 ...
##  $ TIMEMI                        : int  8766 8766 8766 6411 8766 8766 8766 8766 8766 8766 ...
##  $ TIMEMIFC                      : int  8766 5719 8766 6411 8766 8766 8766 8766 8766 8766 ...
##  $ TIMECHD                       : int  8766 5719 8766 6411 8766 8766 8766 8766 8766 8766 ...
##  $ TIMESTRK                      : int  8766 8766 8766 6411 8766 8766 8766 8766 8766 8766 ...
##  $ TIMECVD                       : int  8766 5719 8766 6411 8766 8766 8766 8766 8766 8766 ...
##  $ TIMEDTH                       : int  8766 8766 8766 6411 8766 8766 8766 8766 8766 8766 ...
##  $ TIMEHYP                       : int  8766 0 0 4408 0 2157 1469 5933 2177 0 ...
##  $ Gender                        : chr  "Female" "Female" "Male" "Female" ...
##  $ Smoker                        : chr  "Not current smoker" "Not current smoker" "Not current smoker" "Smoker" ...
##  $ Diabetic                      : chr  "Non Diabetic" "Diabetic" "Non Diabetic" "Non Diabetic" ...
##  $ PREVCHD - Coronary Disease    : chr  "Free of coronary disease" "Free of coronary disease" "Free of coronary disease" "Free of coronary disease" ...
##  $ PREVAP - Angina Pectoris      : chr  "Free of disease angina pectoris" "Free of disease angina pectoris" "Free of disease angina pectoris" "Free of disease angina pectoris" ...
##  $ PREVMI - Myocardial Infarction: chr  "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" ...
##  $ PREVSTRK - Stroke History     : chr  "No stroke history" "No stroke history" "No stroke history" "No stroke history" ...
##  $ PREVHYP - Hypertension        : chr  "No prevalent hypertension" "Prevalent hypertension" "Prevalent hypertension" "Prevalent hypertension" ...
##  - attr(*, "na.action")= 'omit' Named int [1:1009] 1 3 8 15 25 26 28 29 33 35 ...
##   ..- attr(*, "names")= chr [1:1009] "2" "12" "29" "55" ...

Data Splitting

# Split the data into training and testing sets
set.seed(123)

# data_split <- df_clean |> 
#   dplyr::select(-c(TIME, TIMEAP, TIMEMI, TIMEMIFC, TIMECHD, TIMESTRK, TIMECVD, TIMEDTH, TIMEHYP, RANDID, DEATH, ANGINA, HOSPMI, MI_FCHD, ANYCHD, STROKE, HYPERTEN, `PREVCHD - Coronary Disease`, `PREVAP - Angina Pectoris`, `PREVMI - Myocardial Infarction`, `PREVSTRK - Stroke History`, `PREVHYP - Hypertension`)) |> 
#   #mutate_at(.vars = c(6, 9:17), .funs = as.factor) |> 
#   mutate(heart_attack = as.factor(CVD)) %>%
#   dplyr::select(-CVD) |> 
#   initial_split(prop = 0.75)

library(rsample)

## Warning: package 'rsample' was built under R version 4.3.3

# Perform the data split
data_split <- df_clean %>% 
  dplyr::select(-c(TIME, TIMEAP, TIMEMI, TIMEMIFC, TIMECHD, TIMESTRK, TIMECVD, TIMEDTH, TIMEHYP, RANDID, DEATH, ANGINA, HOSPMI, MI_FCHD, ANYCHD, STROKE, HYPERTEN, 
                   `PREVCHD - Coronary Disease`, `PREVAP - Angina Pectoris`, `PREVMI - Myocardial Infarction`, `PREVSTRK - Stroke History`, `PREVHYP - Hypertension`)) %>% 
  # Uncomment and modify if needed
  # mutate_at(.vars = c(6, 9:17), .funs = as.factor) %>% 
  #mutate(CVD = as.factor(CVD)) %>%
  initial_split(prop = 0.75)
# Training data
train_data <- training(data_split)
train_data |> write_csv("train2.csv")
saveRDS(train_data, file = "train_data.rds")

# Testing data
test_data <- testing(data_split)
test_data |> write_csv("test2.csv")
saveRDS(test_data, file = "test_data.rds")
str(test_data)

## 'data.frame':    498 obs. of  15 variables:
##  $ TOTCHOL : int  280 162 206 230 251 216 231 255 216 223 ...
##  $ AGE     : int  64 53 55 49 59 56 65 51 68 60 ...
##  $ SYSBP   : num  168 152 129 142 132 ...
##  $ DIABP   : num  100 101 85 90.5 77 70 70 98 84 100 ...
##  $ CIGPDAY : int  0 0 50 35 20 20 0 17 20 0 ...
##  $ BMI     : num  25.7 26.4 26.4 24.3 19.3 ...
##  $ BPMEDS  : int  0 0 0 0 0 0 1 0 0 1 ...
##  $ HEARTRTE: int  92 105 84 70 82 72 70 84 77 77 ...
##  $ GLUCOSE : int  82 78 69 61 76 49 86 90 70 86 ...
##  $ educ    : int  1 2 4 3 2 2 2 2 1 1 ...
##  $ PERIOD  : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ CVD     : int  0 0 0 1 0 1 0 0 0 0 ...
##  $ Gender  : chr  "Male" "Male" "Male" "Male" ...
##  $ Smoker  : chr  "Not current smoker" "Not current smoker" "Smoker" "Smoker" ...
##  $ Diabetic: chr  "Non Diabetic" "Non Diabetic" "Non Diabetic" "Non Diabetic" ...
##  - attr(*, "na.action")= 'omit' Named int [1:1009] 1 3 8 15 25 26 28 29 33 35 ...
##   ..- attr(*, "names")= chr [1:1009] "2" "12" "29" "55" ...

table(df_clean$CVD, df_clean$HOSPMI, useNA = "no")

##    
##        0    1
##   0 1547   15
##   1  260  169

df_clean |> 
  count(CVD, HOSPMI)

##   CVD HOSPMI    n
## 1   0      0 1547
## 2   0      1   15
## 3   1      0  260
## 4   1      1  169

Part 1 - Data Cleaning

JW

2024-02-24

Setup

Load libraries

Load data

Data Cleaning Functions

Data Transformation

Clean the Data

Descriptive Statistics Table

Raw data

Cleaned data

Write cleaned data

Data Splitting