Setup

Load libraries

library(tidyverse, quietly = TRUE)
library(DataExplorer, quietly = TRUE)
library(GGally, quietly = TRUE)

Load data

# Read in the cleaned dataset
df_clean <- read.csv("FHS_cleaned.csv")

# Display structure
str(df_clean)
## 'data.frame':    8154 obs. of  38 variables:
##  $ X                             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ RANDID                        : int  2448 6238 6238 6238 9428 9428 10552 10552 11252 11252 ...
##  $ TOTCHOL                       : int  195 250 260 237 245 283 225 232 285 343 ...
##  $ AGE                           : int  39 46 52 58 48 54 61 67 46 51 ...
##  $ SYSBP                         : num  106 121 105 108 128 ...
##  $ DIABP                         : num  70 81 69.5 66 80 89 95 109 84 77 ...
##  $ CIGPDAY                       : int  0 0 0 0 20 30 30 20 23 30 ...
##  $ BMI                           : num  27 28.7 29.4 28.5 25.3 ...
##  $ BPMEDS                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HEARTRTE                      : int  80 95 80 80 75 75 65 60 85 90 ...
##  $ GLUCOSE                       : int  77 76 86 71 70 87 103 89 85 72 ...
##  $ educ                          : int  4 2 2 2 1 1 3 3 3 3 ...
##  $ TIME                          : int  0 0 2156 4344 0 2199 0 1977 0 2072 ...
##  $ PERIOD                        : int  1 1 2 3 1 2 1 2 1 2 ...
##  $ DEATH                         : int  0 0 0 0 0 0 1 1 0 0 ...
##  $ ANGINA                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HOSPMI                        : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ MI_FCHD                       : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ ANYCHD                        : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ STROKE                        : int  0 0 0 0 0 0 1 1 0 0 ...
##  $ CVD                           : int  1 0 0 0 0 0 1 1 0 0 ...
##  $ HYPERTEN                      : int  0 0 0 0 0 0 1 1 1 1 ...
##  $ TIMEAP                        : int  8766 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
##  $ TIMEMI                        : int  6438 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
##  $ TIMEMIFC                      : int  6438 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
##  $ TIMECHD                       : int  6438 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
##  $ TIMESTRK                      : int  8766 8766 8766 8766 8766 8766 2089 2089 8766 8766 ...
##  $ TIMECVD                       : int  6438 8766 8766 8766 8766 8766 2089 2089 8766 8766 ...
##  $ TIMEDTH                       : int  8766 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
##  $ TIMEHYP                       : int  8766 8766 8766 8766 8766 8766 0 0 4285 4285 ...
##  $ Gender                        : chr  "Male" "Female" "Female" "Female" ...
##  $ Smoker                        : chr  "Not current smoker" "Not current smoker" "Not current smoker" "Not current smoker" ...
##  $ Diabetic                      : chr  "Non Diabetic" "Non Diabetic" "Non Diabetic" "Non Diabetic" ...
##  $ PREVCHD...Coronary.Disease    : chr  "Free of coronary disease" "Free of coronary disease" "Free of coronary disease" "Free of coronary disease" ...
##  $ PREVAP...Angina.Pectoris      : chr  "Free of disease angina pectoris" "Free of disease angina pectoris" "Free of disease angina pectoris" "Free of disease angina pectoris" ...
##  $ PREVMI...Myocardial.Infarction: chr  "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" ...
##  $ PREVSTRK...Stroke.History     : chr  "No stroke history" "No stroke history" "No stroke history" "No stroke history" ...
##  $ PREVHYP...Hypertension        : chr  "No prevalent hypertension" "No prevalent hypertension" "No prevalent hypertension" "No prevalent hypertension" ...

Exploratory data analysis (EDA)

Missing data analysis

df_clean %>% 
  plot_missing()

Histograms

Numerical Variables (excluding “TIMXXX”)

df_clean %>%
  dplyr::select(-RANDID) %>%
  dplyr::select(!matches("TIM")) %>% 
  plot_histogram()

Numerical Variables (“TIMXXX” only)

df_clean %>%
  dplyr::select(-RANDID) %>%
  dplyr::select(matches("TIM")) %>% 
  plot_histogram()

Bar Charts for Categorical Variables

df_clean %>% 
  plot_bar()

# Making one plot at a time
df_clean %>% 
  select(Gender) |> 
  plot_bar()

Boxplots for Numerical Variables

Function to Create Boxplots

create_boxplot <- function(data, columns) {
  data %>% 
    pivot_longer(cols = all_of(columns)) %>% 
    ggplot(aes(x = name, y = value)) +
    geom_boxplot() +
    labs(x = NULL) +
    theme_bw(base_size = 14) +
    facet_wrap(~name, scales = "free", ncol = 3)
}

# Subset of key numeric columns
key_columns <- c('TOTCHOL', 'AGE', 'SYSBP', 'DIABP', 'BMI', 'HEARTRTE', 'GLUCOSE')

# Create boxplots
create_boxplot(df_clean, key_columns)

Correlation Analysis

Heatmap

df_clean %>%
  dplyr::select(-c(X, RANDID)) |> 
  dplyr::select(!matches("TIM")) %>%
  dplyr::select(where(is.numeric)) |> 
  plot_correlation()

Scatter Plot Matrix

# Prepare the data and create a ggpairs plot
df_clean %>%
  # Remove unwanted columns
  dplyr::select(-c(X, RANDID, BPMEDS, PERIOD, DEATH:HYPERTEN), CVD) %>%
  # Remove columns with names matching "TIM"
  dplyr::select(!matches("TIM")) %>%
  # Keep only numeric columns and CVD
  dplyr::select(where(is.numeric), CVD) %>%
  # Convert CVD to factor
  mutate(CVD = as.factor(CVD)) %>%
  # Create a ggpairs plot
  ggpairs(mapping = aes(color = CVD))

Time Variables Heatmap

df_clean %>%
  dplyr::select(-RANDID) %>%
  dplyr::select(matches("TIM")) %>% 
  plot_correlation()