Setup

Load libraries

library(tidyverse, quietly = TRUE)
library(DataExplorer, quietly = TRUE)
library(GGally, quietly = TRUE)

Load data

# Read in the cleaned dataset
df_clean <- read.csv("FHS_cleaned.csv")

# Display structure
str(df_clean)
## 'data.frame':    1991 obs. of  38 variables:
##  $ X                             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ RANDID                        : int  6238 11263 14367 24721 33077 34689 36459 40435 45464 47561 ...
##  $ TOTCHOL                       : int  237 220 280 264 215 212 162 226 236 290 ...
##  $ AGE                           : int  58 55 64 51 60 49 53 54 64 56 ...
##  $ SYSBP                         : num  108 180 168 141 144 ...
##  $ DIABP                         : num  66 106 100 81 80 96 101 75 89 70 ...
##  $ CIGPDAY                       : int  0 0 0 15 10 10 0 40 20 40 ...
##  $ BMI                           : num  28.5 31.2 25.7 24.8 23 ...
##  $ BPMEDS                        : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ HEARTRTE                      : int  80 86 92 85 57 82 105 85 90 100 ...
##  $ GLUCOSE                       : int  71 81 82 97 91 84 78 102 80 90 ...
##  $ educ                          : int  2 2 1 2 3 2 2 2 3 2 ...
##  $ TIME                          : int  4344 4351 4438 4408 4383 4289 4411 4372 4368 4071 ...
##  $ PERIOD                        : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ DEATH                         : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ ANGINA                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HOSPMI                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MI_FCHD                       : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ ANYCHD                        : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ STROKE                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CVD                           : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ HYPERTEN                      : int  0 1 1 1 1 1 1 1 1 1 ...
##  $ TIMEAP                        : int  8766 8766 8766 6411 8766 8766 8766 8766 8766 8766 ...
##  $ TIMEMI                        : int  8766 8766 8766 6411 8766 8766 8766 8766 8766 8766 ...
##  $ TIMEMIFC                      : int  8766 5719 8766 6411 8766 8766 8766 8766 8766 8766 ...
##  $ TIMECHD                       : int  8766 5719 8766 6411 8766 8766 8766 8766 8766 8766 ...
##  $ TIMESTRK                      : int  8766 8766 8766 6411 8766 8766 8766 8766 8766 8766 ...
##  $ TIMECVD                       : int  8766 5719 8766 6411 8766 8766 8766 8766 8766 8766 ...
##  $ TIMEDTH                       : int  8766 8766 8766 6411 8766 8766 8766 8766 8766 8766 ...
##  $ TIMEHYP                       : int  8766 0 0 4408 0 2157 1469 5933 2177 0 ...
##  $ Gender                        : chr  "Female" "Female" "Male" "Female" ...
##  $ Smoker                        : chr  "Not current smoker" "Not current smoker" "Not current smoker" "Smoker" ...
##  $ Diabetic                      : chr  "Non Diabetic" "Diabetic" "Non Diabetic" "Non Diabetic" ...
##  $ PREVCHD...Coronary.Disease    : chr  "Free of coronary disease" "Free of coronary disease" "Free of coronary disease" "Free of coronary disease" ...
##  $ PREVAP...Angina.Pectoris      : chr  "Free of disease angina pectoris" "Free of disease angina pectoris" "Free of disease angina pectoris" "Free of disease angina pectoris" ...
##  $ PREVMI...Myocardial.Infarction: chr  "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" ...
##  $ PREVSTRK...Stroke.History     : chr  "No stroke history" "No stroke history" "No stroke history" "No stroke history" ...
##  $ PREVHYP...Hypertension        : chr  "No prevalent hypertension" "Prevalent hypertension" "Prevalent hypertension" "Prevalent hypertension" ...

Exploratory data analysis (EDA)

Missing data analysis

df_clean %>% 
  plot_missing()

Histograms

Numerical Variables (excluding “TIMXXX”)

df_clean %>%
  dplyr::select(-RANDID) %>%
  dplyr::select(!matches("TIM")) %>% 
  plot_histogram()

Numerical Variables (“TIMXXX” only)

df_clean %>%
  dplyr::select(-RANDID) %>%
  dplyr::select(matches("TIM")) %>% 
  plot_histogram()

Bar Charts for Categorical Variables

df_clean %>% 
  plot_bar()

# Making one plot at a time
df_clean %>% 
  select(Gender) |> 
  plot_bar()

Boxplots for Numerical Variables

Function to Create Boxplots

create_boxplot <- function(data, columns) {
  data %>% 
    pivot_longer(cols = all_of(columns)) %>% 
    ggplot(aes(x = name, y = value)) +
    geom_boxplot() +
    labs(x = NULL) +
    theme_bw(base_size = 14) +
    facet_wrap(~name, scales = "free", ncol = 3)
}

# Subset of key numeric columns
key_columns <- c('TOTCHOL', 'AGE', 'SYSBP', 'DIABP', 'BMI', 'HEARTRTE', 'GLUCOSE')

# Create boxplots
create_boxplot(df_clean, key_columns)

Correlation Analysis

Heatmap

df_clean %>%
  dplyr::select(-c(X, RANDID)) |> 
  dplyr::select(!matches("TIM")) %>%
  dplyr::select(where(is.numeric)) |> 
  plot_correlation()
## Warning in cor(x = structure(list(TOTCHOL = c(237L, 220L, 280L, 264L, 215L, :
## the standard deviation is zero
## Warning: Removed 36 rows containing missing values or values outside the scale range
## (`geom_text()`).

Scatter Plot Matrix

# Prepare the data and create a ggpairs plot
df_clean %>%
  # Remove unwanted columns
  dplyr::select(-c(X, RANDID, BPMEDS, PERIOD, DEATH:HYPERTEN), CVD) %>%
  # Remove columns with names matching "TIM"
  dplyr::select(!matches("TIM")) %>%
  # Keep only numeric columns and CVD
  dplyr::select(where(is.numeric), CVD) %>%
  # Convert CVD to factor
  mutate(CVD = as.factor(CVD)) %>%
  # Create a ggpairs plot
  ggpairs(mapping = aes(color = CVD))

Time Variables Heatmap

df_clean %>%
  dplyr::select(-RANDID) %>%
  dplyr::select(matches("TIM")) %>% 
  plot_correlation()