Setup

Load libraries

library(tidyverse, quietly = TRUE)
library(DataExplorer, quietly = TRUE)
library(GGally, quietly = TRUE)

Load data

# Read in the cleaned dataset
df_clean <- read.csv("FHS_cleaned.csv")

# Display structure
str(df_clean)

## 'data.frame':    8154 obs. of  38 variables:
##  $ X                             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ RANDID                        : int  2448 6238 6238 6238 9428 9428 10552 10552 11252 11252 ...
##  $ TOTCHOL                       : int  195 250 260 237 245 283 225 232 285 343 ...
##  $ AGE                           : int  39 46 52 58 48 54 61 67 46 51 ...
##  $ SYSBP                         : num  106 121 105 108 128 ...
##  $ DIABP                         : num  70 81 69.5 66 80 89 95 109 84 77 ...
##  $ CIGPDAY                       : int  0 0 0 0 20 30 30 20 23 30 ...
##  $ BMI                           : num  27 28.7 29.4 28.5 25.3 ...
##  $ BPMEDS                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HEARTRTE                      : int  80 95 80 80 75 75 65 60 85 90 ...
##  $ GLUCOSE                       : int  77 76 86 71 70 87 103 89 85 72 ...
##  $ educ                          : int  4 2 2 2 1 1 3 3 3 3 ...
##  $ TIME                          : int  0 0 2156 4344 0 2199 0 1977 0 2072 ...
##  $ PERIOD                        : int  1 1 2 3 1 2 1 2 1 2 ...
##  $ DEATH                         : int  0 0 0 0 0 0 1 1 0 0 ...
##  $ ANGINA                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HOSPMI                        : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ MI_FCHD                       : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ ANYCHD                        : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ STROKE                        : int  0 0 0 0 0 0 1 1 0 0 ...
##  $ CVD                           : int  1 0 0 0 0 0 1 1 0 0 ...
##  $ HYPERTEN                      : int  0 0 0 0 0 0 1 1 1 1 ...
##  $ TIMEAP                        : int  8766 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
##  $ TIMEMI                        : int  6438 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
##  $ TIMEMIFC                      : int  6438 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
##  $ TIMECHD                       : int  6438 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
##  $ TIMESTRK                      : int  8766 8766 8766 8766 8766 8766 2089 2089 8766 8766 ...
##  $ TIMECVD                       : int  6438 8766 8766 8766 8766 8766 2089 2089 8766 8766 ...
##  $ TIMEDTH                       : int  8766 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
##  $ TIMEHYP                       : int  8766 8766 8766 8766 8766 8766 0 0 4285 4285 ...
##  $ Gender                        : chr  "Male" "Female" "Female" "Female" ...
##  $ Smoker                        : chr  "Not current smoker" "Not current smoker" "Not current smoker" "Not current smoker" ...
##  $ Diabetic                      : chr  "Non Diabetic" "Non Diabetic" "Non Diabetic" "Non Diabetic" ...
##  $ PREVCHD...Coronary.Disease    : chr  "Free of coronary disease" "Free of coronary disease" "Free of coronary disease" "Free of coronary disease" ...
##  $ PREVAP...Angina.Pectoris      : chr  "Free of disease angina pectoris" "Free of disease angina pectoris" "Free of disease angina pectoris" "Free of disease angina pectoris" ...
##  $ PREVMI...Myocardial.Infarction: chr  "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" ...
##  $ PREVSTRK...Stroke.History     : chr  "No stroke history" "No stroke history" "No stroke history" "No stroke history" ...
##  $ PREVHYP...Hypertension        : chr  "No prevalent hypertension" "No prevalent hypertension" "No prevalent hypertension" "No prevalent hypertension" ...

Exploratory data analysis (EDA)

Missing data analysis

df_clean %>% 
  plot_missing()

Histograms

Numerical Variables (excluding “TIMXXX”)

df_clean %>%
  dplyr::select(-RANDID) %>%
  dplyr::select(!matches("TIM")) %>% 
  plot_histogram()

Numerical Variables (“TIMXXX” only)

df_clean %>%
  dplyr::select(-RANDID) %>%
  dplyr::select(matches("TIM")) %>% 
  plot_histogram()

Bar Charts for Categorical Variables

df_clean %>% 
  plot_bar()

# Making one plot at a time
df_clean %>% 
  select(Gender) |> 
  plot_bar()

Boxplots for Numerical Variables

Function to Create Boxplots

create_boxplot <- function(data, columns) {
  data %>% 
    pivot_longer(cols = all_of(columns)) %>% 
    ggplot(aes(x = name, y = value)) +
    geom_boxplot() +
    labs(x = NULL) +
    theme_bw(base_size = 14) +
    facet_wrap(~name, scales = "free", ncol = 3)
}

# Subset of key numeric columns
key_columns <- c('TOTCHOL', 'AGE', 'SYSBP', 'DIABP', 'BMI', 'HEARTRTE', 'GLUCOSE')

# Create boxplots
create_boxplot(df_clean, key_columns)

Correlation Analysis

Heatmap

df_clean %>%
  dplyr::select(-c(X, RANDID)) |> 
  dplyr::select(!matches("TIM")) %>%
  dplyr::select(where(is.numeric)) |> 
  plot_correlation()

Scatter Plot Matrix

# Prepare the data and create a ggpairs plot
df_clean %>%
  # Remove unwanted columns
  dplyr::select(-c(X, RANDID, BPMEDS, PERIOD, DEATH:HYPERTEN), CVD) %>%
  # Remove columns with names matching "TIM"
  dplyr::select(!matches("TIM")) %>%
  # Keep only numeric columns and CVD
  dplyr::select(where(is.numeric), CVD) %>%
  # Convert CVD to factor
  mutate(CVD = as.factor(CVD)) %>%
  # Create a ggpairs plot
  ggpairs(mapping = aes(color = CVD))

Time Variables Heatmap

df_clean %>%
  dplyr::select(-RANDID) %>%
  dplyr::select(matches("TIM")) %>% 
  plot_correlation()

Part 2 - Exploratory Data Analysis (EDA)

Joyce D. Williams

2024-02-24

Setup

Load libraries

Load data

Exploratory data analysis (EDA)

Missing data analysis

Histograms

Numerical Variables (excluding “TIMXXX”)

Numerical Variables (“TIMXXX” only)

Bar Charts for Categorical Variables

Boxplots for Numerical Variables

Function to Create Boxplots

Correlation Analysis

Heatmap

Scatter Plot Matrix

Time Variables Heatmap