Setup

Load libraries

library(tidyverse, quietly = TRUE)
library(DataExplorer, quietly = TRUE)
library(GGally, quietly = TRUE)

Load data

# Read in the cleaned dataset
df_clean <- read.csv("FHS_cleaned.csv")

# Display structure
str(df_clean)

## 'data.frame':    1991 obs. of  38 variables:
##  $ X                             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ RANDID                        : int  6238 11263 14367 24721 33077 34689 36459 40435 45464 47561 ...
##  $ TOTCHOL                       : int  237 220 280 264 215 212 162 226 236 290 ...
##  $ AGE                           : int  58 55 64 51 60 49 53 54 64 56 ...
##  $ SYSBP                         : num  108 180 168 141 144 ...
##  $ DIABP                         : num  66 106 100 81 80 96 101 75 89 70 ...
##  $ CIGPDAY                       : int  0 0 0 15 10 10 0 40 20 40 ...
##  $ BMI                           : num  28.5 31.2 25.7 24.8 23 ...
##  $ BPMEDS                        : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ HEARTRTE                      : int  80 86 92 85 57 82 105 85 90 100 ...
##  $ GLUCOSE                       : int  71 81 82 97 91 84 78 102 80 90 ...
##  $ educ                          : int  2 2 1 2 3 2 2 2 3 2 ...
##  $ TIME                          : int  4344 4351 4438 4408 4383 4289 4411 4372 4368 4071 ...
##  $ PERIOD                        : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ DEATH                         : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ ANGINA                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HOSPMI                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MI_FCHD                       : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ ANYCHD                        : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ STROKE                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CVD                           : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ HYPERTEN                      : int  0 1 1 1 1 1 1 1 1 1 ...
##  $ TIMEAP                        : int  8766 8766 8766 6411 8766 8766 8766 8766 8766 8766 ...
##  $ TIMEMI                        : int  8766 8766 8766 6411 8766 8766 8766 8766 8766 8766 ...
##  $ TIMEMIFC                      : int  8766 5719 8766 6411 8766 8766 8766 8766 8766 8766 ...
##  $ TIMECHD                       : int  8766 5719 8766 6411 8766 8766 8766 8766 8766 8766 ...
##  $ TIMESTRK                      : int  8766 8766 8766 6411 8766 8766 8766 8766 8766 8766 ...
##  $ TIMECVD                       : int  8766 5719 8766 6411 8766 8766 8766 8766 8766 8766 ...
##  $ TIMEDTH                       : int  8766 8766 8766 6411 8766 8766 8766 8766 8766 8766 ...
##  $ TIMEHYP                       : int  8766 0 0 4408 0 2157 1469 5933 2177 0 ...
##  $ Gender                        : chr  "Female" "Female" "Male" "Female" ...
##  $ Smoker                        : chr  "Not current smoker" "Not current smoker" "Not current smoker" "Smoker" ...
##  $ Diabetic                      : chr  "Non Diabetic" "Diabetic" "Non Diabetic" "Non Diabetic" ...
##  $ PREVCHD...Coronary.Disease    : chr  "Free of coronary disease" "Free of coronary disease" "Free of coronary disease" "Free of coronary disease" ...
##  $ PREVAP...Angina.Pectoris      : chr  "Free of disease angina pectoris" "Free of disease angina pectoris" "Free of disease angina pectoris" "Free of disease angina pectoris" ...
##  $ PREVMI...Myocardial.Infarction: chr  "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" ...
##  $ PREVSTRK...Stroke.History     : chr  "No stroke history" "No stroke history" "No stroke history" "No stroke history" ...
##  $ PREVHYP...Hypertension        : chr  "No prevalent hypertension" "Prevalent hypertension" "Prevalent hypertension" "Prevalent hypertension" ...

Exploratory data analysis (EDA)

Missing data analysis

df_clean %>% 
  plot_missing()

Histograms

Numerical Variables (excluding “TIMXXX”)

df_clean %>%
  dplyr::select(-RANDID) %>%
  dplyr::select(!matches("TIM")) %>% 
  plot_histogram()

Numerical Variables (“TIMXXX” only)

df_clean %>%
  dplyr::select(-RANDID) %>%
  dplyr::select(matches("TIM")) %>% 
  plot_histogram()

Bar Charts for Categorical Variables

df_clean %>% 
  plot_bar()

# Making one plot at a time
df_clean %>% 
  select(Gender) |> 
  plot_bar()

Boxplots for Numerical Variables

Function to Create Boxplots

create_boxplot <- function(data, columns) {
  data %>% 
    pivot_longer(cols = all_of(columns)) %>% 
    ggplot(aes(x = name, y = value)) +
    geom_boxplot() +
    labs(x = NULL) +
    theme_bw(base_size = 14) +
    facet_wrap(~name, scales = "free", ncol = 3)
}

# Subset of key numeric columns
key_columns <- c('TOTCHOL', 'AGE', 'SYSBP', 'DIABP', 'BMI', 'HEARTRTE', 'GLUCOSE')

# Create boxplots
create_boxplot(df_clean, key_columns)

Correlation Analysis

Heatmap

df_clean %>%
  dplyr::select(-c(X, RANDID)) |> 
  dplyr::select(!matches("TIM")) %>%
  dplyr::select(where(is.numeric)) |> 
  plot_correlation()

## Warning in cor(x = structure(list(TOTCHOL = c(237L, 220L, 280L, 264L, 215L, :
## the standard deviation is zero

## Warning: Removed 36 rows containing missing values or values outside the scale range
## (`geom_text()`).

Scatter Plot Matrix

# Prepare the data and create a ggpairs plot
df_clean %>%
  # Remove unwanted columns
  dplyr::select(-c(X, RANDID, BPMEDS, PERIOD, DEATH:HYPERTEN), CVD) %>%
  # Remove columns with names matching "TIM"
  dplyr::select(!matches("TIM")) %>%
  # Keep only numeric columns and CVD
  dplyr::select(where(is.numeric), CVD) %>%
  # Convert CVD to factor
  mutate(CVD = as.factor(CVD)) %>%
  # Create a ggpairs plot
  ggpairs(mapping = aes(color = CVD))

Time Variables Heatmap

df_clean %>%
  dplyr::select(-RANDID) %>%
  dplyr::select(matches("TIM")) %>% 
  plot_correlation()

Part 2 - Exploratory Data Analysis (EDA)

Joyce D. Williams

2024-02-24

Setup

Load libraries

Load data

Exploratory data analysis (EDA)

Missing data analysis

Histograms

Numerical Variables (excluding “TIMXXX”)

Numerical Variables (“TIMXXX” only)

Bar Charts for Categorical Variables

Boxplots for Numerical Variables

Function to Create Boxplots

Correlation Analysis

Heatmap

Scatter Plot Matrix

Time Variables Heatmap