library(tidyverse, quietly = TRUE)
library(DataExplorer, quietly = TRUE)
library(GGally, quietly = TRUE)
# Read in the cleaned dataset
df_clean <- read.csv("FHS_cleaned.csv")
# Display structure
str(df_clean)
## 'data.frame': 8154 obs. of 38 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ RANDID : int 2448 6238 6238 6238 9428 9428 10552 10552 11252 11252 ...
## $ TOTCHOL : int 195 250 260 237 245 283 225 232 285 343 ...
## $ AGE : int 39 46 52 58 48 54 61 67 46 51 ...
## $ SYSBP : num 106 121 105 108 128 ...
## $ DIABP : num 70 81 69.5 66 80 89 95 109 84 77 ...
## $ CIGPDAY : int 0 0 0 0 20 30 30 20 23 30 ...
## $ BMI : num 27 28.7 29.4 28.5 25.3 ...
## $ BPMEDS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HEARTRTE : int 80 95 80 80 75 75 65 60 85 90 ...
## $ GLUCOSE : int 77 76 86 71 70 87 103 89 85 72 ...
## $ educ : int 4 2 2 2 1 1 3 3 3 3 ...
## $ TIME : int 0 0 2156 4344 0 2199 0 1977 0 2072 ...
## $ PERIOD : int 1 1 2 3 1 2 1 2 1 2 ...
## $ DEATH : int 0 0 0 0 0 0 1 1 0 0 ...
## $ ANGINA : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HOSPMI : int 1 0 0 0 0 0 0 0 0 0 ...
## $ MI_FCHD : int 1 0 0 0 0 0 0 0 0 0 ...
## $ ANYCHD : int 1 0 0 0 0 0 0 0 0 0 ...
## $ STROKE : int 0 0 0 0 0 0 1 1 0 0 ...
## $ CVD : int 1 0 0 0 0 0 1 1 0 0 ...
## $ HYPERTEN : int 0 0 0 0 0 0 1 1 1 1 ...
## $ TIMEAP : int 8766 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
## $ TIMEMI : int 6438 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
## $ TIMEMIFC : int 6438 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
## $ TIMECHD : int 6438 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
## $ TIMESTRK : int 8766 8766 8766 8766 8766 8766 2089 2089 8766 8766 ...
## $ TIMECVD : int 6438 8766 8766 8766 8766 8766 2089 2089 8766 8766 ...
## $ TIMEDTH : int 8766 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
## $ TIMEHYP : int 8766 8766 8766 8766 8766 8766 0 0 4285 4285 ...
## $ Gender : chr "Male" "Female" "Female" "Female" ...
## $ Smoker : chr "Not current smoker" "Not current smoker" "Not current smoker" "Not current smoker" ...
## $ Diabetic : chr "Non Diabetic" "Non Diabetic" "Non Diabetic" "Non Diabetic" ...
## $ PREVCHD...Coronary.Disease : chr "Free of coronary disease" "Free of coronary disease" "Free of coronary disease" "Free of coronary disease" ...
## $ PREVAP...Angina.Pectoris : chr "Free of disease angina pectoris" "Free of disease angina pectoris" "Free of disease angina pectoris" "Free of disease angina pectoris" ...
## $ PREVMI...Myocardial.Infarction: chr "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" ...
## $ PREVSTRK...Stroke.History : chr "No stroke history" "No stroke history" "No stroke history" "No stroke history" ...
## $ PREVHYP...Hypertension : chr "No prevalent hypertension" "No prevalent hypertension" "No prevalent hypertension" "No prevalent hypertension" ...
df_clean %>%
plot_missing()
df_clean %>%
dplyr::select(-RANDID) %>%
dplyr::select(!matches("TIM")) %>%
plot_histogram()
df_clean %>%
dplyr::select(-RANDID) %>%
dplyr::select(matches("TIM")) %>%
plot_histogram()
df_clean %>%
plot_bar()
# Making one plot at a time
df_clean %>%
select(Gender) |>
plot_bar()
create_boxplot <- function(data, columns) {
data %>%
pivot_longer(cols = all_of(columns)) %>%
ggplot(aes(x = name, y = value)) +
geom_boxplot() +
labs(x = NULL) +
theme_bw(base_size = 14) +
facet_wrap(~name, scales = "free", ncol = 3)
}
# Subset of key numeric columns
key_columns <- c('TOTCHOL', 'AGE', 'SYSBP', 'DIABP', 'BMI', 'HEARTRTE', 'GLUCOSE')
# Create boxplots
create_boxplot(df_clean, key_columns)
df_clean %>%
dplyr::select(-c(X, RANDID)) |>
dplyr::select(!matches("TIM")) %>%
dplyr::select(where(is.numeric)) |>
plot_correlation()
# Prepare the data and create a ggpairs plot
df_clean %>%
# Remove unwanted columns
dplyr::select(-c(X, RANDID, BPMEDS, PERIOD, DEATH:HYPERTEN), CVD) %>%
# Remove columns with names matching "TIM"
dplyr::select(!matches("TIM")) %>%
# Keep only numeric columns and CVD
dplyr::select(where(is.numeric), CVD) %>%
# Convert CVD to factor
mutate(CVD = as.factor(CVD)) %>%
# Create a ggpairs plot
ggpairs(mapping = aes(color = CVD))
df_clean %>%
dplyr::select(-RANDID) %>%
dplyr::select(matches("TIM")) %>%
plot_correlation()