library(tidyverse, quietly = TRUE)
library(DataExplorer, quietly = TRUE)
library(GGally, quietly = TRUE)
# Read in the cleaned dataset
df_clean <- read.csv("FHS_cleaned.csv")
# Display structure
str(df_clean)
## 'data.frame': 1991 obs. of 38 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ RANDID : int 6238 11263 14367 24721 33077 34689 36459 40435 45464 47561 ...
## $ TOTCHOL : int 237 220 280 264 215 212 162 226 236 290 ...
## $ AGE : int 58 55 64 51 60 49 53 54 64 56 ...
## $ SYSBP : num 108 180 168 141 144 ...
## $ DIABP : num 66 106 100 81 80 96 101 75 89 70 ...
## $ CIGPDAY : int 0 0 0 15 10 10 0 40 20 40 ...
## $ BMI : num 28.5 31.2 25.7 24.8 23 ...
## $ BPMEDS : int 0 1 0 0 0 0 0 0 0 0 ...
## $ HEARTRTE : int 80 86 92 85 57 82 105 85 90 100 ...
## $ GLUCOSE : int 71 81 82 97 91 84 78 102 80 90 ...
## $ educ : int 2 2 1 2 3 2 2 2 3 2 ...
## $ TIME : int 4344 4351 4438 4408 4383 4289 4411 4372 4368 4071 ...
## $ PERIOD : int 3 3 3 3 3 3 3 3 3 3 ...
## $ DEATH : int 0 0 0 1 0 0 0 0 0 0 ...
## $ ANGINA : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HOSPMI : int 0 0 0 0 0 0 0 0 0 0 ...
## $ MI_FCHD : int 0 1 0 0 0 0 0 0 0 0 ...
## $ ANYCHD : int 0 1 0 0 0 0 0 0 0 0 ...
## $ STROKE : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CVD : int 0 1 0 0 0 0 0 0 0 0 ...
## $ HYPERTEN : int 0 1 1 1 1 1 1 1 1 1 ...
## $ TIMEAP : int 8766 8766 8766 6411 8766 8766 8766 8766 8766 8766 ...
## $ TIMEMI : int 8766 8766 8766 6411 8766 8766 8766 8766 8766 8766 ...
## $ TIMEMIFC : int 8766 5719 8766 6411 8766 8766 8766 8766 8766 8766 ...
## $ TIMECHD : int 8766 5719 8766 6411 8766 8766 8766 8766 8766 8766 ...
## $ TIMESTRK : int 8766 8766 8766 6411 8766 8766 8766 8766 8766 8766 ...
## $ TIMECVD : int 8766 5719 8766 6411 8766 8766 8766 8766 8766 8766 ...
## $ TIMEDTH : int 8766 8766 8766 6411 8766 8766 8766 8766 8766 8766 ...
## $ TIMEHYP : int 8766 0 0 4408 0 2157 1469 5933 2177 0 ...
## $ Gender : chr "Female" "Female" "Male" "Female" ...
## $ Smoker : chr "Not current smoker" "Not current smoker" "Not current smoker" "Smoker" ...
## $ Diabetic : chr "Non Diabetic" "Diabetic" "Non Diabetic" "Non Diabetic" ...
## $ PREVCHD...Coronary.Disease : chr "Free of coronary disease" "Free of coronary disease" "Free of coronary disease" "Free of coronary disease" ...
## $ PREVAP...Angina.Pectoris : chr "Free of disease angina pectoris" "Free of disease angina pectoris" "Free of disease angina pectoris" "Free of disease angina pectoris" ...
## $ PREVMI...Myocardial.Infarction: chr "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" ...
## $ PREVSTRK...Stroke.History : chr "No stroke history" "No stroke history" "No stroke history" "No stroke history" ...
## $ PREVHYP...Hypertension : chr "No prevalent hypertension" "Prevalent hypertension" "Prevalent hypertension" "Prevalent hypertension" ...
df_clean %>%
plot_missing()
df_clean %>%
dplyr::select(-RANDID) %>%
dplyr::select(!matches("TIM")) %>%
plot_histogram()
df_clean %>%
dplyr::select(-RANDID) %>%
dplyr::select(matches("TIM")) %>%
plot_histogram()
df_clean %>%
plot_bar()
# Making one plot at a time
df_clean %>%
select(Gender) |>
plot_bar()
create_boxplot <- function(data, columns) {
data %>%
pivot_longer(cols = all_of(columns)) %>%
ggplot(aes(x = name, y = value)) +
geom_boxplot() +
labs(x = NULL) +
theme_bw(base_size = 14) +
facet_wrap(~name, scales = "free", ncol = 3)
}
# Subset of key numeric columns
key_columns <- c('TOTCHOL', 'AGE', 'SYSBP', 'DIABP', 'BMI', 'HEARTRTE', 'GLUCOSE')
# Create boxplots
create_boxplot(df_clean, key_columns)
df_clean %>%
dplyr::select(-c(X, RANDID)) |>
dplyr::select(!matches("TIM")) %>%
dplyr::select(where(is.numeric)) |>
plot_correlation()
## Warning in cor(x = structure(list(TOTCHOL = c(237L, 220L, 280L, 264L, 215L, :
## the standard deviation is zero
## Warning: Removed 36 rows containing missing values or values outside the scale range
## (`geom_text()`).
# Prepare the data and create a ggpairs plot
df_clean %>%
# Remove unwanted columns
dplyr::select(-c(X, RANDID, BPMEDS, PERIOD, DEATH:HYPERTEN), CVD) %>%
# Remove columns with names matching "TIM"
dplyr::select(!matches("TIM")) %>%
# Keep only numeric columns and CVD
dplyr::select(where(is.numeric), CVD) %>%
# Convert CVD to factor
mutate(CVD = as.factor(CVD)) %>%
# Create a ggpairs plot
ggpairs(mapping = aes(color = CVD))
df_clean %>%
dplyr::select(-RANDID) %>%
dplyr::select(matches("TIM")) %>%
plot_correlation()