library(tidyverse, quietly = TRUE)
#library(tidymodels, quietly = TRUE)
library(psych, quietly = TRUE)
library(DataExplorer, quietly = TRUE)
library(flextable, quietly = TRUE)
getwd()
## [1] "C:/Users/tsapa/Downloads"
setwd("C:/Users/tsapa/Downloads")
# Read in the dataset
df_frmgham2 <- read.csv("frmgham2.csv")
# Display structure and summary
str(df_frmgham2)
## 'data.frame': 11627 obs. of 39 variables:
## $ RANDID : int 2448 2448 6238 6238 6238 9428 9428 10552 10552 11252 ...
## $ SEX : int 1 1 2 2 2 1 1 2 2 2 ...
## $ TOTCHOL : int 195 209 250 260 237 245 283 225 232 285 ...
## $ AGE : int 39 52 46 52 58 48 54 61 67 46 ...
## $ SYSBP : num 106 121 121 105 108 ...
## $ DIABP : num 70 66 81 69.5 66 80 89 95 109 84 ...
## $ CURSMOKE: int 0 0 0 0 0 1 1 1 1 1 ...
## $ CIGPDAY : int 0 0 0 0 0 20 30 30 20 23 ...
## $ BMI : num 27 NA 28.7 29.4 28.5 ...
## $ DIABETES: int 0 0 0 0 0 0 0 0 0 0 ...
## $ BPMEDS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HEARTRTE: int 80 69 95 80 80 75 75 65 60 85 ...
## $ GLUCOSE : int 77 92 76 86 71 70 87 103 89 85 ...
## $ educ : int 4 4 2 2 2 1 1 3 3 3 ...
## $ PREVCHD : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PREVAP : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PREVMI : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PREVSTRK: int 0 0 0 0 0 0 0 0 0 0 ...
## $ PREVHYP : int 0 0 0 0 0 0 0 1 1 0 ...
## $ TIME : int 0 4628 0 2156 4344 0 2199 0 1977 0 ...
## $ PERIOD : int 1 3 1 2 3 1 2 1 2 1 ...
## $ HDLC : int NA 31 NA NA 54 NA NA NA NA NA ...
## $ LDLC : int NA 178 NA NA 141 NA NA NA NA NA ...
## $ DEATH : int 0 0 0 0 0 0 0 1 1 0 ...
## $ ANGINA : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HOSPMI : int 1 1 0 0 0 0 0 0 0 0 ...
## $ MI_FCHD : int 1 1 0 0 0 0 0 0 0 0 ...
## $ ANYCHD : int 1 1 0 0 0 0 0 0 0 0 ...
## $ STROKE : int 0 0 0 0 0 0 0 1 1 0 ...
## $ CVD : int 1 1 0 0 0 0 0 1 1 0 ...
## $ HYPERTEN: int 0 0 0 0 0 0 0 1 1 1 ...
## $ TIMEAP : int 8766 8766 8766 8766 8766 8766 8766 2956 2956 8766 ...
## $ TIMEMI : int 6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
## $ TIMEMIFC: int 6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
## $ TIMECHD : int 6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
## $ TIMESTRK: int 8766 8766 8766 8766 8766 8766 8766 2089 2089 8766 ...
## $ TIMECVD : int 6438 6438 8766 8766 8766 8766 8766 2089 2089 8766 ...
## $ TIMEDTH : int 8766 8766 8766 8766 8766 8766 8766 2956 2956 8766 ...
## $ TIMEHYP : int 8766 8766 8766 8766 8766 8766 8766 0 0 4285 ...
df_frmgham2 <- df_frmgham2[df_frmgham2$PERIOD == 3,]
# Corrected plotting for missing values
df_frmgham2 %>%
plot_missing()
# Corrected descriptive statistics excluding certain columns
df_frmgham2 %>%
dplyr::select(-RANDID) %>%
describe() %>%
dplyr::select(-c(trimmed, mad, skew, kurtosis))
## vars n mean sd median min max range se
## SEX 1 3263 1.57 0.49 2.00 1.00 2.0 1.00 0.01
## TOTCHOL 2 3049 236.71 44.45 234.00 112.00 625.0 513.00 0.80
## AGE 3 3263 60.65 8.30 60.00 44.00 81.0 37.00 0.15
## SYSBP 4 3263 140.22 22.93 137.00 86.00 267.0 181.00 0.40
## DIABP 5 3263 81.79 11.27 80.00 30.00 130.0 100.00 0.20
## CURSMOKE 6 3263 0.34 0.47 0.00 0.00 1.0 1.00 0.01
## CIGPDAY 7 3249 6.77 11.63 0.00 0.00 80.0 80.00 0.20
## BMI 8 3246 25.89 4.08 25.46 14.43 56.8 42.37 0.07
## DIABETES 9 3263 0.08 0.27 0.00 0.00 1.0 1.00 0.00
## BPMEDS 10 2817 0.15 0.36 0.00 0.00 1.0 1.00 0.01
## HEARTRTE 11 3259 77.36 12.49 76.00 37.00 150.0 113.00 0.22
## GLUCOSE 12 2701 89.78 28.16 84.00 46.00 478.0 432.00 0.54
## educ 13 3181 2.01 1.03 2.00 1.00 4.0 3.00 0.02
## PREVCHD 14 3263 0.11 0.31 0.00 0.00 1.0 1.00 0.01
## PREVAP 15 3263 0.08 0.27 0.00 0.00 1.0 1.00 0.00
## PREVMI 16 3263 0.05 0.21 0.00 0.00 1.0 1.00 0.00
## PREVSTRK 17 3263 0.02 0.14 0.00 0.00 1.0 1.00 0.00
## PREVHYP 18 3263 0.60 0.49 1.00 0.00 1.0 1.00 0.01
## TIME 19 3263 4353.67 95.10 4361.00 3748.00 4854.0 1106.00 1.66
## PERIOD 20 3263 3.00 0.00 3.00 3.00 3.0 0.00 0.00
## HDLC 21 3027 49.36 15.63 48.00 10.00 189.0 179.00 0.28
## LDLC 22 3026 176.47 46.86 173.00 20.00 565.0 545.00 0.85
## DEATH 23 3263 0.24 0.43 0.00 0.00 1.0 1.00 0.01
## ANGINA 24 3263 0.16 0.37 0.00 0.00 1.0 1.00 0.01
## HOSPMI 25 3263 0.09 0.29 0.00 0.00 1.0 1.00 0.01
## MI_FCHD 26 3263 0.14 0.35 0.00 0.00 1.0 1.00 0.01
## ANYCHD 27 3263 0.26 0.44 0.00 0.00 1.0 1.00 0.01
## STROKE 28 3263 0.09 0.28 0.00 0.00 1.0 1.00 0.00
## CVD 29 3263 0.23 0.42 0.00 0.00 1.0 1.00 0.01
## HYPERTEN 30 3263 0.75 0.43 1.00 0.00 1.0 1.00 0.01
## TIMEAP 31 3263 7678.69 2058.41 8766.00 0.00 8766.0 8766.00 36.03
## TIMEMI 32 3263 8034.80 1589.87 8766.00 0.00 8766.0 8766.00 27.83
## TIMEMIFC 33 3263 7988.03 1662.14 8766.00 0.00 8766.0 8766.00 29.10
## TIMECHD 34 3263 7445.92 2279.22 8766.00 0.00 8766.0 8766.00 39.90
## TIMESTRK 35 3263 8104.21 1392.10 8766.00 0.00 8766.0 8766.00 24.37
## TIMECVD 36 3263 7609.05 2134.32 8766.00 0.00 8766.0 8766.00 37.36
## TIMEDTH 37 3263 8286.02 1045.52 8766.00 4182.00 8766.0 4584.00 18.30
## TIMEHYP 38 3263 3834.51 3476.47 2941.00 0.00 8766.0 8766.00 60.86
# Count unique observations in RANDID
unique_randids <- df_frmgham2 %>%
distinct(RANDID) %>%
nrow()
print(unique_randids)
## [1] 3263
# Print first and last few rows
head(df_frmgham2)
## RANDID SEX TOTCHOL AGE SYSBP DIABP CURSMOKE CIGPDAY BMI DIABETES BPMEDS
## 2 2448 1 209 52 121 66 0 0 NA 0 0
## 5 6238 2 237 58 108 66 0 0 28.50 0 0
## 12 11252 2 NA 58 155 90 1 30 24.61 0 0
## 15 11263 2 220 55 180 106 0 0 31.17 1 1
## 20 12806 2 320 57 110 46 1 30 22.02 0 0
## 23 14367 1 280 64 168 100 0 0 25.72 0 0
## HEARTRTE GLUCOSE educ PREVCHD PREVAP PREVMI PREVSTRK PREVHYP TIME PERIOD
## 2 69 92 4 0 0 0 0 0 4628 3
## 5 80 71 2 0 0 0 0 0 4344 3
## 12 74 NA 3 0 0 0 0 1 4285 3
## 15 86 81 2 0 0 0 0 1 4351 3
## 20 75 87 2 0 0 0 0 0 4289 3
## 23 92 82 1 0 0 0 0 1 4438 3
## HDLC LDLC DEATH ANGINA HOSPMI MI_FCHD ANYCHD STROKE CVD HYPERTEN TIMEAP
## 2 31 178 0 0 1 1 1 0 1 0 8766
## 5 54 141 0 0 0 0 0 0 0 0 8766
## 12 NA NA 0 0 0 0 0 0 0 1 8766
## 15 46 135 0 0 0 1 1 0 1 1 8766
## 20 34 286 0 0 0 0 0 0 0 1 8766
## 23 44 236 0 0 0 0 0 0 0 1 8766
## TIMEMI TIMEMIFC TIMECHD TIMESTRK TIMECVD TIMEDTH TIMEHYP
## 2 6438 6438 6438 8766 6438 8766 8766
## 5 8766 8766 8766 8766 8766 8766 8766
## 12 8766 8766 8766 8766 8766 8766 4285
## 15 8766 5719 5719 8766 5719 8766 0
## 20 8766 8766 8766 8766 8766 8766 8679
## 23 8766 8766 8766 8766 8766 8766 0
tail(df_frmgham2)
## RANDID SEX TOTCHOL AGE SYSBP DIABP CURSMOKE CIGPDAY BMI DIABETES
## 11607 9982118 1 219 70 163.5 75 0 0 25.26 0
## 11613 9989287 1 NA 63 120.0 80 1 50 17.38 0
## 11616 9990894 2 228 60 205.0 100 1 20 22.37 0
## 11619 9993179 2 251 56 145.0 92 1 35 21.97 0
## 11624 9998212 1 153 52 143.0 89 0 0 25.74 0
## 11627 9999312 2 NA 50 147.0 96 1 10 24.19 0
## BPMEDS HEARTRTE GLUCOSE educ PREVCHD PREVAP PREVMI PREVSTRK PREVHYP TIME
## 11607 0 65 103 3 0 0 0 0 1 4429
## 11613 0 76 NA 3 0 0 0 0 0 4439
## 11616 0 80 91 2 0 0 0 0 1 4417
## 11619 1 95 90 1 0 0 0 0 1 4396
## 11624 0 65 72 3 0 0 0 0 1 4538
## 11627 0 94 NA 3 0 0 0 0 1 4201
## PERIOD HDLC LDLC DEATH ANGINA HOSPMI MI_FCHD ANYCHD STROKE CVD HYPERTEN
## 11607 3 50 169 1 0 0 0 0 1 1 1
## 11613 3 NA NA 1 0 0 0 0 0 0 0
## 11616 3 47 181 1 0 0 0 0 0 0 1
## 11619 3 70 181 1 0 0 0 0 0 0 1
## 11624 3 30 123 0 0 0 0 0 0 0 1
## 11627 3 NA NA 0 0 0 0 0 0 0 1
## TIMEAP TIMEMI TIMEMIFC TIMECHD TIMESTRK TIMECVD TIMEDTH TIMEHYP
## 11607 8457 8457 8457 8457 8346 8346 8457 0
## 11613 7746 7746 7746 7746 7746 7746 7746 7746
## 11616 6433 6433 6433 6433 6433 6433 6433 2219
## 11619 6729 6729 6729 6729 6729 6729 6729 4396
## 11624 8766 8766 8766 8766 8766 8766 8766 0
## 11627 8766 8766 8766 8766 8766 8766 8766 4201
# Function to remove outliers
is_not_outlier <- function(x) {
Q1 <- quantile(x, 0.25)
Q3 <- quantile(x, 0.75)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
x >= lower_bound & x <= upper_bound
}
# Function to clean dataset
clean_data <- function(df) {
df %>%
dplyr::select(-c(LDLC, HDLC)) %>%
na.omit() %>%
filter(if_all(c(TOTCHOL, AGE, SYSBP, DIABP, BMI, HEARTRTE, GLUCOSE), is_not_outlier))
}
final_data <- df_frmgham2 %>%
# Sex
mutate("Gender" = case_when(SEX == 1 ~ "Male",
SEX == 2 ~ "Female")) %>%
# Education
# mutate("Education Level" = case_when(
# EDUCATION == 1 ~ "0 - 11th Grade",
# EDUCATION == 2 ~ "High School Diploma/GED",
# EDUCATION == 3 ~ "Some College/Vocational School",
# EDUCATION == 4 ~ "College degree (BA, BS) or higher"
# )
# ) %>%
# Smoking
mutate("Smoker" = case_when(
CURSMOKE == 0 ~ "Not current smoker",
CURSMOKE == 1 ~ "Smoker"
)) %>%
# mutate("Cigarettes/per day" = case_when(
# CIGPDAY == 0 ~ "Not current smoker",
# CIGPDAY >= 1 ~ "Cigarettes Per Day"
# )) %>%
# Diabetes
mutate("Diabetic" = case_when(
DIABETES == 0 ~ "Non Diabetic",
DIABETES == 1 ~ "Diabetic"
)) %>%
# Previous conditions
mutate("PREVCHD - Coronary Disease" = case_when(
PREVCHD == 0 ~ "Free of coronary disease",
PREVCHD == 1 ~ "Prevalent coronary disease"
)) %>%
mutate("PREVAP - Angina Pectoris" = case_when(
PREVAP == 0 ~ "Free of disease angina pectoris",
PREVAP == 1 ~ "Prevalent disease angina pectoris"
)) %>%
mutate("PREVMI - Myocardial Infarction" = case_when(
PREVMI == 0 ~ "Free of prevalent myocardial infarction",
PREVMI == 1 ~ "Prevalent myocardial infarction"
)) %>%
mutate("PREVSTRK - Stroke History" = case_when(
PREVSTRK == 0 ~ "No stroke history",
PREVSTRK == 1 ~ "History of stroke"
)) %>%
mutate("PREVHYP - Hypertension" = case_when(
PREVHYP == 0 ~ "No prevalent hypertension",
PREVHYP == 1 ~ "Prevalent hypertension"
)) %>%
# dplyr::select(-c(SEX, EDUCATION, CURSMOKE, CIGPDAY, DIABETES, PREVCHD, PREVAP, PREVMI, PREVSTRK, PREVHYP))
dplyr::select(-c(SEX, CURSMOKE, DIABETES, PREVCHD, PREVAP, PREVMI, PREVSTRK, PREVHYP))
df_clean <- clean_data(final_data)
dim(df_clean)
## [1] 1991 37
str(df_clean)
## 'data.frame': 1991 obs. of 37 variables:
## $ RANDID : int 6238 11263 14367 24721 33077 34689 36459 40435 45464 47561 ...
## $ TOTCHOL : int 237 220 280 264 215 212 162 226 236 290 ...
## $ AGE : int 58 55 64 51 60 49 53 54 64 56 ...
## $ SYSBP : num 108 180 168 141 144 ...
## $ DIABP : num 66 106 100 81 80 96 101 75 89 70 ...
## $ CIGPDAY : int 0 0 0 15 10 10 0 40 20 40 ...
## $ BMI : num 28.5 31.2 25.7 24.8 23 ...
## $ BPMEDS : int 0 1 0 0 0 0 0 0 0 0 ...
## $ HEARTRTE : int 80 86 92 85 57 82 105 85 90 100 ...
## $ GLUCOSE : int 71 81 82 97 91 84 78 102 80 90 ...
## $ educ : int 2 2 1 2 3 2 2 2 3 2 ...
## $ TIME : int 4344 4351 4438 4408 4383 4289 4411 4372 4368 4071 ...
## $ PERIOD : int 3 3 3 3 3 3 3 3 3 3 ...
## $ DEATH : int 0 0 0 1 0 0 0 0 0 0 ...
## $ ANGINA : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HOSPMI : int 0 0 0 0 0 0 0 0 0 0 ...
## $ MI_FCHD : int 0 1 0 0 0 0 0 0 0 0 ...
## $ ANYCHD : int 0 1 0 0 0 0 0 0 0 0 ...
## $ STROKE : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CVD : int 0 1 0 0 0 0 0 0 0 0 ...
## $ HYPERTEN : int 0 1 1 1 1 1 1 1 1 1 ...
## $ TIMEAP : int 8766 8766 8766 6411 8766 8766 8766 8766 8766 8766 ...
## $ TIMEMI : int 8766 8766 8766 6411 8766 8766 8766 8766 8766 8766 ...
## $ TIMEMIFC : int 8766 5719 8766 6411 8766 8766 8766 8766 8766 8766 ...
## $ TIMECHD : int 8766 5719 8766 6411 8766 8766 8766 8766 8766 8766 ...
## $ TIMESTRK : int 8766 8766 8766 6411 8766 8766 8766 8766 8766 8766 ...
## $ TIMECVD : int 8766 5719 8766 6411 8766 8766 8766 8766 8766 8766 ...
## $ TIMEDTH : int 8766 8766 8766 6411 8766 8766 8766 8766 8766 8766 ...
## $ TIMEHYP : int 8766 0 0 4408 0 2157 1469 5933 2177 0 ...
## $ Gender : chr "Female" "Female" "Male" "Female" ...
## $ Smoker : chr "Not current smoker" "Not current smoker" "Not current smoker" "Smoker" ...
## $ Diabetic : chr "Non Diabetic" "Diabetic" "Non Diabetic" "Non Diabetic" ...
## $ PREVCHD - Coronary Disease : chr "Free of coronary disease" "Free of coronary disease" "Free of coronary disease" "Free of coronary disease" ...
## $ PREVAP - Angina Pectoris : chr "Free of disease angina pectoris" "Free of disease angina pectoris" "Free of disease angina pectoris" "Free of disease angina pectoris" ...
## $ PREVMI - Myocardial Infarction: chr "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" ...
## $ PREVSTRK - Stroke History : chr "No stroke history" "No stroke history" "No stroke history" "No stroke history" ...
## $ PREVHYP - Hypertension : chr "No prevalent hypertension" "Prevalent hypertension" "Prevalent hypertension" "Prevalent hypertension" ...
## - attr(*, "na.action")= 'omit' Named int [1:1009] 1 3 8 15 25 26 28 29 33 35 ...
## ..- attr(*, "names")= chr [1:1009] "2" "12" "29" "55" ...
df_frmgham2 %>%
pivot_longer(cols = c(TOTCHOL, AGE, SYSBP, DIABP, BMI, HEARTRTE, HDLC, LDLC, GLUCOSE)) %>%
group_by(name) %>%
summarise(
N = n(),
Mean = mean(value, na.rm = TRUE),
SD = sd(value, na.rm = TRUE),
Median = median(value, na.rm = TRUE),
Min = min(value, na.rm = TRUE),
Max = max(value, na.rm = TRUE),
Q1 = quantile(value, probs = 0.25, na.rm = TRUE),
Q3 = quantile(value, probs = 0.75, na.rm = TRUE),
IQR = Q3 - Q1,
Lower_bound = Q1 - 1.5 * IQR,
Upper_bound = Q3 + 1.5 * IQR
) %>%
mutate(across(where(is.numeric), round, 2)) %>%
flextable() %>%
autofit()
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(where(is.numeric), round, 2)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
name | N | Mean | SD | Median | Min | Max | Q1 | Q3 | IQR | Lower_bound | Upper_bound |
---|---|---|---|---|---|---|---|---|---|---|---|
AGE | 3,263 | 60.65 | 8.30 | 60.00 | 44.00 | 81.0 | 54.00 | 67.00 | 13.00 | 34.50 | 86.50 |
BMI | 3,263 | 25.89 | 4.08 | 25.46 | 14.43 | 56.8 | 23.19 | 28.06 | 4.87 | 15.89 | 35.36 |
DIABP | 3,263 | 81.79 | 11.27 | 80.00 | 30.00 | 130.0 | 74.00 | 89.00 | 15.00 | 51.50 | 111.50 |
GLUCOSE | 3,263 | 89.78 | 28.16 | 84.00 | 46.00 | 478.0 | 76.00 | 96.00 | 20.00 | 46.00 | 126.00 |
HDLC | 3,263 | 49.36 | 15.63 | 48.00 | 10.00 | 189.0 | 39.00 | 58.00 | 19.00 | 10.50 | 86.50 |
HEARTRTE | 3,263 | 77.36 | 12.49 | 76.00 | 37.00 | 150.0 | 70.00 | 85.00 | 15.00 | 47.50 | 107.50 |
LDLC | 3,263 | 176.47 | 46.86 | 173.00 | 20.00 | 565.0 | 145.00 | 205.00 | 60.00 | 55.00 | 295.00 |
SYSBP | 3,263 | 140.22 | 22.93 | 137.00 | 86.00 | 267.0 | 123.00 | 154.00 | 31.00 | 76.50 | 200.50 |
TOTCHOL | 3,263 | 236.71 | 44.45 | 234.00 | 112.00 | 625.0 | 206.00 | 264.00 | 58.00 | 119.00 | 351.00 |
df_clean %>%
pivot_longer(cols = c(TOTCHOL, AGE, SYSBP, DIABP, BMI, HEARTRTE, GLUCOSE)) %>%
group_by(name) %>%
summarise(
N = n(),
Mean = mean(value, na.rm = TRUE),
SD = sd(value, na.rm = TRUE),
Median = median(value, na.rm = TRUE),
Min = min(value, na.rm = TRUE),
Max = max(value, na.rm = TRUE),
Q1 = quantile(value, probs = 0.25, na.rm = TRUE),
Q3 = quantile(value, probs = 0.75, na.rm = TRUE),
IQR = Q3 - Q1,
Lower_bound = Q1 - 1.5 * IQR,
Upper_bound = Q3 + 1.5 * IQR
) %>%
mutate(across(where(is.numeric), round, 2)) %>%
flextable() %>%
autofit()
name | N | Mean | SD | Median | Min | Max | Q1 | Q3 | IQR | Lower_bound | Upper_bound |
---|---|---|---|---|---|---|---|---|---|---|---|
AGE | 1,991 | 59.95 | 8.23 | 59.00 | 44.00 | 81.00 | 53.00 | 67.00 | 14.00 | 32.00 | 88.00 |
BMI | 1,991 | 25.47 | 3.38 | 25.23 | 16.58 | 34.84 | 23.10 | 27.62 | 4.52 | 16.33 | 34.39 |
DIABP | 1,991 | 80.41 | 10.50 | 80.00 | 52.00 | 110.00 | 72.75 | 87.00 | 14.25 | 51.38 | 108.38 |
GLUCOSE | 1,991 | 84.19 | 13.81 | 83.00 | 46.00 | 125.00 | 75.00 | 92.00 | 17.00 | 49.50 | 117.50 |
HEARTRTE | 1,991 | 76.54 | 11.18 | 76.00 | 48.00 | 106.00 | 70.00 | 85.00 | 15.00 | 47.50 | 107.50 |
SYSBP | 1,991 | 136.88 | 20.32 | 135.00 | 86.00 | 199.00 | 121.00 | 150.00 | 29.00 | 77.50 | 193.50 |
TOTCHOL | 1,991 | 235.66 | 40.68 | 235.00 | 134.00 | 353.00 | 206.00 | 263.00 | 57.00 | 120.50 | 348.50 |
df_clean |>
write.csv("FHS_cleaned.csv")
str(df_clean)
## 'data.frame': 1991 obs. of 37 variables:
## $ RANDID : int 6238 11263 14367 24721 33077 34689 36459 40435 45464 47561 ...
## $ TOTCHOL : int 237 220 280 264 215 212 162 226 236 290 ...
## $ AGE : int 58 55 64 51 60 49 53 54 64 56 ...
## $ SYSBP : num 108 180 168 141 144 ...
## $ DIABP : num 66 106 100 81 80 96 101 75 89 70 ...
## $ CIGPDAY : int 0 0 0 15 10 10 0 40 20 40 ...
## $ BMI : num 28.5 31.2 25.7 24.8 23 ...
## $ BPMEDS : int 0 1 0 0 0 0 0 0 0 0 ...
## $ HEARTRTE : int 80 86 92 85 57 82 105 85 90 100 ...
## $ GLUCOSE : int 71 81 82 97 91 84 78 102 80 90 ...
## $ educ : int 2 2 1 2 3 2 2 2 3 2 ...
## $ TIME : int 4344 4351 4438 4408 4383 4289 4411 4372 4368 4071 ...
## $ PERIOD : int 3 3 3 3 3 3 3 3 3 3 ...
## $ DEATH : int 0 0 0 1 0 0 0 0 0 0 ...
## $ ANGINA : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HOSPMI : int 0 0 0 0 0 0 0 0 0 0 ...
## $ MI_FCHD : int 0 1 0 0 0 0 0 0 0 0 ...
## $ ANYCHD : int 0 1 0 0 0 0 0 0 0 0 ...
## $ STROKE : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CVD : int 0 1 0 0 0 0 0 0 0 0 ...
## $ HYPERTEN : int 0 1 1 1 1 1 1 1 1 1 ...
## $ TIMEAP : int 8766 8766 8766 6411 8766 8766 8766 8766 8766 8766 ...
## $ TIMEMI : int 8766 8766 8766 6411 8766 8766 8766 8766 8766 8766 ...
## $ TIMEMIFC : int 8766 5719 8766 6411 8766 8766 8766 8766 8766 8766 ...
## $ TIMECHD : int 8766 5719 8766 6411 8766 8766 8766 8766 8766 8766 ...
## $ TIMESTRK : int 8766 8766 8766 6411 8766 8766 8766 8766 8766 8766 ...
## $ TIMECVD : int 8766 5719 8766 6411 8766 8766 8766 8766 8766 8766 ...
## $ TIMEDTH : int 8766 8766 8766 6411 8766 8766 8766 8766 8766 8766 ...
## $ TIMEHYP : int 8766 0 0 4408 0 2157 1469 5933 2177 0 ...
## $ Gender : chr "Female" "Female" "Male" "Female" ...
## $ Smoker : chr "Not current smoker" "Not current smoker" "Not current smoker" "Smoker" ...
## $ Diabetic : chr "Non Diabetic" "Diabetic" "Non Diabetic" "Non Diabetic" ...
## $ PREVCHD - Coronary Disease : chr "Free of coronary disease" "Free of coronary disease" "Free of coronary disease" "Free of coronary disease" ...
## $ PREVAP - Angina Pectoris : chr "Free of disease angina pectoris" "Free of disease angina pectoris" "Free of disease angina pectoris" "Free of disease angina pectoris" ...
## $ PREVMI - Myocardial Infarction: chr "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" ...
## $ PREVSTRK - Stroke History : chr "No stroke history" "No stroke history" "No stroke history" "No stroke history" ...
## $ PREVHYP - Hypertension : chr "No prevalent hypertension" "Prevalent hypertension" "Prevalent hypertension" "Prevalent hypertension" ...
## - attr(*, "na.action")= 'omit' Named int [1:1009] 1 3 8 15 25 26 28 29 33 35 ...
## ..- attr(*, "names")= chr [1:1009] "2" "12" "29" "55" ...
# Split the data into training and testing sets
set.seed(123)
# data_split <- df_clean |>
# dplyr::select(-c(TIME, TIMEAP, TIMEMI, TIMEMIFC, TIMECHD, TIMESTRK, TIMECVD, TIMEDTH, TIMEHYP, RANDID, DEATH, ANGINA, HOSPMI, MI_FCHD, ANYCHD, STROKE, HYPERTEN, `PREVCHD - Coronary Disease`, `PREVAP - Angina Pectoris`, `PREVMI - Myocardial Infarction`, `PREVSTRK - Stroke History`, `PREVHYP - Hypertension`)) |>
# #mutate_at(.vars = c(6, 9:17), .funs = as.factor) |>
# mutate(heart_attack = as.factor(CVD)) %>%
# dplyr::select(-CVD) |>
# initial_split(prop = 0.75)
library(rsample)
## Warning: package 'rsample' was built under R version 4.3.3
# Perform the data split
data_split <- df_clean %>%
dplyr::select(-c(TIME, TIMEAP, TIMEMI, TIMEMIFC, TIMECHD, TIMESTRK, TIMECVD, TIMEDTH, TIMEHYP, RANDID, DEATH, ANGINA, HOSPMI, MI_FCHD, ANYCHD, STROKE, HYPERTEN,
`PREVCHD - Coronary Disease`, `PREVAP - Angina Pectoris`, `PREVMI - Myocardial Infarction`, `PREVSTRK - Stroke History`, `PREVHYP - Hypertension`)) %>%
# Uncomment and modify if needed
# mutate_at(.vars = c(6, 9:17), .funs = as.factor) %>%
#mutate(CVD = as.factor(CVD)) %>%
initial_split(prop = 0.75)
# Training data
train_data <- training(data_split)
train_data |> write_csv("train2.csv")
saveRDS(train_data, file = "train_data.rds")
# Testing data
test_data <- testing(data_split)
test_data |> write_csv("test2.csv")
saveRDS(test_data, file = "test_data.rds")
str(test_data)
## 'data.frame': 498 obs. of 15 variables:
## $ TOTCHOL : int 280 162 206 230 251 216 231 255 216 223 ...
## $ AGE : int 64 53 55 49 59 56 65 51 68 60 ...
## $ SYSBP : num 168 152 129 142 132 ...
## $ DIABP : num 100 101 85 90.5 77 70 70 98 84 100 ...
## $ CIGPDAY : int 0 0 50 35 20 20 0 17 20 0 ...
## $ BMI : num 25.7 26.4 26.4 24.3 19.3 ...
## $ BPMEDS : int 0 0 0 0 0 0 1 0 0 1 ...
## $ HEARTRTE: int 92 105 84 70 82 72 70 84 77 77 ...
## $ GLUCOSE : int 82 78 69 61 76 49 86 90 70 86 ...
## $ educ : int 1 2 4 3 2 2 2 2 1 1 ...
## $ PERIOD : int 3 3 3 3 3 3 3 3 3 3 ...
## $ CVD : int 0 0 0 1 0 1 0 0 0 0 ...
## $ Gender : chr "Male" "Male" "Male" "Male" ...
## $ Smoker : chr "Not current smoker" "Not current smoker" "Smoker" "Smoker" ...
## $ Diabetic: chr "Non Diabetic" "Non Diabetic" "Non Diabetic" "Non Diabetic" ...
## - attr(*, "na.action")= 'omit' Named int [1:1009] 1 3 8 15 25 26 28 29 33 35 ...
## ..- attr(*, "names")= chr [1:1009] "2" "12" "29" "55" ...
table(df_clean$CVD, df_clean$HOSPMI, useNA = "no")
##
## 0 1
## 0 1547 15
## 1 260 169
df_clean |>
count(CVD, HOSPMI)
## CVD HOSPMI n
## 1 0 0 1547
## 2 0 1 15
## 3 1 0 260
## 4 1 1 169