library(tidyverse, quietly = TRUE)
#library(tidymodels, quietly = TRUE)
library(psych, quietly = TRUE)
library(DataExplorer, quietly = TRUE)
library(flextable, quietly = TRUE)
getwd()
## [1] "C:/Users/tsapa/Downloads"
setwd("C:/Users/tsapa/Downloads")
# Read in the dataset
df_frmgham2 <- read.csv("frmgham2.csv")
# Display structure and summary
str(df_frmgham2)
## 'data.frame': 11627 obs. of 39 variables:
## $ RANDID : int 2448 2448 6238 6238 6238 9428 9428 10552 10552 11252 ...
## $ SEX : int 1 1 2 2 2 1 1 2 2 2 ...
## $ TOTCHOL : int 195 209 250 260 237 245 283 225 232 285 ...
## $ AGE : int 39 52 46 52 58 48 54 61 67 46 ...
## $ SYSBP : num 106 121 121 105 108 ...
## $ DIABP : num 70 66 81 69.5 66 80 89 95 109 84 ...
## $ CURSMOKE: int 0 0 0 0 0 1 1 1 1 1 ...
## $ CIGPDAY : int 0 0 0 0 0 20 30 30 20 23 ...
## $ BMI : num 27 NA 28.7 29.4 28.5 ...
## $ DIABETES: int 0 0 0 0 0 0 0 0 0 0 ...
## $ BPMEDS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HEARTRTE: int 80 69 95 80 80 75 75 65 60 85 ...
## $ GLUCOSE : int 77 92 76 86 71 70 87 103 89 85 ...
## $ educ : int 4 4 2 2 2 1 1 3 3 3 ...
## $ PREVCHD : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PREVAP : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PREVMI : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PREVSTRK: int 0 0 0 0 0 0 0 0 0 0 ...
## $ PREVHYP : int 0 0 0 0 0 0 0 1 1 0 ...
## $ TIME : int 0 4628 0 2156 4344 0 2199 0 1977 0 ...
## $ PERIOD : int 1 3 1 2 3 1 2 1 2 1 ...
## $ HDLC : int NA 31 NA NA 54 NA NA NA NA NA ...
## $ LDLC : int NA 178 NA NA 141 NA NA NA NA NA ...
## $ DEATH : int 0 0 0 0 0 0 0 1 1 0 ...
## $ ANGINA : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HOSPMI : int 1 1 0 0 0 0 0 0 0 0 ...
## $ MI_FCHD : int 1 1 0 0 0 0 0 0 0 0 ...
## $ ANYCHD : int 1 1 0 0 0 0 0 0 0 0 ...
## $ STROKE : int 0 0 0 0 0 0 0 1 1 0 ...
## $ CVD : int 1 1 0 0 0 0 0 1 1 0 ...
## $ HYPERTEN: int 0 0 0 0 0 0 0 1 1 1 ...
## $ TIMEAP : int 8766 8766 8766 8766 8766 8766 8766 2956 2956 8766 ...
## $ TIMEMI : int 6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
## $ TIMEMIFC: int 6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
## $ TIMECHD : int 6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
## $ TIMESTRK: int 8766 8766 8766 8766 8766 8766 8766 2089 2089 8766 ...
## $ TIMECVD : int 6438 6438 8766 8766 8766 8766 8766 2089 2089 8766 ...
## $ TIMEDTH : int 8766 8766 8766 8766 8766 8766 8766 2956 2956 8766 ...
## $ TIMEHYP : int 8766 8766 8766 8766 8766 8766 8766 0 0 4285 ...
# Corrected plotting for missing values
df_frmgham2 %>%
plot_missing()
# Corrected descriptive statistics excluding certain columns
df_frmgham2 %>%
dplyr::select(-RANDID) %>%
describe() %>%
dplyr::select(-c(trimmed, mad, skew, kurtosis))
## vars n mean sd median min max range se
## SEX 1 11627 1.57 0.50 2.00 1.00 2.0 1.00 0.00
## TOTCHOL 2 11218 241.16 45.37 238.00 107.00 696.0 589.00 0.43
## AGE 3 11627 54.79 9.56 54.00 32.00 81.0 49.00 0.09
## SYSBP 4 11627 136.32 22.80 132.00 83.50 295.0 211.50 0.21
## DIABP 5 11627 83.04 11.66 82.00 30.00 150.0 120.00 0.11
## CURSMOKE 6 11627 0.43 0.50 0.00 0.00 1.0 1.00 0.00
## CIGPDAY 7 11548 8.25 12.19 0.00 0.00 90.0 90.00 0.11
## BMI 8 11575 25.88 4.10 25.48 14.43 56.8 42.37 0.04
## DIABETES 9 11627 0.05 0.21 0.00 0.00 1.0 1.00 0.00
## BPMEDS 10 11034 0.09 0.28 0.00 0.00 1.0 1.00 0.00
## HEARTRTE 11 11621 76.78 12.46 75.00 37.00 220.0 183.00 0.12
## GLUCOSE 12 10187 84.12 24.99 80.00 39.00 478.0 439.00 0.25
## educ 13 11332 1.99 1.03 2.00 1.00 4.0 3.00 0.01
## PREVCHD 14 11627 0.07 0.26 0.00 0.00 1.0 1.00 0.00
## PREVAP 15 11627 0.05 0.23 0.00 0.00 1.0 1.00 0.00
## PREVMI 16 11627 0.03 0.18 0.00 0.00 1.0 1.00 0.00
## PREVSTRK 17 11627 0.01 0.11 0.00 0.00 1.0 1.00 0.00
## PREVHYP 18 11627 0.46 0.50 0.00 0.00 1.0 1.00 0.00
## TIME 19 11627 1957.02 1758.78 2156.00 0.00 4854.0 4854.00 16.31
## PERIOD 20 11627 1.90 0.81 2.00 1.00 3.0 2.00 0.01
## HDLC 21 3027 49.36 15.63 48.00 10.00 189.0 179.00 0.28
## LDLC 22 3026 176.47 46.86 173.00 20.00 565.0 545.00 0.85
## DEATH 23 11627 0.30 0.46 0.00 0.00 1.0 1.00 0.00
## ANGINA 24 11627 0.16 0.37 0.00 0.00 1.0 1.00 0.00
## HOSPMI 25 11627 0.10 0.30 0.00 0.00 1.0 1.00 0.00
## MI_FCHD 26 11627 0.15 0.36 0.00 0.00 1.0 1.00 0.00
## ANYCHD 27 11627 0.27 0.44 0.00 0.00 1.0 1.00 0.00
## STROKE 28 11627 0.09 0.29 0.00 0.00 1.0 1.00 0.00
## CVD 29 11627 0.25 0.43 0.00 0.00 1.0 1.00 0.00
## HYPERTEN 30 11627 0.74 0.44 1.00 0.00 1.0 1.00 0.00
## TIMEAP 31 11627 7241.56 2477.78 8766.00 0.00 8766.0 8766.00 22.98
## TIMEMI 32 11627 7593.85 2136.73 8766.00 0.00 8766.0 8766.00 19.82
## TIMEMIFC 33 11627 7543.04 2192.12 8766.00 0.00 8766.0 8766.00 20.33
## TIMECHD 34 11627 7008.15 2641.34 8766.00 0.00 8766.0 8766.00 24.50
## TIMESTRK 35 11627 7660.88 2011.08 8766.00 0.00 8766.0 8766.00 18.65
## TIMECVD 36 11627 7166.08 2541.67 8766.00 0.00 8766.0 8766.00 23.57
## TIMEDTH 37 11627 7854.10 1788.37 8766.00 26.00 8766.0 8740.00 16.59
## TIMEHYP 38 11627 3598.96 3464.16 2429.00 0.00 8766.0 8766.00 32.13
# Count unique observations in RANDID
unique_randids <- df_frmgham2 %>%
distinct(RANDID) %>%
nrow()
print(unique_randids)
## [1] 4434
# Print first and last few rows
head(df_frmgham2)
## RANDID SEX TOTCHOL AGE SYSBP DIABP CURSMOKE CIGPDAY BMI DIABETES BPMEDS
## 1 2448 1 195 39 106.0 70.0 0 0 26.97 0 0
## 2 2448 1 209 52 121.0 66.0 0 0 NA 0 0
## 3 6238 2 250 46 121.0 81.0 0 0 28.73 0 0
## 4 6238 2 260 52 105.0 69.5 0 0 29.43 0 0
## 5 6238 2 237 58 108.0 66.0 0 0 28.50 0 0
## 6 9428 1 245 48 127.5 80.0 1 20 25.34 0 0
## HEARTRTE GLUCOSE educ PREVCHD PREVAP PREVMI PREVSTRK PREVHYP TIME PERIOD HDLC
## 1 80 77 4 0 0 0 0 0 0 1 NA
## 2 69 92 4 0 0 0 0 0 4628 3 31
## 3 95 76 2 0 0 0 0 0 0 1 NA
## 4 80 86 2 0 0 0 0 0 2156 2 NA
## 5 80 71 2 0 0 0 0 0 4344 3 54
## 6 75 70 1 0 0 0 0 0 0 1 NA
## LDLC DEATH ANGINA HOSPMI MI_FCHD ANYCHD STROKE CVD HYPERTEN TIMEAP TIMEMI
## 1 NA 0 0 1 1 1 0 1 0 8766 6438
## 2 178 0 0 1 1 1 0 1 0 8766 6438
## 3 NA 0 0 0 0 0 0 0 0 8766 8766
## 4 NA 0 0 0 0 0 0 0 0 8766 8766
## 5 141 0 0 0 0 0 0 0 0 8766 8766
## 6 NA 0 0 0 0 0 0 0 0 8766 8766
## TIMEMIFC TIMECHD TIMESTRK TIMECVD TIMEDTH TIMEHYP
## 1 6438 6438 8766 6438 8766 8766
## 2 6438 6438 8766 6438 8766 8766
## 3 8766 8766 8766 8766 8766 8766
## 4 8766 8766 8766 8766 8766 8766
## 5 8766 8766 8766 8766 8766 8766
## 6 8766 8766 8766 8766 8766 8766
tail(df_frmgham2)
## RANDID SEX TOTCHOL AGE SYSBP DIABP CURSMOKE CIGPDAY BMI DIABETES
## 11622 9998212 1 185 40 141 98 0 0 25.60 0
## 11623 9998212 1 173 46 126 82 0 0 19.17 0
## 11624 9998212 1 153 52 143 89 0 0 25.74 0
## 11625 9999312 2 196 39 133 86 1 30 20.91 0
## 11626 9999312 2 240 46 138 79 1 20 26.39 0
## 11627 9999312 2 NA 50 147 96 1 10 24.19 0
## BPMEDS HEARTRTE GLUCOSE educ PREVCHD PREVAP PREVMI PREVSTRK PREVHYP TIME
## 11622 0 67 72 3 0 0 0 0 1 0
## 11623 0 70 NA 3 0 0 0 0 1 2333
## 11624 0 65 72 3 0 0 0 0 1 4538
## 11625 0 85 80 3 0 0 0 0 0 0
## 11626 0 90 83 3 0 0 0 0 0 2390
## 11627 0 94 NA 3 0 0 0 0 1 4201
## PERIOD HDLC LDLC DEATH ANGINA HOSPMI MI_FCHD ANYCHD STROKE CVD HYPERTEN
## 11622 1 NA NA 0 0 0 0 0 0 0 1
## 11623 2 NA NA 0 0 0 0 0 0 0 1
## 11624 3 30 123 0 0 0 0 0 0 0 1
## 11625 1 NA NA 0 0 0 0 0 0 0 1
## 11626 2 NA NA 0 0 0 0 0 0 0 1
## 11627 3 NA NA 0 0 0 0 0 0 0 1
## TIMEAP TIMEMI TIMEMIFC TIMECHD TIMESTRK TIMECVD TIMEDTH TIMEHYP
## 11622 8766 8766 8766 8766 8766 8766 8766 0
## 11623 8766 8766 8766 8766 8766 8766 8766 0
## 11624 8766 8766 8766 8766 8766 8766 8766 0
## 11625 8766 8766 8766 8766 8766 8766 8766 4201
## 11626 8766 8766 8766 8766 8766 8766 8766 4201
## 11627 8766 8766 8766 8766 8766 8766 8766 4201
# Function to remove outliers
is_not_outlier <- function(x) {
Q1 <- quantile(x, 0.25)
Q3 <- quantile(x, 0.75)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
x >= lower_bound & x <= upper_bound
}
# Function to clean dataset
clean_data <- function(df) {
df %>%
dplyr::select(-c(LDLC, HDLC)) %>%
na.omit() %>%
filter(if_all(c(TOTCHOL, AGE, SYSBP, DIABP, BMI, HEARTRTE, GLUCOSE), is_not_outlier))
}
final_data <- df_frmgham2 %>%
# Sex
mutate("Gender" = case_when(SEX == 1 ~ "Male",
SEX == 2 ~ "Female")) %>%
# Education
# mutate("Education Level" = case_when(
# EDUCATION == 1 ~ "0 - 11th Grade",
# EDUCATION == 2 ~ "High School Diploma/GED",
# EDUCATION == 3 ~ "Some College/Vocational School",
# EDUCATION == 4 ~ "College degree (BA, BS) or higher"
# )
# ) %>%
# Smoking
mutate("Smoker" = case_when(
CURSMOKE == 0 ~ "Not current smoker",
CURSMOKE == 1 ~ "Smoker"
)) %>%
# mutate("Cigarettes/per day" = case_when(
# CIGPDAY == 0 ~ "Not current smoker",
# CIGPDAY >= 1 ~ "Cigarettes Per Day"
# )) %>%
# Diabetes
mutate("Diabetic" = case_when(
DIABETES == 0 ~ "Non Diabetic",
DIABETES == 1 ~ "Diabetic"
)) %>%
# Previous conditions
mutate("PREVCHD - Coronary Disease" = case_when(
PREVCHD == 0 ~ "Free of coronary disease",
PREVCHD == 1 ~ "Prevalent coronary disease"
)) %>%
mutate("PREVAP - Angina Pectoris" = case_when(
PREVAP == 0 ~ "Free of disease angina pectoris",
PREVAP == 1 ~ "Prevalent disease angina pectoris"
)) %>%
mutate("PREVMI - Myocardial Infarction" = case_when(
PREVMI == 0 ~ "Free of prevalent myocardial infarction",
PREVMI == 1 ~ "Prevalent myocardial infarction"
)) %>%
mutate("PREVSTRK - Stroke History" = case_when(
PREVSTRK == 0 ~ "No stroke history",
PREVSTRK == 1 ~ "History of stroke"
)) %>%
mutate("PREVHYP - Hypertension" = case_when(
PREVHYP == 0 ~ "No prevalent hypertension",
PREVHYP == 1 ~ "Prevalent hypertension"
)) %>%
# dplyr::select(-c(SEX, EDUCATION, CURSMOKE, CIGPDAY, DIABETES, PREVCHD, PREVAP, PREVMI, PREVSTRK, PREVHYP))
dplyr::select(-c(SEX, CURSMOKE, DIABETES, PREVCHD, PREVAP, PREVMI, PREVSTRK, PREVHYP))
df_clean <- clean_data(final_data)
dim(df_clean)
## [1] 8154 37
str(df_clean)
## 'data.frame': 8154 obs. of 37 variables:
## $ RANDID : int 2448 6238 6238 6238 9428 9428 10552 10552 11252 11252 ...
## $ TOTCHOL : int 195 250 260 237 245 283 225 232 285 343 ...
## $ AGE : int 39 46 52 58 48 54 61 67 46 51 ...
## $ SYSBP : num 106 121 105 108 128 ...
## $ DIABP : num 70 81 69.5 66 80 89 95 109 84 77 ...
## $ CIGPDAY : int 0 0 0 0 20 30 30 20 23 30 ...
## $ BMI : num 27 28.7 29.4 28.5 25.3 ...
## $ BPMEDS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HEARTRTE : int 80 95 80 80 75 75 65 60 85 90 ...
## $ GLUCOSE : int 77 76 86 71 70 87 103 89 85 72 ...
## $ educ : int 4 2 2 2 1 1 3 3 3 3 ...
## $ TIME : int 0 0 2156 4344 0 2199 0 1977 0 2072 ...
## $ PERIOD : int 1 1 2 3 1 2 1 2 1 2 ...
## $ DEATH : int 0 0 0 0 0 0 1 1 0 0 ...
## $ ANGINA : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HOSPMI : int 1 0 0 0 0 0 0 0 0 0 ...
## $ MI_FCHD : int 1 0 0 0 0 0 0 0 0 0 ...
## $ ANYCHD : int 1 0 0 0 0 0 0 0 0 0 ...
## $ STROKE : int 0 0 0 0 0 0 1 1 0 0 ...
## $ CVD : int 1 0 0 0 0 0 1 1 0 0 ...
## $ HYPERTEN : int 0 0 0 0 0 0 1 1 1 1 ...
## $ TIMEAP : int 8766 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
## $ TIMEMI : int 6438 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
## $ TIMEMIFC : int 6438 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
## $ TIMECHD : int 6438 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
## $ TIMESTRK : int 8766 8766 8766 8766 8766 8766 2089 2089 8766 8766 ...
## $ TIMECVD : int 6438 8766 8766 8766 8766 8766 2089 2089 8766 8766 ...
## $ TIMEDTH : int 8766 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
## $ TIMEHYP : int 8766 8766 8766 8766 8766 8766 0 0 4285 4285 ...
## $ Gender : chr "Male" "Female" "Female" "Female" ...
## $ Smoker : chr "Not current smoker" "Not current smoker" "Not current smoker" "Not current smoker" ...
## $ Diabetic : chr "Non Diabetic" "Non Diabetic" "Non Diabetic" "Non Diabetic" ...
## $ PREVCHD - Coronary Disease : chr "Free of coronary disease" "Free of coronary disease" "Free of coronary disease" "Free of coronary disease" ...
## $ PREVAP - Angina Pectoris : chr "Free of disease angina pectoris" "Free of disease angina pectoris" "Free of disease angina pectoris" "Free of disease angina pectoris" ...
## $ PREVMI - Myocardial Infarction: chr "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" ...
## $ PREVSTRK - Stroke History : chr "No stroke history" "No stroke history" "No stroke history" "No stroke history" ...
## $ PREVHYP - Hypertension : chr "No prevalent hypertension" "No prevalent hypertension" "No prevalent hypertension" "No prevalent hypertension" ...
## - attr(*, "na.action")= 'omit' Named int [1:2317] 2 12 19 28 29 36 41 53 55 66 ...
## ..- attr(*, "names")= chr [1:2317] "2" "12" "19" "28" ...
df_frmgham2 %>%
pivot_longer(cols = c(TOTCHOL, AGE, SYSBP, DIABP, BMI, HEARTRTE, HDLC, LDLC, GLUCOSE)) %>%
group_by(name) %>%
summarise(
N = n(),
Mean = mean(value, na.rm = TRUE),
SD = sd(value, na.rm = TRUE),
Median = median(value, na.rm = TRUE),
Min = min(value, na.rm = TRUE),
Max = max(value, na.rm = TRUE),
Q1 = quantile(value, probs = 0.25, na.rm = TRUE),
Q3 = quantile(value, probs = 0.75, na.rm = TRUE),
IQR = Q3 - Q1,
Lower_bound = Q1 - 1.5 * IQR,
Upper_bound = Q3 + 1.5 * IQR
) %>%
mutate(across(where(is.numeric), round, 2)) %>%
flextable() %>%
autofit()
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(where(is.numeric), round, 2)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
name | N | Mean | SD | Median | Min | Max | Q1 | Q3 | IQR | Lower_bound | Upper_bound |
---|---|---|---|---|---|---|---|---|---|---|---|
AGE | 11,627 | 54.79 | 9.56 | 54.00 | 32.00 | 81.0 | 48.00 | 62.00 | 14.00 | 27.00 | 83.00 |
BMI | 11,627 | 25.88 | 4.10 | 25.48 | 14.43 | 56.8 | 23.09 | 28.07 | 4.98 | 15.63 | 35.53 |
DIABP | 11,627 | 83.04 | 11.66 | 82.00 | 30.00 | 150.0 | 75.00 | 90.00 | 15.00 | 52.50 | 112.50 |
GLUCOSE | 11,627 | 84.12 | 24.99 | 80.00 | 39.00 | 478.0 | 72.00 | 89.00 | 17.00 | 46.50 | 114.50 |
HDLC | 11,627 | 49.36 | 15.63 | 48.00 | 10.00 | 189.0 | 39.00 | 58.00 | 19.00 | 10.50 | 86.50 |
HEARTRTE | 11,627 | 76.78 | 12.46 | 75.00 | 37.00 | 220.0 | 69.00 | 85.00 | 16.00 | 45.00 | 109.00 |
LDLC | 11,627 | 176.47 | 46.86 | 173.00 | 20.00 | 565.0 | 145.00 | 205.00 | 60.00 | 55.00 | 295.00 |
SYSBP | 11,627 | 136.32 | 22.80 | 132.00 | 83.50 | 295.0 | 120.00 | 149.00 | 29.00 | 76.50 | 192.50 |
TOTCHOL | 11,627 | 241.16 | 45.37 | 238.00 | 107.00 | 696.0 | 210.00 | 268.00 | 58.00 | 123.00 | 355.00 |
df_clean %>%
pivot_longer(cols = c(TOTCHOL, AGE, SYSBP, DIABP, BMI, HEARTRTE, GLUCOSE)) %>%
group_by(name) %>%
summarise(
N = n(),
Mean = mean(value, na.rm = TRUE),
SD = sd(value, na.rm = TRUE),
Median = median(value, na.rm = TRUE),
Min = min(value, na.rm = TRUE),
Max = max(value, na.rm = TRUE),
Q1 = quantile(value, probs = 0.25, na.rm = TRUE),
Q3 = quantile(value, probs = 0.75, na.rm = TRUE),
IQR = Q3 - Q1,
Lower_bound = Q1 - 1.5 * IQR,
Upper_bound = Q3 + 1.5 * IQR
) %>%
mutate(across(where(is.numeric), round, 2)) %>%
flextable() %>%
autofit()
name | N | Mean | SD | Median | Min | Max | Q1 | Q3 | IQR | Lower_bound | Upper_bound |
---|---|---|---|---|---|---|---|---|---|---|---|
AGE | 8,154 | 53.80 | 9.41 | 53.00 | 32.00 | 81.00 | 47.00 | 61.00 | 14.00 | 26.00 | 82.00 |
BMI | 8,154 | 25.46 | 3.45 | 25.26 | 15.92 | 35.33 | 23.05 | 27.72 | 4.67 | 16.05 | 34.72 |
DIABP | 8,154 | 81.86 | 10.47 | 81.00 | 53.00 | 112.50 | 74.00 | 88.50 | 14.50 | 52.25 | 110.25 |
GLUCOSE | 8,154 | 79.60 | 11.92 | 78.00 | 47.00 | 114.00 | 71.00 | 87.00 | 16.00 | 47.00 | 111.00 |
HEARTRTE | 8,154 | 75.65 | 11.21 | 75.00 | 45.00 | 109.00 | 68.00 | 82.00 | 14.00 | 47.00 | 103.00 |
SYSBP | 8,154 | 132.79 | 19.33 | 130.00 | 83.50 | 190.00 | 119.00 | 145.00 | 26.00 | 80.00 | 184.00 |
TOTCHOL | 8,154 | 239.26 | 41.40 | 238.00 | 124.00 | 356.00 | 210.00 | 266.00 | 56.00 | 126.00 | 350.00 |
df_clean |>
write.csv("FHS_cleaned.csv")
str(df_clean)
## 'data.frame': 8154 obs. of 37 variables:
## $ RANDID : int 2448 6238 6238 6238 9428 9428 10552 10552 11252 11252 ...
## $ TOTCHOL : int 195 250 260 237 245 283 225 232 285 343 ...
## $ AGE : int 39 46 52 58 48 54 61 67 46 51 ...
## $ SYSBP : num 106 121 105 108 128 ...
## $ DIABP : num 70 81 69.5 66 80 89 95 109 84 77 ...
## $ CIGPDAY : int 0 0 0 0 20 30 30 20 23 30 ...
## $ BMI : num 27 28.7 29.4 28.5 25.3 ...
## $ BPMEDS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HEARTRTE : int 80 95 80 80 75 75 65 60 85 90 ...
## $ GLUCOSE : int 77 76 86 71 70 87 103 89 85 72 ...
## $ educ : int 4 2 2 2 1 1 3 3 3 3 ...
## $ TIME : int 0 0 2156 4344 0 2199 0 1977 0 2072 ...
## $ PERIOD : int 1 1 2 3 1 2 1 2 1 2 ...
## $ DEATH : int 0 0 0 0 0 0 1 1 0 0 ...
## $ ANGINA : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HOSPMI : int 1 0 0 0 0 0 0 0 0 0 ...
## $ MI_FCHD : int 1 0 0 0 0 0 0 0 0 0 ...
## $ ANYCHD : int 1 0 0 0 0 0 0 0 0 0 ...
## $ STROKE : int 0 0 0 0 0 0 1 1 0 0 ...
## $ CVD : int 1 0 0 0 0 0 1 1 0 0 ...
## $ HYPERTEN : int 0 0 0 0 0 0 1 1 1 1 ...
## $ TIMEAP : int 8766 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
## $ TIMEMI : int 6438 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
## $ TIMEMIFC : int 6438 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
## $ TIMECHD : int 6438 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
## $ TIMESTRK : int 8766 8766 8766 8766 8766 8766 2089 2089 8766 8766 ...
## $ TIMECVD : int 6438 8766 8766 8766 8766 8766 2089 2089 8766 8766 ...
## $ TIMEDTH : int 8766 8766 8766 8766 8766 8766 2956 2956 8766 8766 ...
## $ TIMEHYP : int 8766 8766 8766 8766 8766 8766 0 0 4285 4285 ...
## $ Gender : chr "Male" "Female" "Female" "Female" ...
## $ Smoker : chr "Not current smoker" "Not current smoker" "Not current smoker" "Not current smoker" ...
## $ Diabetic : chr "Non Diabetic" "Non Diabetic" "Non Diabetic" "Non Diabetic" ...
## $ PREVCHD - Coronary Disease : chr "Free of coronary disease" "Free of coronary disease" "Free of coronary disease" "Free of coronary disease" ...
## $ PREVAP - Angina Pectoris : chr "Free of disease angina pectoris" "Free of disease angina pectoris" "Free of disease angina pectoris" "Free of disease angina pectoris" ...
## $ PREVMI - Myocardial Infarction: chr "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" "Free of prevalent myocardial infarction" ...
## $ PREVSTRK - Stroke History : chr "No stroke history" "No stroke history" "No stroke history" "No stroke history" ...
## $ PREVHYP - Hypertension : chr "No prevalent hypertension" "No prevalent hypertension" "No prevalent hypertension" "No prevalent hypertension" ...
## - attr(*, "na.action")= 'omit' Named int [1:2317] 2 12 19 28 29 36 41 53 55 66 ...
## ..- attr(*, "names")= chr [1:2317] "2" "12" "19" "28" ...
# Split the data into training and testing sets
set.seed(123)
# data_split <- df_clean |>
# dplyr::select(-c(TIME, TIMEAP, TIMEMI, TIMEMIFC, TIMECHD, TIMESTRK, TIMECVD, TIMEDTH, TIMEHYP, RANDID, DEATH, ANGINA, HOSPMI, MI_FCHD, ANYCHD, STROKE, HYPERTEN, `PREVCHD - Coronary Disease`, `PREVAP - Angina Pectoris`, `PREVMI - Myocardial Infarction`, `PREVSTRK - Stroke History`, `PREVHYP - Hypertension`)) |>
# #mutate_at(.vars = c(6, 9:17), .funs = as.factor) |>
# mutate(heart_attack = as.factor(CVD)) %>%
# dplyr::select(-CVD) |>
# initial_split(prop = 0.75)
library(rsample)
## Warning: package 'rsample' was built under R version 4.3.3
# Perform the data split
data_split <- df_clean %>%
dplyr::select(-c(TIME, TIMEAP, TIMEMI, TIMEMIFC, TIMECHD, TIMESTRK, TIMECVD, TIMEDTH, TIMEHYP, RANDID, DEATH, ANGINA, HOSPMI, MI_FCHD, ANYCHD, STROKE, HYPERTEN,
`PREVCHD - Coronary Disease`, `PREVAP - Angina Pectoris`, `PREVMI - Myocardial Infarction`, `PREVSTRK - Stroke History`, `PREVHYP - Hypertension`)) %>%
# Uncomment and modify if needed
# mutate_at(.vars = c(6, 9:17), .funs = as.factor) %>%
#mutate(CVD = as.factor(CVD)) %>%
initial_split(prop = 0.75)
# Training data
train_data <- training(data_split)
train_data |> write_csv("train2.csv")
saveRDS(train_data, file = "train_data.rds")
# Testing data
test_data <- testing(data_split)
test_data |> write_csv("test2.csv")
saveRDS(test_data, file = "test_data.rds")
str(test_data)
## 'data.frame': 2039 obs. of 15 variables:
## $ TOTCHOL : int 260 237 225 232 343 220 313 260 258 294 ...
## $ AGE : int 52 58 61 67 51 55 45 52 49 46 ...
## $ SYSBP : num 105 108 150 183 109 ...
## $ DIABP : num 69.5 66 95 109 77 106 71 89 102 94 ...
## $ CIGPDAY : int 0 0 30 20 30 0 20 0 0 15 ...
## $ BMI : num 29.4 28.5 28.6 30.2 23.5 ...
## $ BPMEDS : int 0 0 0 0 0 1 0 0 1 0 ...
## $ HEARTRTE: int 80 80 65 60 90 86 79 76 75 98 ...
## $ GLUCOSE : int 86 71 103 89 72 81 78 79 74 64 ...
## $ educ : int 2 2 3 3 3 2 2 1 1 1 ...
## $ PERIOD : int 2 3 1 2 2 3 1 1 2 1 ...
## $ CVD : int 0 0 1 1 0 1 0 0 0 0 ...
## $ Gender : chr "Female" "Female" "Female" "Female" ...
## $ Smoker : chr "Not current smoker" "Not current smoker" "Smoker" "Smoker" ...
## $ Diabetic: chr "Non Diabetic" "Non Diabetic" "Non Diabetic" "Non Diabetic" ...
## - attr(*, "na.action")= 'omit' Named int [1:2317] 2 12 19 28 29 36 41 53 55 66 ...
## ..- attr(*, "names")= chr [1:2317] "2" "12" "19" "28" ...
table(df_clean$CVD, df_clean$HOSPMI, useNA = "no")
##
## 0 1
## 0 6232 55
## 1 1167 700
df_clean |>
count(CVD, HOSPMI)
## CVD HOSPMI n
## 1 0 0 6232
## 2 0 1 55
## 3 1 0 1167
## 4 1 1 700